1
|
package eu.dnetlib.pace.clustering;
|
2
|
|
3
|
import java.util.Collection;
|
4
|
import java.util.Map;
|
5
|
import java.util.Set;
|
6
|
import java.util.StringTokenizer;
|
7
|
|
8
|
import com.google.common.collect.Sets;
|
9
|
|
10
|
/**
|
11
|
* The Class Acronyms.
|
12
|
*/
|
13
|
public class Acronyms extends AbstractClusteringFunction {
|
14
|
|
15
|
/**
|
16
|
* Instantiates a new acronyms.
|
17
|
*
|
18
|
* @param params
|
19
|
* the params
|
20
|
*/
|
21
|
public Acronyms(final Map<String, Integer> params) {
|
22
|
super(params);
|
23
|
}
|
24
|
|
25
|
/*
|
26
|
* (non-Javadoc)
|
27
|
*
|
28
|
* @see eu.dnetlib.pace.clustering.AbstractClusteringFunction#doApply(java.lang.String)
|
29
|
*/
|
30
|
@Override
|
31
|
protected Collection<String> doApply(final String s) {
|
32
|
return extractAcronyms(s, param("max"), param("minLen"), param("maxLen"));
|
33
|
}
|
34
|
|
35
|
/**
|
36
|
* Extract acronyms.
|
37
|
*
|
38
|
* @param s
|
39
|
* the s
|
40
|
* @param maxAcronyms
|
41
|
* the max acronyms
|
42
|
* @param minLen
|
43
|
* the min len
|
44
|
* @param maxLen
|
45
|
* the max len
|
46
|
* @return the sets the
|
47
|
*/
|
48
|
private Set<String> extractAcronyms(final String s, final int maxAcronyms, final int minLen, final int maxLen) {
|
49
|
|
50
|
final Set<String> acronyms = Sets.newLinkedHashSet();
|
51
|
|
52
|
for (int i = 0; i < maxAcronyms; i++) {
|
53
|
|
54
|
final StringTokenizer st = new StringTokenizer(s);
|
55
|
final StringBuilder sb = new StringBuilder();
|
56
|
|
57
|
while (st.hasMoreTokens()) {
|
58
|
final String token = st.nextToken();
|
59
|
if (sb.length() > maxLen) {
|
60
|
break;
|
61
|
}
|
62
|
if ((token.length() > 1) && (i < token.length())) {
|
63
|
sb.append(token.charAt(i));
|
64
|
}
|
65
|
}
|
66
|
String acronym = sb.toString();
|
67
|
if (acronym.length() > minLen) {
|
68
|
acronyms.add(acronym);
|
69
|
}
|
70
|
}
|
71
|
return acronyms;
|
72
|
}
|
73
|
|
74
|
}
|