1
|
package eu.dnetlib.pace.clustering;
|
2
|
|
3
|
import java.util.Collection;
|
4
|
import java.util.Map;
|
5
|
import java.util.Set;
|
6
|
import java.util.StringTokenizer;
|
7
|
|
8
|
import com.google.common.collect.Sets;
|
9
|
|
10
|
public class Acronyms extends AbstractClusteringFunction {
|
11
|
|
12
|
public Acronyms(Map<String, Integer> params) {
|
13
|
super(params);
|
14
|
}
|
15
|
|
16
|
@Override
|
17
|
protected Collection<String> doApply(String s) {
|
18
|
return extractAcronyms(s, param("max"), param("minLen"), param("maxLen"));
|
19
|
}
|
20
|
|
21
|
private Set<String> extractAcronyms(final String s, int maxAcronyms, int minLen, int maxLen) {
|
22
|
|
23
|
final Set<String> acronyms = Sets.newLinkedHashSet();
|
24
|
|
25
|
for (int i = 0; i < maxAcronyms; i++) {
|
26
|
|
27
|
final StringTokenizer st = new StringTokenizer(s);
|
28
|
final StringBuilder sb = new StringBuilder();
|
29
|
|
30
|
while (st.hasMoreTokens()) {
|
31
|
final String token = st.nextToken();
|
32
|
if (sb.length() > maxLen) {
|
33
|
break;
|
34
|
}
|
35
|
if (token.length() > 1 && i < token.length()) {
|
36
|
sb.append(token.charAt(i));
|
37
|
}
|
38
|
}
|
39
|
String acronym = sb.toString();
|
40
|
if (acronym.length() > minLen) {
|
41
|
acronyms.add(acronym);
|
42
|
}
|
43
|
}
|
44
|
return acronyms;
|
45
|
}
|
46
|
|
47
|
}
|