1 |
26600
|
sandro.lab
|
package eu.dnetlib.pace.clustering;
|
2 |
|
|
|
3 |
|
|
import java.util.Collection;
|
4 |
|
|
import java.util.Map;
|
5 |
|
|
import java.util.Set;
|
6 |
|
|
import java.util.StringTokenizer;
|
7 |
|
|
|
8 |
|
|
import com.google.common.collect.Sets;
|
9 |
|
|
|
10 |
28483
|
claudio.at
|
/**
|
11 |
|
|
* The Class Acronyms.
|
12 |
|
|
*/
|
13 |
26600
|
sandro.lab
|
public class Acronyms extends AbstractClusteringFunction {
|
14 |
|
|
|
15 |
28483
|
claudio.at
|
/**
|
16 |
|
|
* Instantiates a new acronyms.
|
17 |
|
|
*
|
18 |
|
|
* @param params
|
19 |
|
|
* the params
|
20 |
|
|
*/
|
21 |
|
|
public Acronyms(final Map<String, Integer> params) {
|
22 |
26600
|
sandro.lab
|
super(params);
|
23 |
|
|
}
|
24 |
|
|
|
25 |
28483
|
claudio.at
|
/*
|
26 |
|
|
* (non-Javadoc)
|
27 |
|
|
*
|
28 |
|
|
* @see eu.dnetlib.pace.clustering.AbstractClusteringFunction#doApply(java.lang.String)
|
29 |
|
|
*/
|
30 |
26600
|
sandro.lab
|
@Override
|
31 |
28483
|
claudio.at
|
protected Collection<String> doApply(final String s) {
|
32 |
26600
|
sandro.lab
|
return extractAcronyms(s, param("max"), param("minLen"), param("maxLen"));
|
33 |
|
|
}
|
34 |
28483
|
claudio.at
|
|
35 |
|
|
/**
|
36 |
|
|
* Extract acronyms.
|
37 |
|
|
*
|
38 |
|
|
* @param s
|
39 |
|
|
* the s
|
40 |
|
|
* @param maxAcronyms
|
41 |
|
|
* the max acronyms
|
42 |
|
|
* @param minLen
|
43 |
|
|
* the min len
|
44 |
|
|
* @param maxLen
|
45 |
|
|
* the max len
|
46 |
|
|
* @return the sets the
|
47 |
|
|
*/
|
48 |
|
|
private Set<String> extractAcronyms(final String s, final int maxAcronyms, final int minLen, final int maxLen) {
|
49 |
|
|
|
50 |
26600
|
sandro.lab
|
final Set<String> acronyms = Sets.newLinkedHashSet();
|
51 |
28483
|
claudio.at
|
|
52 |
26600
|
sandro.lab
|
for (int i = 0; i < maxAcronyms; i++) {
|
53 |
28483
|
claudio.at
|
|
54 |
26600
|
sandro.lab
|
final StringTokenizer st = new StringTokenizer(s);
|
55 |
|
|
final StringBuilder sb = new StringBuilder();
|
56 |
28483
|
claudio.at
|
|
57 |
26600
|
sandro.lab
|
while (st.hasMoreTokens()) {
|
58 |
|
|
final String token = st.nextToken();
|
59 |
|
|
if (sb.length() > maxLen) {
|
60 |
|
|
break;
|
61 |
|
|
}
|
62 |
28483
|
claudio.at
|
if ((token.length() > 1) && (i < token.length())) {
|
63 |
26600
|
sandro.lab
|
sb.append(token.charAt(i));
|
64 |
|
|
}
|
65 |
|
|
}
|
66 |
|
|
String acronym = sb.toString();
|
67 |
|
|
if (acronym.length() > minLen) {
|
68 |
|
|
acronyms.add(acronym);
|
69 |
|
|
}
|
70 |
|
|
}
|
71 |
|
|
return acronyms;
|
72 |
|
|
}
|
73 |
|
|
|
74 |
|
|
}
|