1
|
package eu.dnetlib.pace.clustering;
|
2
|
|
3
|
import java.util.Collection;
|
4
|
import java.util.LinkedHashSet;
|
5
|
import java.util.Map;
|
6
|
import java.util.StringTokenizer;
|
7
|
|
8
|
public class Ngrams extends AbstractClusteringFunction {
|
9
|
|
10
|
public Ngrams(Map<String, Integer> params) {
|
11
|
super(params);
|
12
|
}
|
13
|
|
14
|
@Override
|
15
|
protected Collection<String> doApply(String s) {
|
16
|
return getNgrams(s, param("ngramLen"), param("max"), param("maxPerToken"), param("minNgramLen"));
|
17
|
}
|
18
|
|
19
|
protected Collection<String> getNgrams(String s, int ngramLen, int max, int maxPerToken, int minNgramLen) {
|
20
|
|
21
|
final Collection<String> ngrams = new LinkedHashSet<String>();
|
22
|
final StringTokenizer st = new StringTokenizer(s);
|
23
|
|
24
|
while (st.hasMoreTokens()) {
|
25
|
final String token = st.nextToken();
|
26
|
if (!token.isEmpty()) {
|
27
|
|
28
|
for (int i = 0; i < maxPerToken && ngramLen + i <= token.length(); i++) {
|
29
|
String ngram = (token + " ").substring(i, ngramLen + i).trim();
|
30
|
if (ngrams.size() >= max) {
|
31
|
return ngrams;
|
32
|
}
|
33
|
if (ngram.length() >= minNgramLen) {
|
34
|
ngrams.add(ngram);
|
35
|
}
|
36
|
}
|
37
|
}
|
38
|
}
|
39
|
//System.out.println(ngrams + " n: " + ngrams.size());
|
40
|
return ngrams;
|
41
|
}
|
42
|
|
43
|
}
|