Revision 28483
Added by Claudio Atzori about 10 years ago
Ngrams.java | ||
---|---|---|
1 |
package eu.dnetlib.pace.clustering; |
|
2 |
|
|
3 |
import java.util.Collection; |
|
4 |
import java.util.LinkedHashSet; |
|
5 |
import java.util.Map; |
|
6 |
import java.util.StringTokenizer; |
|
7 |
|
|
8 |
/** |
|
9 |
* The Class Ngrams. |
|
10 |
*/ |
|
11 |
public class Ngrams extends AbstractClusteringFunction { |
|
12 |
|
|
13 |
/** |
|
14 |
* Instantiates a new ngrams. |
|
15 |
* |
|
16 |
* @param params |
|
17 |
* the params |
|
18 |
*/ |
|
19 |
public Ngrams(final Map<String, Integer> params) { |
|
20 |
super(params); |
|
21 |
} |
|
22 |
|
|
23 |
/* |
|
24 |
* (non-Javadoc) |
|
25 |
* |
|
26 |
* @see eu.dnetlib.pace.clustering.AbstractClusteringFunction#doApply(java.lang.String) |
|
27 |
*/ |
|
28 |
@Override |
|
29 |
protected Collection<String> doApply(final String s) { |
|
30 |
return getNgrams(s, param("ngramLen"), param("max"), param("maxPerToken"), param("minNgramLen")); |
|
31 |
} |
|
32 |
|
|
33 |
/** |
|
34 |
* Gets the ngrams. |
|
35 |
* |
|
36 |
* @param s |
|
37 |
* the s |
|
38 |
* @param ngramLen |
|
39 |
* the ngram len |
|
40 |
* @param max |
|
41 |
* the max |
|
42 |
* @param maxPerToken |
|
43 |
* the max per token |
|
44 |
* @param minNgramLen |
|
45 |
* the min ngram len |
|
46 |
* @return the ngrams |
|
47 |
*/ |
|
48 |
protected Collection<String> getNgrams(final String s, final int ngramLen, final int max, final int maxPerToken, final int minNgramLen) { |
|
49 |
|
|
50 |
final Collection<String> ngrams = new LinkedHashSet<String>(); |
|
51 |
final StringTokenizer st = new StringTokenizer(s); |
|
52 |
|
|
53 |
while (st.hasMoreTokens()) { |
|
54 |
final String token = st.nextToken(); |
|
55 |
if (!token.isEmpty()) { |
|
56 |
|
|
57 |
for (int i = 0; (i < maxPerToken) && ((ngramLen + i) <= token.length()); i++) { |
|
58 |
String ngram = (token + " ").substring(i, ngramLen + i).trim(); |
|
59 |
if (ngrams.size() >= max) return ngrams; |
|
60 |
if (ngram.length() >= minNgramLen) { |
|
61 |
ngrams.add(ngram); |
|
62 |
} |
|
63 |
} |
|
64 |
} |
|
65 |
} |
|
66 |
// System.out.println(ngrams + " n: " + ngrams.size()); |
|
67 |
return ngrams; |
|
68 |
} |
|
69 |
|
|
70 |
} |
|
0 | 71 |
Also available in: Unified diff
branch 1.2