Project

General

Profile

1
package eu.dnetlib.pace.clustering;
2

    
3
import java.util.Collection;
4
import java.util.LinkedHashSet;
5
import java.util.Map;
6
import java.util.StringTokenizer;
7

    
8
public class Ngrams extends AbstractClusteringFunction {
9

    
10
	public Ngrams(Map<String, Integer> params) {
11
		super(params);
12
	}
13
	
14
	@Override
15
	protected Collection<String> doApply(String s) {
16
		return getNgrams(s, param("ngramLen"), param("max"), param("maxPerToken"), param("minNgramLen"));
17
	}
18

    
19
	protected Collection<String> getNgrams(String s, int ngramLen, int max, int maxPerToken, int minNgramLen) {
20

    
21
		final Collection<String> ngrams = new LinkedHashSet<String>();
22
		final StringTokenizer st = new StringTokenizer(s);
23

    
24
		while (st.hasMoreTokens()) {
25
			final String token = st.nextToken();
26
			if (!token.isEmpty()) {
27

    
28
				for (int i = 0; i < maxPerToken && ngramLen + i <= token.length(); i++) {
29
					String ngram = (token + "    ").substring(i, ngramLen + i).trim();
30
					if (ngrams.size() >= max) {
31
						return ngrams;
32
					}
33
					if (ngram.length() >= minNgramLen) {
34
						ngrams.add(ngram);
35
					}
36
				}
37
			}
38
		}
39
		//System.out.println(ngrams + " n: " + ngrams.size());
40
		return ngrams;
41
	}
42

    
43
}
(10-10/13)