Project

General

Profile

« Previous | Next » 

Revision 28483

branch 1.2

View differences:

Ngrams.java
1
package eu.dnetlib.pace.clustering;
2

  
3
import java.util.Collection;
4
import java.util.LinkedHashSet;
5
import java.util.Map;
6
import java.util.StringTokenizer;
7

  
8
/**
9
 * The Class Ngrams.
10
 */
11
public class Ngrams extends AbstractClusteringFunction {
12

  
13
	/**
14
	 * Instantiates a new ngrams.
15
	 * 
16
	 * @param params
17
	 *            the params
18
	 */
19
	public Ngrams(final Map<String, Integer> params) {
20
		super(params);
21
	}
22

  
23
	/*
24
	 * (non-Javadoc)
25
	 * 
26
	 * @see eu.dnetlib.pace.clustering.AbstractClusteringFunction#doApply(java.lang.String)
27
	 */
28
	@Override
29
	protected Collection<String> doApply(final String s) {
30
		return getNgrams(s, param("ngramLen"), param("max"), param("maxPerToken"), param("minNgramLen"));
31
	}
32

  
33
	/**
34
	 * Gets the ngrams.
35
	 * 
36
	 * @param s
37
	 *            the s
38
	 * @param ngramLen
39
	 *            the ngram len
40
	 * @param max
41
	 *            the max
42
	 * @param maxPerToken
43
	 *            the max per token
44
	 * @param minNgramLen
45
	 *            the min ngram len
46
	 * @return the ngrams
47
	 */
48
	protected Collection<String> getNgrams(final String s, final int ngramLen, final int max, final int maxPerToken, final int minNgramLen) {
49

  
50
		final Collection<String> ngrams = new LinkedHashSet<String>();
51
		final StringTokenizer st = new StringTokenizer(s);
52

  
53
		while (st.hasMoreTokens()) {
54
			final String token = st.nextToken();
55
			if (!token.isEmpty()) {
56

  
57
				for (int i = 0; (i < maxPerToken) && ((ngramLen + i) <= token.length()); i++) {
58
					String ngram = (token + "    ").substring(i, ngramLen + i).trim();
59
					if (ngrams.size() >= max) return ngrams;
60
					if (ngram.length() >= minNgramLen) {
61
						ngrams.add(ngram);
62
					}
63
				}
64
			}
65
		}
66
		// System.out.println(ngrams + " n: " + ngrams.size());
67
		return ngrams;
68
	}
69

  
70
}
0 71

  

Also available in: Unified diff