Project

General

Profile

1
package eu.dnetlib.pace.clustering;
2

    
3
import java.util.Collection;
4
import java.util.Map;
5
import java.util.Set;
6

    
7
import com.google.common.collect.Sets;
8

    
9
/**
10
 * The Class SuffixPrefix.
11
 */
12
public class SuffixPrefix extends AbstractClusteringFunction {
13

    
14
	/**
15
	 * Instantiates a new suffix prefix.
16
	 * 
17
	 * @param params
18
	 *            the params
19
	 */
20
	public SuffixPrefix(final Map<String, Integer> params) {
21
		super(params);
22
	}
23

    
24
	/*
25
	 * (non-Javadoc)
26
	 * 
27
	 * @see eu.dnetlib.pace.clustering.AbstractClusteringFunction#doApply(java.lang.String)
28
	 */
29
	@Override
30
	protected Collection<String> doApply(final String s) {
31
		return suffixPrefix(s, param("len"), param("max"));
32
	}
33

    
34
	/**
35
	 * Suffix prefix.
36
	 * 
37
	 * @param s
38
	 *            the s
39
	 * @param len
40
	 *            the len
41
	 * @param max
42
	 *            the max
43
	 * @return the collection
44
	 */
45
	private Collection<String> suffixPrefix(final String s, final int len, final int max) {
46
		final Set<String> bigrams = Sets.newLinkedHashSet();
47
		int i = 0;
48
		while ((++i < s.length()) && (bigrams.size() < max)) {
49
			int j = s.indexOf(" ", i);
50

    
51
			int offset = (j + len + 1) < s.length() ? j + len + 1 : s.length();
52

    
53
			if ((j - len) > 0) {
54
				String bigram = s.substring(j - len, offset).replaceAll(" ", "").trim();
55
				if (bigram.length() >= 4) {
56
					bigrams.add(bigram);
57
				}
58
			}
59
		}
60
		return bigrams;
61
	}
62

    
63
}
(12-12/12)