1
|
package eu.dnetlib.pace.clustering;
|
2
|
|
3
|
import java.util.Collection;
|
4
|
import java.util.Map;
|
5
|
import java.util.Set;
|
6
|
|
7
|
import com.google.common.collect.Sets;
|
8
|
|
9
|
/**
|
10
|
* The Class SuffixPrefix.
|
11
|
*/
|
12
|
public class SuffixPrefix extends AbstractClusteringFunction {
|
13
|
|
14
|
/**
|
15
|
* Instantiates a new suffix prefix.
|
16
|
*
|
17
|
* @param params
|
18
|
* the params
|
19
|
*/
|
20
|
public SuffixPrefix(final Map<String, Integer> params) {
|
21
|
super(params);
|
22
|
}
|
23
|
|
24
|
/*
|
25
|
* (non-Javadoc)
|
26
|
*
|
27
|
* @see eu.dnetlib.pace.clustering.AbstractClusteringFunction#doApply(java.lang.String)
|
28
|
*/
|
29
|
@Override
|
30
|
protected Collection<String> doApply(final String s) {
|
31
|
return suffixPrefix(s, param("len"), param("max"));
|
32
|
}
|
33
|
|
34
|
/**
|
35
|
* Suffix prefix.
|
36
|
*
|
37
|
* @param s
|
38
|
* the s
|
39
|
* @param len
|
40
|
* the len
|
41
|
* @param max
|
42
|
* the max
|
43
|
* @return the collection
|
44
|
*/
|
45
|
private Collection<String> suffixPrefix(final String s, final int len, final int max) {
|
46
|
final Set<String> bigrams = Sets.newLinkedHashSet();
|
47
|
int i = 0;
|
48
|
while ((++i < s.length()) && (bigrams.size() < max)) {
|
49
|
int j = s.indexOf(" ", i);
|
50
|
|
51
|
int offset = (j + len + 1) < s.length() ? j + len + 1 : s.length();
|
52
|
|
53
|
if ((j - len) > 0) {
|
54
|
String bigram = s.substring(j - len, offset).replaceAll(" ", "").trim();
|
55
|
if (bigram.length() >= 4) {
|
56
|
bigrams.add(bigram);
|
57
|
}
|
58
|
}
|
59
|
}
|
60
|
return bigrams;
|
61
|
}
|
62
|
|
63
|
}
|