1
|
package eu.dnetlib.pace.clustering;
|
2
|
|
3
|
import java.util.Set;
|
4
|
|
5
|
import org.apache.commons.lang.StringUtils;
|
6
|
|
7
|
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
8
|
|
9
|
public class NGramUtils extends AbstractPaceFunctions {
|
10
|
|
11
|
private static final int SIZE = 100;
|
12
|
|
13
|
private static Set<String> stopwords = AbstractPaceFunctions.loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt");
|
14
|
|
15
|
public static String cleanupForOrdering(String s) {
|
16
|
NGramUtils utils = new NGramUtils();
|
17
|
return (utils.filterStopWords(utils.normalize(s), stopwords) + StringUtils.repeat(" ", SIZE)).substring(0, SIZE).replaceAll(" ", "");
|
18
|
}
|
19
|
|
20
|
}
|