Project

General

Profile

1
package eu.dnetlib.pace.clustering;
2

    
3
import java.util.Set;
4

    
5
import org.apache.commons.lang.StringUtils;
6

    
7
import eu.dnetlib.pace.common.AbstractPaceFunctions;
8

    
9
public class NGramUtils extends AbstractPaceFunctions {
10

    
11
	private static final int SIZE = 100;
12

    
13
	private static Set<String> stopwords = AbstractPaceFunctions.loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt");
14

    
15
	public static String cleanupForOrdering(String s) {
16
		NGramUtils utils = new NGramUtils();
17
		return (utils.filterStopWords(utils.normalize(s), stopwords) +  StringUtils.repeat(" ", SIZE)).substring(0, SIZE).replaceAll(" ", "");
18
	}
19

    
20
}
(9-9/17)