Project

General

Profile

1 26600 sandro.lab
package eu.dnetlib.pace.common;
2
3
import java.text.Normalizer;
4
import java.util.Collection;
5
import java.util.List;
6
import java.util.Set;
7
import java.util.StringTokenizer;
8
9
import org.apache.commons.io.IOUtils;
10 43280 claudio.at
import org.apache.commons.lang.StringUtils;
11 26600 sandro.lab
12
import com.google.common.base.Joiner;
13 38059 claudio.at
import com.google.common.collect.Iterables;
14 26600 sandro.lab
import com.google.common.collect.Lists;
15
import com.google.common.collect.Sets;
16
17
import eu.dnetlib.pace.clustering.NGramUtils;
18 33135 claudio.at
import eu.dnetlib.pace.model.Field;
19 38059 claudio.at
import eu.dnetlib.pace.model.FieldList;
20
import eu.dnetlib.pace.model.FieldListImpl;
21 26600 sandro.lab
22
/**
23
 * Set of common functions
24 33026 claudio.at
 *
25 26600 sandro.lab
 * @author claudio
26
 *
27
 */
28
public abstract class AbstractPaceFunctions {
29 33026 claudio.at
30 26600 sandro.lab
	protected static Set<String> stopwords = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt");
31
32 33026 claudio.at
	protected static Set<String> ngramBlacklist = loadFromClasspath("/eu/dnetlib/pace/config/ngram_blacklist.txt");
33 26600 sandro.lab
34
	private static final String alpha = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 ";
35
	private static final String aliases_from = "⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾ⁿ₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎";
36
	private static final String aliases_to = "0123456789+-=()n0123456789+-=()";
37 33026 claudio.at
38 38059 claudio.at
	protected final static FieldList EMPTY_FIELD = new FieldListImpl();
39
40 33026 claudio.at
	protected String concat(final List<String> l) {
41 26600 sandro.lab
		return Joiner.on(" ").skipNulls().join(l);
42
	}
43
44 33026 claudio.at
	protected String cleanup(final String s) {
45 37195 claudio.at
		final String s1 = nfd(s);
46
		final String s2 = fixAliases(s1);
47
		final String s3 = s2.replaceAll("&ndash;", " ");
48
		final String s4 = s3.replaceAll("&amp;", " ");
49
		final String s5 = s4.replaceAll("&minus;", " ");
50
		final String s6 = s5.replaceAll("([0-9]+)", " $1 ");
51
		final String s7 = s6.replaceAll("[^\\p{ASCII}]|\\p{Punct}", " ");
52
		final String s8 = s7.replaceAll("\\n", " ");
53
		final String s9 = s8.replaceAll("(?m)\\s+", " ");
54
		final String s10 = s9.trim();
55 33026 claudio.at
		return s10;
56 26600 sandro.lab
	}
57 33026 claudio.at
58 37195 claudio.at
	protected String finalCleanup(final String s) {
59
		return s.toLowerCase();
60
	}
61
62 33026 claudio.at
	protected boolean checkNumbers(final String a, final String b) {
63 37195 claudio.at
		final String numbersA = getNumbers(a);
64
		final String numbersB = getNumbers(b);
65
		final String romansA = getRomans(a);
66
		final String romansB = getRomans(b);
67
		return !numbersA.equals(numbersB) || !romansA.equals(romansB);
68 26600 sandro.lab
	}
69
70 33026 claudio.at
	protected String getRomans(final String s) {
71 26600 sandro.lab
		final StringBuilder sb = new StringBuilder();
72 37195 claudio.at
		for (final String t : s.split(" ")) {
73 26600 sandro.lab
			sb.append(isRoman(t) ? t : "");
74
		}
75
		return sb.toString();
76
	}
77
78 33026 claudio.at
	protected boolean isRoman(final String s) {
79 26600 sandro.lab
		return s.replaceAll("^M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$", "qwertyuiop").equals("qwertyuiop");
80
	}
81
82 33026 claudio.at
	protected String getNumbers(final String s) {
83 26600 sandro.lab
		return s.replaceAll("\\D", "");
84
	}
85
86 33026 claudio.at
	protected String fixAliases(final String s) {
87 26600 sandro.lab
		final StringBuilder sb = new StringBuilder();
88 37195 claudio.at
		for (final char ch : Lists.charactersOf(s)) {
89
			final int i = StringUtils.indexOf(aliases_from, ch);
90 26600 sandro.lab
			sb.append(i >= 0 ? aliases_to.charAt(i) : ch);
91
		}
92
		return sb.toString();
93
	}
94
95 33026 claudio.at
	protected String removeSymbols(final String s) {
96 26600 sandro.lab
		final StringBuilder sb = new StringBuilder();
97
98 37195 claudio.at
		for (final char ch : Lists.charactersOf(s)) {
99 26600 sandro.lab
			sb.append(StringUtils.contains(alpha, ch) ? ch : " ");
100
		}
101
		return sb.toString().replaceAll("\\s+", " ");
102
	}
103 33026 claudio.at
104 38059 claudio.at
	protected String getFirstValue(final Field values) {
105
		return (values != null) && !Iterables.isEmpty(values) ? Iterables.getFirst(values, EMPTY_FIELD).stringValue() : null;
106 33135 claudio.at
	}
107
108
	protected boolean notNull(final String s) {
109
		return s != null;
110
	}
111
112 33026 claudio.at
	// ///////////////////////
113
114
	protected String normalize(final String s) {
115
		return nfd(s).toLowerCase().replaceAll("(\\W|\\p{InCombiningDiacriticalMarks}|\\p{Punct}|\\d|\\n)+", " ").trim();
116 26600 sandro.lab
	}
117 33026 claudio.at
118
	private String nfd(final String s) {
119
		return Normalizer.normalize(s, Normalizer.Form.NFD);
120
	}
121
122
	protected String filterStopWords(final String s, final Set<String> stopwords) {
123 26600 sandro.lab
		final StringTokenizer st = new StringTokenizer(s);
124
		final StringBuilder sb = new StringBuilder();
125
		while (st.hasMoreTokens()) {
126
			final String token = st.nextToken();
127
			if (!stopwords.contains(token)) {
128
				sb.append(token);
129
				sb.append(" ");
130
			}
131
		}
132
		return sb.toString().trim();
133
	}
134 33026 claudio.at
135
	protected Collection<String> filterBlacklisted(final Collection<String> set, final Set<String> ngramBlacklist) {
136 26600 sandro.lab
		final Set<String> newset = Sets.newLinkedHashSet();
137 37195 claudio.at
		for (final String s : set) {
138 26600 sandro.lab
			if (!ngramBlacklist.contains(s)) {
139
				newset.add(s);
140
			}
141
		}
142
		return newset;
143
	}
144 33026 claudio.at
145
	// ////////////////////
146
147
	public static Set<String> loadFromClasspath(final String classpath) {
148 26600 sandro.lab
		final Set<String> h = Sets.newHashSet();
149
		try {
150 37195 claudio.at
			for (final String s : IOUtils.readLines(NGramUtils.class.getResourceAsStream(classpath))) {
151 26600 sandro.lab
				h.add(s);
152
			}
153 37195 claudio.at
		} catch (final Throwable e) {
154 26600 sandro.lab
			return Sets.newHashSet();
155
		}
156
		return h;
157 33026 claudio.at
	}
158
159 26600 sandro.lab
}