1 |
26600
|
sandro.lab
|
package eu.dnetlib.pace.common;
|
2 |
|
|
|
3 |
|
|
import java.text.Normalizer;
|
4 |
|
|
import java.util.Collection;
|
5 |
|
|
import java.util.List;
|
6 |
|
|
import java.util.Set;
|
7 |
|
|
import java.util.StringTokenizer;
|
8 |
|
|
|
9 |
|
|
import org.apache.commons.io.IOUtils;
|
10 |
43280
|
claudio.at
|
import org.apache.commons.lang.StringUtils;
|
11 |
26600
|
sandro.lab
|
|
12 |
|
|
import com.google.common.base.Joiner;
|
13 |
38059
|
claudio.at
|
import com.google.common.collect.Iterables;
|
14 |
26600
|
sandro.lab
|
import com.google.common.collect.Lists;
|
15 |
|
|
import com.google.common.collect.Sets;
|
16 |
|
|
|
17 |
|
|
import eu.dnetlib.pace.clustering.NGramUtils;
|
18 |
33135
|
claudio.at
|
import eu.dnetlib.pace.model.Field;
|
19 |
38059
|
claudio.at
|
import eu.dnetlib.pace.model.FieldList;
|
20 |
|
|
import eu.dnetlib.pace.model.FieldListImpl;
|
21 |
26600
|
sandro.lab
|
|
22 |
|
|
/**
|
23 |
|
|
* Set of common functions
|
24 |
33026
|
claudio.at
|
*
|
25 |
26600
|
sandro.lab
|
* @author claudio
|
26 |
|
|
*
|
27 |
|
|
*/
|
28 |
|
|
public abstract class AbstractPaceFunctions {
|
29 |
33026
|
claudio.at
|
|
30 |
26600
|
sandro.lab
|
protected static Set<String> stopwords = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt");
|
31 |
|
|
|
32 |
33026
|
claudio.at
|
protected static Set<String> ngramBlacklist = loadFromClasspath("/eu/dnetlib/pace/config/ngram_blacklist.txt");
|
33 |
26600
|
sandro.lab
|
|
34 |
|
|
private static final String alpha = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 ";
|
35 |
|
|
private static final String aliases_from = "⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾ⁿ₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎";
|
36 |
|
|
private static final String aliases_to = "0123456789+-=()n0123456789+-=()";
|
37 |
33026
|
claudio.at
|
|
38 |
38059
|
claudio.at
|
protected final static FieldList EMPTY_FIELD = new FieldListImpl();
|
39 |
|
|
|
40 |
33026
|
claudio.at
|
protected String concat(final List<String> l) {
|
41 |
26600
|
sandro.lab
|
return Joiner.on(" ").skipNulls().join(l);
|
42 |
|
|
}
|
43 |
|
|
|
44 |
33026
|
claudio.at
|
protected String cleanup(final String s) {
|
45 |
37195
|
claudio.at
|
final String s1 = nfd(s);
|
46 |
|
|
final String s2 = fixAliases(s1);
|
47 |
|
|
final String s3 = s2.replaceAll("–", " ");
|
48 |
|
|
final String s4 = s3.replaceAll("&", " ");
|
49 |
|
|
final String s5 = s4.replaceAll("−", " ");
|
50 |
|
|
final String s6 = s5.replaceAll("([0-9]+)", " $1 ");
|
51 |
|
|
final String s7 = s6.replaceAll("[^\\p{ASCII}]|\\p{Punct}", " ");
|
52 |
|
|
final String s8 = s7.replaceAll("\\n", " ");
|
53 |
|
|
final String s9 = s8.replaceAll("(?m)\\s+", " ");
|
54 |
|
|
final String s10 = s9.trim();
|
55 |
33026
|
claudio.at
|
return s10;
|
56 |
26600
|
sandro.lab
|
}
|
57 |
33026
|
claudio.at
|
|
58 |
37195
|
claudio.at
|
protected String finalCleanup(final String s) {
|
59 |
|
|
return s.toLowerCase();
|
60 |
|
|
}
|
61 |
|
|
|
62 |
33026
|
claudio.at
|
protected boolean checkNumbers(final String a, final String b) {
|
63 |
37195
|
claudio.at
|
final String numbersA = getNumbers(a);
|
64 |
|
|
final String numbersB = getNumbers(b);
|
65 |
|
|
final String romansA = getRomans(a);
|
66 |
|
|
final String romansB = getRomans(b);
|
67 |
|
|
return !numbersA.equals(numbersB) || !romansA.equals(romansB);
|
68 |
26600
|
sandro.lab
|
}
|
69 |
|
|
|
70 |
33026
|
claudio.at
|
protected String getRomans(final String s) {
|
71 |
26600
|
sandro.lab
|
final StringBuilder sb = new StringBuilder();
|
72 |
37195
|
claudio.at
|
for (final String t : s.split(" ")) {
|
73 |
26600
|
sandro.lab
|
sb.append(isRoman(t) ? t : "");
|
74 |
|
|
}
|
75 |
|
|
return sb.toString();
|
76 |
|
|
}
|
77 |
|
|
|
78 |
33026
|
claudio.at
|
protected boolean isRoman(final String s) {
|
79 |
26600
|
sandro.lab
|
return s.replaceAll("^M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$", "qwertyuiop").equals("qwertyuiop");
|
80 |
|
|
}
|
81 |
|
|
|
82 |
33026
|
claudio.at
|
protected String getNumbers(final String s) {
|
83 |
26600
|
sandro.lab
|
return s.replaceAll("\\D", "");
|
84 |
|
|
}
|
85 |
|
|
|
86 |
33026
|
claudio.at
|
protected String fixAliases(final String s) {
|
87 |
26600
|
sandro.lab
|
final StringBuilder sb = new StringBuilder();
|
88 |
37195
|
claudio.at
|
for (final char ch : Lists.charactersOf(s)) {
|
89 |
|
|
final int i = StringUtils.indexOf(aliases_from, ch);
|
90 |
26600
|
sandro.lab
|
sb.append(i >= 0 ? aliases_to.charAt(i) : ch);
|
91 |
|
|
}
|
92 |
|
|
return sb.toString();
|
93 |
|
|
}
|
94 |
|
|
|
95 |
33026
|
claudio.at
|
protected String removeSymbols(final String s) {
|
96 |
26600
|
sandro.lab
|
final StringBuilder sb = new StringBuilder();
|
97 |
|
|
|
98 |
37195
|
claudio.at
|
for (final char ch : Lists.charactersOf(s)) {
|
99 |
26600
|
sandro.lab
|
sb.append(StringUtils.contains(alpha, ch) ? ch : " ");
|
100 |
|
|
}
|
101 |
|
|
return sb.toString().replaceAll("\\s+", " ");
|
102 |
|
|
}
|
103 |
33026
|
claudio.at
|
|
104 |
38059
|
claudio.at
|
protected String getFirstValue(final Field values) {
|
105 |
|
|
return (values != null) && !Iterables.isEmpty(values) ? Iterables.getFirst(values, EMPTY_FIELD).stringValue() : null;
|
106 |
33135
|
claudio.at
|
}
|
107 |
|
|
|
108 |
|
|
protected boolean notNull(final String s) {
|
109 |
|
|
return s != null;
|
110 |
|
|
}
|
111 |
|
|
|
112 |
33026
|
claudio.at
|
// ///////////////////////
|
113 |
|
|
|
114 |
|
|
protected String normalize(final String s) {
|
115 |
|
|
return nfd(s).toLowerCase().replaceAll("(\\W|\\p{InCombiningDiacriticalMarks}|\\p{Punct}|\\d|\\n)+", " ").trim();
|
116 |
26600
|
sandro.lab
|
}
|
117 |
33026
|
claudio.at
|
|
118 |
|
|
private String nfd(final String s) {
|
119 |
|
|
return Normalizer.normalize(s, Normalizer.Form.NFD);
|
120 |
|
|
}
|
121 |
|
|
|
122 |
|
|
protected String filterStopWords(final String s, final Set<String> stopwords) {
|
123 |
26600
|
sandro.lab
|
final StringTokenizer st = new StringTokenizer(s);
|
124 |
|
|
final StringBuilder sb = new StringBuilder();
|
125 |
|
|
while (st.hasMoreTokens()) {
|
126 |
|
|
final String token = st.nextToken();
|
127 |
|
|
if (!stopwords.contains(token)) {
|
128 |
|
|
sb.append(token);
|
129 |
|
|
sb.append(" ");
|
130 |
|
|
}
|
131 |
|
|
}
|
132 |
|
|
return sb.toString().trim();
|
133 |
|
|
}
|
134 |
33026
|
claudio.at
|
|
135 |
|
|
protected Collection<String> filterBlacklisted(final Collection<String> set, final Set<String> ngramBlacklist) {
|
136 |
26600
|
sandro.lab
|
final Set<String> newset = Sets.newLinkedHashSet();
|
137 |
37195
|
claudio.at
|
for (final String s : set) {
|
138 |
26600
|
sandro.lab
|
if (!ngramBlacklist.contains(s)) {
|
139 |
|
|
newset.add(s);
|
140 |
|
|
}
|
141 |
|
|
}
|
142 |
|
|
return newset;
|
143 |
|
|
}
|
144 |
33026
|
claudio.at
|
|
145 |
|
|
// ////////////////////
|
146 |
|
|
|
147 |
|
|
public static Set<String> loadFromClasspath(final String classpath) {
|
148 |
26600
|
sandro.lab
|
final Set<String> h = Sets.newHashSet();
|
149 |
|
|
try {
|
150 |
37195
|
claudio.at
|
for (final String s : IOUtils.readLines(NGramUtils.class.getResourceAsStream(classpath))) {
|
151 |
26600
|
sandro.lab
|
h.add(s);
|
152 |
|
|
}
|
153 |
37195
|
claudio.at
|
} catch (final Throwable e) {
|
154 |
26600
|
sandro.lab
|
return Sets.newHashSet();
|
155 |
|
|
}
|
156 |
|
|
return h;
|
157 |
33026
|
claudio.at
|
}
|
158 |
|
|
|
159 |
26600
|
sandro.lab
|
}
|