Project

General

Profile

1
package eu.dnetlib.pace.common;
2

    
3
import java.text.Normalizer;
4
import java.util.Collection;
5
import java.util.List;
6
import java.util.Set;
7
import java.util.StringTokenizer;
8

    
9
import org.apache.commons.io.IOUtils;
10
import org.apache.commons.lang.StringUtils;
11

    
12
import com.google.common.base.Joiner;
13
import com.google.common.collect.Iterables;
14
import com.google.common.collect.Lists;
15
import com.google.common.collect.Sets;
16

    
17
import eu.dnetlib.pace.clustering.NGramUtils;
18
import eu.dnetlib.pace.model.Field;
19
import eu.dnetlib.pace.model.FieldList;
20
import eu.dnetlib.pace.model.FieldListImpl;
21

    
22
/**
23
 * Set of common functions
24
 *
25
 * @author claudio
26
 *
27
 */
28
public abstract class AbstractPaceFunctions {
29

    
30
	protected static Set<String> stopwords = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt");
31

    
32
	protected static Set<String> ngramBlacklist = loadFromClasspath("/eu/dnetlib/pace/config/ngram_blacklist.txt");
33

    
34
	private static final String alpha = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 ";
35
	private static final String aliases_from = "⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾ⁿ₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎";
36
	private static final String aliases_to = "0123456789+-=()n0123456789+-=()";
37

    
38
	protected final static FieldList EMPTY_FIELD = new FieldListImpl();
39

    
40
	protected String concat(final List<String> l) {
41
		return Joiner.on(" ").skipNulls().join(l);
42
	}
43

    
44
	protected String cleanup(final String s) {
45
		final String s1 = nfd(s);
46
		final String s2 = fixAliases(s1);
47
		final String s3 = s2.replaceAll("&ndash;", " ");
48
		final String s4 = s3.replaceAll("&amp;", " ");
49
		final String s5 = s4.replaceAll("&minus;", " ");
50
		final String s6 = s5.replaceAll("([0-9]+)", " $1 ");
51
		final String s7 = s6.replaceAll("[^\\p{ASCII}]|\\p{Punct}", " ");
52
		final String s8 = s7.replaceAll("\\n", " ");
53
		final String s9 = s8.replaceAll("(?m)\\s+", " ");
54
		final String s10 = s9.trim();
55
		return s10;
56
	}
57

    
58
	protected String finalCleanup(final String s) {
59
		return s.toLowerCase();
60
	}
61

    
62
	protected boolean checkNumbers(final String a, final String b) {
63
		final String numbersA = getNumbers(a);
64
		final String numbersB = getNumbers(b);
65
		final String romansA = getRomans(a);
66
		final String romansB = getRomans(b);
67
		return !numbersA.equals(numbersB) || !romansA.equals(romansB);
68
	}
69

    
70
	protected String getRomans(final String s) {
71
		final StringBuilder sb = new StringBuilder();
72
		for (final String t : s.split(" ")) {
73
			sb.append(isRoman(t) ? t : "");
74
		}
75
		return sb.toString();
76
	}
77

    
78
	protected boolean isRoman(final String s) {
79
		return s.replaceAll("^M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$", "qwertyuiop").equals("qwertyuiop");
80
	}
81

    
82
	protected String getNumbers(final String s) {
83
		return s.replaceAll("\\D", "");
84
	}
85

    
86
	protected String fixAliases(final String s) {
87
		final StringBuilder sb = new StringBuilder();
88
		for (final char ch : Lists.charactersOf(s)) {
89
			final int i = StringUtils.indexOf(aliases_from, ch);
90
			sb.append(i >= 0 ? aliases_to.charAt(i) : ch);
91
		}
92
		return sb.toString();
93
	}
94

    
95
	protected String removeSymbols(final String s) {
96
		final StringBuilder sb = new StringBuilder();
97

    
98
		for (final char ch : Lists.charactersOf(s)) {
99
			sb.append(StringUtils.contains(alpha, ch) ? ch : " ");
100
		}
101
		return sb.toString().replaceAll("\\s+", " ");
102
	}
103

    
104
	protected String getFirstValue(final Field values) {
105
		return (values != null) && !Iterables.isEmpty(values) ? Iterables.getFirst(values, EMPTY_FIELD).stringValue() : null;
106
	}
107

    
108
	protected boolean notNull(final String s) {
109
		return s != null;
110
	}
111

    
112
	// ///////////////////////
113

    
114
	protected String normalize(final String s) {
115
		return nfd(s).toLowerCase().replaceAll("(\\W|\\p{InCombiningDiacriticalMarks}|\\p{Punct}|\\d|\\n)+", " ").trim();
116
	}
117

    
118
	private String nfd(final String s) {
119
		return Normalizer.normalize(s, Normalizer.Form.NFD);
120
	}
121

    
122
	protected String filterStopWords(final String s, final Set<String> stopwords) {
123
		final StringTokenizer st = new StringTokenizer(s);
124
		final StringBuilder sb = new StringBuilder();
125
		while (st.hasMoreTokens()) {
126
			final String token = st.nextToken();
127
			if (!stopwords.contains(token)) {
128
				sb.append(token);
129
				sb.append(" ");
130
			}
131
		}
132
		return sb.toString().trim();
133
	}
134

    
135
	protected Collection<String> filterBlacklisted(final Collection<String> set, final Set<String> ngramBlacklist) {
136
		final Set<String> newset = Sets.newLinkedHashSet();
137
		for (final String s : set) {
138
			if (!ngramBlacklist.contains(s)) {
139
				newset.add(s);
140
			}
141
		}
142
		return newset;
143
	}
144

    
145
	// ////////////////////
146

    
147
	public static Set<String> loadFromClasspath(final String classpath) {
148
		final Set<String> h = Sets.newHashSet();
149
		try {
150
			for (final String s : IOUtils.readLines(NGramUtils.class.getResourceAsStream(classpath))) {
151
				h.add(s);
152
			}
153
		} catch (final Throwable e) {
154
			return Sets.newHashSet();
155
		}
156
		return h;
157
	}
158

    
159
}
    (1-1/1)