Revision 33135
Added by Claudio Atzori over 9 years ago
modules/dnet-pace-core/trunk/src/test/java/eu/dnetlib/pace/AbstractPaceTest.java | ||
---|---|---|
1 | 1 |
package eu.dnetlib.pace; |
2 | 2 |
|
3 |
import java.io.IOException; |
|
4 |
import java.io.StringWriter; |
|
5 |
|
|
6 |
import org.apache.commons.io.IOUtils; |
|
7 |
|
|
8 |
import eu.dnetlib.pace.config.Config; |
|
9 |
import eu.dnetlib.pace.config.DynConf; |
|
3 | 10 |
import eu.dnetlib.pace.config.Type; |
4 | 11 |
import eu.dnetlib.pace.model.Field; |
12 |
import eu.dnetlib.pace.model.FieldValueImpl; |
|
5 | 13 |
|
6 | 14 |
public abstract class AbstractPaceTest { |
7 | 15 |
|
16 |
protected Config getResultConf() { |
|
17 |
return DynConf.load(readFromClasspath("/eu/dnetlib/pace/config/result.pace.conf")); |
|
18 |
} |
|
19 |
|
|
20 |
protected Config getOrganizationConf() { |
|
21 |
return DynConf.load(readFromClasspath("/eu/dnetlib/pace/config/organization.pace.conf")); |
|
22 |
} |
|
23 |
|
|
24 |
private String readFromClasspath(final String filename) { |
|
25 |
StringWriter sw = new StringWriter(); |
|
26 |
try { |
|
27 |
IOUtils.copy(getClass().getResourceAsStream(filename), sw); |
|
28 |
return sw.toString(); |
|
29 |
} catch (IOException e) { |
|
30 |
throw new RuntimeException("cannot load resource from classpath: " + filename); |
|
31 |
} |
|
32 |
} |
|
33 |
|
|
8 | 34 |
protected Field title(final String s) { |
9 |
return new Field(Type.String, "title", s); |
|
35 |
return new FieldValueImpl(Type.String, "title", s);
|
|
10 | 36 |
} |
11 | 37 |
|
12 | 38 |
} |
modules/dnet-pace-core/trunk/src/test/resources/eu/dnetlib/pace/config/organization.pace.conf | ||
---|---|---|
1 |
pace.conf { |
|
2 |
conditions { }, |
|
3 |
model { |
|
4 |
legalname { algo = JaroWinkler, type = String, weight = 0.6, ignoreMissing = false, path = organization/metadata/legalname/value }, |
|
5 |
legalshortname { algo = JaroWinkler, type = String, weight = 0.4, ignoreMissing = true, path = organization/metadata/legalshortname/value } |
|
6 |
} |
|
7 |
} |
modules/dnet-pace-core/trunk/src/test/resources/eu/dnetlib/pace/config/result.pace.conf | ||
---|---|---|
1 |
pace.conf { |
|
2 |
clustering { |
|
3 |
acronyms { fields = [title], params = { max = 1, minLen = 2, maxLen = 4} }, |
|
4 |
ngrampairs { fields = [title], params = { max = 1, ngramLen = 3} }, |
|
5 |
suffixprefix { fields = [title], params = { max = 1, len = 3 } } |
|
6 |
}, |
|
7 |
strictconditions { |
|
8 |
exactMatch { fields = [pid] } |
|
9 |
}, |
|
10 |
conditions { |
|
11 |
yearMatch { fields = [dateofacceptance] }, |
|
12 |
titleVersionMatch { fields = [title] }, |
|
13 |
sizeMatch { fields = [authors] } |
|
14 |
}, |
|
15 |
model { |
|
16 |
pid { algo = ExactMatch, type = String, weight = 0.0, ignoreMissing = true, path = pid/value, overrideMatch = true }, |
|
17 |
title { algo = JaroWinkler, type = String, weight = 1.0, ignoreMissing = false, path = result/metadata/title/value }, |
|
18 |
dateofacceptance { algo = Null, type = String, weight = 0.0, ignoreMissing = true, path = result/metadata/dateofacceptance/value } , |
|
19 |
authors { algo = Null, type = List, weight = 0.0, ignoreMissing = true, path = result/author/metadata/fullname/value } |
|
20 |
}, |
|
21 |
blacklists = { |
|
22 |
title = [ |
|
23 |
"^(Corpus Oral Dialectal \\(COD\\)\\.).*$", |
|
24 |
"^(Kiri Karl Morgensternile).*$", |
|
25 |
"^(\\[Eksliibris Aleksandr).*\\]$", |
|
26 |
"^(\\[Eksliibris Aleksandr).*$", |
|
27 |
"^(Eksliibris Aleksandr).*$", |
|
28 |
"^(Kiri A\\. de Vignolles).*$", |
|
29 |
"^(2 kirja Karl Morgensternile).*$", |
|
30 |
"^(Pirita kloostri idaosa arheoloogilised).*$", |
|
31 |
"^(Kiri tundmatule).*$", |
|
32 |
"^(Kiri Jenaer Allgemeine Literaturzeitung toimetusele).*$", |
|
33 |
"^(Eksliibris Nikolai Birukovile).*$", |
|
34 |
"^(Eksliibris Nikolai Issakovile).*$", |
|
35 |
"^(WHP Cruise Summary Information of section).*$", |
|
36 |
"^(Measurement of the top quark\\-pair production cross section with ATLAS in pp collisions at).*$", |
|
37 |
"^(Measurement of the spin\\-dependent structure function).*" |
|
38 |
] } |
|
39 |
} |
modules/dnet-pace-core/trunk/src/main/java/eu/dnetlib/pace/clustering/BlacklistAwareClusteringCombiner.java | ||
---|---|---|
1 | 1 |
package eu.dnetlib.pace.clustering; |
2 | 2 |
|
3 | 3 |
import java.util.Collection; |
4 |
import java.util.List; |
|
5 | 4 |
import java.util.Map; |
6 | 5 |
import java.util.Map.Entry; |
7 | 6 |
import java.util.Set; |
... | ... | |
12 | 11 |
|
13 | 12 |
import eu.dnetlib.pace.config.Config; |
14 | 13 |
import eu.dnetlib.pace.model.Document; |
15 |
import eu.dnetlib.pace.model.Field; |
|
14 |
import eu.dnetlib.pace.model.FieldListImpl;
|
|
16 | 15 |
import eu.dnetlib.pace.model.MapDocument; |
17 | 16 |
|
18 | 17 |
public class BlacklistAwareClusteringCombiner extends ClusteringCombiner { |
... | ... | |
24 | 23 |
} |
25 | 24 |
|
26 | 25 |
private MapDocument filter(final MapDocument a, final Map<String, Iterable<String>> blacklists) { |
27 |
final Map<String, List<Field>> filtered = Maps.newHashMap(a.getFieldMap());
|
|
26 |
final Map<String, FieldListImpl> filtered = Maps.newHashMap(a.getFieldMap());
|
|
28 | 27 |
if (blacklists != null) { |
29 |
for (final Entry<String, List<Field>> e : filtered.entrySet()) { |
|
30 |
filtered.put(e.getKey(), Lists.newArrayList(Iterables.filter(e.getValue(), new FieldFilter(e.getKey(), blacklists)))); |
|
28 |
for (final Entry<String, FieldListImpl> e : filtered.entrySet()) { |
|
29 |
|
|
30 |
FieldListImpl fl = new FieldListImpl(); |
|
31 |
fl.addAll(Lists.newArrayList(Iterables.filter(e.getValue(), new FieldFilter(e.getKey(), blacklists)))); |
|
32 |
filtered.put(e.getKey(), fl); |
|
31 | 33 |
} |
32 | 34 |
} |
33 | 35 |
return new MapDocument(a.getIdentifier(), filtered); |
... | ... | |
35 | 37 |
|
36 | 38 |
/** |
37 | 39 |
* Tries to match the fields in the regex blacklist. |
38 |
*
|
|
40 |
* |
|
39 | 41 |
* @param fieldName |
40 | 42 |
* @param value |
41 | 43 |
* @return true if the field matches, false otherwise |
modules/dnet-pace-core/trunk/src/main/java/eu/dnetlib/pace/model/FieldListImpl.java | ||
---|---|---|
1 |
package eu.dnetlib.pace.model; |
|
2 |
|
|
3 |
import java.util.Collection; |
|
4 |
import java.util.Iterator; |
|
5 |
import java.util.List; |
|
6 |
import java.util.ListIterator; |
|
7 |
|
|
8 |
import com.google.common.base.Function; |
|
9 |
import com.google.common.base.Joiner; |
|
10 |
import com.google.common.collect.Iterables; |
|
11 |
import com.google.common.collect.Lists; |
|
12 |
|
|
13 |
import eu.dnetlib.pace.config.Type; |
|
14 |
|
|
15 |
/** |
|
16 |
* The Class FieldListImpl. |
|
17 |
*/ |
|
18 |
public class FieldListImpl extends AbstractField implements FieldList { |
|
19 |
|
|
20 |
/** The fields. */ |
|
21 |
private List<Field> fields; |
|
22 |
|
|
23 |
/** |
|
24 |
* Instantiates a new field list impl. |
|
25 |
*/ |
|
26 |
public FieldListImpl() { |
|
27 |
fields = Lists.newArrayList(); |
|
28 |
} |
|
29 |
|
|
30 |
/** |
|
31 |
* Instantiates a new field list impl. |
|
32 |
* |
|
33 |
* @param name |
|
34 |
* the name |
|
35 |
*/ |
|
36 |
public FieldListImpl(final String name) { |
|
37 |
super(Type.List, name); |
|
38 |
fields = Lists.newArrayList(); |
|
39 |
} |
|
40 |
|
|
41 |
/* |
|
42 |
* (non-Javadoc) |
|
43 |
* |
|
44 |
* @see java.util.List#add(java.lang.Object) |
|
45 |
*/ |
|
46 |
@Override |
|
47 |
public boolean add(final Field f) { |
|
48 |
return fields.add(f); |
|
49 |
} |
|
50 |
|
|
51 |
/* |
|
52 |
* (non-Javadoc) |
|
53 |
* |
|
54 |
* @see java.util.List#add(int, java.lang.Object) |
|
55 |
*/ |
|
56 |
@Override |
|
57 |
public void add(final int i, final Field f) { |
|
58 |
fields.add(i, f); |
|
59 |
} |
|
60 |
|
|
61 |
/* |
|
62 |
* (non-Javadoc) |
|
63 |
* |
|
64 |
* @see java.util.List#addAll(java.util.Collection) |
|
65 |
*/ |
|
66 |
@Override |
|
67 |
public boolean addAll(final Collection<? extends Field> f) { |
|
68 |
return fields.addAll(f); |
|
69 |
} |
|
70 |
|
|
71 |
/* |
|
72 |
* (non-Javadoc) |
|
73 |
* |
|
74 |
* @see java.util.List#addAll(int, java.util.Collection) |
|
75 |
*/ |
|
76 |
@Override |
|
77 |
public boolean addAll(final int i, final Collection<? extends Field> f) { |
|
78 |
return fields.addAll(i, f); |
|
79 |
} |
|
80 |
|
|
81 |
/* |
|
82 |
* (non-Javadoc) |
|
83 |
* |
|
84 |
* @see java.util.List#clear() |
|
85 |
*/ |
|
86 |
@Override |
|
87 |
public void clear() { |
|
88 |
fields.clear(); |
|
89 |
} |
|
90 |
|
|
91 |
/* |
|
92 |
* (non-Javadoc) |
|
93 |
* |
|
94 |
* @see java.util.List#contains(java.lang.Object) |
|
95 |
*/ |
|
96 |
@Override |
|
97 |
public boolean contains(final Object o) { |
|
98 |
return fields.contains(o); |
|
99 |
} |
|
100 |
|
|
101 |
/* |
|
102 |
* (non-Javadoc) |
|
103 |
* |
|
104 |
* @see java.util.List#containsAll(java.util.Collection) |
|
105 |
*/ |
|
106 |
@Override |
|
107 |
public boolean containsAll(final Collection<?> f) { |
|
108 |
return fields.containsAll(f); |
|
109 |
} |
|
110 |
|
|
111 |
/* |
|
112 |
* (non-Javadoc) |
|
113 |
* |
|
114 |
* @see java.util.List#get(int) |
|
115 |
*/ |
|
116 |
@Override |
|
117 |
public Field get(final int i) { |
|
118 |
return fields.get(i); |
|
119 |
} |
|
120 |
|
|
121 |
/* |
|
122 |
* (non-Javadoc) |
|
123 |
* |
|
124 |
* @see java.util.List#indexOf(java.lang.Object) |
|
125 |
*/ |
|
126 |
@Override |
|
127 |
public int indexOf(final Object o) { |
|
128 |
return fields.indexOf(o); |
|
129 |
} |
|
130 |
|
|
131 |
/* |
|
132 |
* (non-Javadoc) |
|
133 |
* |
|
134 |
* @see eu.dnetlib.pace.model.Field#isEmpty() |
|
135 |
*/ |
|
136 |
@Override |
|
137 |
public boolean isEmpty() { |
|
138 |
return fields.isEmpty(); |
|
139 |
} |
|
140 |
|
|
141 |
/* |
|
142 |
* (non-Javadoc) |
|
143 |
* |
|
144 |
* @see java.lang.Iterable#iterator() |
|
145 |
*/ |
|
146 |
@Override |
|
147 |
public Iterator<Field> iterator() { |
|
148 |
return fields.iterator(); |
|
149 |
} |
|
150 |
|
|
151 |
/* |
|
152 |
* (non-Javadoc) |
|
153 |
* |
|
154 |
* @see java.util.List#lastIndexOf(java.lang.Object) |
|
155 |
*/ |
|
156 |
@Override |
|
157 |
public int lastIndexOf(final Object o) { |
|
158 |
return fields.lastIndexOf(o); |
|
159 |
} |
|
160 |
|
|
161 |
/* |
|
162 |
* (non-Javadoc) |
|
163 |
* |
|
164 |
* @see java.util.List#listIterator() |
|
165 |
*/ |
|
166 |
@Override |
|
167 |
public ListIterator<Field> listIterator() { |
|
168 |
return fields.listIterator(); |
|
169 |
} |
|
170 |
|
|
171 |
/* |
|
172 |
* (non-Javadoc) |
|
173 |
* |
|
174 |
* @see java.util.List#listIterator(int) |
|
175 |
*/ |
|
176 |
@Override |
|
177 |
public ListIterator<Field> listIterator(final int i) { |
|
178 |
return fields.listIterator(i); |
|
179 |
} |
|
180 |
|
|
181 |
/* |
|
182 |
* (non-Javadoc) |
|
183 |
* |
|
184 |
* @see java.util.List#remove(java.lang.Object) |
|
185 |
*/ |
|
186 |
@Override |
|
187 |
public boolean remove(final Object o) { |
|
188 |
return fields.remove(o); |
|
189 |
} |
|
190 |
|
|
191 |
/* |
|
192 |
* (non-Javadoc) |
|
193 |
* |
|
194 |
* @see java.util.List#remove(int) |
|
195 |
*/ |
|
196 |
@Override |
|
197 |
public Field remove(final int i) { |
|
198 |
return fields.remove(i); |
|
199 |
} |
|
200 |
|
|
201 |
/* |
|
202 |
* (non-Javadoc) |
|
203 |
* |
|
204 |
* @see java.util.List#removeAll(java.util.Collection) |
|
205 |
*/ |
|
206 |
@Override |
|
207 |
public boolean removeAll(final Collection<?> f) { |
|
208 |
return fields.removeAll(f); |
|
209 |
} |
|
210 |
|
|
211 |
/* |
|
212 |
* (non-Javadoc) |
|
213 |
* |
|
214 |
* @see java.util.List#retainAll(java.util.Collection) |
|
215 |
*/ |
|
216 |
@Override |
|
217 |
public boolean retainAll(final Collection<?> f) { |
|
218 |
return fields.retainAll(f); |
|
219 |
} |
|
220 |
|
|
221 |
/* |
|
222 |
* (non-Javadoc) |
|
223 |
* |
|
224 |
* @see java.util.List#set(int, java.lang.Object) |
|
225 |
*/ |
|
226 |
@Override |
|
227 |
public Field set(final int i, final Field f) { |
|
228 |
return fields.set(i, f); |
|
229 |
} |
|
230 |
|
|
231 |
/* |
|
232 |
* (non-Javadoc) |
|
233 |
* |
|
234 |
* @see java.util.List#size() |
|
235 |
*/ |
|
236 |
@Override |
|
237 |
public int size() { |
|
238 |
return fields.size(); |
|
239 |
} |
|
240 |
|
|
241 |
/* |
|
242 |
* (non-Javadoc) |
|
243 |
* |
|
244 |
* @see java.util.List#subList(int, int) |
|
245 |
*/ |
|
246 |
@Override |
|
247 |
public List<Field> subList(final int from, final int to) { |
|
248 |
return fields.subList(from, to); |
|
249 |
} |
|
250 |
|
|
251 |
/* |
|
252 |
* (non-Javadoc) |
|
253 |
* |
|
254 |
* @see java.util.List#toArray() |
|
255 |
*/ |
|
256 |
@Override |
|
257 |
public Object[] toArray() { |
|
258 |
return fields.toArray(); |
|
259 |
} |
|
260 |
|
|
261 |
/* |
|
262 |
* (non-Javadoc) |
|
263 |
* |
|
264 |
* @see java.util.List#toArray(java.lang.Object[]) |
|
265 |
*/ |
|
266 |
@Override |
|
267 |
public <T> T[] toArray(final T[] t) { |
|
268 |
return fields.toArray(t); |
|
269 |
} |
|
270 |
|
|
271 |
/* |
|
272 |
* (non-Javadoc) |
|
273 |
* |
|
274 |
* @see eu.dnetlib.pace.model.Field#stringValue() |
|
275 |
*/ |
|
276 |
@Override |
|
277 |
public String stringValue() { |
|
278 |
return Joiner.on(" ").join(stringList()); |
|
279 |
} |
|
280 |
|
|
281 |
/* |
|
282 |
* (non-Javadoc) |
|
283 |
* |
|
284 |
* @see eu.dnetlib.pace.model.FieldList#stringList() |
|
285 |
*/ |
|
286 |
@Override |
|
287 |
public List<String> stringList() { |
|
288 |
return Lists.newArrayList(Iterables.transform(fields, new Function<Field, String>() { |
|
289 |
|
|
290 |
@Override |
|
291 |
public String apply(final Field f) { |
|
292 |
return f.stringValue(); |
|
293 |
} |
|
294 |
})); |
|
295 |
} |
|
296 |
|
|
297 |
@Override |
|
298 |
public String toString() { |
|
299 |
return stringList().toString(); |
|
300 |
} |
|
301 |
|
|
302 |
} |
modules/dnet-pace-core/trunk/src/main/java/eu/dnetlib/pace/model/FieldValueImpl.java | ||
---|---|---|
1 |
package eu.dnetlib.pace.model; |
|
2 |
|
|
3 |
import java.util.Iterator; |
|
4 |
import java.util.List; |
|
5 |
|
|
6 |
import org.apache.commons.collections.iterators.SingletonIterator; |
|
7 |
|
|
8 |
import eu.dnetlib.pace.config.Type; |
|
9 |
|
|
10 |
/** |
|
11 |
* The Class FieldValueImpl. |
|
12 |
*/ |
|
13 |
public class FieldValueImpl extends AbstractField implements FieldValue { |
|
14 |
|
|
15 |
/** The value. */ |
|
16 |
private Object value = null; |
|
17 |
|
|
18 |
/** |
|
19 |
* Instantiates a new field value impl. |
|
20 |
*/ |
|
21 |
public FieldValueImpl() {} |
|
22 |
|
|
23 |
/** |
|
24 |
* Instantiates a new field value impl. |
|
25 |
* |
|
26 |
* @param type |
|
27 |
* the type |
|
28 |
* @param name |
|
29 |
* the name |
|
30 |
* @param value |
|
31 |
* the value |
|
32 |
*/ |
|
33 |
public FieldValueImpl(final Type type, final String name, final Object value) { |
|
34 |
super(type, name); |
|
35 |
this.value = value; |
|
36 |
} |
|
37 |
|
|
38 |
/* |
|
39 |
* (non-Javadoc) |
|
40 |
* |
|
41 |
* @see eu.dnetlib.pace.model.Field#isEmpty() |
|
42 |
*/ |
|
43 |
@Override |
|
44 |
public boolean isEmpty() { |
|
45 |
if (value == null) return false; |
|
46 |
|
|
47 |
switch (type) { |
|
48 |
case String: |
|
49 |
return value.toString().isEmpty(); |
|
50 |
case List: |
|
51 |
List<?> list = (List<?>) value; |
|
52 |
return list.isEmpty() || ((FieldValueImpl) list.get(0)).isEmpty(); |
|
53 |
default: |
|
54 |
return true; |
|
55 |
} |
|
56 |
} |
|
57 |
|
|
58 |
/* |
|
59 |
* (non-Javadoc) |
|
60 |
* |
|
61 |
* @see eu.dnetlib.pace.model.FieldValue#getValue() |
|
62 |
*/ |
|
63 |
@Override |
|
64 |
public Object getValue() { |
|
65 |
return value; |
|
66 |
} |
|
67 |
|
|
68 |
/* |
|
69 |
* (non-Javadoc) |
|
70 |
* |
|
71 |
* @see eu.dnetlib.pace.model.FieldValue#setValue(java.lang.Object) |
|
72 |
*/ |
|
73 |
@Override |
|
74 |
public void setValue(final Object value) { |
|
75 |
this.value = value; |
|
76 |
} |
|
77 |
|
|
78 |
/* |
|
79 |
* (non-Javadoc) |
|
80 |
* |
|
81 |
* @see eu.dnetlib.pace.model.Field#stringValue() |
|
82 |
*/ |
|
83 |
@Override |
|
84 |
// @SuppressWarnings("unchecked") |
|
85 |
public String stringValue() { |
|
86 |
return String.valueOf(getValue()); |
|
87 |
// switch (getType()) { |
|
88 |
// |
|
89 |
// case Int: |
|
90 |
// return String.valueOf(getValue()); |
|
91 |
// case List: |
|
92 |
// return Joiner.on(" ").join((List<String>) getValue()); |
|
93 |
// case String: |
|
94 |
// return (String) getValue(); |
|
95 |
// default: |
|
96 |
// throw new IllegalArgumentException("Unknown type: " + getType().toString()); |
|
97 |
// } |
|
98 |
} |
|
99 |
|
|
100 |
/* |
|
101 |
* (non-Javadoc) |
|
102 |
* |
|
103 |
* @see java.lang.Iterable#iterator() |
|
104 |
*/ |
|
105 |
@Override |
|
106 |
@SuppressWarnings("unchecked") |
|
107 |
public Iterator<Field> iterator() { |
|
108 |
return new SingletonIterator(this); |
|
109 |
} |
|
110 |
|
|
111 |
} |
modules/dnet-pace-core/trunk/src/main/java/eu/dnetlib/pace/model/FieldDef.java | ||
---|---|---|
1 | 1 |
package eu.dnetlib.pace.model; |
2 | 2 |
|
3 |
import java.util.List; |
|
4 |
|
|
5 |
import com.google.common.base.Splitter; |
|
6 |
import com.google.common.collect.Lists; |
|
7 |
|
|
3 | 8 |
import eu.dnetlib.pace.config.Type; |
4 | 9 |
import eu.dnetlib.pace.distance.DistanceAlgo; |
5 | 10 |
|
6 | 11 |
/** |
7 |
* The schema is composed by field definitions (FieldDef). Each field has a type, a name, and an associated distance |
|
8 |
* algorithm. |
|
12 |
* The schema is composed by field definitions (FieldDef). Each field has a type, a name, and an associated distance algorithm. |
|
9 | 13 |
*/ |
10 | 14 |
public class FieldDef { |
11 | 15 |
|
... | ... | |
13 | 17 |
|
14 | 18 |
private String name; |
15 | 19 |
|
20 |
private String path; |
|
21 |
|
|
16 | 22 |
private DistanceAlgo algo; |
17 | 23 |
|
18 | 24 |
private boolean ignoreMissing; |
19 | 25 |
|
20 |
public FieldDef(String name, DistanceAlgo algo, boolean ignoreMissing) {
|
|
26 |
public FieldDef(final String name, final String path, final DistanceAlgo algo, final boolean ignoreMissing) {
|
|
21 | 27 |
this.name = name; |
28 |
this.path = path; |
|
22 | 29 |
this.algo = algo; |
23 | 30 |
this.ignoreMissing = ignoreMissing; |
24 | 31 |
} |
25 | 32 |
|
26 |
//def apply(s: String): Field[A] |
|
27 |
public Field apply(Type type, String s) {
|
|
33 |
// def apply(s: String): Field[A]
|
|
34 |
public Field apply(final Type type, final String s) {
|
|
28 | 35 |
switch (type) { |
29 | 36 |
case Int: |
30 |
return new Field(type, name, Integer.parseInt(s)); |
|
37 |
return new FieldValueImpl(type, name, Integer.parseInt(s));
|
|
31 | 38 |
case String: |
32 |
return new Field(type, name, s); |
|
39 |
return new FieldValueImpl(type, name, s); |
|
40 |
case List: |
|
41 |
return new FieldListImpl(name); |
|
33 | 42 |
default: |
34 | 43 |
throw new IllegalArgumentException("Casting not implemented for type " + type); |
35 | 44 |
} |
36 | 45 |
} |
37 | 46 |
|
38 | 47 |
public String getName() { |
39 |
return name.split(PATH_SEPARATOR)[0];
|
|
48 |
return name; |
|
40 | 49 |
} |
41 | 50 |
|
51 |
public String getPath() { |
|
52 |
return path; |
|
53 |
} |
|
54 |
|
|
55 |
public List<String> getPathList() { |
|
56 |
return Lists.newArrayList(Splitter.on(PATH_SEPARATOR).split(getPath())); |
|
57 |
} |
|
58 |
|
|
42 | 59 |
public DistanceAlgo getAlgo() { |
43 | 60 |
return algo; |
44 | 61 |
} |
... | ... | |
47 | 64 |
return ignoreMissing; |
48 | 65 |
} |
49 | 66 |
|
50 |
public String getPath() { |
|
51 |
return name; |
|
52 |
} |
|
53 |
|
|
54 | 67 |
@Override |
55 | 68 |
public String toString() { |
56 |
return getPath() + " { algo='" + getAlgo().getClass().getSimpleName() + "' weigth='" + getAlgo().getWeight() + "' ignoreMissing='"
|
|
57 |
+ isIgnoreMissing() + "' }"; |
|
69 |
return getName() + " { \n\talgo='" + getAlgo().getClass().getSimpleName() + "' \n\tweigth='" + getAlgo().getWeight() + "' \n\tignoreMissing='"
|
|
70 |
+ isIgnoreMissing() + "'\n }";
|
|
58 | 71 |
} |
72 |
|
|
59 | 73 |
} |
modules/dnet-pace-core/trunk/src/main/java/eu/dnetlib/pace/model/MapDocumentComparator.java | ||
---|---|---|
4 | 4 |
|
5 | 5 |
import eu.dnetlib.pace.clustering.NGramUtils; |
6 | 6 |
|
7 |
/** |
|
8 |
* The Class MapDocumentComparator. |
|
9 |
*/ |
|
7 | 10 |
public class MapDocumentComparator implements Comparator<Document> { |
8 | 11 |
|
12 |
/** The comparator field. */ |
|
9 | 13 |
private String comparatorField; |
10 | 14 |
|
11 |
public MapDocumentComparator(String comparatorField) { |
|
15 |
/** |
|
16 |
* Instantiates a new map document comparator. |
|
17 |
* |
|
18 |
* @param comparatorField |
|
19 |
* the comparator field |
|
20 |
*/ |
|
21 |
public MapDocumentComparator(final String comparatorField) { |
|
12 | 22 |
this.comparatorField = comparatorField; |
13 | 23 |
} |
14 | 24 |
|
25 |
/* |
|
26 |
* (non-Javadoc) |
|
27 |
* |
|
28 |
* @see java.util.Comparator#compare(java.lang.Object, java.lang.Object) |
|
29 |
*/ |
|
15 | 30 |
@Override |
16 |
public int compare(Document d1, Document d2) {
|
|
31 |
public int compare(final Document d1, final Document d2) {
|
|
17 | 32 |
|
18 |
if (d1.values(comparatorField).isEmpty() || d2.values(comparatorField).isEmpty()) { |
|
19 |
return 0; |
|
20 |
} |
|
33 |
if (d1.values(comparatorField).isEmpty() || d2.values(comparatorField).isEmpty()) return 0; |
|
21 | 34 |
|
22 |
Object o1 = d1.values(comparatorField).get(0).getValue();
|
|
23 |
Object o2 = d2.values(comparatorField).get(0).getValue();
|
|
35 |
String o1 = d1.values(comparatorField).get(0).stringValue();
|
|
36 |
String o2 = d2.values(comparatorField).get(0).stringValue();
|
|
24 | 37 |
|
25 |
if (o1 == null || o2 == null) { |
|
26 |
return 0; |
|
27 |
} |
|
38 |
if ((o1 == null) || (o2 == null)) return 0; |
|
28 | 39 |
|
29 |
String to1 = NGramUtils.cleanupForOrdering(o1.toString());
|
|
30 |
String to2 = NGramUtils.cleanupForOrdering(o2.toString());
|
|
40 |
String to1 = NGramUtils.cleanupForOrdering(o1); |
|
41 |
String to2 = NGramUtils.cleanupForOrdering(o2); |
|
31 | 42 |
|
32 | 43 |
return to1.compareTo(to2); |
33 | 44 |
} |
modules/dnet-pace-core/trunk/src/main/java/eu/dnetlib/pace/model/MapDocumentSerializer.java | ||
---|---|---|
1 | 1 |
package eu.dnetlib.pace.model; |
2 | 2 |
|
3 |
import java.lang.reflect.Type; |
|
4 |
|
|
3 | 5 |
import com.google.gson.Gson; |
6 |
import com.google.gson.GsonBuilder; |
|
7 |
import com.google.gson.InstanceCreator; |
|
8 |
import com.google.gson.JsonDeserializationContext; |
|
9 |
import com.google.gson.JsonDeserializer; |
|
10 |
import com.google.gson.JsonElement; |
|
11 |
import com.google.gson.JsonParseException; |
|
4 | 12 |
|
5 |
public class MapDocumentSerializer { |
|
6 |
public static MapDocument decode(byte[] bytes) { |
|
7 |
return new Gson().fromJson(new String(bytes), MapDocument.class); |
|
13 |
/** |
|
14 |
* The Class MapDocumentSerializer. |
|
15 |
*/ |
|
16 |
public class MapDocumentSerializer implements InstanceCreator<MapDocument> { |
|
17 |
|
|
18 |
@Override |
|
19 |
public MapDocument createInstance(final Type type) { |
|
20 |
return new MapDocument(); |
|
8 | 21 |
} |
9 | 22 |
|
10 |
public static String toString(MapDocument doc) { |
|
23 |
/** |
|
24 |
* Decode. |
|
25 |
* |
|
26 |
* @param bytes |
|
27 |
* the bytes |
|
28 |
* @return the map document |
|
29 |
*/ |
|
30 |
public static MapDocument decode(final byte[] bytes) { |
|
31 |
|
|
32 |
GsonBuilder gson = new GsonBuilder(); |
|
33 |
|
|
34 |
gson.registerTypeAdapter(Field.class, new JsonDeserializer<Field>() { |
|
35 |
|
|
36 |
@Override |
|
37 |
public Field deserialize(final JsonElement json, final Type typeOfT, final JsonDeserializationContext context) throws JsonParseException { |
|
38 |
FieldListImpl fl = new FieldListImpl(); |
|
39 |
if (json.isJsonObject()) { |
|
40 |
String name = json.getAsJsonObject().get("name").getAsString(); |
|
41 |
String type = json.getAsJsonObject().get("type").getAsString(); |
|
42 |
String value = json.getAsJsonObject().get("value").getAsString(); |
|
43 |
fl.add(new FieldValueImpl(eu.dnetlib.pace.config.Type.valueOf(type), name, value)); |
|
44 |
} |
|
45 |
return fl; |
|
46 |
} |
|
47 |
}); |
|
48 |
|
|
49 |
return gson.create().fromJson(new String(bytes), MapDocument.class); |
|
50 |
} |
|
51 |
|
|
52 |
/** |
|
53 |
* To string. |
|
54 |
* |
|
55 |
* @param doc |
|
56 |
* the doc |
|
57 |
* @return the string |
|
58 |
*/ |
|
59 |
public static String toString(final MapDocument doc) { |
|
11 | 60 |
return new Gson().toJson(doc); |
12 | 61 |
} |
13 | 62 |
|
14 |
public static byte[] toByteArray(MapDocument doc) { |
|
63 |
/** |
|
64 |
* To byte array. |
|
65 |
* |
|
66 |
* @param doc |
|
67 |
* the doc |
|
68 |
* @return the byte[] |
|
69 |
*/ |
|
70 |
public static byte[] toByteArray(final MapDocument doc) { |
|
15 | 71 |
return toString(doc).getBytes(); |
16 | 72 |
} |
17 | 73 |
|
modules/dnet-pace-core/trunk/src/main/java/eu/dnetlib/pace/model/AbstractField.java | ||
---|---|---|
1 |
package eu.dnetlib.pace.model; |
|
2 |
|
|
3 |
import eu.dnetlib.pace.config.Type; |
|
4 |
|
|
5 |
/** |
|
6 |
* The Class AbstractField. |
|
7 |
*/ |
|
8 |
public abstract class AbstractField implements Field { |
|
9 |
|
|
10 |
/** The type. */ |
|
11 |
protected Type type = Type.String; |
|
12 |
|
|
13 |
/** The name. */ |
|
14 |
protected String name; |
|
15 |
|
|
16 |
/** |
|
17 |
* Instantiates a new abstract field. |
|
18 |
*/ |
|
19 |
protected AbstractField() {} |
|
20 |
|
|
21 |
/** |
|
22 |
* Instantiates a new abstract field. |
|
23 |
* |
|
24 |
* @param type |
|
25 |
* the type |
|
26 |
* @param name |
|
27 |
* the name |
|
28 |
*/ |
|
29 |
protected AbstractField(final Type type, final String name) { |
|
30 |
this.type = type; |
|
31 |
this.name = name; |
|
32 |
} |
|
33 |
|
|
34 |
/* |
|
35 |
* (non-Javadoc) |
|
36 |
* |
|
37 |
* @see eu.dnetlib.pace.model.Field#getName() |
|
38 |
*/ |
|
39 |
@Override |
|
40 |
public String getName() { |
|
41 |
return name; |
|
42 |
} |
|
43 |
|
|
44 |
/* |
|
45 |
* (non-Javadoc) |
|
46 |
* |
|
47 |
* @see eu.dnetlib.pace.model.Field#getType() |
|
48 |
*/ |
|
49 |
@Override |
|
50 |
public Type getType() { |
|
51 |
return type; |
|
52 |
} |
|
53 |
|
|
54 |
/* |
|
55 |
* (non-Javadoc) |
|
56 |
* |
|
57 |
* @see eu.dnetlib.pace.model.Field#setName(java.lang.String) |
|
58 |
*/ |
|
59 |
@Override |
|
60 |
public void setName(final String name) { |
|
61 |
this.name = name; |
|
62 |
} |
|
63 |
|
|
64 |
/* |
|
65 |
* (non-Javadoc) |
|
66 |
* |
|
67 |
* @see eu.dnetlib.pace.model.Field#setType(eu.dnetlib.pace.config.Type) |
|
68 |
*/ |
|
69 |
@Override |
|
70 |
public void setType(final Type type) { |
|
71 |
this.type = type; |
|
72 |
} |
|
73 |
|
|
74 |
} |
modules/dnet-pace-core/trunk/src/main/java/eu/dnetlib/pace/model/MapDocument.java | ||
---|---|---|
1 | 1 |
package eu.dnetlib.pace.model; |
2 | 2 |
|
3 |
import java.util.List; |
|
4 | 3 |
import java.util.Map; |
5 | 4 |
import java.util.Set; |
6 | 5 |
|
... | ... | |
8 | 7 |
import com.google.common.collect.Lists; |
9 | 8 |
import com.google.common.collect.Maps; |
10 | 9 |
|
10 |
/** |
|
11 |
* The Class MapDocument. |
|
12 |
*/ |
|
11 | 13 |
public class MapDocument implements Document { |
12 | 14 |
|
15 |
/** The identifier. */ |
|
13 | 16 |
private String identifier; |
14 |
private Map<String, List<Field>> fieldMap; |
|
15 | 17 |
|
18 |
/** The field map. */ |
|
19 |
private Map<String, FieldListImpl> fieldMap; |
|
20 |
|
|
21 |
/** |
|
22 |
* Instantiates a new map document. |
|
23 |
*/ |
|
16 | 24 |
public MapDocument() { |
17 | 25 |
identifier = null; |
18 | 26 |
fieldMap = Maps.newHashMap(); |
19 | 27 |
} |
20 | 28 |
|
21 |
public MapDocument(String identifier, Map<String, List<Field>> fieldMap) { |
|
29 |
/** |
|
30 |
* Instantiates a new map document. |
|
31 |
* |
|
32 |
* @param identifier |
|
33 |
* the identifier |
|
34 |
* @param fieldMap |
|
35 |
* the field map |
|
36 |
*/ |
|
37 |
public MapDocument(final String identifier, final Map<String, FieldListImpl> fieldMap) { |
|
22 | 38 |
this.setIdentifier(identifier); |
23 | 39 |
this.fieldMap = fieldMap; |
24 | 40 |
} |
25 |
|
|
26 |
public MapDocument(String identifier, byte[] data) { |
|
41 |
|
|
42 |
/** |
|
43 |
* Instantiates a new map document. |
|
44 |
* |
|
45 |
* @param identifier |
|
46 |
* the identifier |
|
47 |
* @param data |
|
48 |
* the data |
|
49 |
*/ |
|
50 |
public MapDocument(final String identifier, final byte[] data) { |
|
27 | 51 |
MapDocument doc = MapDocumentSerializer.decode(data); |
28 |
|
|
52 |
|
|
29 | 53 |
this.fieldMap = doc.fieldMap; |
30 | 54 |
this.identifier = doc.identifier; |
31 | 55 |
} |
32 | 56 |
|
57 |
/* |
|
58 |
* (non-Javadoc) |
|
59 |
* |
|
60 |
* @see eu.dnetlib.pace.model.document.Document#fields() |
|
61 |
*/ |
|
33 | 62 |
@Override |
34 | 63 |
public Iterable<Field> fields() { |
35 | 64 |
return Lists.newArrayList(Iterables.concat(fieldMap.values())); |
36 | 65 |
} |
37 | 66 |
|
67 |
/* |
|
68 |
* (non-Javadoc) |
|
69 |
* |
|
70 |
* @see eu.dnetlib.pace.model.document.Document#values(java.lang.String) |
|
71 |
*/ |
|
38 | 72 |
@Override |
39 |
public List<Field> values(String name) {
|
|
73 |
public FieldList values(final String name) {
|
|
40 | 74 |
return fieldMap.get(name); |
41 | 75 |
} |
42 |
|
|
76 |
|
|
77 |
/* |
|
78 |
* (non-Javadoc) |
|
79 |
* |
|
80 |
* @see eu.dnetlib.pace.model.document.Document#fieldNames() |
|
81 |
*/ |
|
43 | 82 |
@Override |
44 | 83 |
public Set<String> fieldNames() { |
45 | 84 |
return fieldMap.keySet(); |
46 | 85 |
} |
47 | 86 |
|
87 |
/* |
|
88 |
* (non-Javadoc) |
|
89 |
* |
|
90 |
* @see java.lang.Object#toString() |
|
91 |
*/ |
|
48 | 92 |
@Override |
49 | 93 |
public String toString() { |
50 | 94 |
return MapDocumentSerializer.toString(this); |
51 |
//return String.format("Document(%s)", fieldMap.toString()); |
|
95 |
// return String.format("Document(%s)", fieldMap.toString());
|
|
52 | 96 |
} |
53 | 97 |
|
98 |
/** |
|
99 |
* To byte array. |
|
100 |
* |
|
101 |
* @return the byte[] |
|
102 |
*/ |
|
54 | 103 |
public byte[] toByteArray() { |
55 | 104 |
return MapDocumentSerializer.toByteArray(this); |
56 | 105 |
} |
57 | 106 |
|
107 |
/* |
|
108 |
* (non-Javadoc) |
|
109 |
* |
|
110 |
* @see eu.dnetlib.pace.model.document.Document#getIdentifier() |
|
111 |
*/ |
|
58 | 112 |
@Override |
59 | 113 |
public String getIdentifier() { |
60 | 114 |
return identifier; |
61 | 115 |
} |
62 | 116 |
|
63 |
public void setIdentifier(String identifier) { |
|
117 |
/** |
|
118 |
* Sets the identifier. |
|
119 |
* |
|
120 |
* @param identifier |
|
121 |
* the new identifier |
|
122 |
*/ |
|
123 |
public void setIdentifier(final String identifier) { |
|
64 | 124 |
this.identifier = identifier; |
65 | 125 |
} |
66 | 126 |
|
67 |
public Map<String, List<Field>> getFieldMap() { |
|
127 |
/** |
|
128 |
* Gets the field map. |
|
129 |
* |
|
130 |
* @return the field map |
|
131 |
*/ |
|
132 |
public Map<String, FieldListImpl> getFieldMap() { |
|
68 | 133 |
return fieldMap; |
69 | 134 |
} |
70 | 135 |
|
71 |
public void setFieldMap(Map<String, List<Field>> fieldMap) { |
|
136 |
/** |
|
137 |
* Sets the field map. |
|
138 |
* |
|
139 |
* @param fieldMap |
|
140 |
* the field map |
|
141 |
*/ |
|
142 |
public void setFieldMap(final Map<String, FieldListImpl> fieldMap) { |
|
72 | 143 |
this.fieldMap = fieldMap; |
73 | 144 |
} |
74 | 145 |
|
modules/dnet-pace-core/trunk/src/main/java/eu/dnetlib/pace/model/Document.java | ||
---|---|---|
1 | 1 |
package eu.dnetlib.pace.model; |
2 | 2 |
|
3 |
import java.util.List; |
|
4 | 3 |
import java.util.Set; |
5 | 4 |
|
5 |
/** |
|
6 |
* The Interface Document. Models the common operations available on a Pace Document. |
|
7 |
*/ |
|
6 | 8 |
public interface Document { |
9 |
|
|
10 |
/** |
|
11 |
* Gets the identifier. |
|
12 |
* |
|
13 |
* @return the identifier |
|
14 |
*/ |
|
7 | 15 |
String getIdentifier(); |
8 | 16 |
|
17 |
/** |
|
18 |
* Fields. |
|
19 |
* |
|
20 |
* @return the iterable |
|
21 |
*/ |
|
9 | 22 |
Iterable<Field> fields(); |
10 | 23 |
|
11 |
List<Field> values(String name); |
|
12 |
|
|
24 |
/** |
|
25 |
* Values. |
|
26 |
* |
|
27 |
* @param name |
|
28 |
* the name |
|
29 |
* @return the field list |
|
30 |
*/ |
|
31 |
FieldList values(String name); |
|
32 |
|
|
33 |
/** |
|
34 |
* Field names. |
|
35 |
* |
|
36 |
* @return the sets the |
|
37 |
*/ |
|
13 | 38 |
Set<String> fieldNames(); |
14 | 39 |
} |
modules/dnet-pace-core/trunk/src/main/java/eu/dnetlib/pace/model/FieldList.java | ||
---|---|---|
1 |
package eu.dnetlib.pace.model; |
|
2 |
|
|
3 |
import java.util.List; |
|
4 |
|
|
5 |
/** |
|
6 |
* The Interface FieldList. |
|
7 |
*/ |
|
8 |
public interface FieldList extends List<Field>, Field { |
|
9 |
|
|
10 |
/** |
|
11 |
* String list. |
|
12 |
* |
|
13 |
* @return the list |
|
14 |
*/ |
|
15 |
public List<String> stringList(); |
|
16 |
|
|
17 |
} |
modules/dnet-pace-core/trunk/src/main/java/eu/dnetlib/pace/model/FieldValue.java | ||
---|---|---|
1 |
package eu.dnetlib.pace.model; |
|
2 |
|
|
3 |
/** |
|
4 |
* The Interface FieldValue. |
|
5 |
*/ |
|
6 |
public interface FieldValue extends Field { |
|
7 |
|
|
8 |
/** |
|
9 |
* Gets the value. |
|
10 |
* |
|
11 |
* @return the value |
|
12 |
*/ |
|
13 |
public Object getValue(); |
|
14 |
|
|
15 |
/** |
|
16 |
* Sets the value. |
|
17 |
* |
|
18 |
* @param value |
|
19 |
* the new value |
|
20 |
*/ |
|
21 |
public void setValue(final Object value); |
|
22 |
|
|
23 |
} |
modules/dnet-pace-core/trunk/src/main/java/eu/dnetlib/pace/model/Field.java | ||
---|---|---|
1 | 1 |
package eu.dnetlib.pace.model; |
2 | 2 |
|
3 |
import java.util.List; |
|
4 |
|
|
5 |
import com.google.common.base.Joiner; |
|
6 |
|
|
7 | 3 |
import eu.dnetlib.pace.config.Type; |
8 | 4 |
|
9 |
public class Field { |
|
5 |
/** |
|
6 |
* The Interface Field. |
|
7 |
*/ |
|
8 |
public interface Field extends Iterable<Field> { |
|
10 | 9 |
|
11 |
private Type type = Type.String; |
|
12 |
private String name; |
|
13 |
private Object value = null; |
|
10 |
/** |
|
11 |
* Gets the name. |
|
12 |
* |
|
13 |
* @return the name |
|
14 |
*/ |
|
15 |
public String getName(); |
|
14 | 16 |
|
15 |
public Field() { |
|
16 |
} |
|
17 |
/** |
|
18 |
* Sets the name. |
|
19 |
* |
|
20 |
* @param name |
|
21 |
* the new name |
|
22 |
*/ |
|
23 |
public void setName(String name); |
|
17 | 24 |
|
18 |
public Field(Type type, String name, Object value) { |
|
19 |
this.type = type; |
|
20 |
this.name = name; |
|
21 |
this.value = value; |
|
22 |
} |
|
25 |
/** |
|
26 |
* Gets the type. |
|
27 |
* |
|
28 |
* @return the type |
|
29 |
*/ |
|
30 |
public Type getType(); |
|
23 | 31 |
|
24 |
public boolean isEmpty() { |
|
25 |
if (value == null) { |
|
26 |
return false; |
|
27 |
} |
|
32 |
/** |
|
33 |
* Sets the type. |
|
34 |
* |
|
35 |
* @param type |
|
36 |
* the new type |
|
37 |
*/ |
|
38 |
public void setType(Type type); |
|
28 | 39 |
|
29 |
switch (type) { |
|
30 |
case String: |
|
31 |
return value.toString().isEmpty(); |
|
32 |
case List: |
|
33 |
List<?> list = (List<?>) value; |
|
34 |
return list.isEmpty() || ((Field) list.get(0)).isEmpty(); |
|
35 |
default: |
|
36 |
return true; |
|
37 |
} |
|
38 |
} |
|
40 |
/** |
|
41 |
* Checks if is empty. |
|
42 |
* |
|
43 |
* @return true, if is empty |
|
44 |
*/ |
|
45 |
public boolean isEmpty(); |
|
39 | 46 |
|
40 |
public Object getValue() { |
|
41 |
return value; |
|
42 |
} |
|
47 |
/** |
|
48 |
* String value. |
|
49 |
* |
|
50 |
* @return the string |
|
51 |
*/ |
|
52 |
public String stringValue(); |
|
43 | 53 |
|
44 |
public void setValue(Object value) { |
|
45 |
this.value = value; |
|
46 |
} |
|
47 |
|
|
48 |
public Type getType() { |
|
49 |
return type; |
|
50 |
} |
|
51 |
|
|
52 |
public void setType(Type type) { |
|
53 |
this.type = type; |
|
54 |
} |
|
55 |
|
|
56 |
public String getName() { |
|
57 |
return name; |
|
58 |
} |
|
59 |
|
|
60 |
public void setName(String name) { |
|
61 |
this.name = name; |
|
62 |
} |
|
63 |
|
|
64 |
@SuppressWarnings("unchecked") |
|
65 |
public String stringValue() { |
|
66 |
switch(getType()) { |
|
67 |
case Int: |
|
68 |
return String.valueOf(getValue()); |
|
69 |
case List: |
|
70 |
return Joiner.on(" ").join((List<String>) getValue()); |
|
71 |
case String: |
|
72 |
return (String) getValue(); |
|
73 |
default: |
|
74 |
throw new IllegalArgumentException("Unknown type: " + getType().toString()); |
|
75 |
} |
|
76 |
} |
|
77 |
|
|
78 | 54 |
} |
modules/dnet-pace-core/trunk/src/main/java/eu/dnetlib/pace/model/DocumentBuilder.java | ||
---|---|---|
1 | 1 |
package eu.dnetlib.pace.model; |
2 | 2 |
|
3 |
import java.util.List; |
|
4 | 3 |
import java.util.Map; |
5 | 4 |
|
5 |
/** |
|
6 |
* The Class DocumentBuilder. |
|
7 |
*/ |
|
6 | 8 |
public class DocumentBuilder { |
7 | 9 |
|
8 |
public static MapDocument newInstance(final String identifier, final Map<String, List<Field>> fieldMap) { |
|
10 |
/** |
|
11 |
* New instance. |
|
12 |
* |
|
13 |
* @param identifier |
|
14 |
* the identifier |
|
15 |
* @param fieldMap |
|
16 |
* the field map |
|
17 |
* @return the map document |
|
18 |
*/ |
|
19 |
public static MapDocument newInstance(final String identifier, final Map<String, FieldListImpl> fieldMap) { |
|
9 | 20 |
return new MapDocument(identifier, fieldMap); |
10 | 21 |
} |
11 | 22 |
|
23 |
/** |
|
24 |
* New instance. |
|
25 |
* |
|
26 |
* @param identifier |
|
27 |
* the identifier |
|
28 |
* @param fieldMap |
|
29 |
* the field map |
|
30 |
* @return the map document |
|
31 |
*/ |
|
12 | 32 |
public static MapDocument newInstance(final String identifier, final byte[] fieldMap) { |
13 | 33 |
return new MapDocument(identifier, fieldMap); |
14 | 34 |
} |
modules/dnet-pace-core/trunk/src/main/java/eu/dnetlib/pace/config/ConfigurableModel.java | ||
---|---|---|
4 | 4 |
import java.util.List; |
5 | 5 |
import java.util.Map; |
6 | 6 |
import java.util.Map.Entry; |
7 |
import java.util.Set; |
|
7 | 8 |
|
8 | 9 |
import com.google.common.base.Function; |
9 | 10 |
import com.google.common.base.Predicate; |
10 | 11 |
import com.google.common.collect.Iterables; |
11 | 12 |
import com.google.common.collect.Lists; |
12 | 13 |
import com.google.common.collect.Maps; |
14 |
import com.google.common.collect.Sets; |
|
13 | 15 |
|
14 | 16 |
import eu.dnetlib.pace.clustering.Acronyms; |
15 | 17 |
import eu.dnetlib.pace.clustering.Clustering; |
... | ... | |
20 | 22 |
import eu.dnetlib.pace.clustering.SuffixPrefix; |
21 | 23 |
import eu.dnetlib.pace.condition.AlwaysTrueCondition; |
22 | 24 |
import eu.dnetlib.pace.condition.ConditionAlgo; |
25 |
import eu.dnetlib.pace.condition.DoiExactMatch; |
|
26 |
import eu.dnetlib.pace.condition.ExactMatch; |
|
27 |
import eu.dnetlib.pace.condition.SizeMatch; |
|
23 | 28 |
import eu.dnetlib.pace.condition.TitleVersionMatch; |
24 | 29 |
import eu.dnetlib.pace.condition.YearMatch; |
25 | 30 |
import eu.dnetlib.pace.distance.DistanceAlgo; |
... | ... | |
29 | 34 |
import eu.dnetlib.pace.distance.Level2Levenstein; |
30 | 35 |
import eu.dnetlib.pace.distance.Levenstein; |
31 | 36 |
import eu.dnetlib.pace.distance.NullDistanceAlgo; |
37 |
import eu.dnetlib.pace.distance.SortedJaroWinkler; |
|
38 |
import eu.dnetlib.pace.distance.SortedLevel2JaroWinkler; |
|
32 | 39 |
import eu.dnetlib.pace.distance.SubStringLevenstein; |
33 | 40 |
import eu.dnetlib.pace.distance.YearLevenstein; |
34 | 41 |
import eu.dnetlib.pace.model.ClusteringDef; |
... | ... | |
51 | 58 |
} |
52 | 59 |
|
53 | 60 |
@Override |
61 |
public List<CondDef> strictConditions() { |
|
62 |
return parseConds("strict"); |
|
63 |
} |
|
64 |
|
|
65 |
@Override |
|
54 | 66 |
public List<CondDef> conditions() { |
55 | 67 |
return parseConds(""); |
56 | 68 |
} |
... | ... | |
72 | 84 |
|
73 | 85 |
@Override |
74 | 86 |
public FieldDef identifierFieldDef() { |
75 |
return new FieldDef(identifierField(), new NullDistanceAlgo(), false); |
|
87 |
return new FieldDef(identifierField(), null, new NullDistanceAlgo(), false);
|
|
76 | 88 |
} |
77 | 89 |
|
78 | 90 |
private List<FieldDef> parseFields(final String base) { |
... | ... | |
85 | 97 |
|
86 | 98 |
final String name = e.getKey(); |
87 | 99 |
|
100 |
final String path = config.getString(String.format("pace.conf.model%s.%s.path", base, name)); |
|
88 | 101 |
double weight = config.getDouble(String.format("pace.conf.model%s.%s.weight", base, name)); |
89 |
boolean ignoreMissing = config.getBoolean(String.format("pace.conf.model%s.%s.ignoreMissing", base, name));
|
|
102 |
Boolean ignoreMissing = config.getBoolean(String.format("pace.conf.model%s.%s.ignoreMissing", base, name));
|
|
90 | 103 |
// Type type = Type.valueOf(config.getString(String.format("pace.conf.model%s.%s.type", base, name))); |
91 | 104 |
|
92 |
return new FieldDef(name, getAlgo(base, name, weight), ignoreMissing); |
|
105 |
return new FieldDef(name, path, getAlgo(base, name, weight), ignoreMissing);
|
|
93 | 106 |
} |
94 | 107 |
|
95 | 108 |
private DistanceAlgo getAlgo(final String base, final String name, final double w) { |
... | ... | |
108 | 121 |
return new SubStringLevenstein(w, config.getInt(String.format("pace.conf.model%s.%s.limit", base, name))); |
109 | 122 |
case YearLevenstein: |
110 | 123 |
return new YearLevenstein(w, config.getInt(String.format("pace.conf.model%s.%s.limit", base, name))); |
124 |
case SortedJaroWinkler: |
|
125 |
return new SortedJaroWinkler(w); |
|
126 |
case SortedLevel2JaroWinkler: |
|
127 |
return new SortedLevel2JaroWinkler(w); |
|
111 | 128 |
case Null: |
112 | 129 |
return new NullDistanceAlgo(); |
113 | 130 |
default: |
... | ... | |
119 | 136 |
|
120 | 137 |
public List<CondDef> parseConds(final String base) { |
121 | 138 |
@SuppressWarnings("unchecked") |
122 |
final Map<String, ?> modelMap = (Map<String, ?>) config.getObject("pace.conf.conditions");
|
|
139 |
final Map<String, ?> modelMap = (Map<String, ?>) config.getObject(String.format("pace.conf.%sconditions", base));
|
|
123 | 140 |
return Lists.newArrayList(Iterables.transform(filter(modelMap).entrySet(), new Function<Entry<String, ?>, CondDef>() { |
124 | 141 |
|
125 | 142 |
@Override |
126 | 143 |
public CondDef apply(final Entry<String, ?> e) { |
127 | 144 |
|
128 | 145 |
final Cond condName = Cond.valueOf(e.getKey()); |
129 |
final List<String> fields = config.getList(String.format("pace.conf.conditions%s.%s.fields", base, e.getKey())); |
|
146 |
final List<String> fieldList = config.getList(String.format("pace.conf.%sconditions.%s.fields", base, e.getKey())); |
|
147 |
final Set<String> fieldSet = Sets.newHashSet(fieldList); |
|
130 | 148 |
|
149 |
final List<FieldDef> fields = Lists.newArrayList(Iterables.filter(fields(), new Predicate<FieldDef>() { |
|
150 |
|
|
151 |
@Override |
|
152 |
public boolean apply(final FieldDef fd) { |
|
153 |
return fieldSet.contains(fd.getName()); |
|
154 |
} |
|
155 |
})); |
|
156 |
|
|
131 | 157 |
return new CondDef(getCondAlgo(fields, condName)); |
132 | 158 |
} |
133 | 159 |
|
134 |
private ConditionAlgo getCondAlgo(final List<String> fields, final Cond condName) {
|
|
160 |
private ConditionAlgo getCondAlgo(final List<FieldDef> fields, final Cond condName) {
|
|
135 | 161 |
switch (condName) { |
136 | 162 |
case yearMatch: |
137 | 163 |
return new YearMatch(fields); |
138 | 164 |
case titleVersionMatch: |
139 | 165 |
return new TitleVersionMatch(fields); |
166 |
case sizeMatch: |
|
167 |
return new SizeMatch(fields); |
|
168 |
case exactMatch: |
|
169 |
return new ExactMatch(fields); |
|
170 |
case doiExactMatch: |
|
171 |
return new DoiExactMatch(fields); |
|
140 | 172 |
default: |
141 | 173 |
return new AlwaysTrueCondition(fields); |
142 | 174 |
} |
... | ... | |
156 | 188 |
final List<String> fields = config.getList(String.format("pace.conf.clustering%s.%s.fields", base, e.getKey())); |
157 | 189 |
@SuppressWarnings("unchecked") |
158 | 190 |
final Map<String, Integer> params = |
159 |
(Map<String, Integer>) config.getObject(String.format("pace.conf.clustering%s.%s.params", base, e.getKey()));
|
|
191 |
(Map<String, Integer>) config.getObject(String.format("pace.conf.clustering%s.%s.params", base, e.getKey())); |
|
160 | 192 |
|
161 | 193 |
return new ClusteringDef(clustering, getClusteringFunction(params, clustering), fields); |
162 | 194 |
} |
modules/dnet-pace-core/trunk/src/main/java/eu/dnetlib/pace/config/Algo.java | ||
---|---|---|
1 | 1 |
package eu.dnetlib.pace.config; |
2 | 2 |
|
3 |
/** |
|
4 |
* Enumerates the distance Algos. |
|
5 |
*/ |
|
3 | 6 |
public enum Algo { |
4 |
JaroWinkler, JaroWinklerTitle, Levenstein, Level2JaroWinkler, Level2Levenstein, SubStringLevenstein, YearLevenstein, Null |
|
7 |
|
|
8 |
/** The Jaro winkler. */ |
|
9 |
JaroWinkler, |
|
10 |
/** The Jaro winkler title. */ |
|
11 |
JaroWinklerTitle, |
|
12 |
/** The Levenstein. */ |
|
13 |
Levenstein, |
|
14 |
/** The Level2 jaro winkler. */ |
|
15 |
Level2JaroWinkler, |
|
16 |
/** The Level2 levenstein. */ |
|
17 |
Level2Levenstein, |
|
18 |
/** The Sub string levenstein. */ |
|
19 |
SubStringLevenstein, |
|
20 |
/** The Year levenstein. */ |
|
21 |
YearLevenstein, |
|
22 |
/** The Sorted jaro winkler. */ |
|
23 |
SortedJaroWinkler, |
|
24 |
/** The Sorted level2 jaro winkler. */ |
|
25 |
SortedLevel2JaroWinkler, |
|
26 |
/** The Null. */ |
|
27 |
Null |
|
5 | 28 |
} |
modules/dnet-pace-core/trunk/src/main/java/eu/dnetlib/pace/config/Cond.java | ||
---|---|---|
1 | 1 |
package eu.dnetlib.pace.config; |
2 | 2 |
|
3 |
/** |
|
4 |
* The Enum Cond. |
|
5 |
*/ |
|
3 | 6 |
public enum Cond { |
4 |
yearMatch, titleVersionMatch |
|
7 |
|
|
8 |
/** The year match. */ |
|
9 |
yearMatch, |
|
10 |
/** The title version match. */ |
|
11 |
titleVersionMatch, |
|
12 |
/** The size match. */ |
|
13 |
sizeMatch, |
Also available in: Unified diff
merged branch ProtoMapping