Project

General

Profile

« Previous | Next » 

Revision 33135

merged branch ProtoMapping

View differences:

modules/dnet-pace-core/trunk/src/test/java/eu/dnetlib/pace/AbstractPaceTest.java
1 1
package eu.dnetlib.pace;
2 2

  
3
import java.io.IOException;
4
import java.io.StringWriter;
5

  
6
import org.apache.commons.io.IOUtils;
7

  
8
import eu.dnetlib.pace.config.Config;
9
import eu.dnetlib.pace.config.DynConf;
3 10
import eu.dnetlib.pace.config.Type;
4 11
import eu.dnetlib.pace.model.Field;
12
import eu.dnetlib.pace.model.FieldValueImpl;
5 13

  
6 14
public abstract class AbstractPaceTest {
7 15

  
16
	protected Config getResultConf() {
17
		return DynConf.load(readFromClasspath("/eu/dnetlib/pace/config/result.pace.conf"));
18
	}
19

  
20
	protected Config getOrganizationConf() {
21
		return DynConf.load(readFromClasspath("/eu/dnetlib/pace/config/organization.pace.conf"));
22
	}
23

  
24
	private String readFromClasspath(final String filename) {
25
		StringWriter sw = new StringWriter();
26
		try {
27
			IOUtils.copy(getClass().getResourceAsStream(filename), sw);
28
			return sw.toString();
29
		} catch (IOException e) {
30
			throw new RuntimeException("cannot load resource from classpath: " + filename);
31
		}
32
	}
33

  
8 34
	protected Field title(final String s) {
9
		return new Field(Type.String, "title", s);
35
		return new FieldValueImpl(Type.String, "title", s);
10 36
	}
11 37

  
12 38
}
modules/dnet-pace-core/trunk/src/test/resources/eu/dnetlib/pace/config/organization.pace.conf
1
pace.conf { 
2
	conditions { },
3
	model {
4
		legalname { algo = JaroWinkler, type = String, weight = 0.6, ignoreMissing = false, path = organization/metadata/legalname/value }, 
5
		legalshortname { algo = JaroWinkler, type = String, weight = 0.4, ignoreMissing = true, path = organization/metadata/legalshortname/value } 
6
	} 
7
}
modules/dnet-pace-core/trunk/src/test/resources/eu/dnetlib/pace/config/result.pace.conf
1
pace.conf {
2
	clustering {
3
		acronyms { fields = [title], params = { max = 1, minLen = 2, maxLen = 4} },
4
		ngrampairs { fields = [title], params = { max = 1, ngramLen = 3} },
5
		suffixprefix { fields = [title], params = { max = 1, len = 3 } } 
6
	},
7
	strictconditions {
8
		exactMatch { fields = [pid] }
9
	}, 
10
	conditions { 
11
		yearMatch { fields = [dateofacceptance] },
12
		titleVersionMatch { fields = [title] },
13
		sizeMatch { fields = [authors] } 
14
	},
15
	model {
16
		pid { algo = ExactMatch, type = String, weight = 0.0, ignoreMissing = true, path = pid/value, overrideMatch = true }, 	
17
		title { algo = JaroWinkler, type = String, weight = 1.0, ignoreMissing = false, path = result/metadata/title/value },
18
		dateofacceptance { algo = Null, type = String, weight = 0.0, ignoreMissing = true, path = result/metadata/dateofacceptance/value } ,
19
		authors { algo = Null, type = List, weight = 0.0, ignoreMissing = true, path = result/author/metadata/fullname/value } 		
20
	},
21
	blacklists = {
22
		title = [
23
			"^(Corpus Oral Dialectal \\(COD\\)\\.).*$",
24
			"^(Kiri Karl Morgensternile).*$",
25
			"^(\\[Eksliibris Aleksandr).*\\]$",
26
			"^(\\[Eksliibris Aleksandr).*$",
27
			"^(Eksliibris Aleksandr).*$",
28
			"^(Kiri A\\. de Vignolles).*$",
29
			"^(2 kirja Karl Morgensternile).*$",
30
			"^(Pirita kloostri idaosa arheoloogilised).*$",
31
			"^(Kiri tundmatule).*$",
32
			"^(Kiri Jenaer Allgemeine Literaturzeitung toimetusele).*$",
33
			"^(Eksliibris Nikolai Birukovile).*$",
34
			"^(Eksliibris Nikolai Issakovile).*$",
35
			"^(WHP Cruise Summary Information of section).*$",
36
			"^(Measurement of the top quark\\-pair production cross section with ATLAS in pp collisions at).*$",
37
			"^(Measurement of the spin\\-dependent structure function).*"
38
		] } 
39
}
modules/dnet-pace-core/trunk/src/main/java/eu/dnetlib/pace/clustering/BlacklistAwareClusteringCombiner.java
1 1
package eu.dnetlib.pace.clustering;
2 2

  
3 3
import java.util.Collection;
4
import java.util.List;
5 4
import java.util.Map;
6 5
import java.util.Map.Entry;
7 6
import java.util.Set;
......
12 11

  
13 12
import eu.dnetlib.pace.config.Config;
14 13
import eu.dnetlib.pace.model.Document;
15
import eu.dnetlib.pace.model.Field;
14
import eu.dnetlib.pace.model.FieldListImpl;
16 15
import eu.dnetlib.pace.model.MapDocument;
17 16

  
18 17
public class BlacklistAwareClusteringCombiner extends ClusteringCombiner {
......
24 23
	}
25 24

  
26 25
	private MapDocument filter(final MapDocument a, final Map<String, Iterable<String>> blacklists) {
27
		final Map<String, List<Field>> filtered = Maps.newHashMap(a.getFieldMap());
26
		final Map<String, FieldListImpl> filtered = Maps.newHashMap(a.getFieldMap());
28 27
		if (blacklists != null) {
29
			for (final Entry<String, List<Field>> e : filtered.entrySet()) {
30
				filtered.put(e.getKey(), Lists.newArrayList(Iterables.filter(e.getValue(), new FieldFilter(e.getKey(), blacklists))));
28
			for (final Entry<String, FieldListImpl> e : filtered.entrySet()) {
29

  
30
				FieldListImpl fl = new FieldListImpl();
31
				fl.addAll(Lists.newArrayList(Iterables.filter(e.getValue(), new FieldFilter(e.getKey(), blacklists))));
32
				filtered.put(e.getKey(), fl);
31 33
			}
32 34
		}
33 35
		return new MapDocument(a.getIdentifier(), filtered);
......
35 37

  
36 38
	/**
37 39
	 * Tries to match the fields in the regex blacklist.
38
	 * 
40
	 *
39 41
	 * @param fieldName
40 42
	 * @param value
41 43
	 * @return true if the field matches, false otherwise
modules/dnet-pace-core/trunk/src/main/java/eu/dnetlib/pace/model/FieldListImpl.java
1
package eu.dnetlib.pace.model;
2

  
3
import java.util.Collection;
4
import java.util.Iterator;
5
import java.util.List;
6
import java.util.ListIterator;
7

  
8
import com.google.common.base.Function;
9
import com.google.common.base.Joiner;
10
import com.google.common.collect.Iterables;
11
import com.google.common.collect.Lists;
12

  
13
import eu.dnetlib.pace.config.Type;
14

  
15
/**
16
 * The Class FieldListImpl.
17
 */
18
public class FieldListImpl extends AbstractField implements FieldList {
19

  
20
	/** The fields. */
21
	private List<Field> fields;
22

  
23
	/**
24
	 * Instantiates a new field list impl.
25
	 */
26
	public FieldListImpl() {
27
		fields = Lists.newArrayList();
28
	}
29

  
30
	/**
31
	 * Instantiates a new field list impl.
32
	 * 
33
	 * @param name
34
	 *            the name
35
	 */
36
	public FieldListImpl(final String name) {
37
		super(Type.List, name);
38
		fields = Lists.newArrayList();
39
	}
40

  
41
	/*
42
	 * (non-Javadoc)
43
	 * 
44
	 * @see java.util.List#add(java.lang.Object)
45
	 */
46
	@Override
47
	public boolean add(final Field f) {
48
		return fields.add(f);
49
	}
50

  
51
	/*
52
	 * (non-Javadoc)
53
	 * 
54
	 * @see java.util.List#add(int, java.lang.Object)
55
	 */
56
	@Override
57
	public void add(final int i, final Field f) {
58
		fields.add(i, f);
59
	}
60

  
61
	/*
62
	 * (non-Javadoc)
63
	 * 
64
	 * @see java.util.List#addAll(java.util.Collection)
65
	 */
66
	@Override
67
	public boolean addAll(final Collection<? extends Field> f) {
68
		return fields.addAll(f);
69
	}
70

  
71
	/*
72
	 * (non-Javadoc)
73
	 * 
74
	 * @see java.util.List#addAll(int, java.util.Collection)
75
	 */
76
	@Override
77
	public boolean addAll(final int i, final Collection<? extends Field> f) {
78
		return fields.addAll(i, f);
79
	}
80

  
81
	/*
82
	 * (non-Javadoc)
83
	 * 
84
	 * @see java.util.List#clear()
85
	 */
86
	@Override
87
	public void clear() {
88
		fields.clear();
89
	}
90

  
91
	/*
92
	 * (non-Javadoc)
93
	 * 
94
	 * @see java.util.List#contains(java.lang.Object)
95
	 */
96
	@Override
97
	public boolean contains(final Object o) {
98
		return fields.contains(o);
99
	}
100

  
101
	/*
102
	 * (non-Javadoc)
103
	 * 
104
	 * @see java.util.List#containsAll(java.util.Collection)
105
	 */
106
	@Override
107
	public boolean containsAll(final Collection<?> f) {
108
		return fields.containsAll(f);
109
	}
110

  
111
	/*
112
	 * (non-Javadoc)
113
	 * 
114
	 * @see java.util.List#get(int)
115
	 */
116
	@Override
117
	public Field get(final int i) {
118
		return fields.get(i);
119
	}
120

  
121
	/*
122
	 * (non-Javadoc)
123
	 * 
124
	 * @see java.util.List#indexOf(java.lang.Object)
125
	 */
126
	@Override
127
	public int indexOf(final Object o) {
128
		return fields.indexOf(o);
129
	}
130

  
131
	/*
132
	 * (non-Javadoc)
133
	 * 
134
	 * @see eu.dnetlib.pace.model.Field#isEmpty()
135
	 */
136
	@Override
137
	public boolean isEmpty() {
138
		return fields.isEmpty();
139
	}
140

  
141
	/*
142
	 * (non-Javadoc)
143
	 * 
144
	 * @see java.lang.Iterable#iterator()
145
	 */
146
	@Override
147
	public Iterator<Field> iterator() {
148
		return fields.iterator();
149
	}
150

  
151
	/*
152
	 * (non-Javadoc)
153
	 * 
154
	 * @see java.util.List#lastIndexOf(java.lang.Object)
155
	 */
156
	@Override
157
	public int lastIndexOf(final Object o) {
158
		return fields.lastIndexOf(o);
159
	}
160

  
161
	/*
162
	 * (non-Javadoc)
163
	 * 
164
	 * @see java.util.List#listIterator()
165
	 */
166
	@Override
167
	public ListIterator<Field> listIterator() {
168
		return fields.listIterator();
169
	}
170

  
171
	/*
172
	 * (non-Javadoc)
173
	 * 
174
	 * @see java.util.List#listIterator(int)
175
	 */
176
	@Override
177
	public ListIterator<Field> listIterator(final int i) {
178
		return fields.listIterator(i);
179
	}
180

  
181
	/*
182
	 * (non-Javadoc)
183
	 * 
184
	 * @see java.util.List#remove(java.lang.Object)
185
	 */
186
	@Override
187
	public boolean remove(final Object o) {
188
		return fields.remove(o);
189
	}
190

  
191
	/*
192
	 * (non-Javadoc)
193
	 * 
194
	 * @see java.util.List#remove(int)
195
	 */
196
	@Override
197
	public Field remove(final int i) {
198
		return fields.remove(i);
199
	}
200

  
201
	/*
202
	 * (non-Javadoc)
203
	 * 
204
	 * @see java.util.List#removeAll(java.util.Collection)
205
	 */
206
	@Override
207
	public boolean removeAll(final Collection<?> f) {
208
		return fields.removeAll(f);
209
	}
210

  
211
	/*
212
	 * (non-Javadoc)
213
	 * 
214
	 * @see java.util.List#retainAll(java.util.Collection)
215
	 */
216
	@Override
217
	public boolean retainAll(final Collection<?> f) {
218
		return fields.retainAll(f);
219
	}
220

  
221
	/*
222
	 * (non-Javadoc)
223
	 * 
224
	 * @see java.util.List#set(int, java.lang.Object)
225
	 */
226
	@Override
227
	public Field set(final int i, final Field f) {
228
		return fields.set(i, f);
229
	}
230

  
231
	/*
232
	 * (non-Javadoc)
233
	 * 
234
	 * @see java.util.List#size()
235
	 */
236
	@Override
237
	public int size() {
238
		return fields.size();
239
	}
240

  
241
	/*
242
	 * (non-Javadoc)
243
	 * 
244
	 * @see java.util.List#subList(int, int)
245
	 */
246
	@Override
247
	public List<Field> subList(final int from, final int to) {
248
		return fields.subList(from, to);
249
	}
250

  
251
	/*
252
	 * (non-Javadoc)
253
	 * 
254
	 * @see java.util.List#toArray()
255
	 */
256
	@Override
257
	public Object[] toArray() {
258
		return fields.toArray();
259
	}
260

  
261
	/*
262
	 * (non-Javadoc)
263
	 * 
264
	 * @see java.util.List#toArray(java.lang.Object[])
265
	 */
266
	@Override
267
	public <T> T[] toArray(final T[] t) {
268
		return fields.toArray(t);
269
	}
270

  
271
	/*
272
	 * (non-Javadoc)
273
	 * 
274
	 * @see eu.dnetlib.pace.model.Field#stringValue()
275
	 */
276
	@Override
277
	public String stringValue() {
278
		return Joiner.on(" ").join(stringList());
279
	}
280

  
281
	/*
282
	 * (non-Javadoc)
283
	 * 
284
	 * @see eu.dnetlib.pace.model.FieldList#stringList()
285
	 */
286
	@Override
287
	public List<String> stringList() {
288
		return Lists.newArrayList(Iterables.transform(fields, new Function<Field, String>() {
289

  
290
			@Override
291
			public String apply(final Field f) {
292
				return f.stringValue();
293
			}
294
		}));
295
	}
296

  
297
	@Override
298
	public String toString() {
299
		return stringList().toString();
300
	}
301

  
302
}
modules/dnet-pace-core/trunk/src/main/java/eu/dnetlib/pace/model/FieldValueImpl.java
1
package eu.dnetlib.pace.model;
2

  
3
import java.util.Iterator;
4
import java.util.List;
5

  
6
import org.apache.commons.collections.iterators.SingletonIterator;
7

  
8
import eu.dnetlib.pace.config.Type;
9

  
10
/**
11
 * The Class FieldValueImpl.
12
 */
13
public class FieldValueImpl extends AbstractField implements FieldValue {
14

  
15
	/** The value. */
16
	private Object value = null;
17

  
18
	/**
19
	 * Instantiates a new field value impl.
20
	 */
21
	public FieldValueImpl() {}
22

  
23
	/**
24
	 * Instantiates a new field value impl.
25
	 * 
26
	 * @param type
27
	 *            the type
28
	 * @param name
29
	 *            the name
30
	 * @param value
31
	 *            the value
32
	 */
33
	public FieldValueImpl(final Type type, final String name, final Object value) {
34
		super(type, name);
35
		this.value = value;
36
	}
37

  
38
	/*
39
	 * (non-Javadoc)
40
	 * 
41
	 * @see eu.dnetlib.pace.model.Field#isEmpty()
42
	 */
43
	@Override
44
	public boolean isEmpty() {
45
		if (value == null) return false;
46

  
47
		switch (type) {
48
		case String:
49
			return value.toString().isEmpty();
50
		case List:
51
			List<?> list = (List<?>) value;
52
			return list.isEmpty() || ((FieldValueImpl) list.get(0)).isEmpty();
53
		default:
54
			return true;
55
		}
56
	}
57

  
58
	/*
59
	 * (non-Javadoc)
60
	 * 
61
	 * @see eu.dnetlib.pace.model.FieldValue#getValue()
62
	 */
63
	@Override
64
	public Object getValue() {
65
		return value;
66
	}
67

  
68
	/*
69
	 * (non-Javadoc)
70
	 * 
71
	 * @see eu.dnetlib.pace.model.FieldValue#setValue(java.lang.Object)
72
	 */
73
	@Override
74
	public void setValue(final Object value) {
75
		this.value = value;
76
	}
77

  
78
	/*
79
	 * (non-Javadoc)
80
	 * 
81
	 * @see eu.dnetlib.pace.model.Field#stringValue()
82
	 */
83
	@Override
84
	// @SuppressWarnings("unchecked")
85
	public String stringValue() {
86
		return String.valueOf(getValue());
87
		// switch (getType()) {
88
		//
89
		// case Int:
90
		// return String.valueOf(getValue());
91
		// case List:
92
		// return Joiner.on(" ").join((List<String>) getValue());
93
		// case String:
94
		// return (String) getValue();
95
		// default:
96
		// throw new IllegalArgumentException("Unknown type: " + getType().toString());
97
		// }
98
	}
99

  
100
	/*
101
	 * (non-Javadoc)
102
	 * 
103
	 * @see java.lang.Iterable#iterator()
104
	 */
105
	@Override
106
	@SuppressWarnings("unchecked")
107
	public Iterator<Field> iterator() {
108
		return new SingletonIterator(this);
109
	}
110

  
111
}
modules/dnet-pace-core/trunk/src/main/java/eu/dnetlib/pace/model/FieldDef.java
1 1
package eu.dnetlib.pace.model;
2 2

  
3
import java.util.List;
4

  
5
import com.google.common.base.Splitter;
6
import com.google.common.collect.Lists;
7

  
3 8
import eu.dnetlib.pace.config.Type;
4 9
import eu.dnetlib.pace.distance.DistanceAlgo;
5 10

  
6 11
/**
7
 * The schema is composed by field definitions (FieldDef). Each field has a type, a name, and an associated distance
8
 * algorithm.
12
 * The schema is composed by field definitions (FieldDef). Each field has a type, a name, and an associated distance algorithm.
9 13
 */
10 14
public class FieldDef {
11 15

  
......
13 17

  
14 18
	private String name;
15 19

  
20
	private String path;
21

  
16 22
	private DistanceAlgo algo;
17 23

  
18 24
	private boolean ignoreMissing;
19 25

  
20
	public FieldDef(String name, DistanceAlgo algo, boolean ignoreMissing) {
26
	public FieldDef(final String name, final String path, final DistanceAlgo algo, final boolean ignoreMissing) {
21 27
		this.name = name;
28
		this.path = path;
22 29
		this.algo = algo;
23 30
		this.ignoreMissing = ignoreMissing;
24 31
	}
25 32

  
26
	//def apply(s: String): Field[A]
27
	public Field apply(Type type, String s) {
33
	// def apply(s: String): Field[A]
34
	public Field apply(final Type type, final String s) {
28 35
		switch (type) {
29 36
		case Int:
30
			return new Field(type, name, Integer.parseInt(s));
37
			return new FieldValueImpl(type, name, Integer.parseInt(s));
31 38
		case String:
32
			return new Field(type, name, s);
39
			return new FieldValueImpl(type, name, s);
40
		case List:
41
			return new FieldListImpl(name);
33 42
		default:
34 43
			throw new IllegalArgumentException("Casting not implemented for type " + type);
35 44
		}
36 45
	}
37 46

  
38 47
	public String getName() {
39
		return name.split(PATH_SEPARATOR)[0];
48
		return name;
40 49
	}
41 50

  
51
	public String getPath() {
52
		return path;
53
	}
54

  
55
	public List<String> getPathList() {
56
		return Lists.newArrayList(Splitter.on(PATH_SEPARATOR).split(getPath()));
57
	}
58

  
42 59
	public DistanceAlgo getAlgo() {
43 60
		return algo;
44 61
	}
......
47 64
		return ignoreMissing;
48 65
	}
49 66

  
50
	public String getPath() {
51
		return name;
52
	}
53

  
54 67
	@Override
55 68
	public String toString() {
56
		return getPath() + " { algo='" + getAlgo().getClass().getSimpleName() + "' weigth='" + getAlgo().getWeight() + "' ignoreMissing='"
57
				+ isIgnoreMissing() + "' }";
69
		return getName() + " { \n\talgo='" + getAlgo().getClass().getSimpleName() + "' \n\tweigth='" + getAlgo().getWeight() + "' \n\tignoreMissing='"
70
				+ isIgnoreMissing() + "'\n }";
58 71
	}
72

  
59 73
}
modules/dnet-pace-core/trunk/src/main/java/eu/dnetlib/pace/model/MapDocumentComparator.java
4 4

  
5 5
import eu.dnetlib.pace.clustering.NGramUtils;
6 6

  
7
/**
8
 * The Class MapDocumentComparator.
9
 */
7 10
public class MapDocumentComparator implements Comparator<Document> {
8 11

  
12
	/** The comparator field. */
9 13
	private String comparatorField;
10 14

  
11
	public MapDocumentComparator(String comparatorField) {
15
	/**
16
	 * Instantiates a new map document comparator.
17
	 * 
18
	 * @param comparatorField
19
	 *            the comparator field
20
	 */
21
	public MapDocumentComparator(final String comparatorField) {
12 22
		this.comparatorField = comparatorField;
13 23
	}
14 24

  
25
	/*
26
	 * (non-Javadoc)
27
	 * 
28
	 * @see java.util.Comparator#compare(java.lang.Object, java.lang.Object)
29
	 */
15 30
	@Override
16
	public int compare(Document d1, Document d2) {
31
	public int compare(final Document d1, final Document d2) {
17 32

  
18
		if (d1.values(comparatorField).isEmpty() || d2.values(comparatorField).isEmpty()) {
19
			return 0;
20
		}
33
		if (d1.values(comparatorField).isEmpty() || d2.values(comparatorField).isEmpty()) return 0;
21 34

  
22
		Object o1 = d1.values(comparatorField).get(0).getValue();
23
		Object o2 = d2.values(comparatorField).get(0).getValue();
35
		String o1 = d1.values(comparatorField).get(0).stringValue();
36
		String o2 = d2.values(comparatorField).get(0).stringValue();
24 37

  
25
		if (o1 == null || o2 == null) {
26
			return 0;
27
		}
38
		if ((o1 == null) || (o2 == null)) return 0;
28 39

  
29
		String to1 = NGramUtils.cleanupForOrdering(o1.toString());
30
		String to2 = NGramUtils.cleanupForOrdering(o2.toString());
40
		String to1 = NGramUtils.cleanupForOrdering(o1);
41
		String to2 = NGramUtils.cleanupForOrdering(o2);
31 42

  
32 43
		return to1.compareTo(to2);
33 44
	}
modules/dnet-pace-core/trunk/src/main/java/eu/dnetlib/pace/model/MapDocumentSerializer.java
1 1
package eu.dnetlib.pace.model;
2 2

  
3
import java.lang.reflect.Type;
4

  
3 5
import com.google.gson.Gson;
6
import com.google.gson.GsonBuilder;
7
import com.google.gson.InstanceCreator;
8
import com.google.gson.JsonDeserializationContext;
9
import com.google.gson.JsonDeserializer;
10
import com.google.gson.JsonElement;
11
import com.google.gson.JsonParseException;
4 12

  
5
public class MapDocumentSerializer {
6
	public static MapDocument decode(byte[] bytes) {
7
		return new Gson().fromJson(new String(bytes), MapDocument.class);
13
/**
14
 * The Class MapDocumentSerializer.
15
 */
16
public class MapDocumentSerializer implements InstanceCreator<MapDocument> {
17

  
18
	@Override
19
	public MapDocument createInstance(final Type type) {
20
		return new MapDocument();
8 21
	}
9 22

  
10
	public static String toString(MapDocument doc) {
23
	/**
24
	 * Decode.
25
	 *
26
	 * @param bytes
27
	 *            the bytes
28
	 * @return the map document
29
	 */
30
	public static MapDocument decode(final byte[] bytes) {
31

  
32
		GsonBuilder gson = new GsonBuilder();
33

  
34
		gson.registerTypeAdapter(Field.class, new JsonDeserializer<Field>() {
35

  
36
			@Override
37
			public Field deserialize(final JsonElement json, final Type typeOfT, final JsonDeserializationContext context) throws JsonParseException {
38
				FieldListImpl fl = new FieldListImpl();
39
				if (json.isJsonObject()) {
40
					String name = json.getAsJsonObject().get("name").getAsString();
41
					String type = json.getAsJsonObject().get("type").getAsString();
42
					String value = json.getAsJsonObject().get("value").getAsString();
43
					fl.add(new FieldValueImpl(eu.dnetlib.pace.config.Type.valueOf(type), name, value));
44
				}
45
				return fl;
46
			}
47
		});
48

  
49
		return gson.create().fromJson(new String(bytes), MapDocument.class);
50
	}
51

  
52
	/**
53
	 * To string.
54
	 *
55
	 * @param doc
56
	 *            the doc
57
	 * @return the string
58
	 */
59
	public static String toString(final MapDocument doc) {
11 60
		return new Gson().toJson(doc);
12 61
	}
13 62

  
14
	public static byte[] toByteArray(MapDocument doc) {
63
	/**
64
	 * To byte array.
65
	 *
66
	 * @param doc
67
	 *            the doc
68
	 * @return the byte[]
69
	 */
70
	public static byte[] toByteArray(final MapDocument doc) {
15 71
		return toString(doc).getBytes();
16 72
	}
17 73

  
modules/dnet-pace-core/trunk/src/main/java/eu/dnetlib/pace/model/AbstractField.java
1
package eu.dnetlib.pace.model;
2

  
3
import eu.dnetlib.pace.config.Type;
4

  
5
/**
6
 * The Class AbstractField.
7
 */
8
public abstract class AbstractField implements Field {
9

  
10
	/** The type. */
11
	protected Type type = Type.String;
12

  
13
	/** The name. */
14
	protected String name;
15

  
16
	/**
17
	 * Instantiates a new abstract field.
18
	 */
19
	protected AbstractField() {}
20

  
21
	/**
22
	 * Instantiates a new abstract field.
23
	 *
24
	 * @param type
25
	 *            the type
26
	 * @param name
27
	 *            the name
28
	 */
29
	protected AbstractField(final Type type, final String name) {
30
		this.type = type;
31
		this.name = name;
32
	}
33

  
34
	/*
35
	 * (non-Javadoc)
36
	 * 
37
	 * @see eu.dnetlib.pace.model.Field#getName()
38
	 */
39
	@Override
40
	public String getName() {
41
		return name;
42
	}
43

  
44
	/*
45
	 * (non-Javadoc)
46
	 * 
47
	 * @see eu.dnetlib.pace.model.Field#getType()
48
	 */
49
	@Override
50
	public Type getType() {
51
		return type;
52
	}
53

  
54
	/*
55
	 * (non-Javadoc)
56
	 * 
57
	 * @see eu.dnetlib.pace.model.Field#setName(java.lang.String)
58
	 */
59
	@Override
60
	public void setName(final String name) {
61
		this.name = name;
62
	}
63

  
64
	/*
65
	 * (non-Javadoc)
66
	 * 
67
	 * @see eu.dnetlib.pace.model.Field#setType(eu.dnetlib.pace.config.Type)
68
	 */
69
	@Override
70
	public void setType(final Type type) {
71
		this.type = type;
72
	}
73

  
74
}
modules/dnet-pace-core/trunk/src/main/java/eu/dnetlib/pace/model/MapDocument.java
1 1
package eu.dnetlib.pace.model;
2 2

  
3
import java.util.List;
4 3
import java.util.Map;
5 4
import java.util.Set;
6 5

  
......
8 7
import com.google.common.collect.Lists;
9 8
import com.google.common.collect.Maps;
10 9

  
10
/**
11
 * The Class MapDocument.
12
 */
11 13
public class MapDocument implements Document {
12 14

  
15
	/** The identifier. */
13 16
	private String identifier;
14
	private Map<String, List<Field>> fieldMap;
15 17

  
18
	/** The field map. */
19
	private Map<String, FieldListImpl> fieldMap;
20

  
21
	/**
22
	 * Instantiates a new map document.
23
	 */
16 24
	public MapDocument() {
17 25
		identifier = null;
18 26
		fieldMap = Maps.newHashMap();
19 27
	}
20 28

  
21
	public MapDocument(String identifier, Map<String, List<Field>> fieldMap) {
29
	/**
30
	 * Instantiates a new map document.
31
	 *
32
	 * @param identifier
33
	 *            the identifier
34
	 * @param fieldMap
35
	 *            the field map
36
	 */
37
	public MapDocument(final String identifier, final Map<String, FieldListImpl> fieldMap) {
22 38
		this.setIdentifier(identifier);
23 39
		this.fieldMap = fieldMap;
24 40
	}
25
	
26
	public MapDocument(String identifier, byte[] data) {
41

  
42
	/**
43
	 * Instantiates a new map document.
44
	 *
45
	 * @param identifier
46
	 *            the identifier
47
	 * @param data
48
	 *            the data
49
	 */
50
	public MapDocument(final String identifier, final byte[] data) {
27 51
		MapDocument doc = MapDocumentSerializer.decode(data);
28
		
52

  
29 53
		this.fieldMap = doc.fieldMap;
30 54
		this.identifier = doc.identifier;
31 55
	}
32 56

  
57
	/*
58
	 * (non-Javadoc)
59
	 *
60
	 * @see eu.dnetlib.pace.model.document.Document#fields()
61
	 */
33 62
	@Override
34 63
	public Iterable<Field> fields() {
35 64
		return Lists.newArrayList(Iterables.concat(fieldMap.values()));
36 65
	}
37 66

  
67
	/*
68
	 * (non-Javadoc)
69
	 *
70
	 * @see eu.dnetlib.pace.model.document.Document#values(java.lang.String)
71
	 */
38 72
	@Override
39
	public List<Field> values(String name) {
73
	public FieldList values(final String name) {
40 74
		return fieldMap.get(name);
41 75
	}
42
	
76

  
77
	/*
78
	 * (non-Javadoc)
79
	 *
80
	 * @see eu.dnetlib.pace.model.document.Document#fieldNames()
81
	 */
43 82
	@Override
44 83
	public Set<String> fieldNames() {
45 84
		return fieldMap.keySet();
46 85
	}
47 86

  
87
	/*
88
	 * (non-Javadoc)
89
	 *
90
	 * @see java.lang.Object#toString()
91
	 */
48 92
	@Override
49 93
	public String toString() {
50 94
		return MapDocumentSerializer.toString(this);
51
		//return String.format("Document(%s)", fieldMap.toString());
95
		// return String.format("Document(%s)", fieldMap.toString());
52 96
	}
53 97

  
98
	/**
99
	 * To byte array.
100
	 *
101
	 * @return the byte[]
102
	 */
54 103
	public byte[] toByteArray() {
55 104
		return MapDocumentSerializer.toByteArray(this);
56 105
	}
57 106

  
107
	/*
108
	 * (non-Javadoc)
109
	 *
110
	 * @see eu.dnetlib.pace.model.document.Document#getIdentifier()
111
	 */
58 112
	@Override
59 113
	public String getIdentifier() {
60 114
		return identifier;
61 115
	}
62 116

  
63
	public void setIdentifier(String identifier) {
117
	/**
118
	 * Sets the identifier.
119
	 *
120
	 * @param identifier
121
	 *            the new identifier
122
	 */
123
	public void setIdentifier(final String identifier) {
64 124
		this.identifier = identifier;
65 125
	}
66 126

  
67
	public Map<String, List<Field>> getFieldMap() {
127
	/**
128
	 * Gets the field map.
129
	 *
130
	 * @return the field map
131
	 */
132
	public Map<String, FieldListImpl> getFieldMap() {
68 133
		return fieldMap;
69 134
	}
70 135

  
71
	public void setFieldMap(Map<String, List<Field>> fieldMap) {
136
	/**
137
	 * Sets the field map.
138
	 *
139
	 * @param fieldMap
140
	 *            the field map
141
	 */
142
	public void setFieldMap(final Map<String, FieldListImpl> fieldMap) {
72 143
		this.fieldMap = fieldMap;
73 144
	}
74 145

  
modules/dnet-pace-core/trunk/src/main/java/eu/dnetlib/pace/model/Document.java
1 1
package eu.dnetlib.pace.model;
2 2

  
3
import java.util.List;
4 3
import java.util.Set;
5 4

  
5
/**
6
 * The Interface Document. Models the common operations available on a Pace Document.
7
 */
6 8
public interface Document {
9

  
10
	/**
11
	 * Gets the identifier.
12
	 * 
13
	 * @return the identifier
14
	 */
7 15
	String getIdentifier();
8 16

  
17
	/**
18
	 * Fields.
19
	 * 
20
	 * @return the iterable
21
	 */
9 22
	Iterable<Field> fields();
10 23

  
11
	List<Field> values(String name);
12
	
24
	/**
25
	 * Values.
26
	 * 
27
	 * @param name
28
	 *            the name
29
	 * @return the field list
30
	 */
31
	FieldList values(String name);
32

  
33
	/**
34
	 * Field names.
35
	 * 
36
	 * @return the sets the
37
	 */
13 38
	Set<String> fieldNames();
14 39
}
modules/dnet-pace-core/trunk/src/main/java/eu/dnetlib/pace/model/FieldList.java
1
package eu.dnetlib.pace.model;
2

  
3
import java.util.List;
4

  
5
/**
6
 * The Interface FieldList.
7
 */
8
public interface FieldList extends List<Field>, Field {
9

  
10
	/**
11
	 * String list.
12
	 * 
13
	 * @return the list
14
	 */
15
	public List<String> stringList();
16

  
17
}
modules/dnet-pace-core/trunk/src/main/java/eu/dnetlib/pace/model/FieldValue.java
1
package eu.dnetlib.pace.model;
2

  
3
/**
4
 * The Interface FieldValue.
5
 */
6
public interface FieldValue extends Field {
7

  
8
	/**
9
	 * Gets the value.
10
	 * 
11
	 * @return the value
12
	 */
13
	public Object getValue();
14

  
15
	/**
16
	 * Sets the value.
17
	 * 
18
	 * @param value
19
	 *            the new value
20
	 */
21
	public void setValue(final Object value);
22

  
23
}
modules/dnet-pace-core/trunk/src/main/java/eu/dnetlib/pace/model/Field.java
1 1
package eu.dnetlib.pace.model;
2 2

  
3
import java.util.List;
4

  
5
import com.google.common.base.Joiner;
6

  
7 3
import eu.dnetlib.pace.config.Type;
8 4

  
9
public class Field {
5
/**
6
 * The Interface Field.
7
 */
8
public interface Field extends Iterable<Field> {
10 9

  
11
	private Type type = Type.String;
12
	private String name;
13
	private Object value = null;
10
	/**
11
	 * Gets the name.
12
	 * 
13
	 * @return the name
14
	 */
15
	public String getName();
14 16

  
15
	public Field() {
16
	}
17
	/**
18
	 * Sets the name.
19
	 * 
20
	 * @param name
21
	 *            the new name
22
	 */
23
	public void setName(String name);
17 24

  
18
	public Field(Type type, String name, Object value) {
19
		this.type = type;
20
		this.name = name;
21
		this.value = value;
22
	}
25
	/**
26
	 * Gets the type.
27
	 * 
28
	 * @return the type
29
	 */
30
	public Type getType();
23 31

  
24
	public boolean isEmpty() {
25
		if (value == null) {
26
			return false;
27
		}
32
	/**
33
	 * Sets the type.
34
	 * 
35
	 * @param type
36
	 *            the new type
37
	 */
38
	public void setType(Type type);
28 39

  
29
		switch (type) {
30
		case String:
31
			return value.toString().isEmpty();
32
		case List:
33
			List<?> list = (List<?>) value;
34
			return list.isEmpty() || ((Field) list.get(0)).isEmpty();
35
		default:
36
			return true;
37
		}
38
	}
40
	/**
41
	 * Checks if is empty.
42
	 * 
43
	 * @return true, if is empty
44
	 */
45
	public boolean isEmpty();
39 46

  
40
	public Object getValue() {
41
		return value;
42
	}
47
	/**
48
	 * String value.
49
	 * 
50
	 * @return the string
51
	 */
52
	public String stringValue();
43 53

  
44
	public void setValue(Object value) {
45
		this.value = value;
46
	}
47

  
48
	public Type getType() {
49
		return type;
50
	}
51

  
52
	public void setType(Type type) {
53
		this.type = type;
54
	}
55

  
56
	public String getName() {
57
		return name;
58
	}
59

  
60
	public void setName(String name) {
61
		this.name = name;
62
	}
63
	
64
	@SuppressWarnings("unchecked")
65
	public String stringValue() {
66
		switch(getType()) {
67
		case Int:
68
			return String.valueOf(getValue());
69
		case List:
70
			return Joiner.on(" ").join((List<String>) getValue());
71
		case String:
72
			return (String) getValue();
73
		default:
74
			throw new IllegalArgumentException("Unknown type: " + getType().toString());
75
		}
76
	}		
77

  
78 54
}
modules/dnet-pace-core/trunk/src/main/java/eu/dnetlib/pace/model/DocumentBuilder.java
1 1
package eu.dnetlib.pace.model;
2 2

  
3
import java.util.List;
4 3
import java.util.Map;
5 4

  
5
/**
6
 * The Class DocumentBuilder.
7
 */
6 8
public class DocumentBuilder {
7 9

  
8
	public static MapDocument newInstance(final String identifier, final Map<String, List<Field>> fieldMap) {
10
	/**
11
	 * New instance.
12
	 *
13
	 * @param identifier
14
	 *            the identifier
15
	 * @param fieldMap
16
	 *            the field map
17
	 * @return the map document
18
	 */
19
	public static MapDocument newInstance(final String identifier, final Map<String, FieldListImpl> fieldMap) {
9 20
		return new MapDocument(identifier, fieldMap);
10 21
	}
11 22

  
23
	/**
24
	 * New instance.
25
	 *
26
	 * @param identifier
27
	 *            the identifier
28
	 * @param fieldMap
29
	 *            the field map
30
	 * @return the map document
31
	 */
12 32
	public static MapDocument newInstance(final String identifier, final byte[] fieldMap) {
13 33
		return new MapDocument(identifier, fieldMap);
14 34
	}
modules/dnet-pace-core/trunk/src/main/java/eu/dnetlib/pace/config/ConfigurableModel.java
4 4
import java.util.List;
5 5
import java.util.Map;
6 6
import java.util.Map.Entry;
7
import java.util.Set;
7 8

  
8 9
import com.google.common.base.Function;
9 10
import com.google.common.base.Predicate;
10 11
import com.google.common.collect.Iterables;
11 12
import com.google.common.collect.Lists;
12 13
import com.google.common.collect.Maps;
14
import com.google.common.collect.Sets;
13 15

  
14 16
import eu.dnetlib.pace.clustering.Acronyms;
15 17
import eu.dnetlib.pace.clustering.Clustering;
......
20 22
import eu.dnetlib.pace.clustering.SuffixPrefix;
21 23
import eu.dnetlib.pace.condition.AlwaysTrueCondition;
22 24
import eu.dnetlib.pace.condition.ConditionAlgo;
25
import eu.dnetlib.pace.condition.DoiExactMatch;
26
import eu.dnetlib.pace.condition.ExactMatch;
27
import eu.dnetlib.pace.condition.SizeMatch;
23 28
import eu.dnetlib.pace.condition.TitleVersionMatch;
24 29
import eu.dnetlib.pace.condition.YearMatch;
25 30
import eu.dnetlib.pace.distance.DistanceAlgo;
......
29 34
import eu.dnetlib.pace.distance.Level2Levenstein;
30 35
import eu.dnetlib.pace.distance.Levenstein;
31 36
import eu.dnetlib.pace.distance.NullDistanceAlgo;
37
import eu.dnetlib.pace.distance.SortedJaroWinkler;
38
import eu.dnetlib.pace.distance.SortedLevel2JaroWinkler;
32 39
import eu.dnetlib.pace.distance.SubStringLevenstein;
33 40
import eu.dnetlib.pace.distance.YearLevenstein;
34 41
import eu.dnetlib.pace.model.ClusteringDef;
......
51 58
	}
52 59

  
53 60
	@Override
61
	public List<CondDef> strictConditions() {
62
		return parseConds("strict");
63
	}
64

  
65
	@Override
54 66
	public List<CondDef> conditions() {
55 67
		return parseConds("");
56 68
	}
......
72 84

  
73 85
	@Override
74 86
	public FieldDef identifierFieldDef() {
75
		return new FieldDef(identifierField(), new NullDistanceAlgo(), false);
87
		return new FieldDef(identifierField(), null, new NullDistanceAlgo(), false);
76 88
	}
77 89

  
78 90
	private List<FieldDef> parseFields(final String base) {
......
85 97

  
86 98
				final String name = e.getKey();
87 99

  
100
				final String path = config.getString(String.format("pace.conf.model%s.%s.path", base, name));
88 101
				double weight = config.getDouble(String.format("pace.conf.model%s.%s.weight", base, name));
89
				boolean ignoreMissing = config.getBoolean(String.format("pace.conf.model%s.%s.ignoreMissing", base, name));
102
				Boolean ignoreMissing = config.getBoolean(String.format("pace.conf.model%s.%s.ignoreMissing", base, name));
90 103
				// Type type = Type.valueOf(config.getString(String.format("pace.conf.model%s.%s.type", base, name)));
91 104

  
92
				return new FieldDef(name, getAlgo(base, name, weight), ignoreMissing);
105
				return new FieldDef(name, path, getAlgo(base, name, weight), ignoreMissing);
93 106
			}
94 107

  
95 108
			private DistanceAlgo getAlgo(final String base, final String name, final double w) {
......
108 121
					return new SubStringLevenstein(w, config.getInt(String.format("pace.conf.model%s.%s.limit", base, name)));
109 122
				case YearLevenstein:
110 123
					return new YearLevenstein(w, config.getInt(String.format("pace.conf.model%s.%s.limit", base, name)));
124
				case SortedJaroWinkler:
125
					return new SortedJaroWinkler(w);
126
				case SortedLevel2JaroWinkler:
127
					return new SortedLevel2JaroWinkler(w);
111 128
				case Null:
112 129
					return new NullDistanceAlgo();
113 130
				default:
......
119 136

  
120 137
	public List<CondDef> parseConds(final String base) {
121 138
		@SuppressWarnings("unchecked")
122
		final Map<String, ?> modelMap = (Map<String, ?>) config.getObject("pace.conf.conditions");
139
		final Map<String, ?> modelMap = (Map<String, ?>) config.getObject(String.format("pace.conf.%sconditions", base));
123 140
		return Lists.newArrayList(Iterables.transform(filter(modelMap).entrySet(), new Function<Entry<String, ?>, CondDef>() {
124 141

  
125 142
			@Override
126 143
			public CondDef apply(final Entry<String, ?> e) {
127 144

  
128 145
				final Cond condName = Cond.valueOf(e.getKey());
129
				final List<String> fields = config.getList(String.format("pace.conf.conditions%s.%s.fields", base, e.getKey()));
146
				final List<String> fieldList = config.getList(String.format("pace.conf.%sconditions.%s.fields", base, e.getKey()));
147
				final Set<String> fieldSet = Sets.newHashSet(fieldList);
130 148

  
149
				final List<FieldDef> fields = Lists.newArrayList(Iterables.filter(fields(), new Predicate<FieldDef>() {
150

  
151
					@Override
152
					public boolean apply(final FieldDef fd) {
153
						return fieldSet.contains(fd.getName());
154
					}
155
				}));
156

  
131 157
				return new CondDef(getCondAlgo(fields, condName));
132 158
			}
133 159

  
134
			private ConditionAlgo getCondAlgo(final List<String> fields, final Cond condName) {
160
			private ConditionAlgo getCondAlgo(final List<FieldDef> fields, final Cond condName) {
135 161
				switch (condName) {
136 162
				case yearMatch:
137 163
					return new YearMatch(fields);
138 164
				case titleVersionMatch:
139 165
					return new TitleVersionMatch(fields);
166
				case sizeMatch:
167
					return new SizeMatch(fields);
168
				case exactMatch:
169
					return new ExactMatch(fields);
170
				case doiExactMatch:
171
					return new DoiExactMatch(fields);
140 172
				default:
141 173
					return new AlwaysTrueCondition(fields);
142 174
				}
......
156 188
				final List<String> fields = config.getList(String.format("pace.conf.clustering%s.%s.fields", base, e.getKey()));
157 189
				@SuppressWarnings("unchecked")
158 190
				final Map<String, Integer> params =
159
						(Map<String, Integer>) config.getObject(String.format("pace.conf.clustering%s.%s.params", base, e.getKey()));
191
				(Map<String, Integer>) config.getObject(String.format("pace.conf.clustering%s.%s.params", base, e.getKey()));
160 192

  
161 193
				return new ClusteringDef(clustering, getClusteringFunction(params, clustering), fields);
162 194
			}
modules/dnet-pace-core/trunk/src/main/java/eu/dnetlib/pace/config/Algo.java
1 1
package eu.dnetlib.pace.config;
2 2

  
3
/**
4
 * Enumerates the distance Algos.
5
 */
3 6
public enum Algo {
4
	JaroWinkler, JaroWinklerTitle, Levenstein, Level2JaroWinkler, Level2Levenstein, SubStringLevenstein, YearLevenstein, Null
7

  
8
	/** The Jaro winkler. */
9
	JaroWinkler,
10
	/** The Jaro winkler title. */
11
	JaroWinklerTitle,
12
	/** The Levenstein. */
13
	Levenstein,
14
	/** The Level2 jaro winkler. */
15
	Level2JaroWinkler,
16
	/** The Level2 levenstein. */
17
	Level2Levenstein,
18
	/** The Sub string levenstein. */
19
	SubStringLevenstein,
20
	/** The Year levenstein. */
21
	YearLevenstein,
22
	/** The Sorted jaro winkler. */
23
	SortedJaroWinkler,
24
	/** The Sorted level2 jaro winkler. */
25
	SortedLevel2JaroWinkler,
26
	/** The Null. */
27
	Null
5 28
}
modules/dnet-pace-core/trunk/src/main/java/eu/dnetlib/pace/config/Cond.java
1 1
package eu.dnetlib.pace.config;
2 2

  
3
/**
4
 * The Enum Cond.
5
 */
3 6
public enum Cond {
4
	yearMatch, titleVersionMatch
7

  
8
	/** The year match. */
9
	yearMatch,
10
	/** The title version match. */
11
	titleVersionMatch,
12
	/** The size match. */
13
	sizeMatch,
... This diff was truncated because it exceeds the maximum size that can be displayed.

Also available in: Unified diff