Revision 45806
Added by Eri Katsari about 7 years ago
MyComparator.java | ||
---|---|---|
6 | 6 |
public class MyComparator { |
7 | 7 |
private static double FIELDS_SIMILARITY_THRESHOLD = 0.7; |
8 | 8 |
private static double RECORD_SIMILARITY_THRESHOLD = 0.8; |
9 |
private static String SEPARATOR = ","; |
|
9 |
private static final String LINE_SEPERATOR = "\t.\t"; |
|
10 |
private static final String FIELD_DELIM = "\t"; |
|
10 | 11 |
private static Map<String, String> sourceRecordMappings = new HashMap<>(); |
11 | 12 |
|
12 | 13 |
static { |
13 | 14 |
//TODO remove later!!! make it configurable |
14 |
sourceRecordMappings.put("http://www.eurocris.org/ontologies/cerif/1.3#name",
|
|
15 |
"http://www.w3.org/2000/01/rdf-schema#label");
|
|
16 |
sourceRecordMappings.put("http://lod.openaire.eu/vocab/year", "http://purl.org/dc/terms/issued");
|
|
15 |
sourceRecordMappings.put("<http://www.eurocris.org/ontologies/cerif/1.3#name>",
|
|
16 |
"<http://www.w3.org/2000/01/rdf-schema#label>");
|
|
17 |
sourceRecordMappings.put("<http://lod.openaire.eu/vocab/year>", "<http://purl.org/dc/terms/issued>");
|
|
17 | 18 |
} |
18 | 19 |
|
19 | 20 |
public static double findMatchingPair(String source, String target) { |
21 |
Map<String, String> sourceRecordMap = getRecordsFiledMap(source); |
|
22 |
Map<String, String> targetRecordMap = getRecordsFiledMap(target); |
|
20 | 23 |
|
21 |
String[] sourceFields = source.split(SEPARATOR); |
|
22 |
Map<String, String> sourceFieldsMap = new HashMap<>(); |
|
23 |
for (int j = 0; j < sourceFields.length; j++) { |
|
24 |
String[] split = sourceFields[j].split("\t"); |
|
25 |
sourceFieldsMap.put(split[0], split[1]); |
|
26 |
} |
|
27 |
//get target fields |
|
28 |
String[] targetFields = target.split(","); |
|
29 |
Map<String, String> targetFieldsMap = new HashMap<>(); |
|
30 |
for (int j = 0; j < targetFields.length; j++) { |
|
31 |
String[] split = targetFields[j].split("\t"); |
|
32 |
targetFieldsMap.put(split[0], split[1]); |
|
33 |
} |
|
34 | 24 |
//similarity counters |
35 | 25 |
int matchedFields = 0; |
36 |
double totalFields = (double) targetFields.length;
|
|
26 |
double totalFields = (double) sourceRecordMappings.size();
|
|
37 | 27 |
double recordSimilarity = 0.0; |
38 | 28 |
|
39 |
for (Map.Entry<String, String> sourceField : sourceFieldsMap.entrySet()) {
|
|
29 |
for (Map.Entry<String, String> sourceField : sourceRecordMap.entrySet()) {
|
|
40 | 30 |
String correspondingTargetField = sourceRecordMappings.get(sourceField.getKey()); |
41 |
String targetFieldValue = targetFieldsMap.get(correspondingTargetField);
|
|
31 |
String targetFieldValue = targetRecordMap.get(correspondingTargetField);
|
|
42 | 32 |
double fieldsSimilarity = compare(sourceField.getValue(), targetFieldValue); |
43 | 33 |
System.out.println(sourceField + "\n" + targetFieldValue + "\n : field similarity: " + fieldsSimilarity + "\n-----------------------------------------"); |
44 | 34 |
recordSimilarity += fieldsSimilarity; |
... | ... | |
66 | 56 |
} |
67 | 57 |
} |
68 | 58 |
// System.out.println("Similar chars " + similarChars); |
69 |
return (sourceValue.length() >= targetValue.length() ? similarChars / (double) sourceValue.length() : similarChars / (double)targetValue.length());
|
|
59 |
return (sourceValue.length() >= targetValue.length() ? similarChars / (double) sourceValue.length() : similarChars / (double) targetValue.length());
|
|
70 | 60 |
} |
71 | 61 |
|
72 |
|
|
62 |
private static Map<String, String> getRecordsFiledMap(String source) { |
|
63 |
String sourceRecord = source.substring(source.indexOf(FIELD_DELIM) + 1).trim(); |
|
64 |
String[] sourceTriples = sourceRecord.split(LINE_SEPERATOR); |
|
65 |
Map<String, String> sourceFieldsMap = new HashMap<>(); |
|
66 |
for (String sourceTriple : sourceTriples) { |
|
67 |
String[] split = sourceTriple.split(FIELD_DELIM); |
|
68 |
sourceFieldsMap.put(split[0], split[1]); |
|
69 |
} |
|
70 |
return sourceFieldsMap; |
|
71 |
} |
|
73 | 72 |
} |
Also available in: Unified diff
'updates'