Revision 48805
Added by Eri Katsari almost 7 years ago
MyComparator.java | ||
---|---|---|
1 | 1 |
|
2 | 2 |
package eu.dnetlib.data.mapreduce.hbase.lodExport.utils.compators; |
3 | 3 |
|
4 |
import org.datanucleus.util.StringUtils; |
|
5 |
import org.hsqldb.lib.StringUtil; |
|
6 |
|
|
7 | 4 |
import java.util.HashMap; |
8 |
import java.util.Iterator; |
|
9 | 5 |
import java.util.Map; |
10 | 6 |
|
11 | 7 |
|
12 | 8 |
public class MyComparator { |
13 |
private static double FIELDS_SIMILARITY_THRESHOLD = 0.7D; |
|
14 | 9 |
private static final String LINE_SEPERATOR = "\t.\t"; |
15 | 10 |
private static final String FIELD_DELIM = "\t"; |
16 | 11 |
private static Map<String, String> sourceRecordMappings = new HashMap<>(); |
17 | 12 |
|
18 |
public static double findMatchingPair(String source, String target, boolean useLevDistance) { |
|
13 |
public static double computeSimilarity(DistanceAlgorithms algorithm, String source, String target) { |
|
14 |
switch (algorithm) { |
|
15 |
case LEVENSHTEIN: |
|
16 |
return computeLevenshteinSimilarity(source, target); |
|
17 |
case HAMMING: |
|
18 |
default: |
|
19 |
return computeDistanceSimilarity(source, target); |
|
20 |
} |
|
21 |
|
|
22 |
} |
|
23 |
|
|
24 |
private static double computeDistanceSimilarity(String source, String target) { |
|
19 | 25 |
Map<String, String> sourceRecordMap = getRecordsFiledMap(source); |
20 | 26 |
Map<String, String> targetRecordMap = getRecordsFiledMap(target); |
21 |
int matchedFields = 0; |
|
22 |
double totalFields = (double) sourceRecordMap.size() > (double) targetRecordMap.size() ? (double) sourceRecordMap.size() : (double) targetRecordMap.size(); |
|
27 |
double totalFields = getTotalFields(sourceRecordMap, targetRecordMap); |
|
23 | 28 |
double recordSimilarity = 0.0D; |
24 | 29 |
int maxFieldLength = 1; |
25 | 30 |
|
26 | 31 |
for (Map.Entry<String, String> sourceField : sourceRecordMap.entrySet()) { |
27 | 32 |
String correspondingTargetField = sourceRecordMappings.get(sourceField.getKey()); |
28 | 33 |
String targetFieldValue = targetRecordMap.get(correspondingTargetField); |
29 |
if (StringUtils.isEmpty(targetFieldValue)) { |
|
30 |
break; |
|
31 |
} |
|
34 |
if (targetFieldValue != null) { |
|
35 |
if (sourceField.getValue().length() > maxFieldLength) { |
|
36 |
maxFieldLength = sourceField.getValue().length(); |
|
37 |
} |
|
38 |
if (targetFieldValue.length() > maxFieldLength) { |
|
39 |
maxFieldLength = targetFieldValue.length(); |
|
40 |
} |
|
32 | 41 |
|
33 |
if (sourceField.getValue().length() > maxFieldLength) {
|
|
34 |
maxFieldLength = sourceField.getValue().length();
|
|
42 |
double fieldsSimilarity = DistanceCalculator.getSimpleDistance(sourceField.getValue(), targetFieldValue);
|
|
43 |
recordSimilarity += fieldsSimilarity;
|
|
35 | 44 |
} |
36 |
if (targetFieldValue.length() > maxFieldLength) { |
|
37 |
maxFieldLength = targetFieldValue.length(); |
|
38 |
} |
|
45 |
} |
|
39 | 46 |
|
47 |
return recordSimilarity / totalFields; |
|
48 |
} |
|
40 | 49 |
|
41 |
double fieldsSimilarity; |
|
42 |
if (useLevDistance) { |
|
43 |
fieldsSimilarity = DistanceCalculator.getLevenshteinDistance(sourceField.getValue(), targetFieldValue); |
|
44 |
} else { |
|
45 |
fieldsSimilarity = DistanceCalculator.getSimpleDistance(sourceField.getValue(), targetFieldValue); |
|
46 |
} |
|
50 |
private static double getTotalFields(Map<String, String> sourceRecordMap, Map<String, String> targetRecordMap) { |
|
51 |
return (double) (sourceRecordMap.size() > targetRecordMap.size() ? sourceRecordMap.size() : targetRecordMap.size()); |
|
52 |
} |
|
47 | 53 |
|
48 |
recordSimilarity += fieldsSimilarity; |
|
49 |
if (fieldsSimilarity >= FIELDS_SIMILARITY_THRESHOLD) { |
|
50 |
matchedFields++; |
|
54 |
private static double computeLevenshteinSimilarity(String source, String target) { |
|
55 |
Map<String, String> sourceRecordMap = getRecordsFiledMap(source); |
|
56 |
Map<String, String> targetRecordMap = getRecordsFiledMap(target); |
|
57 |
double recordSimilarity = 0.0D; |
|
58 |
int maxFieldLength = 1; |
|
59 |
double totalFields = getTotalFields(sourceRecordMap, targetRecordMap); |
|
60 |
|
|
61 |
for (Map.Entry<String, String> sourceField : sourceRecordMap.entrySet()) { |
|
62 |
String correspondingTargetField = sourceRecordMappings.get(sourceField.getKey()); |
|
63 |
String targetFieldValue = targetRecordMap.get(correspondingTargetField); |
|
64 |
if (targetFieldValue != null) { |
|
65 |
if (sourceField.getValue().length() > maxFieldLength) { |
|
66 |
maxFieldLength = sourceField.getValue().length(); |
|
67 |
} |
|
68 |
if (targetFieldValue.length() > maxFieldLength) { |
|
69 |
maxFieldLength = targetFieldValue.length(); |
|
70 |
} |
|
71 |
|
|
72 |
double levenshteinDistance = DistanceCalculator.getLevenshteinDistance(sourceField.getValue(), targetFieldValue); |
|
73 |
double longerStringSize = sourceField.getValue().length() > targetFieldValue.length() ? sourceField.getValue().length() : targetFieldValue.length(); |
|
74 |
double fieldsSimilarity = (longerStringSize - levenshteinDistance) / longerStringSize; |
|
75 |
recordSimilarity += fieldsSimilarity; |
|
51 | 76 |
} |
52 | 77 |
} |
53 | 78 |
|
54 |
if (useLevDistance) { |
|
55 |
return 1 - (recordSimilarity / maxFieldLength); |
|
56 |
} |
|
57 | 79 |
return recordSimilarity / totalFields; |
58 | 80 |
} |
59 | 81 |
|
... | ... | |
61 | 83 |
private static Map<String, String> getRecordsFiledMap(String source) { |
62 | 84 |
String sourceRecord = source.substring(source.indexOf(FIELD_DELIM) + 1, source.length()).trim(); |
63 | 85 |
String[] sourceTriples = sourceRecord.split(LINE_SEPERATOR); |
64 |
Map<String, String> sourceFieldsMap = new HashMap(); |
|
86 |
Map<String, String> sourceFieldsMap = new HashMap<>();
|
|
65 | 87 |
|
66 | 88 |
|
67 | 89 |
for (int i = 0; i < sourceTriples.length - 1; i++) { |
... | ... | |
87 | 109 |
|
88 | 110 |
String target = "target_conf/clef/LarsonNJ08\tconf/clef/LarsonNJ08\t<http://www.w3.org/2000/01/rdf-schema#label>\tOverview of VideoCLEF 2008: Automatic Generation of Topic-Based Feeds for Dual Language Audio-Visual Content.\t.\t,"; |
89 | 111 |
String source = "source_<http://lod.openaire.eu/data/result/od_______119::60f21cae791a925a78d0844ad00cea5a>\t<http://lod.openaire.eu/data/result/od_______119::60f21cae791a925a78d0844ad00cea5a>\t<http://purl.org/dc/terms/identifier>\t\"od_______119::60f21cae791a925a78d0844ad00cea5a\"\t.\t<http://lod.openaire.eu/data/result/od_______119::60f21cae791a925a78d0844ad00cea5a>\t<http://purl.org/dc/terms/identifier>\t\"oai:doras.dcu.ie:16187\"\t.\t<http://lod.openaire.eu/data/result/od_______119::60f21cae791a925a78d0844ad00cea5a>\t<http://www.eurocris.org/ontologies/cerif/1.3#name>\t\"Overview of VideoCLEF 2008: Automatic generation of topic based feeds for dual language audio visual content\"\t.\t,"; |
90 |
System.out.println(MyComparator.findMatchingPair(source, target, true)); |
|
112 |
/* |
|
113 |
System.out.println("Hamming :" + MyComparator.computeSimilarity(DistanceAlgorithms.HAMMING, source, target)); |
|
114 |
System.out.println("Lev :" + MyComparator.computeSimilarity(DistanceAlgorithms.LEVENSHTEIN, source, target)); |
|
115 |
*/ |
|
91 | 116 |
|
117 |
target = "target_conf/clef/LarsonNJ08\tconf/clef/LarsonNJ08\t<http://www.w3.org/2000/01/rdf-schema#label>\tOverview of VideoCLEF 2008: Automatic Generation of Topic-Based Feeds for Dual Language Audio-Visual Content.\t.\t,"; |
|
118 |
source = "source_<http://lod.openaire.eu/data/result/od_______119::60f21cae791a925a78d0844ad00cea5a>\t<http://lod.openaire.eu/data/result/od_______119::60f21cae791a925a78d0844ad00cea5a>\t<http://purl.org/dc/terms/identifier>\t\"od_______119::60f21cae791a925a78d0844ad00cea5a\"\t.\t<http://lod.openaire.eu/data/result/od_______119::60f21cae791a925a78d0844ad00cea5a>\t<http://purl.org/dc/terms/identifier>\t\"oai:doras.dcu.ie:16187\"\t.\t<http://lod.openaire.eu/data/result/od_______119::60f21cae791a925a78d0844ad00cea5a>\t<http://www.eurocris.org/ontologies/cerif/1.3#name>\t\"Overview of VideoCLEF 2008: Automatic generation of topic based feeds for visual content\"\t.\t,"; |
|
119 |
|
|
120 |
System.out.println("Hamming :" + MyComparator.computeSimilarity(DistanceAlgorithms.HAMMING, source, target)); |
|
121 |
System.out.println("Lev :" + MyComparator.computeSimilarity(DistanceAlgorithms.LEVENSHTEIN, source, target)); |
|
122 |
|
|
123 |
|
|
92 | 124 |
} |
93 | 125 |
|
94 | 126 |
} |
Also available in: Unified diff
Final update- fixed distance algs, added props un wf