Project

General

Profile

« Previous | Next » 

Revision 48805

Added by Eri Katsari almost 7 years ago

Final update- fixed distance algs, added props un wf

View differences:

MyComparator.java
1 1

  
2 2
package eu.dnetlib.data.mapreduce.hbase.lodExport.utils.compators;
3 3

  
4
import org.datanucleus.util.StringUtils;
5
import org.hsqldb.lib.StringUtil;
6

  
7 4
import java.util.HashMap;
8
import java.util.Iterator;
9 5
import java.util.Map;
10 6

  
11 7

  
12 8
public class MyComparator {
13
    private static double FIELDS_SIMILARITY_THRESHOLD = 0.7D;
14 9
    private static final String LINE_SEPERATOR = "\t.\t";
15 10
    private static final String FIELD_DELIM = "\t";
16 11
    private static Map<String, String> sourceRecordMappings = new HashMap<>();
17 12

  
18
    public static double findMatchingPair(String source, String target, boolean useLevDistance) {
13
    public static double computeSimilarity(DistanceAlgorithms algorithm, String source, String target) {
14
        switch (algorithm) {
15
            case LEVENSHTEIN:
16
                return computeLevenshteinSimilarity(source, target);
17
            case HAMMING:
18
            default:
19
                return computeDistanceSimilarity(source, target);
20
        }
21

  
22
    }
23

  
24
    private static double computeDistanceSimilarity(String source, String target) {
19 25
        Map<String, String> sourceRecordMap = getRecordsFiledMap(source);
20 26
        Map<String, String> targetRecordMap = getRecordsFiledMap(target);
21
        int matchedFields = 0;
22
        double totalFields = (double) sourceRecordMap.size() > (double) targetRecordMap.size() ? (double) sourceRecordMap.size() : (double) targetRecordMap.size();
27
        double totalFields = getTotalFields(sourceRecordMap, targetRecordMap);
23 28
        double recordSimilarity = 0.0D;
24 29
        int maxFieldLength = 1;
25 30

  
26 31
        for (Map.Entry<String, String> sourceField : sourceRecordMap.entrySet()) {
27 32
            String correspondingTargetField = sourceRecordMappings.get(sourceField.getKey());
28 33
            String targetFieldValue = targetRecordMap.get(correspondingTargetField);
29
            if (StringUtils.isEmpty(targetFieldValue)) {
30
                break;
31
            }
34
            if (targetFieldValue != null) {
35
                if (sourceField.getValue().length() > maxFieldLength) {
36
                    maxFieldLength = sourceField.getValue().length();
37
                }
38
                if (targetFieldValue.length() > maxFieldLength) {
39
                    maxFieldLength = targetFieldValue.length();
40
                }
32 41

  
33
            if (sourceField.getValue().length() > maxFieldLength) {
34
                maxFieldLength = sourceField.getValue().length();
42
                double fieldsSimilarity = DistanceCalculator.getSimpleDistance(sourceField.getValue(), targetFieldValue);
43
                recordSimilarity += fieldsSimilarity;
35 44
            }
36
            if (targetFieldValue.length() > maxFieldLength) {
37
                maxFieldLength = targetFieldValue.length();
38
            }
45
        }
39 46

  
47
        return recordSimilarity / totalFields;
48
    }
40 49

  
41
            double fieldsSimilarity;
42
            if (useLevDistance) {
43
                fieldsSimilarity = DistanceCalculator.getLevenshteinDistance(sourceField.getValue(), targetFieldValue);
44
            } else {
45
                fieldsSimilarity = DistanceCalculator.getSimpleDistance(sourceField.getValue(), targetFieldValue);
46
            }
50
    private static double getTotalFields(Map<String, String> sourceRecordMap, Map<String, String> targetRecordMap) {
51
        return (double) (sourceRecordMap.size() > targetRecordMap.size() ? sourceRecordMap.size() : targetRecordMap.size());
52
    }
47 53

  
48
            recordSimilarity += fieldsSimilarity;
49
            if (fieldsSimilarity >= FIELDS_SIMILARITY_THRESHOLD) {
50
                matchedFields++;
54
    private static double computeLevenshteinSimilarity(String source, String target) {
55
        Map<String, String> sourceRecordMap = getRecordsFiledMap(source);
56
        Map<String, String> targetRecordMap = getRecordsFiledMap(target);
57
        double recordSimilarity = 0.0D;
58
        int maxFieldLength = 1;
59
        double totalFields = getTotalFields(sourceRecordMap, targetRecordMap);
60

  
61
        for (Map.Entry<String, String> sourceField : sourceRecordMap.entrySet()) {
62
            String correspondingTargetField = sourceRecordMappings.get(sourceField.getKey());
63
            String targetFieldValue = targetRecordMap.get(correspondingTargetField);
64
            if (targetFieldValue != null) {
65
                if (sourceField.getValue().length() > maxFieldLength) {
66
                    maxFieldLength = sourceField.getValue().length();
67
                }
68
                if (targetFieldValue.length() > maxFieldLength) {
69
                    maxFieldLength = targetFieldValue.length();
70
                }
71

  
72
                double levenshteinDistance = DistanceCalculator.getLevenshteinDistance(sourceField.getValue(), targetFieldValue);
73
                double longerStringSize = sourceField.getValue().length() > targetFieldValue.length() ? sourceField.getValue().length() : targetFieldValue.length();
74
                double fieldsSimilarity = (longerStringSize - levenshteinDistance) / longerStringSize;
75
                recordSimilarity += fieldsSimilarity;
51 76
            }
52 77
        }
53 78

  
54
        if (useLevDistance) {
55
            return 1 - (recordSimilarity / maxFieldLength);
56
        }
57 79
        return recordSimilarity / totalFields;
58 80
    }
59 81

  
......
61 83
    private static Map<String, String> getRecordsFiledMap(String source) {
62 84
        String sourceRecord = source.substring(source.indexOf(FIELD_DELIM) + 1, source.length()).trim();
63 85
        String[] sourceTriples = sourceRecord.split(LINE_SEPERATOR);
64
        Map<String, String> sourceFieldsMap = new HashMap();
86
        Map<String, String> sourceFieldsMap = new HashMap<>();
65 87

  
66 88

  
67 89
        for (int i = 0; i < sourceTriples.length - 1; i++) {
......
87 109

  
88 110
        String target = "target_conf/clef/LarsonNJ08\tconf/clef/LarsonNJ08\t<http://www.w3.org/2000/01/rdf-schema#label>\tOverview of VideoCLEF 2008: Automatic Generation of Topic-Based Feeds for Dual Language Audio-Visual Content.\t.\t,";
89 111
        String source = "source_<http://lod.openaire.eu/data/result/od_______119::60f21cae791a925a78d0844ad00cea5a>\t<http://lod.openaire.eu/data/result/od_______119::60f21cae791a925a78d0844ad00cea5a>\t<http://purl.org/dc/terms/identifier>\t\"od_______119::60f21cae791a925a78d0844ad00cea5a\"\t.\t<http://lod.openaire.eu/data/result/od_______119::60f21cae791a925a78d0844ad00cea5a>\t<http://purl.org/dc/terms/identifier>\t\"oai:doras.dcu.ie:16187\"\t.\t<http://lod.openaire.eu/data/result/od_______119::60f21cae791a925a78d0844ad00cea5a>\t<http://www.eurocris.org/ontologies/cerif/1.3#name>\t\"Overview of VideoCLEF 2008: Automatic generation of topic based feeds for dual language audio visual content\"\t.\t,";
90
        System.out.println(MyComparator.findMatchingPair(source, target, true));
112
        /*
113
        System.out.println("Hamming :" + MyComparator.computeSimilarity(DistanceAlgorithms.HAMMING, source, target));
114
        System.out.println("Lev :" + MyComparator.computeSimilarity(DistanceAlgorithms.LEVENSHTEIN, source, target));
115
        */
91 116

  
117
        target = "target_conf/clef/LarsonNJ08\tconf/clef/LarsonNJ08\t<http://www.w3.org/2000/01/rdf-schema#label>\tOverview of VideoCLEF 2008: Automatic Generation of Topic-Based Feeds for Dual Language Audio-Visual Content.\t.\t,";
118
        source = "source_<http://lod.openaire.eu/data/result/od_______119::60f21cae791a925a78d0844ad00cea5a>\t<http://lod.openaire.eu/data/result/od_______119::60f21cae791a925a78d0844ad00cea5a>\t<http://purl.org/dc/terms/identifier>\t\"od_______119::60f21cae791a925a78d0844ad00cea5a\"\t.\t<http://lod.openaire.eu/data/result/od_______119::60f21cae791a925a78d0844ad00cea5a>\t<http://purl.org/dc/terms/identifier>\t\"oai:doras.dcu.ie:16187\"\t.\t<http://lod.openaire.eu/data/result/od_______119::60f21cae791a925a78d0844ad00cea5a>\t<http://www.eurocris.org/ontologies/cerif/1.3#name>\t\"Overview of VideoCLEF 2008: Automatic generation of topic based feeds for  visual content\"\t.\t,";
119

  
120
        System.out.println("Hamming :" + MyComparator.computeSimilarity(DistanceAlgorithms.HAMMING, source, target));
121
        System.out.println("Lev :" + MyComparator.computeSimilarity(DistanceAlgorithms.LEVENSHTEIN, source, target));
122

  
123

  
92 124
    }
93 125

  
94 126
}

Also available in: Unified diff