Project

General

Profile

« Previous | Next » 

Revision 48805

Added by Eri Katsari almost 7 years ago

Final update- fixed distance algs, added props un wf

View differences:

modules/dnet-openaire-lodinterlinking/branches/cacheOptimized/src/main/java/eu/dnetlib/data/mapreduce/hbase/lodExport/build/BlockNoCacheReducer.java
42 42
    protected void reduce(final Text key, final Iterable<Text> values, final Context context) throws IOException, InterruptedException {
43 43
        Iterator<Text> it = values.iterator();
44 44
        StringBuilder field = new StringBuilder();
45
        Map<String, String> records = new HashMap();
45
        Map<String, String> records = new HashMap<>();
46 46

  
47 47
        try {
48 48
            boolean hasSource = false;
modules/dnet-openaire-lodinterlinking/branches/cacheOptimized/src/main/java/eu/dnetlib/data/mapreduce/hbase/lodExport/build/SourceBuildNoCacheMapper.java
2 2

  
3 3
import eu.dnetlib.data.mapreduce.hbase.lodExport.utils.blocking.Blocking;
4 4
import eu.dnetlib.data.mapreduce.hbase.lodExport.utils.configuration.LodConfiguration;
5
import eu.dnetlib.data.mapreduce.hbase.lodExport.utils.configuration.Properties;
5 6
import org.apache.hadoop.io.LongWritable;
6 7
import org.apache.hadoop.io.Text;
7 8
import org.apache.hadoop.mapreduce.Mapper;
8 9
import org.apache.log4j.Logger;
9 10

  
10 11
import java.io.IOException;
11
import java.util.*;
12
import java.util.ArrayList;
13
import java.util.Arrays;
14
import java.util.Collections;
15
import java.util.HashSet;
16
import java.util.List;
17
import java.util.Set;
12 18

  
13 19
/**
14 20
 * Mapper Class that reads HBASE contents and prepares them for the StatsDB
......
28 34
    private Logger log = Logger.getLogger(this.getClass());
29 35
    private LodConfiguration lodConfiguration = new LodConfiguration();
30 36
    private String lastExecutionDate;
31
    private static final String PREFIX = "source_";
32 37

  
33 38
    private HashSet<String> stopWordsMap = new HashSet<>();
34
    private static final String FIELD_DELIM = "\t";
35
    private static final String LINE_DELIM = "\t.\t";
36 39

  
37 40
    private boolean createCompositekey;
38 41
    private Set<String> usedProperties = new HashSet<>();
......
47 50
    @Override
48 51
    protected void setup(Context context) throws IOException, InterruptedException {
49 52
        try {
50
            lodConfiguration.load(context.getConfiguration().get("lod.sourceMappings"));
51
            stopWordsMap.addAll(Arrays.asList(context.getConfiguration().get("lod.stopwords").split(",")));
52
            createCompositekey = context.getConfiguration().getBoolean("createCompositekey", true);
53
            lodConfiguration.load(context.getConfiguration().get(Properties.LOD_SOURCE_MAPPINGS));
54
            stopWordsMap.addAll(Arrays.asList(context.getConfiguration().get(Properties.LOD_STOPWORDS).split(",")));
55
            createCompositekey = context.getConfiguration().getBoolean(eu.dnetlib.data.mapreduce.hbase.lodExport.utils.configuration.Properties.LOD_CREATE_COMPOSITEKEY, true);
53 56
            usedProperties.add("<http://www.eurocris.org/ontologies/cerif/1.3#name>");
54 57
            usedProperties.add("<http://lod.openaire.eu/vocab/year>");
55

  
56

  
57 58
        } catch (Exception ex) {
58 59
            log.error("An error occured during Mapper Setup " + ex.toString(), ex);
59 60
            System.out.println(ex.getCause().toString());
......
67 68

  
68 69
            StringBuilder id = new StringBuilder();
69 70
            context.getCounter(SOURCE_BUILD_COUNTERS.INPUT_SOURCE_RECORDS).increment(1);
70
            String[] triples = result.toString().split(LINE_DELIM);
71
            String[] triples = result.toString().split(Properties.LINE_DELIM);
71 72

  
72 73

  
73 74
            List<String> tokenList = parseFieldsAndGenerateTokens(id, triples);
......
89 90
        List<String> tokenList = new ArrayList<>();
90 91

  
91 92
        for (String triple : triples) {
92
            String[] fields = triple.split(FIELD_DELIM);
93
            String[] fields = triple.split(Properties.FIELD_DELIM);
93 94
            if (fields.length == 3) {
94 95
                if (id.length() < 1) {
95
                    id.append(PREFIX).append(fields[0]);
96
                    id.append(Properties.SOURCE_PREFIX).append(fields[0]);
96 97
                }
97 98
                String property = fields[1];
98 99
                String value = fields[2];
......
109 110
    private void writeSingleKeysAndValues(Text result, Context context, List<String> tokenList, StringBuilder id) throws IOException, InterruptedException {
110 111
        for (String token : tokenList) {
111 112
            //Write BlockingKey, RecordID to output
112
            context.write(new Text(token), new Text(id + FIELD_DELIM + result.toString()));
113
            context.write(new Text(token), new Text(id + Properties.FIELD_DELIM + result.toString()));
113 114
            context.getCounter(SOURCE_BUILD_COUNTERS.BLOCKING_KEYS).increment(1);
114 115
        }
115 116
    }
116 117

  
117 118
    private void writeCompositeKeyAndValue(Text result, Context context, List<String> tokenList, StringBuilder id) throws IOException, InterruptedException {
118 119
        String compositeKey = createCompositeKey(tokenList);
119

  
120
        context.write(new Text(compositeKey), new Text(id + FIELD_DELIM + result.toString()));
120
        context.write(new Text(compositeKey), new Text(id + Properties.FIELD_DELIM + result.toString()));
121 121
        context.getCounter(SOURCE_BUILD_COUNTERS.BLOCKING_KEYS).increment(1);
122 122
    }
123 123

  
modules/dnet-openaire-lodinterlinking/branches/cacheOptimized/src/main/java/eu/dnetlib/data/mapreduce/hbase/lodExport/build/TargetBuildNoCacheMapper.java
2 2

  
3 3
import eu.dnetlib.data.mapreduce.hbase.lodExport.utils.blocking.Blocking;
4 4
import eu.dnetlib.data.mapreduce.hbase.lodExport.utils.configuration.LodConfiguration;
5
import eu.dnetlib.data.mapreduce.hbase.lodExport.utils.configuration.Properties;
5 6
import org.apache.hadoop.io.LongWritable;
6 7
import org.apache.hadoop.io.Text;
7 8
import org.apache.hadoop.mapreduce.Mapper;
8 9
import org.apache.log4j.Logger;
9 10

  
10 11
import java.io.IOException;
11
import java.util.*;
12
import java.util.ArrayList;
13
import java.util.Arrays;
14
import java.util.Collections;
15
import java.util.HashSet;
16
import java.util.List;
17
import java.util.Set;
12 18

  
13 19

  
14 20
/*-> Parse LOD dump files
......
22 28
*/
23 29

  
24 30
public class TargetBuildNoCacheMapper extends Mapper<LongWritable, Text, Text, Text> {
25
    private static final String PREFIX = "target_";
26 31
    private Logger log = Logger.getLogger(this.getClass());
27 32
    private LodConfiguration lodConfiguration = new LodConfiguration();
28 33
    private HashSet<String> stopWordsMap = new HashSet<>();
29
    private static final String FIELD_DELIM = "\t";
30 34
    private boolean createCompositekey;
31 35
    private Set<String> usedProperties = new HashSet<>();
32
    private static final String LINE_DELIM = "\t.\t";
33

  
36
    
34 37
    public static enum TARGET_BUILD_COUNTERS {
35 38
        BLOCKING_KEYS,
36 39
        INPUT_TARGET_RECORDS
......
39 42

  
40 43
    @Override
41 44
    protected void setup(Context context) throws IOException, InterruptedException {
42

  
43 45
        try {
44
            lodConfiguration.load(context.getConfiguration().get("lod.targetMappings"));
45
            stopWordsMap.addAll(Arrays.asList(context.getConfiguration().get("lod.stopwords").split(",")));
46
            createCompositekey = context.getConfiguration().getBoolean("createCompositekey", true);
46
            lodConfiguration.load(context.getConfiguration().get(Properties.LOD_TARGET_MAPPINGS));
47
            stopWordsMap.addAll(Arrays.asList(context.getConfiguration().get(Properties.LOD_STOPWORDS).split(",")));
48
            createCompositekey = context.getConfiguration().getBoolean(Properties.LOD_CREATE_COMPOSITEKEY, true);
47 49
            usedProperties.add("<http://purl.org/dc/terms/issued>");
48 50
            usedProperties.add("<http://www.w3.org/2000/01/rdf-schema#label>");
49

  
50 51
        } catch (Exception ex) {
51 52
            log.error("An error occured during Mapper Setup " + ex.toString(), ex);
52 53
        }
......
61 62
            StringBuilder id = new StringBuilder();
62 63

  
63 64
            context.getCounter(TARGET_BUILD_COUNTERS.INPUT_TARGET_RECORDS).increment(1);
64
            String[] triples = result.toString().split(LINE_DELIM);
65
            String[] triples = result.toString().split(Properties.LINE_DELIM);
65 66

  
66 67
            List<String> tokenList = parseFieldsAndGenerateTokens(id, triples);
67 68

  
......
84 85
        List<String> tokenList = new ArrayList<>();
85 86

  
86 87
        for (String triple : triples) {
87
            String[] fields = triple.split(FIELD_DELIM);
88
            String[] fields = triple.split(Properties.FIELD_DELIM);
88 89
            if (fields.length == 3) {
89 90
                if (id.length() < 1) {
90
                    id.append(PREFIX).append(fields[0]);
91
                    id.append(Properties.TARGET_PREFIX).append(fields[0]);
91 92
                }
92 93
                String property = fields[1];
93 94
                String value = fields[2];
......
104 105
    private void writeSingleKeysAndValues(Text result, Context context, List<String> tokenList, StringBuilder id) throws IOException, InterruptedException {
105 106
        for (String token : tokenList) {
106 107
            //Write BlockingKey, RecordID to output
107
            context.write(new Text(token), new Text(id + FIELD_DELIM + result.toString()));
108
            context.write(new Text(token), new Text(id + Properties.FIELD_DELIM + result.toString()));
108 109
            context.getCounter(TARGET_BUILD_COUNTERS.BLOCKING_KEYS).increment(1);
109 110
        }
110 111
    }
111 112

  
112 113
    private void writeCompositeKeyAndValue(Text result, Context context, List<String> tokenList, StringBuilder id) throws IOException, InterruptedException {
113 114
        String compositeKey = createCompositeKey(tokenList);
114
        context.write(new Text(compositeKey), new Text(id + FIELD_DELIM + result.toString()));
115
        context.write(new Text(compositeKey), new Text(id + Properties.FIELD_DELIM + result.toString()));
115 116
        context.getCounter(TARGET_BUILD_COUNTERS.BLOCKING_KEYS).increment(1);
116 117
    }
117 118

  
......
123 124
        }
124 125
        return tokenString.toString();
125 126
    }
127

  
126 128
    @Override
127 129
    protected void cleanup(Context context) throws IOException, InterruptedException {
128 130
        super.cleanup(context);
modules/dnet-openaire-lodinterlinking/branches/cacheOptimized/src/main/java/eu/dnetlib/data/mapreduce/hbase/lodExport/utils/compators/DistanceAlgorithms.java
1
package eu.dnetlib.data.mapreduce.hbase.lodExport.utils.compators;
2

  
3
public enum DistanceAlgorithms {
4
    LEVENSHTEIN,
5
    HAMMING
6
}
modules/dnet-openaire-lodinterlinking/branches/cacheOptimized/src/main/java/eu/dnetlib/data/mapreduce/hbase/lodExport/utils/compators/DistanceCalculator.java
2 2

  
3 3
/*Found in https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Levenshtein_distance#Java*/
4 4

  
5
import org.datanucleus.util.StringUtils;
6
import org.hsqldb.lib.StringUtil;
7

  
8 5
public class DistanceCalculator {
9 6

  
10 7
    public static int getLevenshteinDistance(String s1, String s2) {
11
        if (StringUtils.isEmpty(s1) || StringUtils.isEmpty(s2)) {
8
        if (s1 == null || s2 == null) {
12 9
            return 0;
13 10
        }
14 11

  
......
59 56
        if (sourceValue != null && targetValue != null) {
60 57
            sourceValue = sourceValue.replaceAll("[^A-Za-z0-9]", "").toLowerCase();
61 58
            targetValue = targetValue.replaceAll("[^A-Za-z0-9]", "").toLowerCase();
62
            int similarChars = 0;
59
            double similarChars = 0.0;
63 60
            int end = sourceValue.length() <= targetValue.length() ? sourceValue.length() : targetValue.length();
64 61

  
65 62
            for (int i = 0; i < end; ++i) {
66 63
                if (sourceValue.charAt(i) == targetValue.charAt(i)) {
67
                    ++similarChars;
64
                    similarChars++;
68 65
                }
69 66
            }
70
            return sourceValue.length() >= targetValue.length() ? (double) similarChars / (double) sourceValue.length() : (double) similarChars / (double) targetValue.length();
67
            return sourceValue.length() >= targetValue.length() ? similarChars / (double) sourceValue.length() : similarChars / (double) targetValue.length();
71 68
        } else {
72 69
            return 0.0D;
73 70
        }
modules/dnet-openaire-lodinterlinking/branches/cacheOptimized/src/main/java/eu/dnetlib/data/mapreduce/hbase/lodExport/utils/compators/MyComparator.java
1 1

  
2 2
package eu.dnetlib.data.mapreduce.hbase.lodExport.utils.compators;
3 3

  
4
import org.datanucleus.util.StringUtils;
5
import org.hsqldb.lib.StringUtil;
6

  
7 4
import java.util.HashMap;
8
import java.util.Iterator;
9 5
import java.util.Map;
10 6

  
11 7

  
12 8
public class MyComparator {
13
    private static double FIELDS_SIMILARITY_THRESHOLD = 0.7D;
14 9
    private static final String LINE_SEPERATOR = "\t.\t";
15 10
    private static final String FIELD_DELIM = "\t";
16 11
    private static Map<String, String> sourceRecordMappings = new HashMap<>();
17 12

  
18
    public static double findMatchingPair(String source, String target, boolean useLevDistance) {
13
    public static double computeSimilarity(DistanceAlgorithms algorithm, String source, String target) {
14
        switch (algorithm) {
15
            case LEVENSHTEIN:
16
                return computeLevenshteinSimilarity(source, target);
17
            case HAMMING:
18
            default:
19
                return computeDistanceSimilarity(source, target);
20
        }
21

  
22
    }
23

  
24
    private static double computeDistanceSimilarity(String source, String target) {
19 25
        Map<String, String> sourceRecordMap = getRecordsFiledMap(source);
20 26
        Map<String, String> targetRecordMap = getRecordsFiledMap(target);
21
        int matchedFields = 0;
22
        double totalFields = (double) sourceRecordMap.size() > (double) targetRecordMap.size() ? (double) sourceRecordMap.size() : (double) targetRecordMap.size();
27
        double totalFields = getTotalFields(sourceRecordMap, targetRecordMap);
23 28
        double recordSimilarity = 0.0D;
24 29
        int maxFieldLength = 1;
25 30

  
26 31
        for (Map.Entry<String, String> sourceField : sourceRecordMap.entrySet()) {
27 32
            String correspondingTargetField = sourceRecordMappings.get(sourceField.getKey());
28 33
            String targetFieldValue = targetRecordMap.get(correspondingTargetField);
29
            if (StringUtils.isEmpty(targetFieldValue)) {
30
                break;
31
            }
34
            if (targetFieldValue != null) {
35
                if (sourceField.getValue().length() > maxFieldLength) {
36
                    maxFieldLength = sourceField.getValue().length();
37
                }
38
                if (targetFieldValue.length() > maxFieldLength) {
39
                    maxFieldLength = targetFieldValue.length();
40
                }
32 41

  
33
            if (sourceField.getValue().length() > maxFieldLength) {
34
                maxFieldLength = sourceField.getValue().length();
42
                double fieldsSimilarity = DistanceCalculator.getSimpleDistance(sourceField.getValue(), targetFieldValue);
43
                recordSimilarity += fieldsSimilarity;
35 44
            }
36
            if (targetFieldValue.length() > maxFieldLength) {
37
                maxFieldLength = targetFieldValue.length();
38
            }
45
        }
39 46

  
47
        return recordSimilarity / totalFields;
48
    }
40 49

  
41
            double fieldsSimilarity;
42
            if (useLevDistance) {
43
                fieldsSimilarity = DistanceCalculator.getLevenshteinDistance(sourceField.getValue(), targetFieldValue);
44
            } else {
45
                fieldsSimilarity = DistanceCalculator.getSimpleDistance(sourceField.getValue(), targetFieldValue);
46
            }
50
    private static double getTotalFields(Map<String, String> sourceRecordMap, Map<String, String> targetRecordMap) {
51
        return (double) (sourceRecordMap.size() > targetRecordMap.size() ? sourceRecordMap.size() : targetRecordMap.size());
52
    }
47 53

  
48
            recordSimilarity += fieldsSimilarity;
49
            if (fieldsSimilarity >= FIELDS_SIMILARITY_THRESHOLD) {
50
                matchedFields++;
54
    private static double computeLevenshteinSimilarity(String source, String target) {
55
        Map<String, String> sourceRecordMap = getRecordsFiledMap(source);
56
        Map<String, String> targetRecordMap = getRecordsFiledMap(target);
57
        double recordSimilarity = 0.0D;
58
        int maxFieldLength = 1;
59
        double totalFields = getTotalFields(sourceRecordMap, targetRecordMap);
60

  
61
        for (Map.Entry<String, String> sourceField : sourceRecordMap.entrySet()) {
62
            String correspondingTargetField = sourceRecordMappings.get(sourceField.getKey());
63
            String targetFieldValue = targetRecordMap.get(correspondingTargetField);
64
            if (targetFieldValue != null) {
65
                if (sourceField.getValue().length() > maxFieldLength) {
66
                    maxFieldLength = sourceField.getValue().length();
67
                }
68
                if (targetFieldValue.length() > maxFieldLength) {
69
                    maxFieldLength = targetFieldValue.length();
70
                }
71

  
72
                double levenshteinDistance = DistanceCalculator.getLevenshteinDistance(sourceField.getValue(), targetFieldValue);
73
                double longerStringSize = sourceField.getValue().length() > targetFieldValue.length() ? sourceField.getValue().length() : targetFieldValue.length();
74
                double fieldsSimilarity = (longerStringSize - levenshteinDistance) / longerStringSize;
75
                recordSimilarity += fieldsSimilarity;
51 76
            }
52 77
        }
53 78

  
54
        if (useLevDistance) {
55
            return 1 - (recordSimilarity / maxFieldLength);
56
        }
57 79
        return recordSimilarity / totalFields;
58 80
    }
59 81

  
......
61 83
    private static Map<String, String> getRecordsFiledMap(String source) {
62 84
        String sourceRecord = source.substring(source.indexOf(FIELD_DELIM) + 1, source.length()).trim();
63 85
        String[] sourceTriples = sourceRecord.split(LINE_SEPERATOR);
64
        Map<String, String> sourceFieldsMap = new HashMap();
86
        Map<String, String> sourceFieldsMap = new HashMap<>();
65 87

  
66 88

  
67 89
        for (int i = 0; i < sourceTriples.length - 1; i++) {
......
87 109

  
88 110
        String target = "target_conf/clef/LarsonNJ08\tconf/clef/LarsonNJ08\t<http://www.w3.org/2000/01/rdf-schema#label>\tOverview of VideoCLEF 2008: Automatic Generation of Topic-Based Feeds for Dual Language Audio-Visual Content.\t.\t,";
89 111
        String source = "source_<http://lod.openaire.eu/data/result/od_______119::60f21cae791a925a78d0844ad00cea5a>\t<http://lod.openaire.eu/data/result/od_______119::60f21cae791a925a78d0844ad00cea5a>\t<http://purl.org/dc/terms/identifier>\t\"od_______119::60f21cae791a925a78d0844ad00cea5a\"\t.\t<http://lod.openaire.eu/data/result/od_______119::60f21cae791a925a78d0844ad00cea5a>\t<http://purl.org/dc/terms/identifier>\t\"oai:doras.dcu.ie:16187\"\t.\t<http://lod.openaire.eu/data/result/od_______119::60f21cae791a925a78d0844ad00cea5a>\t<http://www.eurocris.org/ontologies/cerif/1.3#name>\t\"Overview of VideoCLEF 2008: Automatic generation of topic based feeds for dual language audio visual content\"\t.\t,";
90
        System.out.println(MyComparator.findMatchingPair(source, target, true));
112
        /*
113
        System.out.println("Hamming :" + MyComparator.computeSimilarity(DistanceAlgorithms.HAMMING, source, target));
114
        System.out.println("Lev :" + MyComparator.computeSimilarity(DistanceAlgorithms.LEVENSHTEIN, source, target));
115
        */
91 116

  
117
        target = "target_conf/clef/LarsonNJ08\tconf/clef/LarsonNJ08\t<http://www.w3.org/2000/01/rdf-schema#label>\tOverview of VideoCLEF 2008: Automatic Generation of Topic-Based Feeds for Dual Language Audio-Visual Content.\t.\t,";
118
        source = "source_<http://lod.openaire.eu/data/result/od_______119::60f21cae791a925a78d0844ad00cea5a>\t<http://lod.openaire.eu/data/result/od_______119::60f21cae791a925a78d0844ad00cea5a>\t<http://purl.org/dc/terms/identifier>\t\"od_______119::60f21cae791a925a78d0844ad00cea5a\"\t.\t<http://lod.openaire.eu/data/result/od_______119::60f21cae791a925a78d0844ad00cea5a>\t<http://purl.org/dc/terms/identifier>\t\"oai:doras.dcu.ie:16187\"\t.\t<http://lod.openaire.eu/data/result/od_______119::60f21cae791a925a78d0844ad00cea5a>\t<http://www.eurocris.org/ontologies/cerif/1.3#name>\t\"Overview of VideoCLEF 2008: Automatic generation of topic based feeds for  visual content\"\t.\t,";
119

  
120
        System.out.println("Hamming :" + MyComparator.computeSimilarity(DistanceAlgorithms.HAMMING, source, target));
121
        System.out.println("Lev :" + MyComparator.computeSimilarity(DistanceAlgorithms.LEVENSHTEIN, source, target));
122

  
123

  
92 124
    }
93 125

  
94 126
}
modules/dnet-openaire-lodinterlinking/branches/cacheOptimized/src/main/java/eu/dnetlib/data/mapreduce/hbase/lodExport/utils/configuration/Properties.java
1
package eu.dnetlib.data.mapreduce.hbase.lodExport.utils.configuration;
2

  
3
public class Properties {
4

  
5

  
6
    public static final String LOD_DISTANCE_ALGORITHM = "lod.distanceAlgorithm";
7
    public static final String LOD_SIMILARITY_THRESHOLD = "lod.similarityThreshold";
8
    public static final String LOD_CREATE_COMPOSITEKEY = "lod.createCompositekey";
9
    public static final String LOD_SOURCE_MAPPINGS = "lod.sourceMappings";
10
    public static final String LOD_STOPWORDS = "lod.stopwords";
11
    public static final String SOURCE_PREFIX = "source_";
12
    public static final String LOD_TARGET_MAPPINGS = "lod.targetMappings";
13
    public static final String TARGET_PREFIX = "target_";
14
    public static final String FIELD_DELIM = "\t";
15
    public static final String LINE_DELIM = "\t.\t";
16
    public static final String OPTIMAL_BLOCKS = "optimalBlockSize";
17
}
modules/dnet-openaire-lodinterlinking/branches/cacheOptimized/src/main/java/eu/dnetlib/data/mapreduce/hbase/lodExport/utils/blocking/Blocking.java
72 72
        return tokens.toString();
73 73
    }
74 74

  
75

  
76
    public static void main(String[] args) {
77

  
78
        List<String> tokenList = new ArrayList<>();
79

  
80
        HashSet<String> stopwordsMap = new HashSet<>();
81
        stopwordsMap.add("and");
82

  
83
        String field = "A test string";
84
        String tokens = tokenBlocking(field, stopwordsMap);
85
        System.out.println(tokens);
86
        tokenList.add(tokens);
87

  
88
        field = "1990";
89
        tokens = tokenBlocking(field, stopwordsMap);
90
        System.out.println(tokens);
91
        tokenList.add(tokens);
92

  
93
        Collections.sort(tokenList);
94
        StringBuilder tokenString = new StringBuilder();
95

  
96
        for (String  t : tokenList) {
97
            tokenString.append(t).append(" ");
98
        }
99

  
100
        System.out.println(tokenString.toString());
101

  
102

  
103
    }
75 104
}
modules/dnet-openaire-lodinterlinking/branches/cacheOptimized/src/main/java/eu/dnetlib/data/mapreduce/hbase/lodExport/preprocessing/DatasetReducer.java
12 12

  
13 13
public class DatasetReducer extends Reducer<Text, Text, Text, Text> {
14 14
    public static enum ENTITIES_COUNTER {
15

  
16 15
        TOTAL_ENTITIES,
17 16
    }
18 17

  
......
30 29
    protected void reduce(final Text key, final Iterable<Text> values, final Context context) throws IOException, InterruptedException {
31 30
        Iterator<Text> it = values.iterator();
32 31
        StringBuffer value = new StringBuffer();
33

  
34 32
        String[] tmp = key.toString().split(",");
35 33
        String dataset = tmp[0].replaceAll("[^a-zA-Z0-9]", "");
36 34
        String entityType = tmp[1].replaceAll("[^a-zA-Z0-9]", "");
modules/dnet-openaire-lodinterlinking/branches/cacheOptimized/src/main/java/eu/dnetlib/data/mapreduce/hbase/lodExport/preprocessing/SourceMapper.java
1 1
package eu.dnetlib.data.mapreduce.hbase.lodExport.preprocessing;
2 2

  
3
import java.io.IOException;
4

  
3
import eu.dnetlib.data.mapreduce.hbase.lodExport.utils.configuration.LodConfiguration;
4
import eu.dnetlib.data.mapreduce.hbase.lodExport.utils.configuration.Properties;
5 5
import org.apache.commons.lang.StringUtils;
6 6
import org.apache.hadoop.io.LongWritable;
7 7
import org.apache.hadoop.io.Text;
8 8
import org.apache.hadoop.mapreduce.Mapper;
9 9
import org.apache.log4j.Logger;
10 10

  
11
import eu.dnetlib.data.mapreduce.hbase.lodExport.utils.configuration.LodConfiguration;
11
import java.io.IOException;
12 12

  
13 13
/**
14 14
 * Mapper Class that reads HBASE contents and prepares them for the StatsDB
......
29 29
<http://lod.openaire.eu/data/result/doajarticles::89217af00809a91acc15a416e56b3782> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://www.eurocris.org/ontologies/cerif/1.3#ResultEntity> .
30 30
<http://lod.openaire.eu/data/result/doajarticles::89217af00809a91acc15a416e56b3782> <http://www.eurocris.org/ontologies/cerif/1.3#name> "Une nouvelle anomalie a allure h r ditaire chez des agneaux  it khouzistans /it" .
31 31
*/
32

  
33 32
public class SourceMapper extends Mapper<LongWritable, Text, Text, Text> {
34 33

  
35 34
    private Logger log = Logger.getLogger(SourceMapper.class);
36

  
37 35
    private LodConfiguration lodConfiguration;
38 36

  
39 37
    public static enum SOURCE_COUNTERS {
......
44 42
    @Override
45 43
    protected void setup(Context context) throws IOException, InterruptedException {
46 44
        lodConfiguration = new LodConfiguration();
47
        lodConfiguration.load(context.getConfiguration().get("lod.sourceMappings"));
48
        log.info("file loaded!" + context.getConfiguration().get("lod.sourceMappings"));
49

  
45
        lodConfiguration.load(context.getConfiguration().get(Properties.LOD_SOURCE_MAPPINGS));
46
        log.info("file loaded!" + context.getConfiguration().get(Properties.LOD_SOURCE_MAPPINGS));
50 47
    }
51 48

  
52

  
53 49
    @Override
54 50
    protected void map(final LongWritable keyIn, final Text result, final Context context) throws IOException {
55 51

  
56 52
        try {
57

  
58 53
            context.getCounter(SOURCE_COUNTERS.TOTAL_ENTITIES).increment(1);
59

  
60 54
            //get ID - output source_recordID so we can group by id and get all props of a record
61 55

  
62 56
            StringBuilder value = new StringBuilder();
......
122 116
        super.cleanup(context);
123 117
    }
124 118

  
125

  
126 119
}
modules/dnet-openaire-lodinterlinking/branches/cacheOptimized/src/main/java/eu/dnetlib/data/mapreduce/hbase/lodExport/preprocessing/TargetMapper.java
1 1
package eu.dnetlib.data.mapreduce.hbase.lodExport.preprocessing;
2 2

  
3 3
import eu.dnetlib.data.mapreduce.hbase.lodExport.utils.configuration.LodConfiguration;
4
import eu.dnetlib.data.mapreduce.hbase.lodExport.utils.configuration.Properties;
4 5
import eu.dnetlib.data.mapreduce.hbase.lodExport.utils.parsing.DblpCsvParser;
5 6
import org.apache.hadoop.io.LongWritable;
6 7
import org.apache.hadoop.io.Text;
......
15 16
 * export
16 17
 */
17 18
public class TargetMapper extends Mapper<LongWritable, Text, Text, Text> {
18

  
19 19
    private Logger log = Logger.getLogger(TargetMapper.class);
20 20

  
21 21
    private LodConfiguration lodConfiguration;
......
27 27
    @Override
28 28
    protected void setup(Context context) throws IOException, InterruptedException {
29 29
        lodConfiguration = new LodConfiguration();
30
        lodConfiguration.load(context.getConfiguration().get("lod.targetMappings"));
31
        log.info("file loaded!" + context.getConfiguration().get("lod.targetMappings"));
30
        lodConfiguration.load(context.getConfiguration().get(Properties.LOD_TARGET_MAPPINGS));
31
        log.info("file loaded!" + context.getConfiguration().get(Properties.LOD_TARGET_MAPPINGS));
32 32
    }
33 33

  
34 34

  
......
38 38
        try {
39 39

  
40 40
            context.getCounter(TARGET_COUNTERS.TOTAL_ENTITIES).increment(1);
41

  
42 41
            Text[] pair = DblpCsvParser.parseTriples(result, lodConfiguration);
43 42
            context.write(pair[0], pair[1]);
44 43
            context.getCounter(TARGET_COUNTERS.TARGET_ENTITIES).increment(1);
modules/dnet-openaire-lodinterlinking/branches/cacheOptimized/src/main/java/eu/dnetlib/data/mapreduce/hbase/lodExport/linkage/LinkCustomReducer.java
1 1
package eu.dnetlib.data.mapreduce.hbase.lodExport.linkage;
2 2

  
3 3
import eu.dnetlib.data.mapreduce.hbase.lodExport.utils.caching.RedisUtils;
4
import eu.dnetlib.data.mapreduce.hbase.lodExport.utils.compators.DistanceAlgorithms;
4 5
import eu.dnetlib.data.mapreduce.hbase.lodExport.utils.compators.MyComparator;
6
import eu.dnetlib.data.mapreduce.hbase.lodExport.utils.configuration.Properties;
5 7
import org.apache.hadoop.io.Text;
6 8
import org.apache.hadoop.mapreduce.Reducer;
7 9
import org.apache.log4j.Logger;
......
14 16
    private Logger log = Logger.getLogger(LinkCustomReducer.class);
15 17
    private RedisUtils redisUtils;
16 18
    private static final String SEPERATOR = ",";
17
    private static double RECORD_SIMILARITY_THRESHOLD = 0.8;
18
    private boolean useLevDistance;
19
    private double similarityThreshold;
20
    private String distanceAlgorithm;
19 21

  
20 22
    public static enum LINK_RECUDER_COUNTERS {
21 23
        TARGET_RECORDS_EXAMINED,
......
29 31

  
30 32
    @Override
31 33
    protected void setup(Context context) throws IOException, InterruptedException {
32

  
33 34
        try {
34
            useLevDistance = context.getConfiguration().getBoolean("lod.useLevDistance", false);
35
            similarityThreshold = context.getConfiguration().getDouble(Properties.LOD_SIMILARITY_THRESHOLD, 0.8);
36
            log.info("SIMILARITY THRESHOLD " + similarityThreshold);
37
            distanceAlgorithm = context.getConfiguration().get(Properties.LOD_DISTANCE_ALGORITHM);
35 38
            redisUtils = new RedisUtils(context.getConfiguration());
36 39
        } catch (Exception e) {
37 40
            log.error("Error connecting to Redis " + e.toString());
38 41
            throw new RuntimeException(e);
39 42
        }
40

  
41 43
    }
42 44

  
43 45
    @Override
......
76 78
                //store target records to have less I/O ops per record
77 79
                //test also version where we get each record again in each loop
78 80

  
79

  
80 81
                //TODO
81 82
                //String targetRecord = redisUtils.getValue(targetId);}
82 83
                String targetRecord = targetRecords.get(targetId);
83
                double similarity = MyComparator.findMatchingPair(sourceRecord, targetRecord, useLevDistance);
84

  
84
                double similarity = MyComparator.computeSimilarity(DistanceAlgorithms.valueOf(distanceAlgorithm), sourceRecord, targetRecord);
85 85
                context.getCounter(LINK_RECUDER_COUNTERS.RECORD_COMPARISONS).increment(1);
86 86

  
87
                if (similarity >= RECORD_SIMILARITY_THRESHOLD) {
87
                if (similarity >= similarityThreshold) {
88 88
                    try {
89
                        context.write(new Text(sourceId),
90
                                new Text(targetId + SEPERATOR + similarity + SEPERATOR + sourceRecord.replaceAll(sourceId, "")
89
                        context.write(new Text(sourceId), new Text(targetId + SEPERATOR + similarity + SEPERATOR + sourceRecord.replaceAll(sourceId, "")
91 90
                                        + SEPERATOR + targetRecord.replaceAll(targetId, "")));
92 91
                        context.getCounter(LINK_RECUDER_COUNTERS.MATCHING_PAIRS).increment(1);
93 92
                    } catch (Exception ex) {
modules/dnet-openaire-lodinterlinking/branches/cacheOptimized/src/main/java/eu/dnetlib/data/mapreduce/hbase/lodExport/linkage/LinkMapper.java
1 1
package eu.dnetlib.data.mapreduce.hbase.lodExport.linkage;
2 2

  
3 3
import eu.dnetlib.data.mapreduce.hbase.lodExport.utils.caching.RedisUtils;
4
import eu.dnetlib.data.mapreduce.hbase.lodExport.utils.configuration.Properties;
4 5
import org.apache.hadoop.io.Text;
5 6
import org.apache.hadoop.mapreduce.Mapper;
6 7
import org.apache.log4j.Logger;
......
13 14
    private RedisUtils redisUtils;
14 15
    private int optimalBlockSize;
15 16
    private static final char SEPERATOR = ',';
16
    private static final String OPTIMAL_BLOCKS = "optimalBlockSize";
17 17

  
18
    public static enum TEST_COUNTERS {
18
    public static enum LINK_MAPPER_COUNTERS {
19 19
        OPTIMAL_BLOCK_SIZE,
20 20
        ERROR_COUNTERS,
21 21
        DISCARDED_BLOCKS,
......
29 29
    protected void setup(Context context) throws IOException, InterruptedException {
30 30
        try {
31 31
            redisUtils = new RedisUtils(context.getConfiguration());
32
            optimalBlockSize = Integer.valueOf(redisUtils.getValue(OPTIMAL_BLOCKS));
33
            context.getCounter(TEST_COUNTERS.OPTIMAL_BLOCK_SIZE).setValue(optimalBlockSize);
32
            optimalBlockSize = Integer.valueOf(redisUtils.getValue(Properties.OPTIMAL_BLOCKS));
33
            context.getCounter(LINK_MAPPER_COUNTERS.OPTIMAL_BLOCK_SIZE).setValue(optimalBlockSize);
34 34
            log.info("optimalBlockSize " + optimalBlockSize);
35 35
        } catch (Exception e) {
36 36
            log.error("ERROR CONNECTING TO REDIS ", e);
......
46 46
            int recordsNumber = countRecords(result);
47 47
            //purge blocks that contain only source or target entities
48 48
            //how many comparisons we have purged
49
            context.getCounter(TEST_COUNTERS.TOTAL_COMPARISONS).increment(recordsNumber * (recordsNumber - 1));
49
            context.getCounter(LINK_MAPPER_COUNTERS.TOTAL_COMPARISONS).increment(recordsNumber * (recordsNumber - 1));
50 50
            if (recordsNumber >= optimalBlockSize) {
51
                context.getCounter(TEST_COUNTERS.DISCARDED_BLOCKS).increment(1);
51
                context.getCounter(LINK_MAPPER_COUNTERS.DISCARDED_BLOCKS).increment(1);
52 52
                //TODO count source / target records and get actual comparison number
53
                context.getCounter(TEST_COUNTERS.DISCARDED_COMPARISONS).increment(recordsNumber * (recordsNumber - 1));
53
                context.getCounter(LINK_MAPPER_COUNTERS.DISCARDED_COMPARISONS).increment(recordsNumber * (recordsNumber - 1));
54 54
            } else {
55 55
                context.write(keyIn, result);
56
                context.getCounter(TEST_COUNTERS.PASSED_BLOCKS).increment(1);
56
                context.getCounter(LINK_MAPPER_COUNTERS.PASSED_BLOCKS).increment(1);
57 57
            }
58
            context.getCounter(TEST_COUNTERS.TOTAL_BLOCKS).increment(1);
58
            context.getCounter(LINK_MAPPER_COUNTERS.TOTAL_BLOCKS).increment(1);
59 59

  
60 60
        } catch (Exception e) {
61 61
            log.error("key causing error is " + keyIn);
62 62
            log.error("ERROR ", e);
63
            context.getCounter(TEST_COUNTERS.ERROR_COUNTERS).increment(1);
63
            context.getCounter(LINK_MAPPER_COUNTERS.ERROR_COUNTERS).increment(1);
64 64
            throw new IOException(e.toString(), e);
65 65
        }
66 66
    }
modules/dnet-openaire-lod-interlinking-wf/src/main/resources/eu/dnetlib/iis/core/javamapreduce/lodexport/job.properties
60 60
stopwordsReducers=1
61 61
lodBlocksInput=/tmp/lod_dump/
62 62
lodBlocksOutput=/tmp/lod_dump/stopwords
63
lod_createCompositekey=true
64
lod_distanceAlgorithm=HAMMING
65
lod_similarityThreshold=0.8
63 66
lodPrefix=http://lod.openaire.eu/data/result/
64 67
lodStopwords=a,able,about,above,abst,accordance,according,accordingly,across,act,actually,added,adj,affected,affecting,affects,after,afterwards,again,against,ah,all,almost,alone,along,already,also,although,always,am,among,amongst,an,and,announce,another,any,anybody,anyhow,anymore,anyone,anything,anyway,anyways,anywhere,apparently,approximately,are,aren,arent,arise,around,as,aside,ask,asking,at,auth,available,away,awfully,b,back,be,became,because,become,becomes,becoming,been,before,beforehand,begin,beginning,beginnings,begins,behind,being,believe,below,beside,besides,between,beyond,biol,both,brief,briefly,but,by,c,ca,came,can,cannot,cant,cause,causes,certain,certainly,co,com,come,comes,contain,containing,contains,could,couldnt,d,date,did,didnt,different,do,does,doesnt,doing,done,dont,down,downwards,due,during,e,each,ed,edu,effect,eg,eight,eighty,either,else,elsewhere,end,ending,enough,especially,et,et-al,etc,even,ever,every,everybody,everyone,everything,everywhere,ex,except,f,far,few,ff,fifth,first,five,fix,followed,following,follows,for,former,formerly,forth,found,four,from,further,furthermore,g,gave,get,gets,getting,give,given,gives,giving,go,goes,gone,got,gotten,h,had,happens,hardly,has,hasnt,have,havent,having,he,hed,hence,her,here,hereafter,hereby,herein,heres,hereupon,hers,herself,hes,hi,hid,him,himself,his,hither,home,how,howbeit,however,hundred,i,id,ie,if,ill,im,immediate,immediately,importance,important,in,inc,indeed,index,information,instead,into,invention,inward,is,isnt,it,itd,itll,its,itself,ive,j,just,k,keep,	keeps,kept,kg,km,know,known,knows,l,largely,last,lately,later,latter,latterly,least,less,lest,let,lets,like,liked,likely,line,little,ll,look,looking,looks,ltd,m,made,mainly,make,makes,many,may,maybe,me,mean,means,meantime,meanwhile,merely,mg,might,million,miss,ml,more,moreover,most,mostly,mr,mrs,much,mug,must,my,myself,n,na,name,namely,nay,nd,near,nearly,necessarily,necessary,need,needs,neither,never,nevertheless,new,next,nine,ninety,no,nobody,non,none,nonetheless,noone,nor,normally,nos,not,noted,nothing,now,nowhere,o,obtain,obtained,obviously,of,off,often,oh,ok,okay,old,omitted,on,once,one,ones,only,onto,or,ord,other,others,otherwise,ought,our,ours,ourselves,out,outside,over,overall,owing,own,p,page,pages,part,particular,particularly,past,per,perhaps,placed,please,plus,poorly,possible,possibly,potentially,pp,predominantly,present,previously,primarily,probably,promptly,proud,provides,put,q,que,quickly,quite,qv,r,ran,rather,rd,re,readily,really,recent,recently,ref,refs,regarding,regardless,regards,related,relatively,research,respectively,resulted,resulting,results,right,run,s,said,same,saw,say,saying,says,sec,section,see,seeing,seem,seemed,seeming,seems,seen,self,selves,sent,seven,several,shall,she,shed,shell,shes,should,shouldnt,show,showed,shown,showns,shows,significant,significantly,similar,similarly,since,six,slightly,so,some,somebody,somehow,someone,somethan,something,sometime,sometimes,somewhat,somewhere,soon,sorry,specifically,specified,specify,specifying,still,stop,strongly,sub,substantially,successfully,such,sufficiently,suggest,sup,sure,t,take,taken,taking,tell,tends,th,than,thank,thanks,thanx,that,thatll,thats,thatve,the,their,theirs,them,themselves,then,thence,there,thereafter,thereby,thered,therefore,therein,therell,thereof,therere,theres,thereto,thereupon,thereve,these,they,theyd,theyll,theyre,theyve,think,this,those,thou,though,thoughh,thousand,throug,through,throughout,thru,thus,til,tip,to,together,too,took,toward,towards,tried,tries,truly,try,trying,ts,twice,two,u,un,under,unfortunately,unless,unlike,unlikely,until,unto,up,upon,ups,us,use,used,useful,usefully,usefulness,uses,using,usually,v,value,various,ve,very,via,viz,vol,vols,vs,w,want,wants,was,wasnt,way,we,wed,welcome,well,went,were,werent,weve,what,whatever,whatll,whats,when,whence,whenever,where,whereafter,whereas,whereby,wherein,wheres,whereupon,wherever,whether,which,while,whim,whither,who,whod,whoever,whole,wholl,whom,whomever,whos,whose,why,widely,willing,wish,with,within,without,wont,words,world,would,wouldnt,www,x,y,yes,yet,you,youd,youll,your,youre,yours,yourself,yourselves,youve,z,zero
65 68
lod_configXML=<?xml version="1.0" encoding="UTF-8"?> <!DOCTYPE LIMES SYSTEM "limes.dtd"> <LIMES> <PREFIX> <NAMESPACE>http://www.w3.org/1999/02/22-rdf-syntax-ns#</NAMESPACE> <LABEL>rdf</LABEL> </PREFIX> <PREFIX> <NAMESPACE>http://www.w3.org/2000/01/rdf-schema#</NAMESPACE> <LABEL>rdfs</LABEL> </PREFIX> <SOURCE> <ID>source1</ID> <ENDPOINT>/user/kanakakis/groundTruth/sourceNT</ENDPOINT> <VAR>?x</VAR> <PAGESIZE>100</PAGESIZE> <RESTRICTION>?x rdf:type http://www.eurocris.org/ontologies/cerif/1.3#ResultEntity</RESTRICTION> <PROPERTY>http://lod.openaire.eu/vocab/year RENAME Year</PROPERTY> <PROPERTY>http://www.w3.org/1999/02/22-rdf-syntax-ns# RENAME type</PROPERTY> <PROPERTY>http://purl.org/dc/terms/identifier RENAME id</PROPERTY> <PROPERTY>http://www.eurocris.org/ontologies/cerif/1.3#name AS lowercase->regexreplace("[^A-Za-z0-9]"," ") RENAME publicationName</PROPERTY> </SOURCE> <TARGET> <ID>source2</ID> <ENDPOINT>/user/kanakakis/groundTruth/targetNT</ENDPOINT> <VAR>?y</VAR> <PAGESIZE>100</PAGESIZE> <RESTRICTION>?y rdf:type http://swrc.ontoware.org/ontology#Article</RESTRICTION> <PROPERTY>http://www.w3.org/1999/02/22-rdf-syntax-ns# RENAME type</PROPERTY> <PROPERTY>http://purl.org/dc/terms/issued RENAME Year</PROPERTY> <PROPERTY>http://purl.org/dc/terms/identifier RENAME id</PROPERTY> <PROPERTY>http://www.w3.org/2000/01/rdf-schema#label AS lowercase->regexreplace("[^A-Za-z0-9]"," ") RENAME articleName</PROPERTY> </TARGET> <METRIC>AND(jaro(x.publicationName,y.articleName)|0.8,jaro(x.Year,y.Year)|1.0)</METRIC> <!-- <METRIC>jaro(x.publicatioName,y.articleName)|0.7</METRIC> --> <ACCEPTANCE> <THRESHOLD>0.8</THRESHOLD> <FILE>/user/kanakakis/groundTruth/accepted_links_0.8_no_purge</FILE> <RELATION>owl:sameAs</RELATION> </ACCEPTANCE> <REVIEW> <THRESHOLD>0.8</THRESHOLD> <FILE>/user/kanakakis/groundTruth/verified_links_0.8</FILE> <RELATION>owl:sameAs</RELATION> </REVIEW> <EXECUTION>Default</EXECUTION> <OUTPUT>TTL</OUTPUT> </LIMES>
modules/dnet-openaire-lod-interlinking-wf/src/main/resources/eu/dnetlib/iis/core/javamapreduce/lodexport/oozie_app/workflow.xml
17 17
        </configuration>
18 18
    </global>
19 19

  
20
    <start to='linkage'/>
20
    <start to='build'/>
21 21
    <action name="preProcessing">
22 22
        <map-reduce>
23 23
            <configuration>
......
63 63

  
64 64
                    <name>mapred.input.dir.formats</name>
65 65
                    <value>
66
                        ${nameNode}${sourceInput};org.apache.hadoop.mapreduce.lib.input.TextInputFormat,${nameNode}${targetInput};org.apache.hadoop.mapreduce.lib.input.TextInputFormat
66
                        ${nameNode}${sourceInput};org.apache.hadoop.mapreduce.lib.input.TextInputFormat,${nameNode}${targetInput};eu.dnetlib.data.mapreduce.hbase.lodExport.preprocessing.XmlInputFormat
67

  
67 68
                    </value>
68 69
                </property>
69 70

  
......
255 256
                </property>
256 257

  
257 258

  
259
                <property>
260
                    <name>lod.createCompositekey</name>
261
                    <value>${lod_createCompositekey}</value>
262
                </property>
263

  
264
                <property>
265
                    <name>lod.createCompositekey</name>
266
                    <value>${lod_createCompositekey}</value>
267
                </property>
268

  
269

  
258 270
                <!-- ## Workflow node parameters -->
259 271
                <property>
260 272
                    <name>mapred.reduce.tasks</name>
......
644 656
                </property>
645 657

  
646 658

  
659
                <property>
660
                    <name>lod.distanceAlgorithm</name>
661
                    <value>${lod_distanceAlgorithm}</value>
662
                </property>
663
                <property>
664
                    <name>lod.similarityThreshold</name>
665
                    <value>${lod_similarityThreshold}</value>
666
                </property>
667

  
668

  
669

  
647 670
                <!-- ## Workflow node parameters -->
648 671
                <property>
649 672
                    <name>mapred.reduce.tasks</name>

Also available in: Unified diff