Revision 48805
Added by Eri Katsari almost 7 years ago
modules/dnet-openaire-lodinterlinking/branches/cacheOptimized/src/main/java/eu/dnetlib/data/mapreduce/hbase/lodExport/build/BlockNoCacheReducer.java | ||
---|---|---|
42 | 42 |
protected void reduce(final Text key, final Iterable<Text> values, final Context context) throws IOException, InterruptedException { |
43 | 43 |
Iterator<Text> it = values.iterator(); |
44 | 44 |
StringBuilder field = new StringBuilder(); |
45 |
Map<String, String> records = new HashMap(); |
|
45 |
Map<String, String> records = new HashMap<>();
|
|
46 | 46 |
|
47 | 47 |
try { |
48 | 48 |
boolean hasSource = false; |
modules/dnet-openaire-lodinterlinking/branches/cacheOptimized/src/main/java/eu/dnetlib/data/mapreduce/hbase/lodExport/build/SourceBuildNoCacheMapper.java | ||
---|---|---|
2 | 2 |
|
3 | 3 |
import eu.dnetlib.data.mapreduce.hbase.lodExport.utils.blocking.Blocking; |
4 | 4 |
import eu.dnetlib.data.mapreduce.hbase.lodExport.utils.configuration.LodConfiguration; |
5 |
import eu.dnetlib.data.mapreduce.hbase.lodExport.utils.configuration.Properties; |
|
5 | 6 |
import org.apache.hadoop.io.LongWritable; |
6 | 7 |
import org.apache.hadoop.io.Text; |
7 | 8 |
import org.apache.hadoop.mapreduce.Mapper; |
8 | 9 |
import org.apache.log4j.Logger; |
9 | 10 |
|
10 | 11 |
import java.io.IOException; |
11 |
import java.util.*; |
|
12 |
import java.util.ArrayList; |
|
13 |
import java.util.Arrays; |
|
14 |
import java.util.Collections; |
|
15 |
import java.util.HashSet; |
|
16 |
import java.util.List; |
|
17 |
import java.util.Set; |
|
12 | 18 |
|
13 | 19 |
/** |
14 | 20 |
* Mapper Class that reads HBASE contents and prepares them for the StatsDB |
... | ... | |
28 | 34 |
private Logger log = Logger.getLogger(this.getClass()); |
29 | 35 |
private LodConfiguration lodConfiguration = new LodConfiguration(); |
30 | 36 |
private String lastExecutionDate; |
31 |
private static final String PREFIX = "source_"; |
|
32 | 37 |
|
33 | 38 |
private HashSet<String> stopWordsMap = new HashSet<>(); |
34 |
private static final String FIELD_DELIM = "\t"; |
|
35 |
private static final String LINE_DELIM = "\t.\t"; |
|
36 | 39 |
|
37 | 40 |
private boolean createCompositekey; |
38 | 41 |
private Set<String> usedProperties = new HashSet<>(); |
... | ... | |
47 | 50 |
@Override |
48 | 51 |
protected void setup(Context context) throws IOException, InterruptedException { |
49 | 52 |
try { |
50 |
lodConfiguration.load(context.getConfiguration().get("lod.sourceMappings"));
|
|
51 |
stopWordsMap.addAll(Arrays.asList(context.getConfiguration().get("lod.stopwords").split(",")));
|
|
52 |
createCompositekey = context.getConfiguration().getBoolean("createCompositekey", true);
|
|
53 |
lodConfiguration.load(context.getConfiguration().get(Properties.LOD_SOURCE_MAPPINGS));
|
|
54 |
stopWordsMap.addAll(Arrays.asList(context.getConfiguration().get(Properties.LOD_STOPWORDS).split(",")));
|
|
55 |
createCompositekey = context.getConfiguration().getBoolean(eu.dnetlib.data.mapreduce.hbase.lodExport.utils.configuration.Properties.LOD_CREATE_COMPOSITEKEY, true);
|
|
53 | 56 |
usedProperties.add("<http://www.eurocris.org/ontologies/cerif/1.3#name>"); |
54 | 57 |
usedProperties.add("<http://lod.openaire.eu/vocab/year>"); |
55 |
|
|
56 |
|
|
57 | 58 |
} catch (Exception ex) { |
58 | 59 |
log.error("An error occured during Mapper Setup " + ex.toString(), ex); |
59 | 60 |
System.out.println(ex.getCause().toString()); |
... | ... | |
67 | 68 |
|
68 | 69 |
StringBuilder id = new StringBuilder(); |
69 | 70 |
context.getCounter(SOURCE_BUILD_COUNTERS.INPUT_SOURCE_RECORDS).increment(1); |
70 |
String[] triples = result.toString().split(LINE_DELIM); |
|
71 |
String[] triples = result.toString().split(Properties.LINE_DELIM);
|
|
71 | 72 |
|
72 | 73 |
|
73 | 74 |
List<String> tokenList = parseFieldsAndGenerateTokens(id, triples); |
... | ... | |
89 | 90 |
List<String> tokenList = new ArrayList<>(); |
90 | 91 |
|
91 | 92 |
for (String triple : triples) { |
92 |
String[] fields = triple.split(FIELD_DELIM); |
|
93 |
String[] fields = triple.split(Properties.FIELD_DELIM);
|
|
93 | 94 |
if (fields.length == 3) { |
94 | 95 |
if (id.length() < 1) { |
95 |
id.append(PREFIX).append(fields[0]); |
|
96 |
id.append(Properties.SOURCE_PREFIX).append(fields[0]);
|
|
96 | 97 |
} |
97 | 98 |
String property = fields[1]; |
98 | 99 |
String value = fields[2]; |
... | ... | |
109 | 110 |
private void writeSingleKeysAndValues(Text result, Context context, List<String> tokenList, StringBuilder id) throws IOException, InterruptedException { |
110 | 111 |
for (String token : tokenList) { |
111 | 112 |
//Write BlockingKey, RecordID to output |
112 |
context.write(new Text(token), new Text(id + FIELD_DELIM + result.toString())); |
|
113 |
context.write(new Text(token), new Text(id + Properties.FIELD_DELIM + result.toString()));
|
|
113 | 114 |
context.getCounter(SOURCE_BUILD_COUNTERS.BLOCKING_KEYS).increment(1); |
114 | 115 |
} |
115 | 116 |
} |
116 | 117 |
|
117 | 118 |
private void writeCompositeKeyAndValue(Text result, Context context, List<String> tokenList, StringBuilder id) throws IOException, InterruptedException { |
118 | 119 |
String compositeKey = createCompositeKey(tokenList); |
119 |
|
|
120 |
context.write(new Text(compositeKey), new Text(id + FIELD_DELIM + result.toString())); |
|
120 |
context.write(new Text(compositeKey), new Text(id + Properties.FIELD_DELIM + result.toString())); |
|
121 | 121 |
context.getCounter(SOURCE_BUILD_COUNTERS.BLOCKING_KEYS).increment(1); |
122 | 122 |
} |
123 | 123 |
|
modules/dnet-openaire-lodinterlinking/branches/cacheOptimized/src/main/java/eu/dnetlib/data/mapreduce/hbase/lodExport/build/TargetBuildNoCacheMapper.java | ||
---|---|---|
2 | 2 |
|
3 | 3 |
import eu.dnetlib.data.mapreduce.hbase.lodExport.utils.blocking.Blocking; |
4 | 4 |
import eu.dnetlib.data.mapreduce.hbase.lodExport.utils.configuration.LodConfiguration; |
5 |
import eu.dnetlib.data.mapreduce.hbase.lodExport.utils.configuration.Properties; |
|
5 | 6 |
import org.apache.hadoop.io.LongWritable; |
6 | 7 |
import org.apache.hadoop.io.Text; |
7 | 8 |
import org.apache.hadoop.mapreduce.Mapper; |
8 | 9 |
import org.apache.log4j.Logger; |
9 | 10 |
|
10 | 11 |
import java.io.IOException; |
11 |
import java.util.*; |
|
12 |
import java.util.ArrayList; |
|
13 |
import java.util.Arrays; |
|
14 |
import java.util.Collections; |
|
15 |
import java.util.HashSet; |
|
16 |
import java.util.List; |
|
17 |
import java.util.Set; |
|
12 | 18 |
|
13 | 19 |
|
14 | 20 |
/*-> Parse LOD dump files |
... | ... | |
22 | 28 |
*/ |
23 | 29 |
|
24 | 30 |
public class TargetBuildNoCacheMapper extends Mapper<LongWritable, Text, Text, Text> { |
25 |
private static final String PREFIX = "target_"; |
|
26 | 31 |
private Logger log = Logger.getLogger(this.getClass()); |
27 | 32 |
private LodConfiguration lodConfiguration = new LodConfiguration(); |
28 | 33 |
private HashSet<String> stopWordsMap = new HashSet<>(); |
29 |
private static final String FIELD_DELIM = "\t"; |
|
30 | 34 |
private boolean createCompositekey; |
31 | 35 |
private Set<String> usedProperties = new HashSet<>(); |
32 |
private static final String LINE_DELIM = "\t.\t"; |
|
33 |
|
|
36 |
|
|
34 | 37 |
public static enum TARGET_BUILD_COUNTERS { |
35 | 38 |
BLOCKING_KEYS, |
36 | 39 |
INPUT_TARGET_RECORDS |
... | ... | |
39 | 42 |
|
40 | 43 |
@Override |
41 | 44 |
protected void setup(Context context) throws IOException, InterruptedException { |
42 |
|
|
43 | 45 |
try { |
44 |
lodConfiguration.load(context.getConfiguration().get("lod.targetMappings"));
|
|
45 |
stopWordsMap.addAll(Arrays.asList(context.getConfiguration().get("lod.stopwords").split(",")));
|
|
46 |
createCompositekey = context.getConfiguration().getBoolean("createCompositekey", true);
|
|
46 |
lodConfiguration.load(context.getConfiguration().get(Properties.LOD_TARGET_MAPPINGS));
|
|
47 |
stopWordsMap.addAll(Arrays.asList(context.getConfiguration().get(Properties.LOD_STOPWORDS).split(",")));
|
|
48 |
createCompositekey = context.getConfiguration().getBoolean(Properties.LOD_CREATE_COMPOSITEKEY, true);
|
|
47 | 49 |
usedProperties.add("<http://purl.org/dc/terms/issued>"); |
48 | 50 |
usedProperties.add("<http://www.w3.org/2000/01/rdf-schema#label>"); |
49 |
|
|
50 | 51 |
} catch (Exception ex) { |
51 | 52 |
log.error("An error occured during Mapper Setup " + ex.toString(), ex); |
52 | 53 |
} |
... | ... | |
61 | 62 |
StringBuilder id = new StringBuilder(); |
62 | 63 |
|
63 | 64 |
context.getCounter(TARGET_BUILD_COUNTERS.INPUT_TARGET_RECORDS).increment(1); |
64 |
String[] triples = result.toString().split(LINE_DELIM); |
|
65 |
String[] triples = result.toString().split(Properties.LINE_DELIM);
|
|
65 | 66 |
|
66 | 67 |
List<String> tokenList = parseFieldsAndGenerateTokens(id, triples); |
67 | 68 |
|
... | ... | |
84 | 85 |
List<String> tokenList = new ArrayList<>(); |
85 | 86 |
|
86 | 87 |
for (String triple : triples) { |
87 |
String[] fields = triple.split(FIELD_DELIM); |
|
88 |
String[] fields = triple.split(Properties.FIELD_DELIM);
|
|
88 | 89 |
if (fields.length == 3) { |
89 | 90 |
if (id.length() < 1) { |
90 |
id.append(PREFIX).append(fields[0]); |
|
91 |
id.append(Properties.TARGET_PREFIX).append(fields[0]);
|
|
91 | 92 |
} |
92 | 93 |
String property = fields[1]; |
93 | 94 |
String value = fields[2]; |
... | ... | |
104 | 105 |
private void writeSingleKeysAndValues(Text result, Context context, List<String> tokenList, StringBuilder id) throws IOException, InterruptedException { |
105 | 106 |
for (String token : tokenList) { |
106 | 107 |
//Write BlockingKey, RecordID to output |
107 |
context.write(new Text(token), new Text(id + FIELD_DELIM + result.toString())); |
|
108 |
context.write(new Text(token), new Text(id + Properties.FIELD_DELIM + result.toString()));
|
|
108 | 109 |
context.getCounter(TARGET_BUILD_COUNTERS.BLOCKING_KEYS).increment(1); |
109 | 110 |
} |
110 | 111 |
} |
111 | 112 |
|
112 | 113 |
private void writeCompositeKeyAndValue(Text result, Context context, List<String> tokenList, StringBuilder id) throws IOException, InterruptedException { |
113 | 114 |
String compositeKey = createCompositeKey(tokenList); |
114 |
context.write(new Text(compositeKey), new Text(id + FIELD_DELIM + result.toString())); |
|
115 |
context.write(new Text(compositeKey), new Text(id + Properties.FIELD_DELIM + result.toString()));
|
|
115 | 116 |
context.getCounter(TARGET_BUILD_COUNTERS.BLOCKING_KEYS).increment(1); |
116 | 117 |
} |
117 | 118 |
|
... | ... | |
123 | 124 |
} |
124 | 125 |
return tokenString.toString(); |
125 | 126 |
} |
127 |
|
|
126 | 128 |
@Override |
127 | 129 |
protected void cleanup(Context context) throws IOException, InterruptedException { |
128 | 130 |
super.cleanup(context); |
modules/dnet-openaire-lodinterlinking/branches/cacheOptimized/src/main/java/eu/dnetlib/data/mapreduce/hbase/lodExport/utils/compators/DistanceAlgorithms.java | ||
---|---|---|
1 |
package eu.dnetlib.data.mapreduce.hbase.lodExport.utils.compators; |
|
2 |
|
|
3 |
public enum DistanceAlgorithms { |
|
4 |
LEVENSHTEIN, |
|
5 |
HAMMING |
|
6 |
} |
modules/dnet-openaire-lodinterlinking/branches/cacheOptimized/src/main/java/eu/dnetlib/data/mapreduce/hbase/lodExport/utils/compators/DistanceCalculator.java | ||
---|---|---|
2 | 2 |
|
3 | 3 |
/*Found in https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Levenshtein_distance#Java*/ |
4 | 4 |
|
5 |
import org.datanucleus.util.StringUtils; |
|
6 |
import org.hsqldb.lib.StringUtil; |
|
7 |
|
|
8 | 5 |
public class DistanceCalculator { |
9 | 6 |
|
10 | 7 |
public static int getLevenshteinDistance(String s1, String s2) { |
11 |
if (StringUtils.isEmpty(s1) || StringUtils.isEmpty(s2)) {
|
|
8 |
if (s1 == null || s2 == null) {
|
|
12 | 9 |
return 0; |
13 | 10 |
} |
14 | 11 |
|
... | ... | |
59 | 56 |
if (sourceValue != null && targetValue != null) { |
60 | 57 |
sourceValue = sourceValue.replaceAll("[^A-Za-z0-9]", "").toLowerCase(); |
61 | 58 |
targetValue = targetValue.replaceAll("[^A-Za-z0-9]", "").toLowerCase(); |
62 |
int similarChars = 0;
|
|
59 |
double similarChars = 0.0;
|
|
63 | 60 |
int end = sourceValue.length() <= targetValue.length() ? sourceValue.length() : targetValue.length(); |
64 | 61 |
|
65 | 62 |
for (int i = 0; i < end; ++i) { |
66 | 63 |
if (sourceValue.charAt(i) == targetValue.charAt(i)) { |
67 |
++similarChars;
|
|
64 |
similarChars++;
|
|
68 | 65 |
} |
69 | 66 |
} |
70 |
return sourceValue.length() >= targetValue.length() ? (double) similarChars / (double) sourceValue.length() : (double) similarChars / (double) targetValue.length();
|
|
67 |
return sourceValue.length() >= targetValue.length() ? similarChars / (double) sourceValue.length() : similarChars / (double) targetValue.length();
|
|
71 | 68 |
} else { |
72 | 69 |
return 0.0D; |
73 | 70 |
} |
modules/dnet-openaire-lodinterlinking/branches/cacheOptimized/src/main/java/eu/dnetlib/data/mapreduce/hbase/lodExport/utils/compators/MyComparator.java | ||
---|---|---|
1 | 1 |
|
2 | 2 |
package eu.dnetlib.data.mapreduce.hbase.lodExport.utils.compators; |
3 | 3 |
|
4 |
import org.datanucleus.util.StringUtils; |
|
5 |
import org.hsqldb.lib.StringUtil; |
|
6 |
|
|
7 | 4 |
import java.util.HashMap; |
8 |
import java.util.Iterator; |
|
9 | 5 |
import java.util.Map; |
10 | 6 |
|
11 | 7 |
|
12 | 8 |
public class MyComparator { |
13 |
private static double FIELDS_SIMILARITY_THRESHOLD = 0.7D; |
|
14 | 9 |
private static final String LINE_SEPERATOR = "\t.\t"; |
15 | 10 |
private static final String FIELD_DELIM = "\t"; |
16 | 11 |
private static Map<String, String> sourceRecordMappings = new HashMap<>(); |
17 | 12 |
|
18 |
public static double findMatchingPair(String source, String target, boolean useLevDistance) { |
|
13 |
public static double computeSimilarity(DistanceAlgorithms algorithm, String source, String target) { |
|
14 |
switch (algorithm) { |
|
15 |
case LEVENSHTEIN: |
|
16 |
return computeLevenshteinSimilarity(source, target); |
|
17 |
case HAMMING: |
|
18 |
default: |
|
19 |
return computeDistanceSimilarity(source, target); |
|
20 |
} |
|
21 |
|
|
22 |
} |
|
23 |
|
|
24 |
private static double computeDistanceSimilarity(String source, String target) { |
|
19 | 25 |
Map<String, String> sourceRecordMap = getRecordsFiledMap(source); |
20 | 26 |
Map<String, String> targetRecordMap = getRecordsFiledMap(target); |
21 |
int matchedFields = 0; |
|
22 |
double totalFields = (double) sourceRecordMap.size() > (double) targetRecordMap.size() ? (double) sourceRecordMap.size() : (double) targetRecordMap.size(); |
|
27 |
double totalFields = getTotalFields(sourceRecordMap, targetRecordMap); |
|
23 | 28 |
double recordSimilarity = 0.0D; |
24 | 29 |
int maxFieldLength = 1; |
25 | 30 |
|
26 | 31 |
for (Map.Entry<String, String> sourceField : sourceRecordMap.entrySet()) { |
27 | 32 |
String correspondingTargetField = sourceRecordMappings.get(sourceField.getKey()); |
28 | 33 |
String targetFieldValue = targetRecordMap.get(correspondingTargetField); |
29 |
if (StringUtils.isEmpty(targetFieldValue)) { |
|
30 |
break; |
|
31 |
} |
|
34 |
if (targetFieldValue != null) { |
|
35 |
if (sourceField.getValue().length() > maxFieldLength) { |
|
36 |
maxFieldLength = sourceField.getValue().length(); |
|
37 |
} |
|
38 |
if (targetFieldValue.length() > maxFieldLength) { |
|
39 |
maxFieldLength = targetFieldValue.length(); |
|
40 |
} |
|
32 | 41 |
|
33 |
if (sourceField.getValue().length() > maxFieldLength) {
|
|
34 |
maxFieldLength = sourceField.getValue().length();
|
|
42 |
double fieldsSimilarity = DistanceCalculator.getSimpleDistance(sourceField.getValue(), targetFieldValue);
|
|
43 |
recordSimilarity += fieldsSimilarity;
|
|
35 | 44 |
} |
36 |
if (targetFieldValue.length() > maxFieldLength) { |
|
37 |
maxFieldLength = targetFieldValue.length(); |
|
38 |
} |
|
45 |
} |
|
39 | 46 |
|
47 |
return recordSimilarity / totalFields; |
|
48 |
} |
|
40 | 49 |
|
41 |
double fieldsSimilarity; |
|
42 |
if (useLevDistance) { |
|
43 |
fieldsSimilarity = DistanceCalculator.getLevenshteinDistance(sourceField.getValue(), targetFieldValue); |
|
44 |
} else { |
|
45 |
fieldsSimilarity = DistanceCalculator.getSimpleDistance(sourceField.getValue(), targetFieldValue); |
|
46 |
} |
|
50 |
private static double getTotalFields(Map<String, String> sourceRecordMap, Map<String, String> targetRecordMap) { |
|
51 |
return (double) (sourceRecordMap.size() > targetRecordMap.size() ? sourceRecordMap.size() : targetRecordMap.size()); |
|
52 |
} |
|
47 | 53 |
|
48 |
recordSimilarity += fieldsSimilarity; |
|
49 |
if (fieldsSimilarity >= FIELDS_SIMILARITY_THRESHOLD) { |
|
50 |
matchedFields++; |
|
54 |
private static double computeLevenshteinSimilarity(String source, String target) { |
|
55 |
Map<String, String> sourceRecordMap = getRecordsFiledMap(source); |
|
56 |
Map<String, String> targetRecordMap = getRecordsFiledMap(target); |
|
57 |
double recordSimilarity = 0.0D; |
|
58 |
int maxFieldLength = 1; |
|
59 |
double totalFields = getTotalFields(sourceRecordMap, targetRecordMap); |
|
60 |
|
|
61 |
for (Map.Entry<String, String> sourceField : sourceRecordMap.entrySet()) { |
|
62 |
String correspondingTargetField = sourceRecordMappings.get(sourceField.getKey()); |
|
63 |
String targetFieldValue = targetRecordMap.get(correspondingTargetField); |
|
64 |
if (targetFieldValue != null) { |
|
65 |
if (sourceField.getValue().length() > maxFieldLength) { |
|
66 |
maxFieldLength = sourceField.getValue().length(); |
|
67 |
} |
|
68 |
if (targetFieldValue.length() > maxFieldLength) { |
|
69 |
maxFieldLength = targetFieldValue.length(); |
|
70 |
} |
|
71 |
|
|
72 |
double levenshteinDistance = DistanceCalculator.getLevenshteinDistance(sourceField.getValue(), targetFieldValue); |
|
73 |
double longerStringSize = sourceField.getValue().length() > targetFieldValue.length() ? sourceField.getValue().length() : targetFieldValue.length(); |
|
74 |
double fieldsSimilarity = (longerStringSize - levenshteinDistance) / longerStringSize; |
|
75 |
recordSimilarity += fieldsSimilarity; |
|
51 | 76 |
} |
52 | 77 |
} |
53 | 78 |
|
54 |
if (useLevDistance) { |
|
55 |
return 1 - (recordSimilarity / maxFieldLength); |
|
56 |
} |
|
57 | 79 |
return recordSimilarity / totalFields; |
58 | 80 |
} |
59 | 81 |
|
... | ... | |
61 | 83 |
private static Map<String, String> getRecordsFiledMap(String source) { |
62 | 84 |
String sourceRecord = source.substring(source.indexOf(FIELD_DELIM) + 1, source.length()).trim(); |
63 | 85 |
String[] sourceTriples = sourceRecord.split(LINE_SEPERATOR); |
64 |
Map<String, String> sourceFieldsMap = new HashMap(); |
|
86 |
Map<String, String> sourceFieldsMap = new HashMap<>();
|
|
65 | 87 |
|
66 | 88 |
|
67 | 89 |
for (int i = 0; i < sourceTriples.length - 1; i++) { |
... | ... | |
87 | 109 |
|
88 | 110 |
String target = "target_conf/clef/LarsonNJ08\tconf/clef/LarsonNJ08\t<http://www.w3.org/2000/01/rdf-schema#label>\tOverview of VideoCLEF 2008: Automatic Generation of Topic-Based Feeds for Dual Language Audio-Visual Content.\t.\t,"; |
89 | 111 |
String source = "source_<http://lod.openaire.eu/data/result/od_______119::60f21cae791a925a78d0844ad00cea5a>\t<http://lod.openaire.eu/data/result/od_______119::60f21cae791a925a78d0844ad00cea5a>\t<http://purl.org/dc/terms/identifier>\t\"od_______119::60f21cae791a925a78d0844ad00cea5a\"\t.\t<http://lod.openaire.eu/data/result/od_______119::60f21cae791a925a78d0844ad00cea5a>\t<http://purl.org/dc/terms/identifier>\t\"oai:doras.dcu.ie:16187\"\t.\t<http://lod.openaire.eu/data/result/od_______119::60f21cae791a925a78d0844ad00cea5a>\t<http://www.eurocris.org/ontologies/cerif/1.3#name>\t\"Overview of VideoCLEF 2008: Automatic generation of topic based feeds for dual language audio visual content\"\t.\t,"; |
90 |
System.out.println(MyComparator.findMatchingPair(source, target, true)); |
|
112 |
/* |
|
113 |
System.out.println("Hamming :" + MyComparator.computeSimilarity(DistanceAlgorithms.HAMMING, source, target)); |
|
114 |
System.out.println("Lev :" + MyComparator.computeSimilarity(DistanceAlgorithms.LEVENSHTEIN, source, target)); |
|
115 |
*/ |
|
91 | 116 |
|
117 |
target = "target_conf/clef/LarsonNJ08\tconf/clef/LarsonNJ08\t<http://www.w3.org/2000/01/rdf-schema#label>\tOverview of VideoCLEF 2008: Automatic Generation of Topic-Based Feeds for Dual Language Audio-Visual Content.\t.\t,"; |
|
118 |
source = "source_<http://lod.openaire.eu/data/result/od_______119::60f21cae791a925a78d0844ad00cea5a>\t<http://lod.openaire.eu/data/result/od_______119::60f21cae791a925a78d0844ad00cea5a>\t<http://purl.org/dc/terms/identifier>\t\"od_______119::60f21cae791a925a78d0844ad00cea5a\"\t.\t<http://lod.openaire.eu/data/result/od_______119::60f21cae791a925a78d0844ad00cea5a>\t<http://purl.org/dc/terms/identifier>\t\"oai:doras.dcu.ie:16187\"\t.\t<http://lod.openaire.eu/data/result/od_______119::60f21cae791a925a78d0844ad00cea5a>\t<http://www.eurocris.org/ontologies/cerif/1.3#name>\t\"Overview of VideoCLEF 2008: Automatic generation of topic based feeds for visual content\"\t.\t,"; |
|
119 |
|
|
120 |
System.out.println("Hamming :" + MyComparator.computeSimilarity(DistanceAlgorithms.HAMMING, source, target)); |
|
121 |
System.out.println("Lev :" + MyComparator.computeSimilarity(DistanceAlgorithms.LEVENSHTEIN, source, target)); |
|
122 |
|
|
123 |
|
|
92 | 124 |
} |
93 | 125 |
|
94 | 126 |
} |
modules/dnet-openaire-lodinterlinking/branches/cacheOptimized/src/main/java/eu/dnetlib/data/mapreduce/hbase/lodExport/utils/configuration/Properties.java | ||
---|---|---|
1 |
package eu.dnetlib.data.mapreduce.hbase.lodExport.utils.configuration; |
|
2 |
|
|
3 |
public class Properties { |
|
4 |
|
|
5 |
|
|
6 |
public static final String LOD_DISTANCE_ALGORITHM = "lod.distanceAlgorithm"; |
|
7 |
public static final String LOD_SIMILARITY_THRESHOLD = "lod.similarityThreshold"; |
|
8 |
public static final String LOD_CREATE_COMPOSITEKEY = "lod.createCompositekey"; |
|
9 |
public static final String LOD_SOURCE_MAPPINGS = "lod.sourceMappings"; |
|
10 |
public static final String LOD_STOPWORDS = "lod.stopwords"; |
|
11 |
public static final String SOURCE_PREFIX = "source_"; |
|
12 |
public static final String LOD_TARGET_MAPPINGS = "lod.targetMappings"; |
|
13 |
public static final String TARGET_PREFIX = "target_"; |
|
14 |
public static final String FIELD_DELIM = "\t"; |
|
15 |
public static final String LINE_DELIM = "\t.\t"; |
|
16 |
public static final String OPTIMAL_BLOCKS = "optimalBlockSize"; |
|
17 |
} |
modules/dnet-openaire-lodinterlinking/branches/cacheOptimized/src/main/java/eu/dnetlib/data/mapreduce/hbase/lodExport/utils/blocking/Blocking.java | ||
---|---|---|
72 | 72 |
return tokens.toString(); |
73 | 73 |
} |
74 | 74 |
|
75 |
|
|
76 |
public static void main(String[] args) { |
|
77 |
|
|
78 |
List<String> tokenList = new ArrayList<>(); |
|
79 |
|
|
80 |
HashSet<String> stopwordsMap = new HashSet<>(); |
|
81 |
stopwordsMap.add("and"); |
|
82 |
|
|
83 |
String field = "A test string"; |
|
84 |
String tokens = tokenBlocking(field, stopwordsMap); |
|
85 |
System.out.println(tokens); |
|
86 |
tokenList.add(tokens); |
|
87 |
|
|
88 |
field = "1990"; |
|
89 |
tokens = tokenBlocking(field, stopwordsMap); |
|
90 |
System.out.println(tokens); |
|
91 |
tokenList.add(tokens); |
|
92 |
|
|
93 |
Collections.sort(tokenList); |
|
94 |
StringBuilder tokenString = new StringBuilder(); |
|
95 |
|
|
96 |
for (String t : tokenList) { |
|
97 |
tokenString.append(t).append(" "); |
|
98 |
} |
|
99 |
|
|
100 |
System.out.println(tokenString.toString()); |
|
101 |
|
|
102 |
|
|
103 |
} |
|
75 | 104 |
} |
modules/dnet-openaire-lodinterlinking/branches/cacheOptimized/src/main/java/eu/dnetlib/data/mapreduce/hbase/lodExport/preprocessing/DatasetReducer.java | ||
---|---|---|
12 | 12 |
|
13 | 13 |
public class DatasetReducer extends Reducer<Text, Text, Text, Text> { |
14 | 14 |
public static enum ENTITIES_COUNTER { |
15 |
|
|
16 | 15 |
TOTAL_ENTITIES, |
17 | 16 |
} |
18 | 17 |
|
... | ... | |
30 | 29 |
protected void reduce(final Text key, final Iterable<Text> values, final Context context) throws IOException, InterruptedException { |
31 | 30 |
Iterator<Text> it = values.iterator(); |
32 | 31 |
StringBuffer value = new StringBuffer(); |
33 |
|
|
34 | 32 |
String[] tmp = key.toString().split(","); |
35 | 33 |
String dataset = tmp[0].replaceAll("[^a-zA-Z0-9]", ""); |
36 | 34 |
String entityType = tmp[1].replaceAll("[^a-zA-Z0-9]", ""); |
modules/dnet-openaire-lodinterlinking/branches/cacheOptimized/src/main/java/eu/dnetlib/data/mapreduce/hbase/lodExport/preprocessing/SourceMapper.java | ||
---|---|---|
1 | 1 |
package eu.dnetlib.data.mapreduce.hbase.lodExport.preprocessing; |
2 | 2 |
|
3 |
import java.io.IOException;
|
|
4 |
|
|
3 |
import eu.dnetlib.data.mapreduce.hbase.lodExport.utils.configuration.LodConfiguration;
|
|
4 |
import eu.dnetlib.data.mapreduce.hbase.lodExport.utils.configuration.Properties; |
|
5 | 5 |
import org.apache.commons.lang.StringUtils; |
6 | 6 |
import org.apache.hadoop.io.LongWritable; |
7 | 7 |
import org.apache.hadoop.io.Text; |
8 | 8 |
import org.apache.hadoop.mapreduce.Mapper; |
9 | 9 |
import org.apache.log4j.Logger; |
10 | 10 |
|
11 |
import eu.dnetlib.data.mapreduce.hbase.lodExport.utils.configuration.LodConfiguration;
|
|
11 |
import java.io.IOException;
|
|
12 | 12 |
|
13 | 13 |
/** |
14 | 14 |
* Mapper Class that reads HBASE contents and prepares them for the StatsDB |
... | ... | |
29 | 29 |
<http://lod.openaire.eu/data/result/doajarticles::89217af00809a91acc15a416e56b3782> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://www.eurocris.org/ontologies/cerif/1.3#ResultEntity> . |
30 | 30 |
<http://lod.openaire.eu/data/result/doajarticles::89217af00809a91acc15a416e56b3782> <http://www.eurocris.org/ontologies/cerif/1.3#name> "Une nouvelle anomalie a allure h r ditaire chez des agneaux it khouzistans /it" . |
31 | 31 |
*/ |
32 |
|
|
33 | 32 |
public class SourceMapper extends Mapper<LongWritable, Text, Text, Text> { |
34 | 33 |
|
35 | 34 |
private Logger log = Logger.getLogger(SourceMapper.class); |
36 |
|
|
37 | 35 |
private LodConfiguration lodConfiguration; |
38 | 36 |
|
39 | 37 |
public static enum SOURCE_COUNTERS { |
... | ... | |
44 | 42 |
@Override |
45 | 43 |
protected void setup(Context context) throws IOException, InterruptedException { |
46 | 44 |
lodConfiguration = new LodConfiguration(); |
47 |
lodConfiguration.load(context.getConfiguration().get("lod.sourceMappings")); |
|
48 |
log.info("file loaded!" + context.getConfiguration().get("lod.sourceMappings")); |
|
49 |
|
|
45 |
lodConfiguration.load(context.getConfiguration().get(Properties.LOD_SOURCE_MAPPINGS)); |
|
46 |
log.info("file loaded!" + context.getConfiguration().get(Properties.LOD_SOURCE_MAPPINGS)); |
|
50 | 47 |
} |
51 | 48 |
|
52 |
|
|
53 | 49 |
@Override |
54 | 50 |
protected void map(final LongWritable keyIn, final Text result, final Context context) throws IOException { |
55 | 51 |
|
56 | 52 |
try { |
57 |
|
|
58 | 53 |
context.getCounter(SOURCE_COUNTERS.TOTAL_ENTITIES).increment(1); |
59 |
|
|
60 | 54 |
//get ID - output source_recordID so we can group by id and get all props of a record |
61 | 55 |
|
62 | 56 |
StringBuilder value = new StringBuilder(); |
... | ... | |
122 | 116 |
super.cleanup(context); |
123 | 117 |
} |
124 | 118 |
|
125 |
|
|
126 | 119 |
} |
modules/dnet-openaire-lodinterlinking/branches/cacheOptimized/src/main/java/eu/dnetlib/data/mapreduce/hbase/lodExport/preprocessing/TargetMapper.java | ||
---|---|---|
1 | 1 |
package eu.dnetlib.data.mapreduce.hbase.lodExport.preprocessing; |
2 | 2 |
|
3 | 3 |
import eu.dnetlib.data.mapreduce.hbase.lodExport.utils.configuration.LodConfiguration; |
4 |
import eu.dnetlib.data.mapreduce.hbase.lodExport.utils.configuration.Properties; |
|
4 | 5 |
import eu.dnetlib.data.mapreduce.hbase.lodExport.utils.parsing.DblpCsvParser; |
5 | 6 |
import org.apache.hadoop.io.LongWritable; |
6 | 7 |
import org.apache.hadoop.io.Text; |
... | ... | |
15 | 16 |
* export |
16 | 17 |
*/ |
17 | 18 |
public class TargetMapper extends Mapper<LongWritable, Text, Text, Text> { |
18 |
|
|
19 | 19 |
private Logger log = Logger.getLogger(TargetMapper.class); |
20 | 20 |
|
21 | 21 |
private LodConfiguration lodConfiguration; |
... | ... | |
27 | 27 |
@Override |
28 | 28 |
protected void setup(Context context) throws IOException, InterruptedException { |
29 | 29 |
lodConfiguration = new LodConfiguration(); |
30 |
lodConfiguration.load(context.getConfiguration().get("lod.targetMappings"));
|
|
31 |
log.info("file loaded!" + context.getConfiguration().get("lod.targetMappings"));
|
|
30 |
lodConfiguration.load(context.getConfiguration().get(Properties.LOD_TARGET_MAPPINGS));
|
|
31 |
log.info("file loaded!" + context.getConfiguration().get(Properties.LOD_TARGET_MAPPINGS));
|
|
32 | 32 |
} |
33 | 33 |
|
34 | 34 |
|
... | ... | |
38 | 38 |
try { |
39 | 39 |
|
40 | 40 |
context.getCounter(TARGET_COUNTERS.TOTAL_ENTITIES).increment(1); |
41 |
|
|
42 | 41 |
Text[] pair = DblpCsvParser.parseTriples(result, lodConfiguration); |
43 | 42 |
context.write(pair[0], pair[1]); |
44 | 43 |
context.getCounter(TARGET_COUNTERS.TARGET_ENTITIES).increment(1); |
modules/dnet-openaire-lodinterlinking/branches/cacheOptimized/src/main/java/eu/dnetlib/data/mapreduce/hbase/lodExport/linkage/LinkCustomReducer.java | ||
---|---|---|
1 | 1 |
package eu.dnetlib.data.mapreduce.hbase.lodExport.linkage; |
2 | 2 |
|
3 | 3 |
import eu.dnetlib.data.mapreduce.hbase.lodExport.utils.caching.RedisUtils; |
4 |
import eu.dnetlib.data.mapreduce.hbase.lodExport.utils.compators.DistanceAlgorithms; |
|
4 | 5 |
import eu.dnetlib.data.mapreduce.hbase.lodExport.utils.compators.MyComparator; |
6 |
import eu.dnetlib.data.mapreduce.hbase.lodExport.utils.configuration.Properties; |
|
5 | 7 |
import org.apache.hadoop.io.Text; |
6 | 8 |
import org.apache.hadoop.mapreduce.Reducer; |
7 | 9 |
import org.apache.log4j.Logger; |
... | ... | |
14 | 16 |
private Logger log = Logger.getLogger(LinkCustomReducer.class); |
15 | 17 |
private RedisUtils redisUtils; |
16 | 18 |
private static final String SEPERATOR = ","; |
17 |
private static double RECORD_SIMILARITY_THRESHOLD = 0.8;
|
|
18 |
private boolean useLevDistance;
|
|
19 |
private double similarityThreshold;
|
|
20 |
private String distanceAlgorithm;
|
|
19 | 21 |
|
20 | 22 |
public static enum LINK_RECUDER_COUNTERS { |
21 | 23 |
TARGET_RECORDS_EXAMINED, |
... | ... | |
29 | 31 |
|
30 | 32 |
@Override |
31 | 33 |
protected void setup(Context context) throws IOException, InterruptedException { |
32 |
|
|
33 | 34 |
try { |
34 |
useLevDistance = context.getConfiguration().getBoolean("lod.useLevDistance", false); |
|
35 |
similarityThreshold = context.getConfiguration().getDouble(Properties.LOD_SIMILARITY_THRESHOLD, 0.8); |
|
36 |
log.info("SIMILARITY THRESHOLD " + similarityThreshold); |
|
37 |
distanceAlgorithm = context.getConfiguration().get(Properties.LOD_DISTANCE_ALGORITHM); |
|
35 | 38 |
redisUtils = new RedisUtils(context.getConfiguration()); |
36 | 39 |
} catch (Exception e) { |
37 | 40 |
log.error("Error connecting to Redis " + e.toString()); |
38 | 41 |
throw new RuntimeException(e); |
39 | 42 |
} |
40 |
|
|
41 | 43 |
} |
42 | 44 |
|
43 | 45 |
@Override |
... | ... | |
76 | 78 |
//store target records to have less I/O ops per record |
77 | 79 |
//test also version where we get each record again in each loop |
78 | 80 |
|
79 |
|
|
80 | 81 |
//TODO |
81 | 82 |
//String targetRecord = redisUtils.getValue(targetId);} |
82 | 83 |
String targetRecord = targetRecords.get(targetId); |
83 |
double similarity = MyComparator.findMatchingPair(sourceRecord, targetRecord, useLevDistance); |
|
84 |
|
|
84 |
double similarity = MyComparator.computeSimilarity(DistanceAlgorithms.valueOf(distanceAlgorithm), sourceRecord, targetRecord); |
|
85 | 85 |
context.getCounter(LINK_RECUDER_COUNTERS.RECORD_COMPARISONS).increment(1); |
86 | 86 |
|
87 |
if (similarity >= RECORD_SIMILARITY_THRESHOLD) {
|
|
87 |
if (similarity >= similarityThreshold) {
|
|
88 | 88 |
try { |
89 |
context.write(new Text(sourceId), |
|
90 |
new Text(targetId + SEPERATOR + similarity + SEPERATOR + sourceRecord.replaceAll(sourceId, "") |
|
89 |
context.write(new Text(sourceId), new Text(targetId + SEPERATOR + similarity + SEPERATOR + sourceRecord.replaceAll(sourceId, "") |
|
91 | 90 |
+ SEPERATOR + targetRecord.replaceAll(targetId, ""))); |
92 | 91 |
context.getCounter(LINK_RECUDER_COUNTERS.MATCHING_PAIRS).increment(1); |
93 | 92 |
} catch (Exception ex) { |
modules/dnet-openaire-lodinterlinking/branches/cacheOptimized/src/main/java/eu/dnetlib/data/mapreduce/hbase/lodExport/linkage/LinkMapper.java | ||
---|---|---|
1 | 1 |
package eu.dnetlib.data.mapreduce.hbase.lodExport.linkage; |
2 | 2 |
|
3 | 3 |
import eu.dnetlib.data.mapreduce.hbase.lodExport.utils.caching.RedisUtils; |
4 |
import eu.dnetlib.data.mapreduce.hbase.lodExport.utils.configuration.Properties; |
|
4 | 5 |
import org.apache.hadoop.io.Text; |
5 | 6 |
import org.apache.hadoop.mapreduce.Mapper; |
6 | 7 |
import org.apache.log4j.Logger; |
... | ... | |
13 | 14 |
private RedisUtils redisUtils; |
14 | 15 |
private int optimalBlockSize; |
15 | 16 |
private static final char SEPERATOR = ','; |
16 |
private static final String OPTIMAL_BLOCKS = "optimalBlockSize"; |
|
17 | 17 |
|
18 |
public static enum TEST_COUNTERS {
|
|
18 |
public static enum LINK_MAPPER_COUNTERS {
|
|
19 | 19 |
OPTIMAL_BLOCK_SIZE, |
20 | 20 |
ERROR_COUNTERS, |
21 | 21 |
DISCARDED_BLOCKS, |
... | ... | |
29 | 29 |
protected void setup(Context context) throws IOException, InterruptedException { |
30 | 30 |
try { |
31 | 31 |
redisUtils = new RedisUtils(context.getConfiguration()); |
32 |
optimalBlockSize = Integer.valueOf(redisUtils.getValue(OPTIMAL_BLOCKS)); |
|
33 |
context.getCounter(TEST_COUNTERS.OPTIMAL_BLOCK_SIZE).setValue(optimalBlockSize);
|
|
32 |
optimalBlockSize = Integer.valueOf(redisUtils.getValue(Properties.OPTIMAL_BLOCKS));
|
|
33 |
context.getCounter(LINK_MAPPER_COUNTERS.OPTIMAL_BLOCK_SIZE).setValue(optimalBlockSize);
|
|
34 | 34 |
log.info("optimalBlockSize " + optimalBlockSize); |
35 | 35 |
} catch (Exception e) { |
36 | 36 |
log.error("ERROR CONNECTING TO REDIS ", e); |
... | ... | |
46 | 46 |
int recordsNumber = countRecords(result); |
47 | 47 |
//purge blocks that contain only source or target entities |
48 | 48 |
//how many comparisons we have purged |
49 |
context.getCounter(TEST_COUNTERS.TOTAL_COMPARISONS).increment(recordsNumber * (recordsNumber - 1));
|
|
49 |
context.getCounter(LINK_MAPPER_COUNTERS.TOTAL_COMPARISONS).increment(recordsNumber * (recordsNumber - 1));
|
|
50 | 50 |
if (recordsNumber >= optimalBlockSize) { |
51 |
context.getCounter(TEST_COUNTERS.DISCARDED_BLOCKS).increment(1);
|
|
51 |
context.getCounter(LINK_MAPPER_COUNTERS.DISCARDED_BLOCKS).increment(1);
|
|
52 | 52 |
//TODO count source / target records and get actual comparison number |
53 |
context.getCounter(TEST_COUNTERS.DISCARDED_COMPARISONS).increment(recordsNumber * (recordsNumber - 1));
|
|
53 |
context.getCounter(LINK_MAPPER_COUNTERS.DISCARDED_COMPARISONS).increment(recordsNumber * (recordsNumber - 1));
|
|
54 | 54 |
} else { |
55 | 55 |
context.write(keyIn, result); |
56 |
context.getCounter(TEST_COUNTERS.PASSED_BLOCKS).increment(1);
|
|
56 |
context.getCounter(LINK_MAPPER_COUNTERS.PASSED_BLOCKS).increment(1);
|
|
57 | 57 |
} |
58 |
context.getCounter(TEST_COUNTERS.TOTAL_BLOCKS).increment(1);
|
|
58 |
context.getCounter(LINK_MAPPER_COUNTERS.TOTAL_BLOCKS).increment(1);
|
|
59 | 59 |
|
60 | 60 |
} catch (Exception e) { |
61 | 61 |
log.error("key causing error is " + keyIn); |
62 | 62 |
log.error("ERROR ", e); |
63 |
context.getCounter(TEST_COUNTERS.ERROR_COUNTERS).increment(1);
|
|
63 |
context.getCounter(LINK_MAPPER_COUNTERS.ERROR_COUNTERS).increment(1);
|
|
64 | 64 |
throw new IOException(e.toString(), e); |
65 | 65 |
} |
66 | 66 |
} |
modules/dnet-openaire-lod-interlinking-wf/src/main/resources/eu/dnetlib/iis/core/javamapreduce/lodexport/job.properties | ||
---|---|---|
60 | 60 |
stopwordsReducers=1 |
61 | 61 |
lodBlocksInput=/tmp/lod_dump/ |
62 | 62 |
lodBlocksOutput=/tmp/lod_dump/stopwords |
63 |
lod_createCompositekey=true |
|
64 |
lod_distanceAlgorithm=HAMMING |
|
65 |
lod_similarityThreshold=0.8 |
|
63 | 66 |
lodPrefix=http://lod.openaire.eu/data/result/ |
64 | 67 |
lodStopwords=a,able,about,above,abst,accordance,according,accordingly,across,act,actually,added,adj,affected,affecting,affects,after,afterwards,again,against,ah,all,almost,alone,along,already,also,although,always,am,among,amongst,an,and,announce,another,any,anybody,anyhow,anymore,anyone,anything,anyway,anyways,anywhere,apparently,approximately,are,aren,arent,arise,around,as,aside,ask,asking,at,auth,available,away,awfully,b,back,be,became,because,become,becomes,becoming,been,before,beforehand,begin,beginning,beginnings,begins,behind,being,believe,below,beside,besides,between,beyond,biol,both,brief,briefly,but,by,c,ca,came,can,cannot,cant,cause,causes,certain,certainly,co,com,come,comes,contain,containing,contains,could,couldnt,d,date,did,didnt,different,do,does,doesnt,doing,done,dont,down,downwards,due,during,e,each,ed,edu,effect,eg,eight,eighty,either,else,elsewhere,end,ending,enough,especially,et,et-al,etc,even,ever,every,everybody,everyone,everything,everywhere,ex,except,f,far,few,ff,fifth,first,five,fix,followed,following,follows,for,former,formerly,forth,found,four,from,further,furthermore,g,gave,get,gets,getting,give,given,gives,giving,go,goes,gone,got,gotten,h,had,happens,hardly,has,hasnt,have,havent,having,he,hed,hence,her,here,hereafter,hereby,herein,heres,hereupon,hers,herself,hes,hi,hid,him,himself,his,hither,home,how,howbeit,however,hundred,i,id,ie,if,ill,im,immediate,immediately,importance,important,in,inc,indeed,index,information,instead,into,invention,inward,is,isnt,it,itd,itll,its,itself,ive,j,just,k,keep, keeps,kept,kg,km,know,known,knows,l,largely,last,lately,later,latter,latterly,least,less,lest,let,lets,like,liked,likely,line,little,ll,look,looking,looks,ltd,m,made,mainly,make,makes,many,may,maybe,me,mean,means,meantime,meanwhile,merely,mg,might,million,miss,ml,more,moreover,most,mostly,mr,mrs,much,mug,must,my,myself,n,na,name,namely,nay,nd,near,nearly,necessarily,necessary,need,needs,neither,never,nevertheless,new,next,nine,ninety,no,nobody,non,none,nonetheless,noone,nor,normally,nos,not,noted,nothing,now,nowhere,o,obtain,obtained,obviously,of,off,often,oh,ok,okay,old,omitted,on,once,one,ones,only,onto,or,ord,other,others,otherwise,ought,our,ours,ourselves,out,outside,over,overall,owing,own,p,page,pages,part,particular,particularly,past,per,perhaps,placed,please,plus,poorly,possible,possibly,potentially,pp,predominantly,present,previously,primarily,probably,promptly,proud,provides,put,q,que,quickly,quite,qv,r,ran,rather,rd,re,readily,really,recent,recently,ref,refs,regarding,regardless,regards,related,relatively,research,respectively,resulted,resulting,results,right,run,s,said,same,saw,say,saying,says,sec,section,see,seeing,seem,seemed,seeming,seems,seen,self,selves,sent,seven,several,shall,she,shed,shell,shes,should,shouldnt,show,showed,shown,showns,shows,significant,significantly,similar,similarly,since,six,slightly,so,some,somebody,somehow,someone,somethan,something,sometime,sometimes,somewhat,somewhere,soon,sorry,specifically,specified,specify,specifying,still,stop,strongly,sub,substantially,successfully,such,sufficiently,suggest,sup,sure,t,take,taken,taking,tell,tends,th,than,thank,thanks,thanx,that,thatll,thats,thatve,the,their,theirs,them,themselves,then,thence,there,thereafter,thereby,thered,therefore,therein,therell,thereof,therere,theres,thereto,thereupon,thereve,these,they,theyd,theyll,theyre,theyve,think,this,those,thou,though,thoughh,thousand,throug,through,throughout,thru,thus,til,tip,to,together,too,took,toward,towards,tried,tries,truly,try,trying,ts,twice,two,u,un,under,unfortunately,unless,unlike,unlikely,until,unto,up,upon,ups,us,use,used,useful,usefully,usefulness,uses,using,usually,v,value,various,ve,very,via,viz,vol,vols,vs,w,want,wants,was,wasnt,way,we,wed,welcome,well,went,were,werent,weve,what,whatever,whatll,whats,when,whence,whenever,where,whereafter,whereas,whereby,wherein,wheres,whereupon,wherever,whether,which,while,whim,whither,who,whod,whoever,whole,wholl,whom,whomever,whos,whose,why,widely,willing,wish,with,within,without,wont,words,world,would,wouldnt,www,x,y,yes,yet,you,youd,youll,your,youre,yours,yourself,yourselves,youve,z,zero |
65 | 68 |
lod_configXML=<?xml version="1.0" encoding="UTF-8"?> <!DOCTYPE LIMES SYSTEM "limes.dtd"> <LIMES> <PREFIX> <NAMESPACE>http://www.w3.org/1999/02/22-rdf-syntax-ns#</NAMESPACE> <LABEL>rdf</LABEL> </PREFIX> <PREFIX> <NAMESPACE>http://www.w3.org/2000/01/rdf-schema#</NAMESPACE> <LABEL>rdfs</LABEL> </PREFIX> <SOURCE> <ID>source1</ID> <ENDPOINT>/user/kanakakis/groundTruth/sourceNT</ENDPOINT> <VAR>?x</VAR> <PAGESIZE>100</PAGESIZE> <RESTRICTION>?x rdf:type http://www.eurocris.org/ontologies/cerif/1.3#ResultEntity</RESTRICTION> <PROPERTY>http://lod.openaire.eu/vocab/year RENAME Year</PROPERTY> <PROPERTY>http://www.w3.org/1999/02/22-rdf-syntax-ns# RENAME type</PROPERTY> <PROPERTY>http://purl.org/dc/terms/identifier RENAME id</PROPERTY> <PROPERTY>http://www.eurocris.org/ontologies/cerif/1.3#name AS lowercase->regexreplace("[^A-Za-z0-9]"," ") RENAME publicationName</PROPERTY> </SOURCE> <TARGET> <ID>source2</ID> <ENDPOINT>/user/kanakakis/groundTruth/targetNT</ENDPOINT> <VAR>?y</VAR> <PAGESIZE>100</PAGESIZE> <RESTRICTION>?y rdf:type http://swrc.ontoware.org/ontology#Article</RESTRICTION> <PROPERTY>http://www.w3.org/1999/02/22-rdf-syntax-ns# RENAME type</PROPERTY> <PROPERTY>http://purl.org/dc/terms/issued RENAME Year</PROPERTY> <PROPERTY>http://purl.org/dc/terms/identifier RENAME id</PROPERTY> <PROPERTY>http://www.w3.org/2000/01/rdf-schema#label AS lowercase->regexreplace("[^A-Za-z0-9]"," ") RENAME articleName</PROPERTY> </TARGET> <METRIC>AND(jaro(x.publicationName,y.articleName)|0.8,jaro(x.Year,y.Year)|1.0)</METRIC> <!-- <METRIC>jaro(x.publicatioName,y.articleName)|0.7</METRIC> --> <ACCEPTANCE> <THRESHOLD>0.8</THRESHOLD> <FILE>/user/kanakakis/groundTruth/accepted_links_0.8_no_purge</FILE> <RELATION>owl:sameAs</RELATION> </ACCEPTANCE> <REVIEW> <THRESHOLD>0.8</THRESHOLD> <FILE>/user/kanakakis/groundTruth/verified_links_0.8</FILE> <RELATION>owl:sameAs</RELATION> </REVIEW> <EXECUTION>Default</EXECUTION> <OUTPUT>TTL</OUTPUT> </LIMES> |
modules/dnet-openaire-lod-interlinking-wf/src/main/resources/eu/dnetlib/iis/core/javamapreduce/lodexport/oozie_app/workflow.xml | ||
---|---|---|
17 | 17 |
</configuration> |
18 | 18 |
</global> |
19 | 19 |
|
20 |
<start to='linkage'/>
|
|
20 |
<start to='build'/>
|
|
21 | 21 |
<action name="preProcessing"> |
22 | 22 |
<map-reduce> |
23 | 23 |
<configuration> |
... | ... | |
63 | 63 |
|
64 | 64 |
<name>mapred.input.dir.formats</name> |
65 | 65 |
<value> |
66 |
${nameNode}${sourceInput};org.apache.hadoop.mapreduce.lib.input.TextInputFormat,${nameNode}${targetInput};org.apache.hadoop.mapreduce.lib.input.TextInputFormat |
|
66 |
${nameNode}${sourceInput};org.apache.hadoop.mapreduce.lib.input.TextInputFormat,${nameNode}${targetInput};eu.dnetlib.data.mapreduce.hbase.lodExport.preprocessing.XmlInputFormat |
|
67 |
|
|
67 | 68 |
</value> |
68 | 69 |
</property> |
69 | 70 |
|
... | ... | |
255 | 256 |
</property> |
256 | 257 |
|
257 | 258 |
|
259 |
<property> |
|
260 |
<name>lod.createCompositekey</name> |
|
261 |
<value>${lod_createCompositekey}</value> |
|
262 |
</property> |
|
263 |
|
|
264 |
<property> |
|
265 |
<name>lod.createCompositekey</name> |
|
266 |
<value>${lod_createCompositekey}</value> |
|
267 |
</property> |
|
268 |
|
|
269 |
|
|
258 | 270 |
<!-- ## Workflow node parameters --> |
259 | 271 |
<property> |
260 | 272 |
<name>mapred.reduce.tasks</name> |
... | ... | |
644 | 656 |
</property> |
645 | 657 |
|
646 | 658 |
|
659 |
<property> |
|
660 |
<name>lod.distanceAlgorithm</name> |
|
661 |
<value>${lod_distanceAlgorithm}</value> |
|
662 |
</property> |
|
663 |
<property> |
|
664 |
<name>lod.similarityThreshold</name> |
|
665 |
<value>${lod_similarityThreshold}</value> |
|
666 |
</property> |
|
667 |
|
|
668 |
|
|
669 |
|
|
647 | 670 |
<!-- ## Workflow node parameters --> |
648 | 671 |
<property> |
649 | 672 |
<name>mapred.reduce.tasks</name> |
Also available in: Unified diff
Final update- fixed distance algs, added props un wf