1
|
package eu.dnetlib.data.mapreduce.hbase.lodExport.utils;
|
2
|
|
3
|
import org.apache.log4j.Logger;
|
4
|
|
5
|
import java.util.ArrayList;
|
6
|
import java.util.Arrays;
|
7
|
import java.util.HashMap;
|
8
|
import java.util.Iterator;
|
9
|
import java.util.List;
|
10
|
import java.util.Map;
|
11
|
|
12
|
public class Blocking {
|
13
|
private static Logger log = Logger.getLogger(Blocking.class);
|
14
|
|
15
|
public static List<String> tokenBlocking(String[] record, Map<String, Integer> stopwordsMap) {
|
16
|
|
17
|
List<String> blockingKeys = new ArrayList<>();
|
18
|
Map<String, Integer> blockingKeysMap = new HashMap<>();
|
19
|
String value = null;
|
20
|
try {
|
21
|
for (int i = 1; i < record.length; i++) {
|
22
|
value = record[i].substring(record[i].indexOf("\t") + 1).replaceAll("_", " ");
|
23
|
|
24
|
String[] tokens = value.toLowerCase().split("[\\W_]");
|
25
|
|
26
|
for (int j = 0; j < tokens.length; j++) {
|
27
|
String currentToken = tokens[j];
|
28
|
if (currentToken.length() > 0 && !stopwordsMap.containsKey(currentToken)) {
|
29
|
// blockingKeys.add(currentToken);
|
30
|
if (blockingKeysMap.containsKey(currentToken)) {
|
31
|
blockingKeysMap.put(currentToken, blockingKeysMap.get(currentToken) + 1);
|
32
|
} else {
|
33
|
blockingKeysMap.put(currentToken, 1);
|
34
|
}
|
35
|
}
|
36
|
}
|
37
|
StringBuilder tokenBuilder = new StringBuilder();
|
38
|
Iterator<Map.Entry<String, Integer>> mapIterator = blockingKeysMap.entrySet().iterator();
|
39
|
//throw away frequent words - tokens that occur more than once within a record field
|
40
|
|
41
|
while (mapIterator.hasNext()) {
|
42
|
Map.Entry<String, Integer> entry = mapIterator.next();
|
43
|
|
44
|
/* if (entry.getValue() > 1) {
|
45
|
log.debug("Removed token : " + entry.getKey() + " with " + entry.getValue() + "occurences");
|
46
|
blockingKeysMap.remove(entry.getKey());
|
47
|
}
|
48
|
*/
|
49
|
if (entry.getValue() == 1) {
|
50
|
tokenBuilder.append(entry).append(" ");
|
51
|
}
|
52
|
}
|
53
|
blockingKeys.add(tokenBuilder.toString());
|
54
|
}
|
55
|
} catch (Exception e) {
|
56
|
log.error(e);
|
57
|
}
|
58
|
|
59
|
return blockingKeys;
|
60
|
}
|
61
|
|
62
|
}
|