1
|
package eu.dnetlib.data.mapreduce.hbase.lodExport.utils.blocking;
|
2
|
|
3
|
import java.util.*;
|
4
|
|
5
|
public class Blocking {
|
6
|
private static final String LINE_DELIM = "\t.\t";
|
7
|
private static final String FIELD_DELIM = "\t";
|
8
|
|
9
|
public static String tokenBlocking(String field, HashSet<String> stopwordsMap) {
|
10
|
String[] tokens = field.toLowerCase().replace("\"", "").split("[\\W_]");
|
11
|
Map<String, Integer> blockingKeysMap = new TreeMap<>();
|
12
|
StringBuilder token = new StringBuilder();
|
13
|
|
14
|
for (String currentToken : tokens) {
|
15
|
if (currentToken.length() > 1 && !stopwordsMap.contains(currentToken)) {
|
16
|
if (blockingKeysMap.containsKey(currentToken)) {
|
17
|
blockingKeysMap.put(currentToken, blockingKeysMap.get(currentToken) + 1);
|
18
|
} else {
|
19
|
blockingKeysMap.put(currentToken, 1);
|
20
|
}
|
21
|
}
|
22
|
}
|
23
|
|
24
|
//throw away frequent words - tokens that occur more than once within a record field
|
25
|
for (Map.Entry<String, Integer> entry : blockingKeysMap.entrySet()) {
|
26
|
if (entry.getValue() == 1) {
|
27
|
token.append(entry.getKey()).append(" ");
|
28
|
}
|
29
|
}
|
30
|
|
31
|
return token.toString();
|
32
|
|
33
|
}
|
34
|
|
35
|
|
36
|
public static Set<String> multipleTokenBlocking(String result, HashSet<String> stopwordsMap, Set<String> usedProperties) {
|
37
|
String[] triples = result.split(LINE_DELIM);
|
38
|
Set<String> tokenList = new TreeSet<>();
|
39
|
|
40
|
for (String triple : triples) {
|
41
|
String[] fields = triple.split(FIELD_DELIM);
|
42
|
if (fields.length == 3) {
|
43
|
String property = fields[1];
|
44
|
String value = fields[2];
|
45
|
tokenList.add(createToken(stopwordsMap, usedProperties, property, value));
|
46
|
}
|
47
|
}
|
48
|
|
49
|
return tokenList;
|
50
|
}
|
51
|
|
52
|
private static String createToken(HashSet<String> stopwordsMap, Set<String> usedProperties, String property, String value) {
|
53
|
Map<String, Integer> blockingKeysMap = new TreeMap<>();
|
54
|
if (usedProperties.contains(property)) {
|
55
|
String[] tokens = value.toLowerCase().replace("\"", "").split("[\\W_]");
|
56
|
for (String currentToken : tokens) {
|
57
|
if (!currentToken.isEmpty() && currentToken.length() > 1 && !stopwordsMap.contains(currentToken)) {
|
58
|
if (blockingKeysMap.containsKey(currentToken)) {
|
59
|
blockingKeysMap.put(currentToken, blockingKeysMap.get(currentToken) + 1);
|
60
|
} else {
|
61
|
blockingKeysMap.put(currentToken, 1);
|
62
|
}
|
63
|
}
|
64
|
}
|
65
|
}
|
66
|
StringBuilder tokens = new StringBuilder();
|
67
|
for (Map.Entry<String, Integer> entry : blockingKeysMap.entrySet()) {
|
68
|
if (entry.getValue() == 1) {
|
69
|
tokens.append(entry.getKey()).append(" ");
|
70
|
}
|
71
|
}
|
72
|
return tokens.toString();
|
73
|
}
|
74
|
|
75
|
|
76
|
public static void main(String[] args) {
|
77
|
|
78
|
List<String> tokenList = new ArrayList<>();
|
79
|
|
80
|
HashSet<String> stopwordsMap = new HashSet<>();
|
81
|
stopwordsMap.add("and");
|
82
|
|
83
|
String field = "A test string";
|
84
|
String tokens = tokenBlocking(field, stopwordsMap);
|
85
|
System.out.println(tokens);
|
86
|
tokenList.add(tokens);
|
87
|
|
88
|
field = "1990";
|
89
|
tokens = tokenBlocking(field, stopwordsMap);
|
90
|
System.out.println(tokens);
|
91
|
tokenList.add(tokens);
|
92
|
|
93
|
Collections.sort(tokenList);
|
94
|
StringBuilder tokenString = new StringBuilder();
|
95
|
|
96
|
for (String t : tokenList) {
|
97
|
tokenString.append(t).append(" ");
|
98
|
}
|
99
|
|
100
|
System.out.println(tokenString.toString());
|
101
|
|
102
|
|
103
|
}
|
104
|
}
|