1 |
43053
|
giorgos.al
|
package eu.dnetlib.data.mapreduce.hbase.lodExport.utils;
|
2 |
|
|
|
3 |
44296
|
eri.katsar
|
import org.apache.log4j.Logger;
|
4 |
|
|
|
5 |
43053
|
giorgos.al
|
import java.util.ArrayList;
|
6 |
45054
|
eri.katsar
|
import java.util.Arrays;
|
7 |
|
|
import java.util.HashMap;
|
8 |
|
|
import java.util.Iterator;
|
9 |
|
|
import java.util.List;
|
10 |
44368
|
eri.katsar
|
import java.util.Map;
|
11 |
43053
|
giorgos.al
|
|
12 |
|
|
public class Blocking {
|
13 |
44295
|
eri.katsar
|
private static Logger log = Logger.getLogger(Blocking.class);
|
14 |
43053
|
giorgos.al
|
|
15 |
45054
|
eri.katsar
|
public static List<String> tokenBlocking(String[] record, Map<String, Integer> stopwordsMap) {
|
16 |
45056
|
eri.katsar
|
|
17 |
|
|
List<String> blockingKeys = new ArrayList<>();
|
18 |
45054
|
eri.katsar
|
Map<String, Integer> blockingKeysMap = new HashMap<>();
|
19 |
44295
|
eri.katsar
|
String value = null;
|
20 |
|
|
try {
|
21 |
|
|
for (int i = 1; i < record.length; i++) {
|
22 |
45050
|
eri.katsar
|
value = record[i].substring(record[i].indexOf("\t") + 1).replaceAll("_", " ");
|
23 |
45054
|
eri.katsar
|
|
24 |
45050
|
eri.katsar
|
String[] tokens = value.toLowerCase().split("[\\W_]");
|
25 |
45054
|
eri.katsar
|
|
26 |
44295
|
eri.katsar
|
for (int j = 0; j < tokens.length; j++) {
|
27 |
45050
|
eri.katsar
|
String currentToken = tokens[j];
|
28 |
|
|
if (currentToken.length() > 0 && !stopwordsMap.containsKey(currentToken)) {
|
29 |
45054
|
eri.katsar
|
// blockingKeys.add(currentToken);
|
30 |
|
|
if (blockingKeysMap.containsKey(currentToken)) {
|
31 |
|
|
blockingKeysMap.put(currentToken, blockingKeysMap.get(currentToken) + 1);
|
32 |
|
|
} else {
|
33 |
|
|
blockingKeysMap.put(currentToken, 1);
|
34 |
|
|
}
|
35 |
44295
|
eri.katsar
|
}
|
36 |
|
|
}
|
37 |
45056
|
eri.katsar
|
StringBuilder tokenBuilder = new StringBuilder();
|
38 |
|
|
Iterator<Map.Entry<String, Integer>> mapIterator = blockingKeysMap.entrySet().iterator();
|
39 |
|
|
//throw away frequent words - tokens that occur more than once within a record field
|
40 |
45054
|
eri.katsar
|
|
41 |
45056
|
eri.katsar
|
while (mapIterator.hasNext()) {
|
42 |
|
|
Map.Entry<String, Integer> entry = mapIterator.next();
|
43 |
45054
|
eri.katsar
|
|
44 |
45056
|
eri.katsar
|
/* if (entry.getValue() > 1) {
|
45 |
45054
|
eri.katsar
|
log.debug("Removed token : " + entry.getKey() + " with " + entry.getValue() + "occurences");
|
46 |
|
|
blockingKeysMap.remove(entry.getKey());
|
47 |
|
|
}
|
48 |
45056
|
eri.katsar
|
*/
|
49 |
|
|
if (entry.getValue() == 1) {
|
50 |
|
|
tokenBuilder.append(entry).append(" ");
|
51 |
|
|
}
|
52 |
|
|
}
|
53 |
|
|
blockingKeys.add(tokenBuilder.toString());
|
54 |
|
|
}
|
55 |
|
|
} catch (Exception e) {
|
56 |
45622
|
eri.katsar
|
log.error(e);
|
57 |
45054
|
eri.katsar
|
}
|
58 |
|
|
|
59 |
45056
|
eri.katsar
|
return blockingKeys;
|
60 |
43897
|
eri.katsar
|
}
|
61 |
45056
|
eri.katsar
|
|
62 |
43053
|
giorgos.al
|
}
|