Project

General

Profile

1
package eu.dnetlib.data.mapreduce.hbase.lodExport.utils.blocking;
2

    
3
import java.util.*;
4

    
5
public class Blocking {
6
    private static final String LINE_DELIM = "\t.\t";
7
    private static final String FIELD_DELIM = "\t";
8

    
9
    public static String tokenBlocking(String field, HashSet<String> stopwordsMap) {
10
        String[] tokens = field.toLowerCase().replace("\"", "").split("[\\W_]");
11
        Map<String, Integer> blockingKeysMap = new TreeMap<>();
12
        StringBuilder token = new StringBuilder();
13

    
14
        for (String currentToken : tokens) {
15
            if (currentToken.length() > 1 && !stopwordsMap.contains(currentToken)) {
16
                if (blockingKeysMap.containsKey(currentToken)) {
17
                    blockingKeysMap.put(currentToken, blockingKeysMap.get(currentToken) + 1);
18
                } else {
19
                    blockingKeysMap.put(currentToken, 1);
20
                }
21
            }
22
        }
23

    
24
        //throw away frequent words - tokens that occur more than once within a record field
25
        for (Map.Entry<String, Integer> entry : blockingKeysMap.entrySet()) {
26
            if (entry.getValue() == 1) {
27
                token.append(entry.getKey()).append(" ");
28
            }
29
        }
30

    
31
        return token.toString();
32

    
33
    }
34

    
35

    
36
    public static Set<String> multipleTokenBlocking(String result, HashSet<String> stopwordsMap, Set<String> usedProperties) {
37
        String[] triples = result.split(LINE_DELIM);
38
        Set<String> tokenList = new TreeSet<>();
39

    
40
        for (String triple : triples) {
41
            String[] fields = triple.split(FIELD_DELIM);
42
            if (fields.length == 3) {
43
                String property = fields[1];
44
                String value = fields[2];
45
                tokenList.add(createToken(stopwordsMap, usedProperties, property, value));
46
            }
47
        }
48

    
49
        return tokenList;
50
    }
51

    
52
    private static String createToken(HashSet<String> stopwordsMap, Set<String> usedProperties, String property, String value) {
53
        Map<String, Integer> blockingKeysMap = new TreeMap<>();
54
        if (usedProperties.contains(property)) {
55
            String[] tokens = value.toLowerCase().replace("\"", "").split("[\\W_]");
56
            for (String currentToken : tokens) {
57
                if (!currentToken.isEmpty() && currentToken.length() > 1 && !stopwordsMap.contains(currentToken)) {
58
                    if (blockingKeysMap.containsKey(currentToken)) {
59
                        blockingKeysMap.put(currentToken, blockingKeysMap.get(currentToken) + 1);
60
                    } else {
61
                        blockingKeysMap.put(currentToken, 1);
62
                    }
63
                }
64
            }
65
        }
66
        StringBuilder tokens = new StringBuilder();
67
        for (Map.Entry<String, Integer> entry : blockingKeysMap.entrySet()) {
68
            if (entry.getValue() == 1) {
69
                tokens.append(entry.getKey()).append(" ");
70
            }
71
        }
72
        return tokens.toString();
73
    }
74

    
75

    
76
    public static void main(String[] args) {
77

    
78
        List<String> tokenList = new ArrayList<>();
79

    
80
        HashSet<String> stopwordsMap = new HashSet<>();
81
        stopwordsMap.add("and");
82

    
83
        String field = "A test string";
84
        String tokens = tokenBlocking(field, stopwordsMap);
85
        System.out.println(tokens);
86
        tokenList.add(tokens);
87

    
88
        field = "1990";
89
        tokens = tokenBlocking(field, stopwordsMap);
90
        System.out.println(tokens);
91
        tokenList.add(tokens);
92

    
93
        Collections.sort(tokenList);
94
        StringBuilder tokenString = new StringBuilder();
95

    
96
        for (String  t : tokenList) {
97
            tokenString.append(t).append(" ");
98
        }
99

    
100
        System.out.println(tokenString.toString());
101

    
102

    
103
    }
104
}
(1-1/2)