Project

General

Profile

1
package eu.dnetlib.data.mapreduce.hbase.lodExport.utils;
2

    
3
import org.apache.log4j.Logger;
4

    
5
import java.util.ArrayList;
6
import java.util.Arrays;
7
import java.util.HashMap;
8
import java.util.Iterator;
9
import java.util.List;
10
import java.util.Map;
11

    
12
public class Blocking {
13
    private static Logger log = Logger.getLogger(Blocking.class);
14

    
15
    public static List<String> tokenBlocking(String[] record, Map<String, Integer> stopwordsMap) {
16

    
17
        List<String> blockingKeys = new ArrayList<>();
18
        Map<String, Integer> blockingKeysMap = new HashMap<>();
19
        String value = null;
20
        try {
21
            for (int i = 1; i < record.length; i++) {
22
                value = record[i].substring(record[i].indexOf("\t") + 1).replaceAll("_", " ");
23

    
24
                String[] tokens = value.toLowerCase().split("[\\W_]");
25

    
26
                for (int j = 0; j < tokens.length; j++) {
27
                    String currentToken = tokens[j];
28
                    if (currentToken.length() > 0 && !stopwordsMap.containsKey(currentToken)) {
29
                        //        blockingKeys.add(currentToken);
30
                        if (blockingKeysMap.containsKey(currentToken)) {
31
                            blockingKeysMap.put(currentToken, blockingKeysMap.get(currentToken) + 1);
32
                        } else {
33
                            blockingKeysMap.put(currentToken, 1);
34
                        }
35
                    }
36
                }
37
                StringBuilder tokenBuilder = new StringBuilder();
38
                Iterator<Map.Entry<String, Integer>> mapIterator = blockingKeysMap.entrySet().iterator();
39
                //throw away frequent words - tokens that occur more than once within a record field
40

    
41
                while (mapIterator.hasNext()) {
42
                    Map.Entry<String, Integer> entry = mapIterator.next();
43

    
44
           /* if (entry.getValue() > 1) {
45
                log.debug("Removed token : " + entry.getKey() + " with " + entry.getValue() + "occurences");
46
                blockingKeysMap.remove(entry.getKey());
47
            }
48
           */
49
                    if (entry.getValue() == 1) {
50
                        tokenBuilder.append(entry).append(" ");
51
                    }
52
                }
53
                blockingKeys.add(tokenBuilder.toString());
54
            }
55
        } catch (Exception e) {
56
            log.error(e);
57
        }
58

    
59
        return blockingKeys;
60
    }
61

    
62
}
(2-2/12)