Project

General

Profile

1 43053 giorgos.al
package eu.dnetlib.data.mapreduce.hbase.lodExport.utils;
2
3 44296 eri.katsar
import org.apache.log4j.Logger;
4
5 43053 giorgos.al
import java.util.ArrayList;
6 45054 eri.katsar
import java.util.Arrays;
7
import java.util.HashMap;
8
import java.util.Iterator;
9
import java.util.List;
10 44368 eri.katsar
import java.util.Map;
11 43053 giorgos.al
12
public class Blocking {
13 44295 eri.katsar
    private static Logger log = Logger.getLogger(Blocking.class);
14 43053 giorgos.al
15 45054 eri.katsar
    public static List<String> tokenBlocking(String[] record, Map<String, Integer> stopwordsMap) {
16 45056 eri.katsar
17
        List<String> blockingKeys = new ArrayList<>();
18 45054 eri.katsar
        Map<String, Integer> blockingKeysMap = new HashMap<>();
19 44295 eri.katsar
        String value = null;
20
        try {
21
            for (int i = 1; i < record.length; i++) {
22 45050 eri.katsar
                value = record[i].substring(record[i].indexOf("\t") + 1).replaceAll("_", " ");
23 45054 eri.katsar
24 45050 eri.katsar
                String[] tokens = value.toLowerCase().split("[\\W_]");
25 45054 eri.katsar
26 44295 eri.katsar
                for (int j = 0; j < tokens.length; j++) {
27 45050 eri.katsar
                    String currentToken = tokens[j];
28
                    if (currentToken.length() > 0 && !stopwordsMap.containsKey(currentToken)) {
29 45054 eri.katsar
                        //        blockingKeys.add(currentToken);
30
                        if (blockingKeysMap.containsKey(currentToken)) {
31
                            blockingKeysMap.put(currentToken, blockingKeysMap.get(currentToken) + 1);
32
                        } else {
33
                            blockingKeysMap.put(currentToken, 1);
34
                        }
35 44295 eri.katsar
                    }
36
                }
37 45056 eri.katsar
                StringBuilder tokenBuilder = new StringBuilder();
38
                Iterator<Map.Entry<String, Integer>> mapIterator = blockingKeysMap.entrySet().iterator();
39
                //throw away frequent words - tokens that occur more than once within a record field
40 45054 eri.katsar
41 45056 eri.katsar
                while (mapIterator.hasNext()) {
42
                    Map.Entry<String, Integer> entry = mapIterator.next();
43 45054 eri.katsar
44 45056 eri.katsar
           /* if (entry.getValue() > 1) {
45 45054 eri.katsar
                log.debug("Removed token : " + entry.getKey() + " with " + entry.getValue() + "occurences");
46
                blockingKeysMap.remove(entry.getKey());
47
            }
48 45056 eri.katsar
           */
49
                    if (entry.getValue() == 1) {
50
                        tokenBuilder.append(entry).append(" ");
51
                    }
52
                }
53
                blockingKeys.add(tokenBuilder.toString());
54
            }
55
        } catch (Exception e) {
56 45622 eri.katsar
            log.error(e);
57 45054 eri.katsar
        }
58
59 45056 eri.katsar
        return blockingKeys;
60 43897 eri.katsar
    }
61 45056 eri.katsar
62 43053 giorgos.al
}