Revision 43424
Added by Giorgos Alexiou about 8 years ago
modules/dnet-openaire-lodinterlinking/trunk/src/main/java/eu/dnetlib/data/mapreduce/hbase/lodExport/utils/FrequencyCounter.java | ||
---|---|---|
42 | 42 |
List filenames = getFiles(fs, path); |
43 | 43 |
Map<Integer, Integer> frequencyMap = getFrequencyMap(fs, filenames); |
44 | 44 |
Map<Integer, Integer> sortedMap = sortMapByValue(frequencyMap, ORDERING_TYPE.ASC); |
45 |
Map<BigInteger,Double> statistics = getStatistics(sortedMap); |
|
46 |
|
|
45 | 47 |
System.out.println("Sorted Map is " + sortedMap.entrySet()); |
46 | 48 |
writeMap(fs, sortedMap, outputPath); |
47 | 49 |
fs.close(); |
... | ... | |
66 | 68 |
return fileNames; |
67 | 69 |
} |
68 | 70 |
|
69 |
private static Map getFrequencyMap(FileSystem fs, List<String> fileNames) throws Exception { |
|
71 |
private static Map<Integer,Integer> getFrequencyMap(FileSystem fs, List<String> fileNames) throws Exception {
|
|
70 | 72 |
String line = null; |
71 | 73 |
; |
72 | 74 |
try { |
... | ... | |
101 | 103 |
|
102 | 104 |
} |
103 | 105 |
|
106 |
private static Map<BigInteger,Double> getStatistics(Map<Integer,Integer> sortedMap){ |
|
107 |
Map<BigInteger,Double> statistics= new HashMap<BigInteger,Double>(); |
|
108 |
double CC = 0d; |
|
109 |
CC = 0d; |
|
110 |
int lastBlockSize = 2; |
|
111 |
int f; |
|
112 |
BigInteger totalSizeOfBlocks = BigInteger.ZERO; |
|
113 |
BigInteger numberOfComparisons = BigInteger.ZERO; |
|
114 |
|
|
115 |
Set<Integer> keys = sortedMap.keySet(); |
|
116 |
for(Integer key : keys){ |
|
117 |
BigInteger blockSize = BigInteger.valueOf(key.intValue()); |
|
118 |
f = sortedMap.get(key); |
|
119 |
totalSizeOfBlocks = totalSizeOfBlocks.add(BigInteger.valueOf(f).multiply(blockSize)); |
|
120 |
numberOfComparisons = numberOfComparisons.add(BigInteger.valueOf(f).multiply(blockSize.multiply(blockSize.subtract(BigInteger.ONE)).shiftLeft(1))); |
|
121 |
CC = totalSizeOfBlocks.doubleValue()/numberOfComparisons.doubleValue(); |
|
122 |
statistics.put(blockSize, CC); |
|
123 |
} |
|
124 |
|
|
125 |
|
|
126 |
return statistics; |
|
127 |
} |
|
128 |
|
|
129 |
|
|
130 |
private static int optimalBlockSize(Map<BigInteger,Double> statistics){ |
|
131 |
int optimalBlockSize = statistics.; |
|
132 |
|
|
133 |
for(int i = statistics.size() -1; i >= 1; i--){ |
|
134 |
if(Math.abs(statistics.get(i)._2 - statistics.get(i-1)._2) < eps){ |
|
135 |
eps = Math.abs(statistics.get(i)._2 - statistics.get(i-1)._2); |
|
136 |
optimalBlockSize = statistics.get(i)._1; |
|
137 |
} |
|
138 |
} |
|
104 | 139 |
|
105 |
public static Map sortMapByValue(Map map, ORDERING_TYPE ordering) { |
|
106 |
Ordering valueComparator; |
|
140 |
|
|
141 |
|
|
142 |
return optimalBlockSize; |
|
143 |
} |
|
144 |
|
|
145 |
public static Map<Integer,Integer> sortMapByValue(Map map, ORDERING_TYPE ordering) { |
|
146 |
Ordering<Integer> valueComparator; |
|
107 | 147 |
if (ordering == ORDERING_TYPE.DESC) { |
108 | 148 |
valueComparator = Ordering.natural().onResultOf(Functions.forMap(map)).compound(Ordering.natural()).reverse(); |
109 | 149 |
} else { |
Also available in: Unified diff
More Methods added