9 |
9 |
import eu.dnetlib.data.proto.TypeProtos;
|
10 |
10 |
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
11 |
11 |
import eu.dnetlib.dhp.schema.oaf.Relation;
|
|
12 |
import org.apache.commons.lang.StringUtils;
|
12 |
13 |
import org.apache.hadoop.hbase.client.Result;
|
13 |
14 |
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
|
14 |
15 |
import org.apache.hadoop.hbase.mapreduce.TableMapper;
|
... | ... | |
20 |
21 |
import java.util.Map;
|
21 |
22 |
import java.util.Map.Entry;
|
22 |
23 |
import java.util.NavigableMap;
|
|
24 |
import java.util.stream.Collectors;
|
23 |
25 |
|
24 |
26 |
/**
|
25 |
27 |
* Exports Oaf objects as their json serialization.
|
... | ... | |
66 |
68 |
}
|
67 |
69 |
|
68 |
70 |
final Map<byte[], NavigableMap<byte[], byte[]>> row = value.getNoVersionMap();
|
69 |
|
|
|
71 |
|
70 |
72 |
for (byte[] cf : row.keySet()) {
|
71 |
73 |
|
72 |
|
for (Entry<byte[], byte[]> q : row.get(cf).entrySet()) {
|
73 |
|
final String key = new String(q.getKey());
|
74 |
|
if(key.startsWith("update") || DedupUtils.BODY_S.equals(key)) {
|
75 |
|
continue;
|
|
74 |
for (Entry<byte[], byte[]> q : value.getFamilyMap(cf).entrySet().stream().filter(e -> {
|
|
75 |
final String key = new String(e.getKey());
|
|
76 |
boolean skip = key.startsWith("update") || key.equals(DedupUtils.BODY_S);
|
|
77 |
if (skip) {
|
|
78 |
context.getCounter("export", String.format("skipped %s", StringUtils.substring(key, 0, 6))).increment(1);
|
76 |
79 |
}
|
|
80 |
return !skip;
|
|
81 |
}).collect(Collectors.toList())) {
|
77 |
82 |
if (new String(q.getValue()).equals("")) {
|
78 |
|
context.getCounter("export", "skipped relation " + new String(cf)).increment(1);
|
|
83 |
context.getCounter("export", "skipped " + new String(cf)).increment(1);
|
79 |
84 |
} else {
|
80 |
85 |
emit(context, OafProtos.Oaf.parseFrom(q.getValue()));
|
81 |
86 |
}
|
fixed infospace export procedure, avoid to emit the same result more than once