Revision 57517
Added by Michele Artini about 4 years ago
modules/dnet-mapreduce-jobs/trunk/src/main/java/eu/dnetlib/data/mapreduce/hbase/openorgs/GenerateSimilaritiesReducer.java | ||
---|---|---|
32 | 32 |
|
33 | 33 |
if (list.size() < 2) { return; } |
34 | 34 |
|
35 |
if (reduceUsingId(OpenOrgsCommon.OPENORGS_MAIN_PREFIX, list, context) |
|
36 |
|| reduceUsingId(OpenOrgsCommon.OPENORGS_CORDA_FP7_PREFIX, list, context) |
|
37 |
|| reduceUsingId(OpenOrgsCommon.OPENORGS_CORDA_H2020_PREFIX, list, context) |
|
38 |
|| reduceUsingId("20|", list, context)) { |
|
39 |
// NOHING TODO |
|
35 |
final String mainId = findMainId(OpenOrgsCommon.OPENORGS_MAIN_PREFIX, list); |
|
36 |
|
|
37 |
if (StringUtils.isNotBlank(mainId)) { |
|
38 |
for (final OafEntity o : list) { |
|
39 |
if (!o.getOriginalIdList().contains(mainId)) { |
|
40 |
context.getCounter("organization", "relations to " + OpenOrgsCommon.OPENORGS_MAIN_PREFIX + "*").increment(1); |
|
41 |
emit(newSimilarity(mainId, o), context); |
|
42 |
} |
|
43 |
} |
|
40 | 44 |
} |
41 | 45 |
} catch (final InvalidProtocolBufferException e) { |
42 | 46 |
e.printStackTrace(); |
... | ... | |
44 | 48 |
} |
45 | 49 |
} |
46 | 50 |
|
47 |
private boolean reduceUsingId(final String idPrefix, final List<OafEntity> list, final Context context) { |
|
48 |
final String mainId = findMainId(idPrefix, list); |
|
49 |
|
|
50 |
if (StringUtils.isNotBlank(mainId)) { |
|
51 |
for (final OafEntity o : list) { |
|
52 |
if (!o.getId().equals(mainId)) { |
|
53 |
context.getCounter("organization", "relations to " + idPrefix + "*").increment(1); |
|
54 |
emit(newSimilarity(mainId, o), context); |
|
55 |
} |
|
56 |
} |
|
57 |
|
|
58 |
return true; |
|
59 |
} |
|
60 |
return false; |
|
61 |
} |
|
62 |
|
|
63 | 51 |
private String findMainId(final String idPrefix, final List<OafEntity> list) { |
64 | 52 |
final List<String> valids = new ArrayList<>(); |
65 | 53 |
|
66 | 54 |
for (final OafEntity e : list) { |
67 |
if (e.getId().startsWith(idPrefix)) { |
|
68 |
valids.add(e.getId()); |
|
55 |
for (final String id : e.getOriginalIdList()) { |
|
56 |
if (id.startsWith(idPrefix)) { |
|
57 |
valids.add(id); |
|
58 |
} |
|
69 | 59 |
} |
70 | 60 |
} |
71 | 61 |
if (valids.isEmpty()) { return null; } |
... | ... | |
77 | 67 |
|
78 | 68 |
private void emit(final Similarity simrel, final Context context) { |
79 | 69 |
try { |
80 |
valueOut.set(simrel.toJsonBytes());
|
|
70 |
valueOut.set(simrel.toTsv());
|
|
81 | 71 |
context.getCounter("organization", "relations (total)").increment(1); |
82 | 72 |
context.write(NullWritable.get(), valueOut); |
83 | 73 |
} catch (IOException | InterruptedException e) { |
... | ... | |
88 | 78 |
private Similarity newSimilarity(final String openOrgsId, final OafEntity oafEntity) { |
89 | 79 |
final Similarity s = new Similarity(); |
90 | 80 |
s.setOpenOrgID(openOrgsId); |
91 |
s.setOpenaireId(oafEntity.getId()); |
|
92 | 81 |
s.setOpenaireOriginalId(oafEntity.getOriginalId(0)); |
93 | 82 |
s.setName(oafEntity.getOrganization().getMetadata().getLegalname().getValue()); |
94 | 83 |
s.setAcronym(oafEntity.getOrganization().getMetadata().getLegalshortname().getValue()); |
modules/dnet-mapreduce-jobs/trunk/src/main/java/eu/dnetlib/data/mapreduce/hbase/openorgs/OpenOrgsCommon.java | ||
---|---|---|
6 | 6 |
|
7 | 7 |
public class OpenOrgsCommon { |
8 | 8 |
|
9 |
public static final String OPENORGS_MESH_PREFIX = "20|openorgsmesh::";
|
|
10 |
public static final String OPENORGS_MAIN_PREFIX = "20|openorgs____::";
|
|
11 |
public static final String OPENORGS_CORDA_FP7_PREFIX = "20|corda_______::";
|
|
12 |
public static final String OPENORGS_CORDA_H2020_PREFIX = "20|corda__h2020::";
|
|
9 |
public static final String OPENORGS_MESH_PREFIX = "openorgsmesh::"; |
|
10 |
public static final String OPENORGS_MAIN_PREFIX = "openorgs____::"; |
|
11 |
public static final String OPENORGS_CORDA_FP7_PREFIX = "corda_______::"; |
|
12 |
public static final String OPENORGS_CORDA_H2020_PREFIX = "corda__h2020::"; |
|
13 | 13 |
|
14 | 14 |
public static boolean isOpenOrgsMesh(final ImmutableBytesWritable k) { |
15 |
return new String(k.copyBytes(), Charset.forName("UTF-8")).startsWith(OPENORGS_MESH_PREFIX); |
|
15 |
return new String(k.copyBytes(), Charset.forName("UTF-8")).startsWith("20|" + OPENORGS_MESH_PREFIX);
|
|
16 | 16 |
} |
17 | 17 |
|
18 | 18 |
public static boolean isOpenOrgs(final ImmutableBytesWritable k) { |
19 |
return new String(k.copyBytes(), Charset.forName("UTF-8")).startsWith(OPENORGS_MAIN_PREFIX); |
|
19 |
return new String(k.copyBytes(), Charset.forName("UTF-8")).startsWith("20|" + OPENORGS_MAIN_PREFIX);
|
|
20 | 20 |
} |
21 | 21 |
|
22 | 22 |
public static boolean isCordaFp7(final ImmutableBytesWritable k) { |
23 |
return new String(k.copyBytes(), Charset.forName("UTF-8")).startsWith(OPENORGS_CORDA_FP7_PREFIX); |
|
23 |
return new String(k.copyBytes(), Charset.forName("UTF-8")).startsWith("20|" + OPENORGS_CORDA_FP7_PREFIX);
|
|
24 | 24 |
} |
25 | 25 |
|
26 | 26 |
public static boolean isCordaH2020(final ImmutableBytesWritable k) { |
27 |
return new String(k.copyBytes(), Charset.forName("UTF-8")).startsWith(OPENORGS_CORDA_H2020_PREFIX); |
|
27 |
return new String(k.copyBytes(), Charset.forName("UTF-8")).startsWith("20|" + OPENORGS_CORDA_H2020_PREFIX);
|
|
28 | 28 |
} |
29 | 29 |
|
30 | 30 |
} |
modules/dnet-mapreduce-jobs/trunk/src/main/java/eu/dnetlib/data/mapreduce/hbase/openorgs/Similarity.java | ||
---|---|---|
1 | 1 |
package eu.dnetlib.data.mapreduce.hbase.openorgs; |
2 | 2 |
|
3 |
import com.fasterxml.jackson.annotation.JsonProperty; |
|
4 |
import com.fasterxml.jackson.core.JsonProcessingException; |
|
5 |
import com.fasterxml.jackson.databind.ObjectMapper; |
|
6 |
|
|
7 | 3 |
public class Similarity { |
8 | 4 |
|
9 |
private static final ObjectMapper objectMapper = new ObjectMapper(); |
|
10 |
|
|
11 |
@JsonProperty("local_id") |
|
12 | 5 |
private String openOrgID; |
13 | 6 |
|
14 |
@JsonProperty("oa_id") |
|
15 |
private String openaireId; |
|
16 |
|
|
17 |
@JsonProperty("oa_original_id") |
|
18 | 7 |
private String openaireOriginalId; |
19 | 8 |
|
20 |
@JsonProperty("oa_name") |
|
21 | 9 |
private String name; |
22 | 10 |
|
23 |
@JsonProperty("oa_acronym") |
|
24 | 11 |
private String acronym; |
25 | 12 |
|
26 |
@JsonProperty("oa_country") |
|
27 | 13 |
private String country; |
28 | 14 |
|
29 |
@JsonProperty("oa_url") |
|
30 | 15 |
private String url; |
31 | 16 |
|
32 |
@JsonProperty("oa_collectedfrom") |
|
33 | 17 |
private String collectedFrom; |
34 | 18 |
|
35 | 19 |
public String getOpenOrgID() { |
... | ... | |
40 | 24 |
this.openOrgID = openOrgID; |
41 | 25 |
} |
42 | 26 |
|
43 |
public String getOpenaireId() { |
|
44 |
return openaireId; |
|
45 |
} |
|
46 |
|
|
47 |
public void setOpenaireId(final String openaireId) { |
|
48 |
this.openaireId = openaireId; |
|
49 |
} |
|
50 |
|
|
51 | 27 |
public String getOpenaireOriginalId() { |
52 | 28 |
return openaireOriginalId; |
53 | 29 |
} |
... | ... | |
96 | 72 |
this.collectedFrom = collectedFrom; |
97 | 73 |
} |
98 | 74 |
|
99 |
public String toJson() throws JsonProcessingException {
|
|
100 |
return objectMapper.writeValueAsString(this);
|
|
75 |
public String toTsv() {
|
|
76 |
return String.format("%s\t%s\t%s\t%s\t%s\t%s\t%s", openOrgID, openaireOriginalId, name, acronym, country, url, collectedFrom);
|
|
101 | 77 |
} |
102 |
|
|
103 |
public byte[] toJsonBytes() throws JsonProcessingException { |
|
104 |
return objectMapper.writeValueAsBytes(this); |
|
105 |
} |
|
106 | 78 |
} |
modules/dnet-mapreduce-jobs/trunk/src/main/java/eu/dnetlib/data/mapreduce/hbase/openorgs/GenerateOrganizationsReducer.java | ||
---|---|---|
47 | 47 |
emit(e3, context); |
48 | 48 |
return; |
49 | 49 |
} |
50 |
final OafEntity e4 = findOrgToEmit("20|", list);
|
|
50 |
final OafEntity e4 = findOrgToEmit("", list); |
|
51 | 51 |
if (e4 != null) { |
52 | 52 |
context.getCounter("organization", "new (from other sources)").increment(1); |
53 | 53 |
emit(e4, context); |
... | ... | |
63 | 63 |
final List<OafEntity> valids = new ArrayList<>(); |
64 | 64 |
|
65 | 65 |
for (final OafEntity e : list) { |
66 |
if (e.getId().startsWith(idPrefix)) { |
|
66 |
if (e.getId().startsWith("20|" + idPrefix)) {
|
|
67 | 67 |
valids.add(e); |
68 | 68 |
} |
69 | 69 |
} |
Also available in: Unified diff
OpenOrgs DB: use of tsv for rels