Project

General

Profile

« Previous | Next » 

Revision 57517

OpenOrgs DB: use of tsv for rels

View differences:

modules/dnet-mapreduce-jobs/trunk/src/main/java/eu/dnetlib/data/mapreduce/hbase/openorgs/GenerateSimilaritiesReducer.java
32 32

  
33 33
			if (list.size() < 2) { return; }
34 34

  
35
			if (reduceUsingId(OpenOrgsCommon.OPENORGS_MAIN_PREFIX, list, context)
36
					|| reduceUsingId(OpenOrgsCommon.OPENORGS_CORDA_FP7_PREFIX, list, context)
37
					|| reduceUsingId(OpenOrgsCommon.OPENORGS_CORDA_H2020_PREFIX, list, context)
38
					|| reduceUsingId("20|", list, context)) {
39
				// NOHING TODO
35
			final String mainId = findMainId(OpenOrgsCommon.OPENORGS_MAIN_PREFIX, list);
36

  
37
			if (StringUtils.isNotBlank(mainId)) {
38
				for (final OafEntity o : list) {
39
					if (!o.getOriginalIdList().contains(mainId)) {
40
						context.getCounter("organization", "relations to " + OpenOrgsCommon.OPENORGS_MAIN_PREFIX + "*").increment(1);
41
						emit(newSimilarity(mainId, o), context);
42
					}
43
				}
40 44
			}
41 45
		} catch (final InvalidProtocolBufferException e) {
42 46
			e.printStackTrace();
......
44 48
		}
45 49
	}
46 50

  
47
	private boolean reduceUsingId(final String idPrefix, final List<OafEntity> list, final Context context) {
48
		final String mainId = findMainId(idPrefix, list);
49

  
50
		if (StringUtils.isNotBlank(mainId)) {
51
			for (final OafEntity o : list) {
52
				if (!o.getId().equals(mainId)) {
53
					context.getCounter("organization", "relations to " + idPrefix + "*").increment(1);
54
					emit(newSimilarity(mainId, o), context);
55
				}
56
			}
57

  
58
			return true;
59
		}
60
		return false;
61
	}
62

  
63 51
	private String findMainId(final String idPrefix, final List<OafEntity> list) {
64 52
		final List<String> valids = new ArrayList<>();
65 53

  
66 54
		for (final OafEntity e : list) {
67
			if (e.getId().startsWith(idPrefix)) {
68
				valids.add(e.getId());
55
			for (final String id : e.getOriginalIdList()) {
56
				if (id.startsWith(idPrefix)) {
57
					valids.add(id);
58
				}
69 59
			}
70 60
		}
71 61
		if (valids.isEmpty()) { return null; }
......
77 67

  
78 68
	private void emit(final Similarity simrel, final Context context) {
79 69
		try {
80
			valueOut.set(simrel.toJsonBytes());
70
			valueOut.set(simrel.toTsv());
81 71
			context.getCounter("organization", "relations (total)").increment(1);
82 72
			context.write(NullWritable.get(), valueOut);
83 73
		} catch (IOException | InterruptedException e) {
......
88 78
	private Similarity newSimilarity(final String openOrgsId, final OafEntity oafEntity) {
89 79
		final Similarity s = new Similarity();
90 80
		s.setOpenOrgID(openOrgsId);
91
		s.setOpenaireId(oafEntity.getId());
92 81
		s.setOpenaireOriginalId(oafEntity.getOriginalId(0));
93 82
		s.setName(oafEntity.getOrganization().getMetadata().getLegalname().getValue());
94 83
		s.setAcronym(oafEntity.getOrganization().getMetadata().getLegalshortname().getValue());
modules/dnet-mapreduce-jobs/trunk/src/main/java/eu/dnetlib/data/mapreduce/hbase/openorgs/OpenOrgsCommon.java
6 6

  
7 7
public class OpenOrgsCommon {
8 8

  
9
	public static final String OPENORGS_MESH_PREFIX = "20|openorgsmesh::";
10
	public static final String OPENORGS_MAIN_PREFIX = "20|openorgs____::";
11
	public static final String OPENORGS_CORDA_FP7_PREFIX = "20|corda_______::";
12
	public static final String OPENORGS_CORDA_H2020_PREFIX = "20|corda__h2020::";
9
	public static final String OPENORGS_MESH_PREFIX = "openorgsmesh::";
10
	public static final String OPENORGS_MAIN_PREFIX = "openorgs____::";
11
	public static final String OPENORGS_CORDA_FP7_PREFIX = "corda_______::";
12
	public static final String OPENORGS_CORDA_H2020_PREFIX = "corda__h2020::";
13 13

  
14 14
	public static boolean isOpenOrgsMesh(final ImmutableBytesWritable k) {
15
		return new String(k.copyBytes(), Charset.forName("UTF-8")).startsWith(OPENORGS_MESH_PREFIX);
15
		return new String(k.copyBytes(), Charset.forName("UTF-8")).startsWith("20|" + OPENORGS_MESH_PREFIX);
16 16
	}
17 17

  
18 18
	public static boolean isOpenOrgs(final ImmutableBytesWritable k) {
19
		return new String(k.copyBytes(), Charset.forName("UTF-8")).startsWith(OPENORGS_MAIN_PREFIX);
19
		return new String(k.copyBytes(), Charset.forName("UTF-8")).startsWith("20|" + OPENORGS_MAIN_PREFIX);
20 20
	}
21 21

  
22 22
	public static boolean isCordaFp7(final ImmutableBytesWritable k) {
23
		return new String(k.copyBytes(), Charset.forName("UTF-8")).startsWith(OPENORGS_CORDA_FP7_PREFIX);
23
		return new String(k.copyBytes(), Charset.forName("UTF-8")).startsWith("20|" + OPENORGS_CORDA_FP7_PREFIX);
24 24
	}
25 25

  
26 26
	public static boolean isCordaH2020(final ImmutableBytesWritable k) {
27
		return new String(k.copyBytes(), Charset.forName("UTF-8")).startsWith(OPENORGS_CORDA_H2020_PREFIX);
27
		return new String(k.copyBytes(), Charset.forName("UTF-8")).startsWith("20|" + OPENORGS_CORDA_H2020_PREFIX);
28 28
	}
29 29

  
30 30
}
modules/dnet-mapreduce-jobs/trunk/src/main/java/eu/dnetlib/data/mapreduce/hbase/openorgs/Similarity.java
1 1
package eu.dnetlib.data.mapreduce.hbase.openorgs;
2 2

  
3
import com.fasterxml.jackson.annotation.JsonProperty;
4
import com.fasterxml.jackson.core.JsonProcessingException;
5
import com.fasterxml.jackson.databind.ObjectMapper;
6

  
7 3
public class Similarity {
8 4

  
9
	private static final ObjectMapper objectMapper = new ObjectMapper();
10

  
11
	@JsonProperty("local_id")
12 5
	private String openOrgID;
13 6

  
14
	@JsonProperty("oa_id")
15
	private String openaireId;
16

  
17
	@JsonProperty("oa_original_id")
18 7
	private String openaireOriginalId;
19 8

  
20
	@JsonProperty("oa_name")
21 9
	private String name;
22 10

  
23
	@JsonProperty("oa_acronym")
24 11
	private String acronym;
25 12

  
26
	@JsonProperty("oa_country")
27 13
	private String country;
28 14

  
29
	@JsonProperty("oa_url")
30 15
	private String url;
31 16

  
32
	@JsonProperty("oa_collectedfrom")
33 17
	private String collectedFrom;
34 18

  
35 19
	public String getOpenOrgID() {
......
40 24
		this.openOrgID = openOrgID;
41 25
	}
42 26

  
43
	public String getOpenaireId() {
44
		return openaireId;
45
	}
46

  
47
	public void setOpenaireId(final String openaireId) {
48
		this.openaireId = openaireId;
49
	}
50

  
51 27
	public String getOpenaireOriginalId() {
52 28
		return openaireOriginalId;
53 29
	}
......
96 72
		this.collectedFrom = collectedFrom;
97 73
	}
98 74

  
99
	public String toJson() throws JsonProcessingException {
100
		return objectMapper.writeValueAsString(this);
75
	public String toTsv() {
76
		return String.format("%s\t%s\t%s\t%s\t%s\t%s\t%s", openOrgID, openaireOriginalId, name, acronym, country, url, collectedFrom);
101 77
	}
102

  
103
	public byte[] toJsonBytes() throws JsonProcessingException {
104
		return objectMapper.writeValueAsBytes(this);
105
	}
106 78
}
modules/dnet-mapreduce-jobs/trunk/src/main/java/eu/dnetlib/data/mapreduce/hbase/openorgs/GenerateOrganizationsReducer.java
47 47
				emit(e3, context);
48 48
				return;
49 49
			}
50
			final OafEntity e4 = findOrgToEmit("20|", list);
50
			final OafEntity e4 = findOrgToEmit("", list);
51 51
			if (e4 != null) {
52 52
				context.getCounter("organization", "new (from other sources)").increment(1);
53 53
				emit(e4, context);
......
63 63
		final List<OafEntity> valids = new ArrayList<>();
64 64

  
65 65
		for (final OafEntity e : list) {
66
			if (e.getId().startsWith(idPrefix)) {
66
			if (e.getId().startsWith("20|" + idPrefix)) {
67 67
				valids.add(e);
68 68
			}
69 69
		}

Also available in: Unified diff