Revision 57655
Added by Michele De Bonis about 5 years ago
modules/dnet-mapreduce-jobs/branches/tree-dedup/src/test/java/eu/dnetlib/data/mapreduce/hbase/propagation/orcidtoresult/OrcidToResultTest.java | ||
---|---|---|
1 |
package eu.dnetlib.data.mapreduce.hbase.propagation.orcidtoresult; |
|
2 |
|
|
3 |
import com.google.gson.Gson; |
|
4 |
import com.google.protobuf.InvalidProtocolBufferException; |
|
5 |
import com.googlecode.protobuf.format.JsonFormat; |
|
6 |
import eu.dnetlib.data.mapreduce.hbase.propagation.NotValidResultSequenceException; |
|
7 |
import eu.dnetlib.data.mapreduce.hbase.propagation.PropagationConstants; |
|
8 |
import eu.dnetlib.data.mapreduce.hbase.propagation.Value; |
|
9 |
import eu.dnetlib.data.mapreduce.hbase.propagation.ValueList; |
|
10 |
import eu.dnetlib.data.mapreduce.hbase.propagation.communitythroughorganization.OrganizationMap; |
|
11 |
import eu.dnetlib.data.mapreduce.hbase.propagation.orcidthroughproducts.Emit; |
|
12 |
import eu.dnetlib.data.mapreduce.hbase.propagation.orcidthroughproducts.ResultOrcidIterator; |
|
13 |
import eu.dnetlib.data.proto.FieldTypeProtos; |
|
14 |
import org.apache.hadoop.io.Text; |
|
15 |
import org.elasticsearch.hadoop.util.Assert; |
|
16 |
import org.junit.Before; |
|
17 |
import org.junit.Ignore; |
|
18 |
import org.junit.Test; |
|
19 |
|
|
20 |
import java.io.IOException; |
|
21 |
import java.util.ArrayList; |
|
22 |
import java.util.List; |
|
23 |
|
|
24 |
public class OrcidToResultTest { |
|
25 |
|
|
26 |
final static String json1 = "{\"fullname\": \"Matteucci, F.\",\"rank\": 1 }"; |
|
27 |
final static String json2 = "{\"fullname\": \"Romano, D.\",\"rank\": 2}"; |
|
28 |
final static String json3 = "{\"pid\": [{\"value\": \"0000-0002-0571-4163\",\"key\": \"ORCID\"}],\"fullname\": \"Paolo Molaro\",\"surname\": \"Molaro\",\"name\": \"Paolo\",\"rank\": 3}"; |
|
29 |
|
|
30 |
Value v; |
|
31 |
@Ignore |
|
32 |
@Before |
|
33 |
public void setUp() throws InvalidProtocolBufferException, JsonFormat.ParseException { |
|
34 |
List<String> authors = new ArrayList<>(); |
|
35 |
FieldTypeProtos.Author.Builder author_builder = FieldTypeProtos.Author.newBuilder(); |
|
36 |
JsonFormat.merge(json1, author_builder); |
|
37 |
authors.add(JsonFormat.printToString(author_builder.build())); |
|
38 |
author_builder = FieldTypeProtos.Author.newBuilder(); |
|
39 |
JsonFormat.merge(json2, author_builder); |
|
40 |
authors.add(JsonFormat.printToString(author_builder.build())); |
|
41 |
author_builder = FieldTypeProtos.Author.newBuilder(); |
|
42 |
JsonFormat.merge(json3, author_builder); |
|
43 |
authors.add(JsonFormat.printToString(author_builder.build())); |
|
44 |
|
|
45 |
|
|
46 |
Emit e = new Emit(); |
|
47 |
e.setId("id1"); |
|
48 |
e.setAuthor_list(authors); |
|
49 |
v = Value.newInstance(new Gson().toJson(e,Emit.class),"0.85", PropagationConstants.Type.fromresult); |
|
50 |
|
|
51 |
System.out.println(v.toJson()); |
|
52 |
} |
|
53 |
|
|
54 |
@Test |
|
55 |
public void testBefore() throws NotValidResultSequenceException, IOException { |
|
56 |
|
|
57 |
ValueList vl = new ValueList("orcid2result.json"); |
|
58 |
List<Text> tmp = vl.getValueToText(); |
|
59 |
ResultOrcidIterator roi = new ResultOrcidIterator(tmp,"orcid_______::00001032f8cbf9091d8fade5bfa1700c"); |
|
60 |
|
|
61 |
System.out.println(roi.hasNext()); |
|
62 |
|
|
63 |
while(roi.hasNext()){ |
|
64 |
System.out.println(roi.next()); |
|
65 |
} |
|
66 |
} |
|
67 |
} |
modules/dnet-mapreduce-jobs/branches/tree-dedup/src/test/java/eu/dnetlib/data/mapreduce/hbase/propagation/communitythroughorganization/OrganizationToCommunityTest.java | ||
---|---|---|
1 |
package eu.dnetlib.data.mapreduce.hbase.propagation.communitythroughorganization; |
|
2 |
|
|
3 |
import com.google.gson.Gson; |
|
4 |
import com.google.protobuf.InvalidProtocolBufferException; |
|
5 |
import org.elasticsearch.hadoop.util.Assert; |
|
6 |
import org.junit.Before; |
|
7 |
import org.junit.Test; |
|
8 |
|
|
9 |
|
|
10 |
public class OrganizationToCommunityTest { |
|
11 |
|
|
12 |
OrganizationMap organizationMap; |
|
13 |
|
|
14 |
|
|
15 |
final String jsonmap = "{'coorda__h2020::3fb05a9524c3f790391261347852f638':['mes'], " + |
|
16 |
"'corda__h2020::e8dbe14cca9bf6fce09d468872f813f8':['mes'], " + |
|
17 |
"'snsf________::9b253f265e3bef5cae6d881fdf61aceb':['mes'], " + |
|
18 |
"'rcuk________::e054eea0a47665af8c3656b5785ccf76':['mes'], " + |
|
19 |
"'corda__h2020::edc18d67c9b11fb616ca9f6e1db1b151':['mes'], " + |
|
20 |
"'rcuk________::d5736d9da90521ddcdc7828a05a85e9a':['mes'], " + |
|
21 |
"'corda__h2020::f5d418d3aa1cf817ddefcc3fdc039f27':['mes'], " + |
|
22 |
"'snsf________::8fa091f8f25a846779acb4ea97b50aef':['mes'], " + |
|
23 |
"'corda__h2020::81e020977211c2c40fae2e1a50bffd71':['mes'], " + |
|
24 |
"'corda_______::81e020977211c2c40fae2e1a50bffd71':['mes'], " + |
|
25 |
"'snsf________::31d0a100e54e3cdb3c6f52d91e638c78':['mes'], " + |
|
26 |
"'corda__h2020::ea379ef91b8cc86f9ac5edc4169292db':['mes'], " + |
|
27 |
"'corda__h2020::f75ee2ee48e5cb0ec8c8d30aaa8fef70':['mes'], " + |
|
28 |
"'rcuk________::e16010089551a1a9182a94604fc0ea59':['mes'], " + |
|
29 |
"'corda__h2020::38531a2cce7c5c347ffc439b07c1f43b':['mes'], " + |
|
30 |
"'corda_______::38531a2cce7c5c347ffc439b07c1f43b':['mes'], " + |
|
31 |
"'grid________::b2cbbf5eadbbf87d534b022bad3191d7':['mes'], " + |
|
32 |
"'snsf________::74730ef1439d7f7636a8be58a6b471b8':['mes'], " + |
|
33 |
"'nsf_________::ad72e19043a5a467e35f9b444d11563e':['mes'], " + |
|
34 |
"'rcuk________::0fc3e92500290902a2d38ec2445e74c3':['mes'], " + |
|
35 |
"'grid________::ad2c29905da0eb3c06b3fa80cacd89ea':['mes']," + |
|
36 |
"'corda__h2020::30b53e4d63d3724f00acb9cbaca40860':['mes']," + |
|
37 |
"'corda__h2020::f60f84bee14ad93f0db0e49af1d5c317':['mes']," + |
|
38 |
"'corda__h2020::7bf251ac3765b5e89d82270a1763d09f':['mes']," + |
|
39 |
"'corda__h2020::65531bd11be9935948c7f2f4db1c1832':['mes']," + |
|
40 |
"'corda__h2020::e0e98f86bbc76638bbb72a8fe2302946':['mes']," + |
|
41 |
"'snsf________::3eb43582ac27601459a8d8b3e195724b':['mes']," + |
|
42 |
"'corda__h2020::af2481dab65d06c8ea0ae02b5517b9b6':['mes']," + |
|
43 |
"'corda__h2020::c19d05cfde69a50d3ebc89bd0ee49929':['mes']," + |
|
44 |
"'corda__h2020::af0bfd9fc09f80d9488f56d71a9832f0':['mes']}"; |
|
45 |
|
|
46 |
|
|
47 |
@Before |
|
48 |
public void setUp() throws InvalidProtocolBufferException { |
|
49 |
organizationMap = new Gson().fromJson(jsonmap,OrganizationMap.class); |
|
50 |
|
|
51 |
} |
|
52 |
|
|
53 |
@Test |
|
54 |
public void provaMapLoading(){ |
|
55 |
Assert.isTrue(organizationMap.get("pippo").size()==0); |
|
56 |
|
|
57 |
Assert.isTrue(organizationMap.get("corda__h2020::af0bfd9fc09f80d9488f56d71a9832f0").size()==1); |
|
58 |
|
|
59 |
Assert.isTrue((organizationMap.size()==30)); |
|
60 |
} |
|
61 |
|
|
62 |
@Test |
|
63 |
public void provaOrderedString(){ |
|
64 |
String s1 = "Molaro, P."; |
|
65 |
String s2 = "Paolo Molaro"; |
|
66 |
|
|
67 |
String[] ns_a1 = s1.trim().split(" "); |
|
68 |
String[] ns_a2 = s2.trim().split(" "); |
|
69 |
|
|
70 |
|
|
71 |
|
|
72 |
|
|
73 |
if (ns_a1[0].endsWith(".") || ns_a1[0].endsWith(",")){ |
|
74 |
ns_a1[0] = ns_a1[0].substring(0,ns_a1[0].length()-1); |
|
75 |
} |
|
76 |
if (ns_a1[1].endsWith(".") || ns_a1[1].endsWith(",")){ |
|
77 |
ns_a1[1] = ns_a1[1].substring(0,ns_a1[1].length()-1); |
|
78 |
} |
|
79 |
|
|
80 |
if (ns_a2[0].endsWith(".") || ns_a2[0].endsWith(",")){ |
|
81 |
ns_a2[0] = ns_a2[0].substring(0,ns_a2[0].length()-1); |
|
82 |
} |
|
83 |
if (ns_a2[1].endsWith(".") || ns_a2[1].endsWith(",")){ |
|
84 |
ns_a2[1] = ns_a2[1].substring(0,ns_a2[1].length()-1); |
|
85 |
} |
|
86 |
if(ns_a1[0].compareTo(ns_a1[1]) < 0){ |
|
87 |
String tmp = ns_a1[0]; |
|
88 |
ns_a1[0] = ns_a1[1]; |
|
89 |
ns_a1[1] = tmp; |
|
90 |
} |
|
91 |
|
|
92 |
if(ns_a2[0].compareTo(ns_a2[1]) < 0){ |
|
93 |
String tmp = ns_a2[0]; |
|
94 |
ns_a2[0] = ns_a2[1]; |
|
95 |
ns_a2[1] = tmp; |
|
96 |
|
|
97 |
} |
|
98 |
System.out.println(ns_a1[1].compareTo(ns_a1[0])); |
|
99 |
System.out.println(ns_a2[1].compareTo(ns_a2[0])); |
|
100 |
if(ns_a1[0].equalsIgnoreCase(ns_a2[0])){ |
|
101 |
if(ns_a1[1].equalsIgnoreCase(ns_a2[1])){//same name and surname |
|
102 |
System.out.println("equals"); |
|
103 |
} |
|
104 |
if(ns_a1[1].length() == 1 || ns_a2[1].length() == 1){ |
|
105 |
System.out.println(ns_a1[1].charAt(0) == ns_a2[1].charAt(0)); |
|
106 |
} |
|
107 |
System.out.println("different"); |
|
108 |
}else{ |
|
109 |
if(ns_a1[1].equalsIgnoreCase(ns_a2[1])){ |
|
110 |
if(ns_a1[0].length() == 1 || ns_a2[0].length()==1) |
|
111 |
System.out.println( ns_a1[0].charAt(0) == ns_a2[0].charAt(0)); |
|
112 |
else |
|
113 |
System.out.println("different"); |
|
114 |
} |
|
115 |
} |
|
116 |
} |
|
117 |
|
|
118 |
} |
modules/dnet-mapreduce-jobs/branches/tree-dedup/src/test/resources/eu/dnetlib/data/mapreduce/hbase/propagation/orcid2result.json | ||
---|---|---|
1 |
{"valueList":[{"type":"fromresult", |
|
2 |
"value":"{\"id\":\"id1\",\"author_list\":[\"{\\\"fullname\\\": \\\"Matteucci, F.\\\",\\\"rank\\\": 1}\",\"{\\\"fullname\\\": \\\"Donatella Romano\\\",\\\"name\\\": \\\"Donatella\\\",\\\"surname\\\": \\\"Romano\\\",\\\"rank\\\": 2,\\\"pid\\\": [{\\\"key\\\": \\\"ORCID\\\",\\\"value\\\": \\\"0000-0002-0845-6171\\\"}]}\",\"{\\\"fullname\\\": \\\"Paolo Molaro\\\",\\\"name\\\": \\\"Paolo\\\",\\\"surname\\\": \\\"Molaro\\\",\\\"rank\\\": 3,\\\"pid\\\": [{\\\"key\\\": \\\"ORCID\\\",\\\"value\\\": \\\"0000-0002-0571-4163\\\"}]}\"]}", |
|
3 |
"trust":"0.9"}, |
|
4 |
{"type": "fromsemrel", |
|
5 |
"trust": "0.9", |
|
6 |
"value": "{\"id\":\"id2\",\"author_list\":[\"{\\\"fullname\\\": \\\"Matteucci, F.\\\",\\\"rank\\\": 1}\",\"{\\\"fullname\\\": \\\"Molaro, P.\\\",\\\"rank\\\": 3,\\\"name\\\": \\\"P.\\\",\\\"surname\\\": \\\"Molaro\\\"}\",\"{\\\"fullname\\\": \\\"Donatella Romano\\\",\\\"name\\\": \\\"Donatella\\\",\\\"surname\\\": \\\"Romano\\\",\\\"rank\\\": 2,\\\"pid\\\": [{\\\"key\\\": \\\"ORCID\\\",\\\"value\\\": \\\"0000-0002-0845-6171\\\"}]}\"]}" |
|
7 |
}, |
|
8 |
{"type":"fromsemrel", |
|
9 |
"value":"{\"id\":\"id1\",\"author_list\":[\"{\\\"fullname\\\": \\\"Matteucci, F.\\\",\\\"rank\\\": 1}\",\"{\\\"fullname\\\": \\\"Donatella Romano\\\",\\\"name\\\": \\\"Donatella\\\",\\\"surname\\\": \\\"Romano\\\",\\\"rank\\\": 2,\\\"pid\\\": [{\\\"key\\\": \\\"ORCID\\\",\\\"value\\\": \\\"0000-0002-0845-6171\\\"}]}\",\"{\\\"fullname\\\": \\\"Paolo Molaro\\\",\\\"rank\\\": 3,\\\"pid\\\": [{\\\"key\\\": \\\"ORCID\\\",\\\"value\\\": \\\"0000-0002-0571-4163\\\"}]}\"]}", |
|
10 |
"trust":"0.9"} |
|
11 |
|
|
12 |
|
|
13 |
]} |
|
14 |
|
modules/dnet-mapreduce-jobs/branches/tree-dedup/src/main/java/eu/dnetlib/data/mapreduce/hbase/dedup/DedupReducer.java | ||
---|---|---|
1 | 1 |
package eu.dnetlib.data.mapreduce.hbase.dedup; |
2 | 2 |
|
3 |
import java.io.IOException; |
|
4 |
import java.util.*; |
|
5 |
|
|
6 | 3 |
import com.google.common.base.Function; |
7 | 4 |
import com.google.common.collect.Iterables; |
8 |
import com.google.common.collect.Lists; |
|
9 |
import com.google.protobuf.InvalidProtocolBufferException; |
|
10 | 5 |
import eu.dnetlib.data.mapreduce.JobParams; |
11 | 6 |
import eu.dnetlib.data.mapreduce.util.DedupUtils; |
12 |
import eu.dnetlib.data.mapreduce.util.StreamUtils; |
|
13 |
import eu.dnetlib.data.proto.OafProtos; |
|
14 |
import eu.dnetlib.data.proto.RelTypeProtos.SubRelType; |
|
15 |
import eu.dnetlib.data.proto.TypeProtos.Type; |
|
16 |
import eu.dnetlib.pace.clustering.NGramUtils; |
|
17 | 7 |
import eu.dnetlib.pace.config.DedupConfig; |
18 |
import eu.dnetlib.pace.config.WfConfig; |
|
19 |
import eu.dnetlib.pace.distance.PaceDocumentDistance; |
|
20 |
import eu.dnetlib.pace.distance.eval.ScoreResult; |
|
21 |
import eu.dnetlib.pace.model.*; |
|
8 |
import eu.dnetlib.pace.model.MapDocument; |
|
9 |
import eu.dnetlib.pace.model.MapDocumentSerializer; |
|
22 | 10 |
import eu.dnetlib.pace.util.BlockProcessor; |
23 | 11 |
import eu.dnetlib.pace.util.Reporter; |
24 |
import org.apache.commons.lang.StringUtils; |
|
25 | 12 |
import org.apache.commons.logging.Log; |
26 | 13 |
import org.apache.commons.logging.LogFactory; |
27 | 14 |
import org.apache.hadoop.hbase.client.Durability; |
... | ... | |
32 | 19 |
import org.apache.hadoop.io.Text; |
33 | 20 |
|
34 | 21 |
import javax.annotation.Nullable; |
22 |
import java.io.IOException; |
|
35 | 23 |
|
36 | 24 |
public class DedupReducer extends TableReducer<Text, ImmutableBytesWritable, ImmutableBytesWritable> { |
37 | 25 |
|
modules/dnet-mapreduce-jobs/branches/tree-dedup/src/main/java/eu/dnetlib/data/mapreduce/hbase/propagation/PropagationConstants.java | ||
---|---|---|
18 | 18 |
|
19 | 19 |
public final static String ORGANIZATION_COMMUNITY_TRUST = "0.85"; |
20 | 20 |
public static final String ORCID_RESULT_TRUST = "0.85"; |
21 |
public static final String PRODUCT_TO_ORGANIZATION = "0.85"; |
|
21 | 22 |
|
22 | 23 |
public static final String ZERO = "0"; |
23 | 24 |
public static final String ONE = "1"; |
... | ... | |
29 | 30 |
public final static String SCHEMA_ID = "dnet:provenanceActions"; |
30 | 31 |
|
31 | 32 |
public final static String DNET_COUNTRY_SCHEMA = "dnet:countries"; |
32 |
public final static String DNET_RELATION_SCHEMA = "dnet:result_project_relations"; |
|
33 |
public final static String DNET_RELATION_SCHEMA_PROJECTS = "dnet:result_project_relations"; |
|
34 |
public final static String DNET_RELATION_SCHEMA_ORGANIZATION = "dnet:result_organization_relation"; |
|
33 | 35 |
//public final static String DNET_COMMUNITY_RELATION_SCHEMA = "dnet:result_result_relations"; |
34 | 36 |
|
35 | 37 |
public final static String CLASS_PROJECT_ID = "propagation:project:semrel"; |
... | ... | |
45 | 47 |
public final static String CLASS_ORGANIZATION_NAME = "Propagation of community result through organization association"; |
46 | 48 |
|
47 | 49 |
public final static String CLASS_ORCID_ID = "propagation:orcid:result"; |
48 |
public static final String CLASS_ORCID_NAME = "Propagation of ORCID through result linked by isSupplementedBy of isSupplementOf semantic relations";
|
|
50 |
public static final String CLASS_ORCID_NAME = "Propagation of ORCID through result linked by isSupplementedBy or isSupplementTo semantic relations";
|
|
49 | 51 |
|
52 |
public static final String CLASS_ORGANIZATION_RESULT_ID = "propagation:result:organization"; |
|
53 |
public static final String CLASS_ORGANIZATION_RESULT_NAME = "Propagation of result linked to organization through result linked by isSupplementTo or isSupplementedBy"; |
|
54 |
|
|
50 | 55 |
public final static int PROJECT = TypeProtos.Type.project.getNumber();//40 |
51 | 56 |
public final static int DATASOURCE = TypeProtos.Type.datasource.getNumber();//10 |
52 | 57 |
public final static int ORGANIZATION = TypeProtos.Type.organization.getNumber();//20 |
53 | 58 |
public final static int PUBLICATION = TypeProtos.Type.result.getNumber();//50 |
54 | 59 |
|
55 |
public final static RelTypeProtos.RelType REL_TYPE = RelTypeProtos.RelType.resultProject; |
|
56 |
public final static RelTypeProtos.SubRelType SUBREL_TYPE = RelTypeProtos.SubRelType.outcome; |
|
60 |
public final static RelTypeProtos.RelType REL_TYPE_PROJECT = RelTypeProtos.RelType.resultProject;
|
|
61 |
public final static RelTypeProtos.SubRelType SUBREL_TYPE_PROJECT = RelTypeProtos.SubRelType.outcome;
|
|
57 | 62 |
public static final String REL_PROJECT_RESULT = "produces"; |
58 | 63 |
public static final String REL_RESULT_PROJECT = "isProducedBy"; |
59 |
public static final String RELATION = REL_TYPE + "_" + SUBREL_TYPE + "_";
|
|
64 |
public static final String RELATION = REL_TYPE_PROJECT + "_" + SUBREL_TYPE_PROJECT + "_";
|
|
60 | 65 |
public static final String OUTCOME_PRODUCEDBY = RELATION + REL_RESULT_PROJECT; |
66 |
|
|
61 | 67 |
public static final String[] DEFAULT_PROJECT_RELATION_SET = new String[]{"resultResult_supplement_isSupplementedBy","resultResult_supplement_isSupplementTo"}; |
62 | 68 |
public static final String[] DEFAULT_COMMUNITY_RELATION_SET = new String[]{"resultResult_supplement_isSupplementedBy","resultResult_supplement_isSupplementTo"}; |
69 |
public static final String[] DEFAULT_ORGANIZATION_RESULT_RELATION_SET = new String[]{"resultResult_supplement_isSupplementedBy","resultResult_supplement_isSupplementTo"}; |
|
63 | 70 |
|
64 | 71 |
public static final String[] DEFAULT_RESULT_RELATION_SET = new String[]{"resultResult_supplement_isSupplementedBy","resultResult_supplement_isSupplementTo"}; |
65 | 72 |
|
modules/dnet-mapreduce-jobs/branches/tree-dedup/src/main/java/eu/dnetlib/data/mapreduce/hbase/propagation/orcidthroughproducts/PropagationOrcidToResultFileReducer.java | ||
---|---|---|
1 |
package eu.dnetlib.data.mapreduce.hbase.propagation.orcidthroughproducts; |
|
2 |
|
|
3 |
import com.googlecode.protobuf.format.JsonFormat; |
|
4 |
import eu.dnetlib.data.mapreduce.hbase.propagation.NotValidResultSequenceException; |
|
5 |
import eu.dnetlib.data.mapreduce.hbase.propagation.ResultIterator; |
|
6 |
import eu.dnetlib.data.mapreduce.hbase.propagation.projecttoresult.PropagationProjectToResultReducer; |
|
7 |
import eu.dnetlib.data.mapreduce.hbase.propagation.projecttoresult.ResultProjectIterator; |
|
8 |
import eu.dnetlib.data.proto.OafProtos; |
|
9 |
import org.apache.commons.logging.Log; |
|
10 |
import org.apache.commons.logging.LogFactory; |
|
11 |
import org.apache.hadoop.hbase.io.ImmutableBytesWritable; |
|
12 |
import org.apache.hadoop.hbase.util.Bytes; |
|
13 |
import org.apache.hadoop.io.Text; |
|
14 |
import org.apache.hadoop.mapreduce.Reducer; |
|
15 |
|
|
16 |
import java.io.IOException; |
|
17 |
import java.util.List; |
|
18 |
|
|
19 |
import static eu.dnetlib.data.mapreduce.hbase.propagation.PropagationConstants.COUNTER_PROPAGATION; |
|
20 |
|
|
21 |
public class PropagationOrcidToResultFileReducer extends Reducer<ImmutableBytesWritable, Text, Text, Text> { |
|
22 |
private static final Log log = LogFactory.getLog(PropagationOrcidToResultFileReducer.class); // NOPMD by marko on 11/24/08 5:02 PM |
|
23 |
|
|
24 |
private Text keyOut; |
|
25 |
private Text outValue; |
|
26 |
|
|
27 |
|
|
28 |
@Override |
|
29 |
protected void setup(final Context context) throws IOException, InterruptedException { |
|
30 |
super.setup(context); |
|
31 |
keyOut = new Text(""); |
|
32 |
outValue = new Text(); |
|
33 |
} |
|
34 |
|
|
35 |
|
|
36 |
@Override |
|
37 |
protected void reduce(ImmutableBytesWritable key, Iterable<Text> values, Context context) throws IOException, InterruptedException { |
|
38 |
ResultIterator rh = null; |
|
39 |
try { |
|
40 |
rh = new ResultOrcidIterator(values, Bytes.toString(key.copyBytes())); |
|
41 |
} catch (NotValidResultSequenceException e) { |
|
42 |
context.getCounter(COUNTER_PROPAGATION, e.getMessage()).increment(1); |
|
43 |
return; |
|
44 |
} |
|
45 |
while (rh.hasNext()) { |
|
46 |
List<OafProtos.Oaf> oaf_list = rh.next(); |
|
47 |
if(oaf_list != null){ |
|
48 |
for (OafProtos.Oaf oaf : oaf_list) { |
|
49 |
keyOut.set(oaf.getEntity().getId()); |
|
50 |
outValue.set(JsonFormat.printToString(oaf).getBytes()); |
|
51 |
context.write(keyOut, outValue); |
|
52 |
context.getCounter(COUNTER_PROPAGATION, "Added orcid to result").increment(1); |
|
53 |
} |
|
54 |
|
|
55 |
} |
|
56 |
|
|
57 |
} |
|
58 |
|
|
59 |
} |
|
60 |
} |
modules/dnet-mapreduce-jobs/branches/tree-dedup/src/main/java/eu/dnetlib/data/mapreduce/hbase/propagation/orcidthroughproducts/PropagationOrcidToResultReducer.java | ||
---|---|---|
1 |
package eu.dnetlib.data.mapreduce.hbase.propagation.orcidthroughproducts; |
|
2 |
|
|
3 |
import com.googlecode.protobuf.format.JsonFormat; |
|
4 |
import eu.dnetlib.data.mapreduce.hbase.propagation.NotValidResultSequenceException; |
|
5 |
import eu.dnetlib.data.mapreduce.hbase.propagation.ResultIterator; |
|
6 |
import eu.dnetlib.data.mapreduce.hbase.propagation.projecttoresult.PropagationProjectToResultReducer; |
|
7 |
import eu.dnetlib.data.mapreduce.hbase.propagation.projecttoresult.ResultProjectIterator; |
|
8 |
import eu.dnetlib.data.mapreduce.util.OafRowKeyDecoder; |
|
9 |
import eu.dnetlib.data.proto.OafProtos; |
|
10 |
import org.apache.commons.logging.Log; |
|
11 |
import org.apache.commons.logging.LogFactory; |
|
12 |
import org.apache.hadoop.hbase.client.Put; |
|
13 |
import org.apache.hadoop.hbase.io.ImmutableBytesWritable; |
|
14 |
import org.apache.hadoop.hbase.mapreduce.TableReducer; |
|
15 |
import org.apache.hadoop.hbase.util.Bytes; |
|
16 |
import org.apache.hadoop.io.Text; |
|
17 |
|
|
18 |
import java.io.IOException; |
|
19 |
import java.util.List; |
|
20 |
|
|
21 |
import static eu.dnetlib.data.mapreduce.hbase.propagation.PropagationConstants.COUNTER_PROPAGATION; |
|
22 |
import static eu.dnetlib.data.mapreduce.hbase.propagation.PropagationConstants.RELATION; |
|
23 |
|
|
24 |
public class PropagationOrcidToResultReducer extends TableReducer<ImmutableBytesWritable, Text, ImmutableBytesWritable> { |
|
25 |
private static final Log log = LogFactory.getLog(PropagationOrcidToResultReducer.class); // NOPMD by marko on 11/24/08 5:02 PM |
|
26 |
private ImmutableBytesWritable keyOut; |
|
27 |
|
|
28 |
|
|
29 |
|
|
30 |
@Override |
|
31 |
protected void setup(final Context context) throws IOException, InterruptedException { |
|
32 |
super.setup(context); |
|
33 |
keyOut = new ImmutableBytesWritable(); |
|
34 |
} |
|
35 |
|
|
36 |
|
|
37 |
@Override |
|
38 |
protected void reduce(ImmutableBytesWritable key, Iterable<Text> values, Context context) throws IOException, InterruptedException { |
|
39 |
ResultIterator rh = null; |
|
40 |
try { |
|
41 |
rh = new ResultOrcidIterator(values, Bytes.toString(key.copyBytes())); |
|
42 |
} catch (NotValidResultSequenceException e) { |
|
43 |
context.getCounter(COUNTER_PROPAGATION, e.getMessage()).increment(1); |
|
44 |
return; |
|
45 |
} |
|
46 |
|
|
47 |
while (rh.hasNext()) { |
|
48 |
List<OafProtos.Oaf> oaf_list = rh.next(); |
|
49 |
if(oaf_list != null){ |
|
50 |
for (OafProtos.Oaf oaf : oaf_list) { |
|
51 |
byte[] targetRowKey = Bytes.toBytes(oaf.getEntity().getId()); |
|
52 |
final Put put = new Put(targetRowKey).add(Bytes.toBytes("result"), Bytes.toBytes("update_" + System.nanoTime()), oaf.toByteArray()); |
|
53 |
keyOut.set(targetRowKey); |
|
54 |
context.write(keyOut, put); |
|
55 |
context.getCounter(COUNTER_PROPAGATION, "added orcid to product").increment(1); |
|
56 |
|
|
57 |
} |
|
58 |
|
|
59 |
} |
|
60 |
|
|
61 |
} |
|
62 |
|
|
63 |
|
|
64 |
|
|
65 |
} |
|
66 |
|
|
67 |
|
|
68 |
|
|
69 |
} |
modules/dnet-mapreduce-jobs/branches/tree-dedup/src/main/java/eu/dnetlib/data/mapreduce/hbase/propagation/orcidthroughproducts/ResultOrcidIterator.java | ||
---|---|---|
1 |
package eu.dnetlib.data.mapreduce.hbase.propagation.orcidthroughproducts; |
|
2 |
|
|
3 |
import com.googlecode.protobuf.format.JsonFormat; |
|
4 |
import eu.dnetlib.data.mapreduce.hbase.propagation.*; |
|
5 |
import eu.dnetlib.data.proto.*; |
|
6 |
import org.apache.hadoop.io.Text; |
|
7 |
|
|
8 |
import java.util.ArrayList; |
|
9 |
import java.util.Arrays; |
|
10 |
import java.util.Iterator; |
|
11 |
import java.util.List; |
|
12 |
import java.util.stream.Collectors; |
|
13 |
|
|
14 |
public class ResultOrcidIterator extends ResultIterator { |
|
15 |
|
|
16 |
private Iterator<String> author_iterator; |
|
17 |
private List<FieldTypeProtos.Author> autoritative_authors ; |
|
18 |
private List<String> relatedResult ; |
|
19 |
|
|
20 |
|
|
21 |
public ResultOrcidIterator(final Iterable<Text> values, final String key) throws NotValidResultSequenceException { |
|
22 |
super(values,key); |
|
23 |
} |
|
24 |
|
|
25 |
@Override |
|
26 |
protected void checkSequence() throws NotValidResultSequenceException { |
|
27 |
if(!it.hasNext()){ |
|
28 |
throw new NotValidResultSequenceException("Empty information for key"); |
|
29 |
} |
|
30 |
|
|
31 |
try { |
|
32 |
autoritative_authors = new ArrayList<>(); |
|
33 |
relatedResult = new ArrayList<>(); |
|
34 |
analizeValueList(); |
|
35 |
|
|
36 |
}catch(JsonFormat.ParseException e){ |
|
37 |
throw new NotValidResultSequenceException("Problems recreating the author list from serialization"); |
|
38 |
} |
|
39 |
|
|
40 |
List<FieldTypeProtos.Author> authors_with_orcid = autoritative_authors.stream() |
|
41 |
.map(a -> { |
|
42 |
if (a.getPidList() == null || a.getPidList().isEmpty()) |
|
43 |
return null; |
|
44 |
return a; |
|
45 |
}) |
|
46 |
.filter(a -> a!= null) |
|
47 |
.filter(a -> containsOrcid(a.getPidList())) |
|
48 |
.collect(Collectors.toList()); |
|
49 |
|
|
50 |
|
|
51 |
if(authors_with_orcid.size() == 0 || relatedResult.size() == 0){ |
|
52 |
resultId = TERMINATOR; |
|
53 |
return; |
|
54 |
} |
|
55 |
|
|
56 |
|
|
57 |
author_iterator = relatedResult.iterator(); |
|
58 |
autoritative_authors = authors_with_orcid; |
|
59 |
getNext(); |
|
60 |
|
|
61 |
} |
|
62 |
|
|
63 |
private boolean containsOrcid(List<FieldTypeProtos.KeyValue> pidList){ |
|
64 |
if(pidList == null) |
|
65 |
return false; |
|
66 |
return pidList |
|
67 |
.stream() |
|
68 |
.filter(kv -> kv.getKey().equals(PropagationConstants.AUTHOR_PID)) |
|
69 |
.collect(Collectors.toList()).size() > 0; |
|
70 |
} |
|
71 |
|
|
72 |
private void getNext(){ |
|
73 |
if (author_iterator.hasNext()) |
|
74 |
resultId = author_iterator.next(); |
|
75 |
else |
|
76 |
resultId = TERMINATOR; |
|
77 |
} |
|
78 |
|
|
79 |
@Override |
|
80 |
public List<OafProtos.Oaf> next() { |
|
81 |
//get the next merged author list |
|
82 |
try { |
|
83 |
//list of authors in the related result |
|
84 |
Emit e = Emit.fromJson(resultId); |
|
85 |
List<FieldTypeProtos.Author> author_list = getAuthorList(e); |
|
86 |
|
|
87 |
ResultProtos.Result.Metadata.Builder metadata = searchMatch(author_list); |
|
88 |
|
|
89 |
if (metadata != null){ |
|
90 |
ArrayList<OafProtos.Oaf> ret = new ArrayList<OafProtos.Oaf>(Arrays.asList(getUpdate(metadata, e.getId()))); |
|
91 |
getNext(); |
|
92 |
return ret; |
|
93 |
} |
|
94 |
|
|
95 |
|
|
96 |
}catch(JsonFormat.ParseException e){ |
|
97 |
|
|
98 |
} |
|
99 |
getNext(); |
|
100 |
return null; |
|
101 |
} |
|
102 |
|
|
103 |
private ResultProtos.Result.Metadata.Builder searchMatch(List<FieldTypeProtos.Author> author_list){ |
|
104 |
ResultProtos.Result.Metadata.Builder metadataBuilder = ResultProtos.Result.Metadata.newBuilder(); |
|
105 |
boolean updated = false; |
|
106 |
// for (FieldTypeProtos.Author a: autoritative_authors){ |
|
107 |
// searchAuthor(a,author_list); |
|
108 |
// } |
|
109 |
|
|
110 |
for (FieldTypeProtos.Author a: author_list){ |
|
111 |
FieldTypeProtos.Author.Builder author = searchAuthor(a, autoritative_authors); |
|
112 |
if(author != null){ |
|
113 |
updated = true; |
|
114 |
metadataBuilder.addAuthor(author); |
|
115 |
}else{ |
|
116 |
metadataBuilder.addAuthor(FieldTypeProtos.Author.newBuilder(a)); |
|
117 |
} |
|
118 |
} |
|
119 |
if(updated) |
|
120 |
return metadataBuilder; |
|
121 |
return null; |
|
122 |
} |
|
123 |
|
|
124 |
|
|
125 |
private boolean equals(FieldTypeProtos.Author a1, FieldTypeProtos.Author a2){ |
|
126 |
if(a1.hasSurname()){ |
|
127 |
if(a2.hasSurname()){ |
|
128 |
if(!a1.getSurname().trim().equalsIgnoreCase(a2.getSurname().trim())){ |
|
129 |
return false; |
|
130 |
} |
|
131 |
//have the same surname. Check the name |
|
132 |
if(a1.hasName()){ |
|
133 |
if (a2.hasName()){ |
|
134 |
if (a1.getName().trim().equalsIgnoreCase(a2.getName().trim())){ |
|
135 |
return true; //same name and same surname in a related research result |
|
136 |
} |
|
137 |
//they could be differently written (i.e. only the initials of the name in one of the two |
|
138 |
return (a1.getName().trim().substring(0,0).equalsIgnoreCase(a2.getName().trim().substring(0,0))); |
|
139 |
} |
|
140 |
} |
|
141 |
} |
|
142 |
} |
|
143 |
// if(a1.hasFullname()){ |
|
144 |
// if (a2.hasFullname()){ |
|
145 |
// if (a1.getFullname().trim().equalsIgnoreCase(a2.getFullname().trim())){ |
|
146 |
// return true; |
|
147 |
// } |
|
148 |
// //split string containing name and surname |
|
149 |
// String[] ns_a1 = a1.getFullname().trim().split(" "); |
|
150 |
// String[] ns_a2 = a2.getFullname().trim().split(" "); |
|
151 |
// |
|
152 |
// |
|
153 |
// if (ns_a1[0].endsWith(".") || ns_a1[0].endsWith(",")){ |
|
154 |
// ns_a1[0] = ns_a1[0].substring(0,ns_a1[0].length()-1); |
|
155 |
// } |
|
156 |
// if (ns_a1[1].endsWith(".") || ns_a1[1].endsWith(",")){ |
|
157 |
// ns_a1[1] = ns_a1[1].substring(0,ns_a1[1].length()-1); |
|
158 |
// } |
|
159 |
// |
|
160 |
// if (ns_a2[0].endsWith(".") || ns_a2[0].endsWith(",")){ |
|
161 |
// ns_a2[0] = ns_a2[0].substring(0,ns_a2[0].length()-1); |
|
162 |
// } |
|
163 |
// if (ns_a2[1].endsWith(".") || ns_a2[1].endsWith(",")){ |
|
164 |
// ns_a2[1] = ns_a2[1].substring(0,ns_a2[1].length()-1); |
|
165 |
// } |
|
166 |
// |
|
167 |
// if(ns_a1[0].compareTo(ns_a1[1]) < 0){ |
|
168 |
// String tmp = ns_a1[0]; |
|
169 |
// ns_a1[0] = ns_a1[1]; |
|
170 |
// ns_a1[1] = tmp; |
|
171 |
// } |
|
172 |
// |
|
173 |
// if(ns_a2[0].compareTo(ns_a2[1]) < 0){ |
|
174 |
// String tmp = ns_a2[0]; |
|
175 |
// ns_a2[0] = ns_a2[1]; |
|
176 |
// ns_a2[1] = tmp; |
|
177 |
// |
|
178 |
// } |
|
179 |
// |
|
180 |
// if(ns_a1[0].equalsIgnoreCase(ns_a2[0])){ |
|
181 |
// if(ns_a1[1].equalsIgnoreCase(ns_a2[1])){//same name and surname |
|
182 |
// return true; |
|
183 |
// } |
|
184 |
// if(ns_a1[1].length() == 1 || ns_a2[1].length() == 1){ |
|
185 |
// return ns_a1[1].charAt(0) == ns_a2[1].charAt(0);//same surname and initial of the name |
|
186 |
// } |
|
187 |
// return false; |
|
188 |
// |
|
189 |
// }else{ |
|
190 |
// if(ns_a1[1].equalsIgnoreCase(ns_a2[1])){ |
|
191 |
// if(ns_a1[0].length() == 1 || ns_a2[0].length()==1) |
|
192 |
// return ns_a1[0].charAt(0) == ns_a2[0].charAt(0); |
|
193 |
// else |
|
194 |
// return false; |
|
195 |
// } |
|
196 |
// } |
|
197 |
// |
|
198 |
// |
|
199 |
// |
|
200 |
// } |
|
201 |
// return false; |
|
202 |
// } |
|
203 |
return false; |
|
204 |
|
|
205 |
} |
|
206 |
|
|
207 |
private FieldTypeProtos.Author.Builder searchAuthor(FieldTypeProtos.Author a, List<FieldTypeProtos.Author> author_list){ |
|
208 |
if(containsOrcid(a.getPidList())) |
|
209 |
return null; |
|
210 |
for(FieldTypeProtos.Author autoritative_author : author_list) { |
|
211 |
if (equals(autoritative_author, a)) { |
|
212 |
if(!containsOrcid(a.getPidList())) |
|
213 |
return update(a, autoritative_author); |
|
214 |
} |
|
215 |
} |
|
216 |
return null; |
|
217 |
|
|
218 |
} |
|
219 |
|
|
220 |
private void analizeValueList() throws JsonFormat.ParseException { |
|
221 |
while(it.hasNext()){ |
|
222 |
Value v = Value.fromJson(it.next().toString()); |
|
223 |
|
|
224 |
if(v.getType().equals(PropagationConstants.Type.fromresult)){ |
|
225 |
autoritative_authors.addAll(getAuthorList(Emit.fromJson(v.getValue ()))); |
|
226 |
} |
|
227 |
if(v.getType().equals(PropagationConstants.Type.fromsemrel)){ |
|
228 |
relatedResult.add(v.getValue()); |
|
229 |
} |
|
230 |
} |
|
231 |
|
|
232 |
} |
|
233 |
private FieldTypeProtos.Author.Builder update(FieldTypeProtos.Author related_author, FieldTypeProtos.Author autoritative_autor ){ |
|
234 |
|
|
235 |
FieldTypeProtos.Author.Builder res = FieldTypeProtos.Author.newBuilder(related_author); |
|
236 |
List<FieldTypeProtos.KeyValue> apid_list = autoritative_autor.getPidList(); |
|
237 |
FieldTypeProtos.KeyValue akv = apid_list.stream().filter(kv -> kv.getKey().equals(PropagationConstants.AUTHOR_PID)).collect(Collectors.toList()).get(0); |
|
238 |
FieldTypeProtos.KeyValue.Builder kvb = FieldTypeProtos.KeyValue.newBuilder(); |
|
239 |
kvb.setKey(akv.getKey()).setValue(akv.getValue()); |
|
240 |
kvb.setDataInfo(Utils.getDataInfo( |
|
241 |
PropagationConstants.ORCID_RESULT_TRUST, |
|
242 |
PropagationConstants.CLASS_ORCID_ID, |
|
243 |
PropagationConstants.SCHEMA_ID, |
|
244 |
PropagationConstants.SCHEMA_NAME, |
|
245 |
PropagationConstants.DATA_INFO_TYPE, |
|
246 |
PropagationConstants.CLASS_ORCID_NAME) |
|
247 |
); |
|
248 |
return res.addPid(kvb); |
|
249 |
|
|
250 |
|
|
251 |
} |
|
252 |
|
|
253 |
private List<FieldTypeProtos.Author> getAuthorList(Emit e) throws JsonFormat.ParseException { |
|
254 |
|
|
255 |
List<FieldTypeProtos.Author> authors = new ArrayList<>(); |
|
256 |
for (String author : e.getAuthor_list()) { |
|
257 |
FieldTypeProtos.Author.Builder author_builder = FieldTypeProtos.Author.newBuilder(); |
|
258 |
JsonFormat.merge(author, author_builder); |
|
259 |
authors.add(author_builder.build()); |
|
260 |
} |
|
261 |
|
|
262 |
return authors; |
|
263 |
|
|
264 |
} |
|
265 |
|
|
266 |
|
|
267 |
public static OafProtos.Oaf getUpdate(ResultProtos.Result.Metadata.Builder metadata, String resultId) { |
|
268 |
final ResultProtos.Result.Builder result = ResultProtos.Result.newBuilder().setMetadata(metadata); |
|
269 |
final OafProtos.OafEntity.Builder entity = OafProtos.OafEntity.newBuilder() |
|
270 |
.setType(TypeProtos.Type.result) |
|
271 |
.setId(resultId) |
|
272 |
.setResult(result); |
|
273 |
|
|
274 |
return OafProtos.Oaf.newBuilder() |
|
275 |
.setKind(KindProtos.Kind.entity) |
|
276 |
.setEntity(entity) |
|
277 |
.build(); |
|
278 |
} |
|
279 |
} |
modules/dnet-mapreduce-jobs/branches/tree-dedup/src/main/java/eu/dnetlib/data/mapreduce/hbase/propagation/orcidthroughproducts/PropagationOrcidToResultMapper.java | ||
---|---|---|
1 |
package eu.dnetlib.data.mapreduce.hbase.propagation.orcidthroughproducts; |
|
2 |
|
|
3 |
import com.google.gson.Gson; |
|
4 |
import com.googlecode.protobuf.format.JsonFormat; |
|
5 |
import eu.dnetlib.data.mapreduce.hbase.dedup.fixrelation.Key; |
|
6 |
import eu.dnetlib.data.mapreduce.hbase.propagation.Value; |
|
7 |
import eu.dnetlib.data.mapreduce.hbase.propagation.communitythroughorganization.DedupedList; |
|
8 |
import eu.dnetlib.data.mapreduce.hbase.propagation.communitythroughorganization.OrganizationMap; |
|
9 |
import eu.dnetlib.data.mapreduce.hbase.propagation.projecttoresult.PropagationProjectToResultReducer; |
|
10 |
import eu.dnetlib.data.mapreduce.util.OafRowKeyDecoder; |
|
11 |
import eu.dnetlib.data.proto.FieldTypeProtos; |
|
12 |
import eu.dnetlib.data.proto.OafProtos; |
|
13 |
import eu.dnetlib.data.proto.TypeProtos; |
|
14 |
import org.apache.avro.generic.GenericData; |
|
15 |
import org.apache.commons.lang3.StringUtils; |
|
16 |
import org.apache.commons.logging.Log; |
|
17 |
import org.apache.commons.logging.LogFactory; |
|
18 |
import org.apache.hadoop.hbase.client.Result; |
|
19 |
import org.apache.hadoop.hbase.io.ImmutableBytesWritable; |
|
20 |
import org.apache.hadoop.hbase.mapreduce.TableMapper; |
|
21 |
import org.apache.hadoop.hbase.util.Bytes; |
|
22 |
import org.apache.hadoop.io.Text; |
|
23 |
|
|
24 |
import java.io.IOException; |
|
25 |
import java.util.ArrayList; |
|
26 |
import java.util.HashSet; |
|
27 |
import java.util.List; |
|
28 |
import java.util.Set; |
|
29 |
import java.util.stream.Collectors; |
|
30 |
|
|
31 |
import static eu.dnetlib.data.mapreduce.hbase.propagation.PropagationConstants.*; |
|
32 |
import static eu.dnetlib.data.mapreduce.hbase.propagation.PropagationConstants.COUNTER_PROPAGATION; |
|
33 |
import static eu.dnetlib.data.mapreduce.hbase.propagation.Utils.getEntity; |
|
34 |
import static eu.dnetlib.data.mapreduce.hbase.propagation.Utils.getRelationTarget; |
|
35 |
|
|
36 |
public class PropagationOrcidToResultMapper extends TableMapper<ImmutableBytesWritable, Text> { |
|
37 |
private static final Log log = LogFactory.getLog(PropagationOrcidToResultMapper.class); // NOPMD by marko on 11/24/08 5:02 PM |
|
38 |
private Text valueOut; |
|
39 |
private ImmutableBytesWritable keyOut; |
|
40 |
private String[] sem_rels; |
|
41 |
private String trust; |
|
42 |
|
|
43 |
@Override |
|
44 |
protected void setup(final Context context) throws IOException, InterruptedException { |
|
45 |
super.setup(context); |
|
46 |
valueOut = new Text(); |
|
47 |
keyOut = new ImmutableBytesWritable(); |
|
48 |
|
|
49 |
sem_rels = context.getConfiguration().getStrings("propagatetoorcid.semanticrelations", DEFAULT_RESULT_RELATION_SET); |
|
50 |
trust = context.getConfiguration().get("propagatetoorcid.trust","0.85"); |
|
51 |
|
|
52 |
} |
|
53 |
|
|
54 |
@Override |
|
55 |
protected void map(final ImmutableBytesWritable keyIn, final Result value, final Context context) throws IOException, InterruptedException { |
|
56 |
final TypeProtos.Type type = OafRowKeyDecoder.decode(keyIn.copyBytes()).getType(); |
|
57 |
final OafProtos.OafEntity entity = getEntity(value, type);//getEntity already verified that it is not delByInference |
|
58 |
|
|
59 |
|
|
60 |
if (entity != null) { |
|
61 |
|
|
62 |
if (type == TypeProtos.Type.result){ |
|
63 |
Set<String> result_result = new HashSet<>(); |
|
64 |
//verifico se il risultato ha una relazione semantica verso uno o piu' risultati. |
|
65 |
//per ogni risultato linkato con issupplementto o issupplementedby emetto: |
|
66 |
// id risultato linkato come chiave, |
|
67 |
// id risultato oggetto del mapping e lista degli autori del risultato oggetto del mapper come value |
|
68 |
for(String sem : sem_rels){ |
|
69 |
result_result.addAll(getRelationTarget(value, sem, context, COUNTER_PROPAGATION)); |
|
70 |
} |
|
71 |
if(!result_result.isEmpty()){ |
|
72 |
List<String> authorlist = getAuthorList(entity.getResult().getMetadata().getAuthorList()); |
|
73 |
Emit e = new Emit(); |
|
74 |
e.setId(Bytes.toString(keyIn.get())); |
|
75 |
e.setAuthor_list(authorlist); |
|
76 |
valueOut.set(Value.newInstance(new Gson().toJson(e, Emit.class), |
|
77 |
trust, |
|
78 |
Type.fromsemrel).toJson()); |
|
79 |
for (String result: result_result){ |
|
80 |
keyOut.set(Bytes.toBytes(result)); |
|
81 |
context.write(keyOut,valueOut); |
|
82 |
context.getCounter(COUNTER_PROPAGATION,"emit for sem_rel").increment(1); |
|
83 |
} |
|
84 |
|
|
85 |
//emetto anche id dell'oggetto del mapper come chiave e lista degli autori come valore |
|
86 |
e.setId(keyIn.toString()); |
|
87 |
e.setAuthor_list(authorlist); |
|
88 |
valueOut.set(Value.newInstance(new Gson().toJson(e, Emit.class), trust, Type.fromresult).toJson()); |
|
89 |
context.write(keyIn, valueOut); |
|
90 |
context.getCounter(COUNTER_PROPAGATION,"emit for result with orcid").increment(1); |
|
91 |
|
|
92 |
} |
|
93 |
} |
|
94 |
|
|
95 |
} |
|
96 |
} |
|
97 |
|
|
98 |
private List<String> getAuthorList(List<FieldTypeProtos.Author> author_list){ |
|
99 |
|
|
100 |
return author_list.stream().map(a -> new JsonFormat().printToString(a)).collect(Collectors.toList()); |
|
101 |
|
|
102 |
} |
|
103 |
|
|
104 |
|
|
105 |
|
|
106 |
} |
modules/dnet-mapreduce-jobs/branches/tree-dedup/src/main/java/eu/dnetlib/data/mapreduce/hbase/propagation/orcidthroughproducts/Emit.java | ||
---|---|---|
1 |
package eu.dnetlib.data.mapreduce.hbase.propagation.orcidthroughproducts; |
|
2 |
|
|
3 |
import com.google.gson.Gson; |
|
4 |
import eu.dnetlib.data.mapreduce.hbase.propagation.communitythroughorganization.DedupedList; |
|
5 |
import eu.dnetlib.data.proto.FieldTypeProtos; |
|
6 |
|
|
7 |
import java.io.Serializable; |
|
8 |
import java.util.List; |
|
9 |
|
|
10 |
public class Emit implements Serializable { |
|
11 |
|
|
12 |
private String id; |
|
13 |
private List<String> author_list; |
|
14 |
|
|
15 |
public String getId() { |
|
16 |
return id; |
|
17 |
} |
|
18 |
|
|
19 |
public void setId(String id) { |
|
20 |
this.id = id; |
|
21 |
} |
|
22 |
|
|
23 |
public List<String> getAuthor_list() { |
|
24 |
return author_list; |
|
25 |
} |
|
26 |
|
|
27 |
public void setAuthor_list(List<String> author_list) { |
|
28 |
this.author_list = author_list; |
|
29 |
} |
|
30 |
|
|
31 |
public static Emit fromJson(String value) { |
|
32 |
return new Gson().fromJson(value, Emit.class); |
|
33 |
} |
|
34 |
} |
modules/dnet-mapreduce-jobs/branches/tree-dedup/src/main/java/eu/dnetlib/data/mapreduce/hbase/propagation/projecttoresult/ResultProjectIterator.java | ||
---|---|---|
88 | 88 |
.setRelMetadata( |
89 | 89 |
RelMetadataProtos.RelMetadata.newBuilder() |
90 | 90 |
.setSemantics( |
91 |
getQualifier(semantics,DNET_RELATION_SCHEMA,semantics,DNET_RELATION_SCHEMA)
|
|
91 |
getQualifier(semantics, DNET_RELATION_SCHEMA_PROJECTS, semantics, DNET_RELATION_SCHEMA_PROJECTS)
|
|
92 | 92 |
) |
93 | 93 |
) |
94 | 94 |
); |
95 | 95 |
|
96 | 96 |
final OafProtos.OafRel.Builder relation = OafProtos.OafRel.newBuilder() |
97 | 97 |
.setChild(false) |
98 |
.setSubRelType(SUBREL_TYPE) |
|
99 |
.setRelType(REL_TYPE) |
|
98 |
.setSubRelType(SUBREL_TYPE_PROJECT)
|
|
99 |
.setRelType(REL_TYPE_PROJECT)
|
|
100 | 100 |
.setRelClass(semantics) |
101 | 101 |
.setTarget(target) |
102 | 102 |
.setSource(source) |
modules/dnet-mapreduce-jobs/branches/tree-dedup/src/main/java/eu/dnetlib/data/mapreduce/hbase/propagation/communitythroughorganization/PropagationCommunityThroughOrganizationFileReducer.java | ||
---|---|---|
58 | 58 |
|
59 | 59 |
if(communities.size() > 0){ |
60 | 60 |
final ResultProtos.Result.Metadata.Builder metadata = ResultProtos.Result.Metadata.newBuilder(); |
61 |
communities.stream().forEach(community->metadata.addContext(Utils.getContext(community, ORGANIZATION_COMMUNITY_TRUST, CLASS_ORGANIZATION_ID, DATA_INFO_TYPE,CLASS_ORGANIZATION_NAME))); |
|
61 |
communities.stream().forEach(community-> |
|
62 |
{metadata.addContext(Utils.getContext(community, ORGANIZATION_COMMUNITY_TRUST, CLASS_ORGANIZATION_ID, DATA_INFO_TYPE,CLASS_ORGANIZATION_NAME)); |
|
63 |
context.getCounter(COUNTER_PROPAGATION, "added result to community " + community).increment(resultIds.size());}); |
|
64 |
|
|
62 | 65 |
for(String result: resultIds){ |
63 | 66 |
keyOut.set(result); |
64 | 67 |
outValue.set(JsonFormat.printToString(Utils.getUpdate(metadata, result)).getBytes()); |
... | ... | |
66 | 69 |
context.getCounter(COUNTER_PROPAGATION, "added community to result").increment(communities.size()); |
67 | 70 |
} |
68 | 71 |
|
72 |
|
|
73 |
|
|
69 | 74 |
} |
70 | 75 |
|
71 | 76 |
|
modules/dnet-mapreduce-jobs/branches/tree-dedup/src/main/java/eu/dnetlib/data/mapreduce/hbase/propagation/communitythroughorganization/PropagationCommunityThroughOrganizationReducer.java | ||
---|---|---|
37 | 37 |
DedupedList communities = new DedupedList(); |
38 | 38 |
Set<String> resultIds = new HashSet<>(); |
39 | 39 |
|
40 |
while(it.hasNext()){
|
|
40 |
while (it.hasNext()) {
|
|
41 | 41 |
Value v = Value.fromJson(it.next().toString()); |
42 |
switch (v.getType()){ |
|
42 |
switch (v.getType()) {
|
|
43 | 43 |
case fromorganization: |
44 | 44 |
communities.addAll(DedupedList.fromJson(v.getValue())); |
45 | 45 |
break; |
... | ... | |
51 | 51 |
|
52 | 52 |
} |
53 | 53 |
|
54 |
if(communities.size() > 0){
|
|
54 |
if (communities.size() > 0) {
|
|
55 | 55 |
final ResultProtos.Result.Metadata.Builder metadata = ResultProtos.Result.Metadata.newBuilder(); |
56 |
communities.stream().forEach(community->metadata.addContext(Utils.getContext(community, ORGANIZATION_COMMUNITY_TRUST, CLASS_ORGANIZATION_ID, DATA_INFO_TYPE,CLASS_ORGANIZATION_NAME))); |
|
57 |
for(String result: resultIds){ |
|
56 |
communities.stream().forEach(community -> { |
|
57 |
metadata.addContext(Utils.getContext(community, ORGANIZATION_COMMUNITY_TRUST, CLASS_ORGANIZATION_ID, DATA_INFO_TYPE, CLASS_ORGANIZATION_NAME)); |
|
58 |
context.getCounter(COUNTER_PROPAGATION, "added result to community " + community).increment(resultIds.size()); |
|
59 |
}); |
|
60 |
for (String result : resultIds) { |
|
58 | 61 |
final Put put = new Put(Bytes.toBytes(result)).add(Bytes.toBytes("result"), Bytes.toBytes("update_" + System.nanoTime()), Utils.getUpdate(metadata, result).toByteArray()); |
59 | 62 |
keyOut.set(Bytes.toBytes(result)); |
60 | 63 |
context.write(keyOut, put); |
... | ... | |
65 | 68 |
|
66 | 69 |
} |
67 | 70 |
|
68 |
|
|
69 | 71 |
} |
modules/dnet-mapreduce-jobs/branches/tree-dedup/src/main/java/eu/dnetlib/data/mapreduce/hbase/broker/enrich/AbstractEnrichmentReducer.java | ||
---|---|---|
17 | 17 |
import eu.dnetlib.data.mapreduce.hbase.broker.model.EventWrapper; |
18 | 18 |
import eu.dnetlib.data.proto.OafProtos.Oaf; |
19 | 19 |
import eu.dnetlib.pace.config.DedupConfig; |
20 |
import eu.dnetlib.pace.distance.PaceDocumentDistance; |
|
21 |
import eu.dnetlib.pace.distance.eval.ScoreResult; |
|
22 | 20 |
import eu.dnetlib.pace.model.MapDocument; |
23 | 21 |
import eu.dnetlib.pace.model.ProtoDocumentBuilder; |
22 |
import eu.dnetlib.pace.tree.support.TreeProcessor; |
|
24 | 23 |
import org.apache.commons.lang.StringUtils; |
25 | 24 |
import org.apache.commons.math.util.MathUtils; |
26 | 25 |
import org.apache.hadoop.hbase.client.HTable; |
... | ... | |
160 | 159 |
final MapDocument a = ProtoDocumentBuilder.newInstance(oa.getEntity().getId(), oa.getEntity(), dedupConf.getPace().getModel()); |
161 | 160 |
final MapDocument b = ProtoDocumentBuilder.newInstance(ob.getEntity().getId(), ob.getEntity(), dedupConf.getPace().getModel()); |
162 | 161 |
|
163 |
final ScoreResult sr = new PaceDocumentDistance().between(a, b, dedupConf);
|
|
164 |
return sr.getScore();
|
|
162 |
TreeProcessor tree = new TreeProcessor(dedupConf);
|
|
163 |
return tree.computeScore(a, b);
|
|
165 | 164 |
} |
166 | 165 |
|
167 | 166 |
protected float scale(final double d) { |
modules/dnet-mapreduce-jobs/branches/tree-dedup/src/main/java/eu/dnetlib/data/mapreduce/hbase/broker/OrcidEventFactory.java | ||
---|---|---|
19 | 19 |
import eu.dnetlib.data.proto.FieldTypeProtos; |
20 | 20 |
import eu.dnetlib.data.proto.OafProtos.Oaf; |
21 | 21 |
import eu.dnetlib.miscutils.collections.Pair; |
22 |
import eu.dnetlib.pace.distance.algo.JaroWinkler;
|
|
22 |
import eu.dnetlib.pace.tree.JaroWinkler;
|
|
23 | 23 |
|
24 | 24 |
public class OrcidEventFactory { |
25 | 25 |
|
modules/dnet-mapreduce-jobs/branches/tree-dedup/src/main/java/eu/dnetlib/data/mapreduce/hbase/dataimport/DOIBoostToActions.java | ||
---|---|---|
10 | 10 |
import java.util.stream.Stream; |
11 | 11 |
import java.util.zip.Inflater; |
12 | 12 |
|
13 |
import com.google.common.collect.Lists; |
|
13 | 14 |
import com.google.gson.Gson; |
14 | 15 |
import com.google.gson.JsonElement; |
15 | 16 |
import com.google.gson.JsonObject; |
... | ... | |
44 | 45 |
public static final String SEPARATOR = "::"; |
45 | 46 |
public static final String DNET_LANGUAGES = "dnet:languages"; |
46 | 47 |
|
48 |
private static final List<String> DATE_TYPES = Lists.newArrayList("issued", "accepted", "published-online", "published-print"); |
|
49 |
|
|
50 |
|
|
51 |
|
|
47 | 52 |
private static Map<String, Pair<String, String>> datasources = new HashMap<String, Pair<String, String>>() {{ |
48 | 53 |
put(MAG.toLowerCase(), new Pair<>("Microsoft Academic Graph", OPENAIRE_PREFIX + SEPARATOR + "microsoft")); |
49 | 54 |
put(ORCID.toLowerCase(), new Pair<>(ORCID, OPENAIRE_PREFIX + SEPARATOR + "orcid")); |
... | ... | |
391 | 396 |
.setQualifier(getQualifier("main title", "dnet:dataCite_title")) |
392 | 397 |
.build())); |
393 | 398 |
|
394 |
settingRelevantDate(rootElement, metadata, "issued", "issued", true); |
|
399 |
|
|
400 |
final String firstValidDate = getFirstValidDate(rootElement); |
|
401 |
if (StringUtils.isNotBlank(firstValidDate)) { |
|
402 |
setDate(metadata, "issued", firstValidDate, true); |
|
403 |
} else { |
|
404 |
context.incrementCounter("filtered", "missing_date", 1); |
|
405 |
return null; |
|
406 |
} |
|
395 | 407 |
settingRelevantDate(rootElement, metadata, "accepted", "accepted", false); |
396 | 408 |
settingRelevantDate(rootElement, metadata, "published-online", "published-online", false); |
397 | 409 |
settingRelevantDate(rootElement, metadata, "published-print", "published-print", false); |
... | ... | |
547 | 559 |
return root.has(key) && root.get(key).isJsonArray(); |
548 | 560 |
} |
549 | 561 |
|
562 |
private static String getFirstValidDate(final JsonObject root) { |
|
563 |
return DATE_TYPES.stream() |
|
564 |
.map(type -> getStringValue(root, type)) |
|
565 |
.filter(Objects::nonNull) |
|
566 |
.filter(DumpToActionsUtility::isValidDate) |
|
567 |
.findFirst() |
|
568 |
.orElseGet(null); |
|
569 |
} |
|
570 |
|
|
571 |
private static void setDate(ResultProtos.Result.Metadata.Builder metadata, |
|
572 |
final String dictionaryKey, |
|
573 |
final String date, |
|
574 |
final boolean addToDateOfAcceptance) { |
|
575 |
if (date == null) |
|
576 |
return; |
|
577 |
if (addToDateOfAcceptance) { |
|
578 |
metadata.setDateofacceptance(FieldTypeProtos.StringField.newBuilder().setValue(date).build()); |
|
579 |
} else { |
|
580 |
metadata.addRelevantdate(FieldTypeProtos.StructuredProperty.newBuilder() |
|
581 |
.setValue(date) |
|
582 |
.setQualifier(getQualifier(dictionaryKey, "dnet:dataCite_date")) |
|
583 |
.build()); |
|
584 |
} |
|
585 |
} |
|
586 |
|
|
550 | 587 |
private static void settingRelevantDate(JsonObject rootElement, |
551 | 588 |
ResultProtos.Result.Metadata.Builder metadata, |
552 | 589 |
final String jsonKey, |
modules/dnet-mapreduce-jobs/branches/tree-dedup/src/main/java/eu/dnetlib/data/mapreduce/hbase/dataimport/DumpToActionsUtility.java | ||
---|---|---|
47 | 47 |
} |
48 | 48 |
|
49 | 49 |
public static boolean isValidDate(final String date) { |
50 |
return date.matches("\\d{4}-\\d{2}-\\d{2}");
|
|
50 |
return date.matches("\\d{4}-\\d{1,2}-\\d{1,2}");
|
|
51 | 51 |
} |
52 | 52 |
|
53 | 53 |
public static FieldTypeProtos.StructuredProperty getPid(final JsonObject localIdentifier, final Map<String, ScholExplorerConfiguration> conf) { |
modules/dnet-mapreduce-jobs/branches/tree-dedup/src/main/java/eu/dnetlib/data/mapreduce/hbase/dataexport/ProtoConverter.java | ||
---|---|---|
476 | 476 |
entity.setRank(author.getRank()); |
477 | 477 |
entity.setPid(author.getPidList() |
478 | 478 |
.stream() |
479 |
.map(ProtoConverter::mapKV) |
|
479 |
.map(kv -> { |
|
480 |
final StructuredProperty sp = new StructuredProperty(); |
|
481 |
sp.setValue(kv.getValue()); |
|
482 |
final Qualifier q = new Qualifier(); |
|
483 |
q.setClassid(kv.getKey()); |
|
484 |
q.setClassname(kv.getKey()); |
|
485 |
sp.setQualifier(q); |
|
486 |
return sp; |
|
487 |
}) |
|
480 | 488 |
.collect(Collectors.toList())); |
481 | 489 |
entity.setAffiliation(author.getAffiliationList() |
482 | 490 |
.stream() |
modules/dnet-mapreduce-jobs/branches/tree-dedup/pom.xml | ||
---|---|---|
9 | 9 |
<modelVersion>4.0.0</modelVersion> |
10 | 10 |
<groupId>eu.dnetlib</groupId> |
11 | 11 |
<artifactId>dnet-mapreduce-jobs</artifactId> |
12 |
<version>1.2.1-SNAPSHOT</version>
|
|
12 |
<version>1.2.1-TREEDEDUP</version>
|
|
13 | 13 |
<packaging>jar</packaging> |
14 | 14 |
<scm> |
15 | 15 |
<developerConnection>scm:svn:https://svn.driver.research-infrastructures.eu/driver/dnet45/modules/dnet-mapreduce-jobs/trunk</developerConnection> |
... | ... | |
195 | 195 |
<dependency> |
196 | 196 |
<groupId>eu.dnetlib</groupId> |
197 | 197 |
<artifactId>dnet-openaireplus-mapping-utils</artifactId> |
198 |
<version>[6.3.25,7.0.0)</version>
|
|
198 |
<version>6.3.39-TREEDEDUP</version>
|
|
199 | 199 |
</dependency> |
200 | 200 |
<dependency> |
201 | 201 |
<groupId>org.antlr</groupId> |
Also available in: Unified diff
modification to fit with the tree-dedup