Project

General

Profile

« Previous | Next » 

Revision 57655

modification to fit with the tree-dedup

View differences:

modules/dnet-mapreduce-jobs/branches/tree-dedup/src/test/java/eu/dnetlib/data/mapreduce/hbase/propagation/orcidtoresult/OrcidToResultTest.java
1
package eu.dnetlib.data.mapreduce.hbase.propagation.orcidtoresult;
2

  
3
import com.google.gson.Gson;
4
import com.google.protobuf.InvalidProtocolBufferException;
5
import com.googlecode.protobuf.format.JsonFormat;
6
import eu.dnetlib.data.mapreduce.hbase.propagation.NotValidResultSequenceException;
7
import eu.dnetlib.data.mapreduce.hbase.propagation.PropagationConstants;
8
import eu.dnetlib.data.mapreduce.hbase.propagation.Value;
9
import eu.dnetlib.data.mapreduce.hbase.propagation.ValueList;
10
import eu.dnetlib.data.mapreduce.hbase.propagation.communitythroughorganization.OrganizationMap;
11
import eu.dnetlib.data.mapreduce.hbase.propagation.orcidthroughproducts.Emit;
12
import eu.dnetlib.data.mapreduce.hbase.propagation.orcidthroughproducts.ResultOrcidIterator;
13
import eu.dnetlib.data.proto.FieldTypeProtos;
14
import org.apache.hadoop.io.Text;
15
import org.elasticsearch.hadoop.util.Assert;
16
import org.junit.Before;
17
import org.junit.Ignore;
18
import org.junit.Test;
19

  
20
import java.io.IOException;
21
import java.util.ArrayList;
22
import java.util.List;
23

  
24
public class OrcidToResultTest {
25

  
26
    final static String json1 = "{\"fullname\": \"Matteucci, F.\",\"rank\": 1 }";
27
    final static String json2 = "{\"fullname\": \"Romano, D.\",\"rank\": 2}";
28
    final static String json3 = "{\"pid\": [{\"value\": \"0000-0002-0571-4163\",\"key\": \"ORCID\"}],\"fullname\": \"Paolo Molaro\",\"surname\": \"Molaro\",\"name\": \"Paolo\",\"rank\": 3}";
29

  
30
    Value v;
31
    @Ignore
32
    @Before
33
    public void setUp() throws InvalidProtocolBufferException, JsonFormat.ParseException {
34
        List<String> authors = new ArrayList<>();
35
        FieldTypeProtos.Author.Builder author_builder = FieldTypeProtos.Author.newBuilder();
36
        JsonFormat.merge(json1, author_builder);
37
        authors.add(JsonFormat.printToString(author_builder.build()));
38
        author_builder = FieldTypeProtos.Author.newBuilder();
39
        JsonFormat.merge(json2, author_builder);
40
        authors.add(JsonFormat.printToString(author_builder.build()));
41
        author_builder = FieldTypeProtos.Author.newBuilder();
42
        JsonFormat.merge(json3, author_builder);
43
        authors.add(JsonFormat.printToString(author_builder.build()));
44

  
45

  
46
        Emit e = new Emit();
47
        e.setId("id1");
48
        e.setAuthor_list(authors);
49
        v = Value.newInstance(new Gson().toJson(e,Emit.class),"0.85", PropagationConstants.Type.fromresult);
50

  
51
        System.out.println(v.toJson());
52
    }
53

  
54
    @Test
55
    public void testBefore() throws NotValidResultSequenceException, IOException {
56

  
57
        ValueList vl = new ValueList("orcid2result.json");
58
        List<Text> tmp = vl.getValueToText();
59
        ResultOrcidIterator roi = new ResultOrcidIterator(tmp,"orcid_______::00001032f8cbf9091d8fade5bfa1700c");
60

  
61
        System.out.println(roi.hasNext());
62

  
63
        while(roi.hasNext()){
64
            System.out.println(roi.next());
65
        }
66
    }
67
}
modules/dnet-mapreduce-jobs/branches/tree-dedup/src/test/java/eu/dnetlib/data/mapreduce/hbase/propagation/communitythroughorganization/OrganizationToCommunityTest.java
1
package eu.dnetlib.data.mapreduce.hbase.propagation.communitythroughorganization;
2

  
3
import com.google.gson.Gson;
4
import com.google.protobuf.InvalidProtocolBufferException;
5
import org.elasticsearch.hadoop.util.Assert;
6
import org.junit.Before;
7
import org.junit.Test;
8

  
9

  
10
public class OrganizationToCommunityTest {
11

  
12
    OrganizationMap organizationMap;
13

  
14

  
15
    final String jsonmap = "{'coorda__h2020::3fb05a9524c3f790391261347852f638':['mes'], " +
16
            "'corda__h2020::e8dbe14cca9bf6fce09d468872f813f8':['mes'], " +
17
            "'snsf________::9b253f265e3bef5cae6d881fdf61aceb':['mes'], " +
18
            "'rcuk________::e054eea0a47665af8c3656b5785ccf76':['mes'], " +
19
            "'corda__h2020::edc18d67c9b11fb616ca9f6e1db1b151':['mes'], " +
20
            "'rcuk________::d5736d9da90521ddcdc7828a05a85e9a':['mes'], " +
21
            "'corda__h2020::f5d418d3aa1cf817ddefcc3fdc039f27':['mes'], " +
22
            "'snsf________::8fa091f8f25a846779acb4ea97b50aef':['mes'], " +
23
            "'corda__h2020::81e020977211c2c40fae2e1a50bffd71':['mes'], " +
24
            "'corda_______::81e020977211c2c40fae2e1a50bffd71':['mes'], " +
25
            "'snsf________::31d0a100e54e3cdb3c6f52d91e638c78':['mes'], " +
26
            "'corda__h2020::ea379ef91b8cc86f9ac5edc4169292db':['mes'], " +
27
            "'corda__h2020::f75ee2ee48e5cb0ec8c8d30aaa8fef70':['mes'], " +
28
            "'rcuk________::e16010089551a1a9182a94604fc0ea59':['mes'], " +
29
            "'corda__h2020::38531a2cce7c5c347ffc439b07c1f43b':['mes'], " +
30
            "'corda_______::38531a2cce7c5c347ffc439b07c1f43b':['mes'], " +
31
            "'grid________::b2cbbf5eadbbf87d534b022bad3191d7':['mes'], " +
32
            "'snsf________::74730ef1439d7f7636a8be58a6b471b8':['mes'], " +
33
            "'nsf_________::ad72e19043a5a467e35f9b444d11563e':['mes'], " +
34
            "'rcuk________::0fc3e92500290902a2d38ec2445e74c3':['mes'], " +
35
            "'grid________::ad2c29905da0eb3c06b3fa80cacd89ea':['mes']," +
36
            "'corda__h2020::30b53e4d63d3724f00acb9cbaca40860':['mes']," +
37
            "'corda__h2020::f60f84bee14ad93f0db0e49af1d5c317':['mes']," +
38
            "'corda__h2020::7bf251ac3765b5e89d82270a1763d09f':['mes']," +
39
            "'corda__h2020::65531bd11be9935948c7f2f4db1c1832':['mes']," +
40
            "'corda__h2020::e0e98f86bbc76638bbb72a8fe2302946':['mes']," +
41
            "'snsf________::3eb43582ac27601459a8d8b3e195724b':['mes']," +
42
            "'corda__h2020::af2481dab65d06c8ea0ae02b5517b9b6':['mes']," +
43
            "'corda__h2020::c19d05cfde69a50d3ebc89bd0ee49929':['mes']," +
44
            "'corda__h2020::af0bfd9fc09f80d9488f56d71a9832f0':['mes']}";
45

  
46

  
47
    @Before
48
    public void setUp() throws InvalidProtocolBufferException {
49
        organizationMap = new Gson().fromJson(jsonmap,OrganizationMap.class);
50

  
51
    }
52

  
53
    @Test
54
    public void provaMapLoading(){
55
        Assert.isTrue(organizationMap.get("pippo").size()==0);
56

  
57
        Assert.isTrue(organizationMap.get("corda__h2020::af0bfd9fc09f80d9488f56d71a9832f0").size()==1);
58

  
59
        Assert.isTrue((organizationMap.size()==30));
60
    }
61

  
62
    @Test
63
    public void provaOrderedString(){
64
        String s1 = "Molaro, P.";
65
        String s2 = "Paolo Molaro";
66

  
67
        String[] ns_a1 = s1.trim().split(" ");
68
        String[] ns_a2 = s2.trim().split(" ");
69

  
70

  
71

  
72

  
73
        if (ns_a1[0].endsWith(".") || ns_a1[0].endsWith(",")){
74
            ns_a1[0] = ns_a1[0].substring(0,ns_a1[0].length()-1);
75
        }
76
        if (ns_a1[1].endsWith(".") || ns_a1[1].endsWith(",")){
77
            ns_a1[1] = ns_a1[1].substring(0,ns_a1[1].length()-1);
78
        }
79

  
80
        if (ns_a2[0].endsWith(".") || ns_a2[0].endsWith(",")){
81
            ns_a2[0] = ns_a2[0].substring(0,ns_a2[0].length()-1);
82
        }
83
        if (ns_a2[1].endsWith(".") || ns_a2[1].endsWith(",")){
84
            ns_a2[1] = ns_a2[1].substring(0,ns_a2[1].length()-1);
85
        }
86
        if(ns_a1[0].compareTo(ns_a1[1]) < 0){
87
            String tmp = ns_a1[0];
88
            ns_a1[0] = ns_a1[1];
89
            ns_a1[1] = tmp;
90
        }
91

  
92
        if(ns_a2[0].compareTo(ns_a2[1]) < 0){
93
            String tmp = ns_a2[0];
94
            ns_a2[0] = ns_a2[1];
95
            ns_a2[1] = tmp;
96

  
97
        }
98
        System.out.println(ns_a1[1].compareTo(ns_a1[0]));
99
        System.out.println(ns_a2[1].compareTo(ns_a2[0]));
100
        if(ns_a1[0].equalsIgnoreCase(ns_a2[0])){
101
            if(ns_a1[1].equalsIgnoreCase(ns_a2[1])){//same name and surname
102
                System.out.println("equals");
103
            }
104
            if(ns_a1[1].length() == 1 || ns_a2[1].length() == 1){
105
                System.out.println(ns_a1[1].charAt(0) == ns_a2[1].charAt(0));
106
            }
107
            System.out.println("different");
108
        }else{
109
            if(ns_a1[1].equalsIgnoreCase(ns_a2[1])){
110
                if(ns_a1[0].length() == 1 || ns_a2[0].length()==1)
111
                    System.out.println( ns_a1[0].charAt(0) == ns_a2[0].charAt(0));
112
                else
113
                    System.out.println("different");
114
            }
115
        }
116
    }
117

  
118
}
modules/dnet-mapreduce-jobs/branches/tree-dedup/src/test/resources/eu/dnetlib/data/mapreduce/hbase/propagation/orcid2result.json
1
{"valueList":[{"type":"fromresult",
2
  "value":"{\"id\":\"id1\",\"author_list\":[\"{\\\"fullname\\\": \\\"Matteucci, F.\\\",\\\"rank\\\": 1}\",\"{\\\"fullname\\\": \\\"Donatella Romano\\\",\\\"name\\\": \\\"Donatella\\\",\\\"surname\\\": \\\"Romano\\\",\\\"rank\\\": 2,\\\"pid\\\": [{\\\"key\\\": \\\"ORCID\\\",\\\"value\\\": \\\"0000-0002-0845-6171\\\"}]}\",\"{\\\"fullname\\\": \\\"Paolo Molaro\\\",\\\"name\\\": \\\"Paolo\\\",\\\"surname\\\": \\\"Molaro\\\",\\\"rank\\\": 3,\\\"pid\\\": [{\\\"key\\\": \\\"ORCID\\\",\\\"value\\\": \\\"0000-0002-0571-4163\\\"}]}\"]}",
3
  "trust":"0.9"},
4
  {"type": "fromsemrel",
5
  "trust": "0.9",
6
  "value": "{\"id\":\"id2\",\"author_list\":[\"{\\\"fullname\\\": \\\"Matteucci, F.\\\",\\\"rank\\\": 1}\",\"{\\\"fullname\\\": \\\"Molaro, P.\\\",\\\"rank\\\": 3,\\\"name\\\": \\\"P.\\\",\\\"surname\\\": \\\"Molaro\\\"}\",\"{\\\"fullname\\\": \\\"Donatella Romano\\\",\\\"name\\\": \\\"Donatella\\\",\\\"surname\\\": \\\"Romano\\\",\\\"rank\\\": 2,\\\"pid\\\": [{\\\"key\\\": \\\"ORCID\\\",\\\"value\\\": \\\"0000-0002-0845-6171\\\"}]}\"]}"
7
  },
8
  {"type":"fromsemrel",
9
    "value":"{\"id\":\"id1\",\"author_list\":[\"{\\\"fullname\\\": \\\"Matteucci, F.\\\",\\\"rank\\\": 1}\",\"{\\\"fullname\\\": \\\"Donatella Romano\\\",\\\"name\\\": \\\"Donatella\\\",\\\"surname\\\": \\\"Romano\\\",\\\"rank\\\": 2,\\\"pid\\\": [{\\\"key\\\": \\\"ORCID\\\",\\\"value\\\": \\\"0000-0002-0845-6171\\\"}]}\",\"{\\\"fullname\\\": \\\"Paolo Molaro\\\",\\\"rank\\\": 3,\\\"pid\\\": [{\\\"key\\\": \\\"ORCID\\\",\\\"value\\\": \\\"0000-0002-0571-4163\\\"}]}\"]}",
10
    "trust":"0.9"}
11

  
12

  
13
]}
14

  
modules/dnet-mapreduce-jobs/branches/tree-dedup/src/main/java/eu/dnetlib/data/mapreduce/hbase/dedup/DedupReducer.java
1 1
package eu.dnetlib.data.mapreduce.hbase.dedup;
2 2

  
3
import java.io.IOException;
4
import java.util.*;
5

  
6 3
import com.google.common.base.Function;
7 4
import com.google.common.collect.Iterables;
8
import com.google.common.collect.Lists;
9
import com.google.protobuf.InvalidProtocolBufferException;
10 5
import eu.dnetlib.data.mapreduce.JobParams;
11 6
import eu.dnetlib.data.mapreduce.util.DedupUtils;
12
import eu.dnetlib.data.mapreduce.util.StreamUtils;
13
import eu.dnetlib.data.proto.OafProtos;
14
import eu.dnetlib.data.proto.RelTypeProtos.SubRelType;
15
import eu.dnetlib.data.proto.TypeProtos.Type;
16
import eu.dnetlib.pace.clustering.NGramUtils;
17 7
import eu.dnetlib.pace.config.DedupConfig;
18
import eu.dnetlib.pace.config.WfConfig;
19
import eu.dnetlib.pace.distance.PaceDocumentDistance;
20
import eu.dnetlib.pace.distance.eval.ScoreResult;
21
import eu.dnetlib.pace.model.*;
8
import eu.dnetlib.pace.model.MapDocument;
9
import eu.dnetlib.pace.model.MapDocumentSerializer;
22 10
import eu.dnetlib.pace.util.BlockProcessor;
23 11
import eu.dnetlib.pace.util.Reporter;
24
import org.apache.commons.lang.StringUtils;
25 12
import org.apache.commons.logging.Log;
26 13
import org.apache.commons.logging.LogFactory;
27 14
import org.apache.hadoop.hbase.client.Durability;
......
32 19
import org.apache.hadoop.io.Text;
33 20

  
34 21
import javax.annotation.Nullable;
22
import java.io.IOException;
35 23

  
36 24
public class DedupReducer extends TableReducer<Text, ImmutableBytesWritable, ImmutableBytesWritable> {
37 25

  
modules/dnet-mapreduce-jobs/branches/tree-dedup/src/main/java/eu/dnetlib/data/mapreduce/hbase/propagation/PropagationConstants.java
18 18

  
19 19
    public final static String ORGANIZATION_COMMUNITY_TRUST = "0.85";
20 20
    public static final String ORCID_RESULT_TRUST = "0.85";
21
    public static final String PRODUCT_TO_ORGANIZATION = "0.85";
21 22

  
22 23
    public static final String ZERO = "0";
23 24
    public static final String ONE = "1";
......
29 30
    public final static String SCHEMA_ID = "dnet:provenanceActions";
30 31

  
31 32
    public final static String DNET_COUNTRY_SCHEMA = "dnet:countries";
32
    public final static String DNET_RELATION_SCHEMA = "dnet:result_project_relations";
33
    public final static String DNET_RELATION_SCHEMA_PROJECTS = "dnet:result_project_relations";
34
    public final static String DNET_RELATION_SCHEMA_ORGANIZATION = "dnet:result_organization_relation";
33 35
    //public final static String DNET_COMMUNITY_RELATION_SCHEMA = "dnet:result_result_relations";
34 36

  
35 37
    public final static String CLASS_PROJECT_ID = "propagation:project:semrel";
......
45 47
    public final static String CLASS_ORGANIZATION_NAME = "Propagation of community result through organization association";
46 48

  
47 49
    public final static String CLASS_ORCID_ID = "propagation:orcid:result";
48
    public static final String CLASS_ORCID_NAME = "Propagation of ORCID through result linked by isSupplementedBy of isSupplementOf semantic relations";
50
    public static final String CLASS_ORCID_NAME = "Propagation of ORCID through result linked by isSupplementedBy or isSupplementTo semantic relations";
49 51

  
52
    public static final String CLASS_ORGANIZATION_RESULT_ID = "propagation:result:organization";
53
    public static final String CLASS_ORGANIZATION_RESULT_NAME = "Propagation of result linked to organization through result linked by isSupplementTo or isSupplementedBy";
54

  
50 55
    public final static int PROJECT = TypeProtos.Type.project.getNumber();//40
51 56
    public final static int DATASOURCE = TypeProtos.Type.datasource.getNumber();//10
52 57
    public final static int ORGANIZATION = TypeProtos.Type.organization.getNumber();//20
53 58
    public final static int PUBLICATION = TypeProtos.Type.result.getNumber();//50
54 59

  
55
    public final static RelTypeProtos.RelType REL_TYPE = RelTypeProtos.RelType.resultProject;
56
    public final static RelTypeProtos.SubRelType SUBREL_TYPE = RelTypeProtos.SubRelType.outcome;
60
    public final static RelTypeProtos.RelType REL_TYPE_PROJECT = RelTypeProtos.RelType.resultProject;
61
    public final static RelTypeProtos.SubRelType SUBREL_TYPE_PROJECT = RelTypeProtos.SubRelType.outcome;
57 62
    public static final String REL_PROJECT_RESULT = "produces";
58 63
    public static final String REL_RESULT_PROJECT = "isProducedBy";
59
    public static final String RELATION = REL_TYPE + "_" + SUBREL_TYPE + "_";
64
    public static final String RELATION = REL_TYPE_PROJECT + "_" + SUBREL_TYPE_PROJECT + "_";
60 65
    public static final String OUTCOME_PRODUCEDBY = RELATION + REL_RESULT_PROJECT;
66

  
61 67
    public static final String[] DEFAULT_PROJECT_RELATION_SET = new String[]{"resultResult_supplement_isSupplementedBy","resultResult_supplement_isSupplementTo"};
62 68
    public static final String[] DEFAULT_COMMUNITY_RELATION_SET = new String[]{"resultResult_supplement_isSupplementedBy","resultResult_supplement_isSupplementTo"};
69
    public static final String[] DEFAULT_ORGANIZATION_RESULT_RELATION_SET = new String[]{"resultResult_supplement_isSupplementedBy","resultResult_supplement_isSupplementTo"};
63 70

  
64 71
    public static final String[] DEFAULT_RESULT_RELATION_SET = new String[]{"resultResult_supplement_isSupplementedBy","resultResult_supplement_isSupplementTo"};
65 72

  
modules/dnet-mapreduce-jobs/branches/tree-dedup/src/main/java/eu/dnetlib/data/mapreduce/hbase/propagation/orcidthroughproducts/PropagationOrcidToResultFileReducer.java
1
package eu.dnetlib.data.mapreduce.hbase.propagation.orcidthroughproducts;
2

  
3
import com.googlecode.protobuf.format.JsonFormat;
4
import eu.dnetlib.data.mapreduce.hbase.propagation.NotValidResultSequenceException;
5
import eu.dnetlib.data.mapreduce.hbase.propagation.ResultIterator;
6
import eu.dnetlib.data.mapreduce.hbase.propagation.projecttoresult.PropagationProjectToResultReducer;
7
import eu.dnetlib.data.mapreduce.hbase.propagation.projecttoresult.ResultProjectIterator;
8
import eu.dnetlib.data.proto.OafProtos;
9
import org.apache.commons.logging.Log;
10
import org.apache.commons.logging.LogFactory;
11
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
12
import org.apache.hadoop.hbase.util.Bytes;
13
import org.apache.hadoop.io.Text;
14
import org.apache.hadoop.mapreduce.Reducer;
15

  
16
import java.io.IOException;
17
import java.util.List;
18

  
19
import static eu.dnetlib.data.mapreduce.hbase.propagation.PropagationConstants.COUNTER_PROPAGATION;
20

  
21
public class PropagationOrcidToResultFileReducer extends Reducer<ImmutableBytesWritable, Text, Text, Text> {
22
    private static final Log log = LogFactory.getLog(PropagationOrcidToResultFileReducer.class); // NOPMD by marko on 11/24/08 5:02 PM
23

  
24
    private Text keyOut;
25
    private Text outValue;
26

  
27

  
28
    @Override
29
    protected void setup(final Context context) throws IOException, InterruptedException {
30
        super.setup(context);
31
        keyOut = new Text("");
32
        outValue = new Text();
33
    }
34

  
35

  
36
    @Override
37
    protected void reduce(ImmutableBytesWritable key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
38
        ResultIterator rh = null;
39
        try {
40
            rh = new ResultOrcidIterator(values, Bytes.toString(key.copyBytes()));
41
        } catch (NotValidResultSequenceException e) {
42
            context.getCounter(COUNTER_PROPAGATION, e.getMessage()).increment(1);
43
            return;
44
        }
45
        while (rh.hasNext()) {
46
            List<OafProtos.Oaf> oaf_list = rh.next();
47
            if(oaf_list != null){
48
                for (OafProtos.Oaf oaf : oaf_list) {
49
                    keyOut.set(oaf.getEntity().getId());
50
                    outValue.set(JsonFormat.printToString(oaf).getBytes());
51
                    context.write(keyOut, outValue);
52
                    context.getCounter(COUNTER_PROPAGATION, "Added orcid to result").increment(1);
53
                }
54

  
55
            }
56

  
57
        }
58

  
59
    }
60
}
modules/dnet-mapreduce-jobs/branches/tree-dedup/src/main/java/eu/dnetlib/data/mapreduce/hbase/propagation/orcidthroughproducts/PropagationOrcidToResultReducer.java
1
package eu.dnetlib.data.mapreduce.hbase.propagation.orcidthroughproducts;
2

  
3
import com.googlecode.protobuf.format.JsonFormat;
4
import eu.dnetlib.data.mapreduce.hbase.propagation.NotValidResultSequenceException;
5
import eu.dnetlib.data.mapreduce.hbase.propagation.ResultIterator;
6
import eu.dnetlib.data.mapreduce.hbase.propagation.projecttoresult.PropagationProjectToResultReducer;
7
import eu.dnetlib.data.mapreduce.hbase.propagation.projecttoresult.ResultProjectIterator;
8
import eu.dnetlib.data.mapreduce.util.OafRowKeyDecoder;
9
import eu.dnetlib.data.proto.OafProtos;
10
import org.apache.commons.logging.Log;
11
import org.apache.commons.logging.LogFactory;
12
import org.apache.hadoop.hbase.client.Put;
13
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
14
import org.apache.hadoop.hbase.mapreduce.TableReducer;
15
import org.apache.hadoop.hbase.util.Bytes;
16
import org.apache.hadoop.io.Text;
17

  
18
import java.io.IOException;
19
import java.util.List;
20

  
21
import static eu.dnetlib.data.mapreduce.hbase.propagation.PropagationConstants.COUNTER_PROPAGATION;
22
import static eu.dnetlib.data.mapreduce.hbase.propagation.PropagationConstants.RELATION;
23

  
24
public class PropagationOrcidToResultReducer  extends TableReducer<ImmutableBytesWritable, Text, ImmutableBytesWritable> {
25
    private static final Log log = LogFactory.getLog(PropagationOrcidToResultReducer.class); // NOPMD by marko on 11/24/08 5:02 PM
26
    private ImmutableBytesWritable keyOut;
27

  
28

  
29

  
30
    @Override
31
    protected void setup(final Context context) throws IOException, InterruptedException {
32
        super.setup(context);
33
        keyOut = new ImmutableBytesWritable();
34
    }
35

  
36

  
37
    @Override
38
    protected void reduce(ImmutableBytesWritable key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
39
        ResultIterator rh = null;
40
        try {
41
            rh = new ResultOrcidIterator(values, Bytes.toString(key.copyBytes()));
42
        } catch (NotValidResultSequenceException e) {
43
            context.getCounter(COUNTER_PROPAGATION, e.getMessage()).increment(1);
44
            return;
45
        }
46

  
47
        while (rh.hasNext()) {
48
            List<OafProtos.Oaf> oaf_list = rh.next();
49
            if(oaf_list != null){
50
                for (OafProtos.Oaf oaf : oaf_list) {
51
                    byte[] targetRowKey = Bytes.toBytes(oaf.getEntity().getId());
52
                    final Put put = new Put(targetRowKey).add(Bytes.toBytes("result"), Bytes.toBytes("update_" + System.nanoTime()), oaf.toByteArray());
53
                    keyOut.set(targetRowKey);
54
                    context.write(keyOut, put);
55
                    context.getCounter(COUNTER_PROPAGATION, "added orcid to product").increment(1);
56

  
57
                }
58

  
59
            }
60

  
61
        }
62

  
63

  
64

  
65
    }
66

  
67

  
68

  
69
}
modules/dnet-mapreduce-jobs/branches/tree-dedup/src/main/java/eu/dnetlib/data/mapreduce/hbase/propagation/orcidthroughproducts/ResultOrcidIterator.java
1
package eu.dnetlib.data.mapreduce.hbase.propagation.orcidthroughproducts;
2

  
3
import com.googlecode.protobuf.format.JsonFormat;
4
import eu.dnetlib.data.mapreduce.hbase.propagation.*;
5
import eu.dnetlib.data.proto.*;
6
import org.apache.hadoop.io.Text;
7

  
8
import java.util.ArrayList;
9
import java.util.Arrays;
10
import java.util.Iterator;
11
import java.util.List;
12
import java.util.stream.Collectors;
13

  
14
public class ResultOrcidIterator extends ResultIterator {
15

  
16
    private Iterator<String> author_iterator;
17
    private List<FieldTypeProtos.Author> autoritative_authors ;
18
    private List<String> relatedResult ;
19

  
20

  
21
    public ResultOrcidIterator(final Iterable<Text> values, final String key) throws NotValidResultSequenceException {
22
        super(values,key);
23
    }
24

  
25
    @Override
26
    protected void checkSequence() throws NotValidResultSequenceException {
27
        if(!it.hasNext()){
28
            throw new NotValidResultSequenceException("Empty information for key");
29
        }
30

  
31
        try {
32
            autoritative_authors = new ArrayList<>();
33
            relatedResult = new ArrayList<>();
34
            analizeValueList();
35

  
36
        }catch(JsonFormat.ParseException e){
37
            throw new NotValidResultSequenceException("Problems recreating the author list from serialization");
38
        }
39

  
40
        List<FieldTypeProtos.Author> authors_with_orcid = autoritative_authors.stream()
41
                .map(a -> {
42
                    if (a.getPidList() == null || a.getPidList().isEmpty())
43
                        return null;
44
                    return a;
45
                })
46
                .filter(a -> a!= null)
47
                .filter(a -> containsOrcid(a.getPidList()))
48
                .collect(Collectors.toList());
49

  
50

  
51
        if(authors_with_orcid.size() == 0 || relatedResult.size() == 0){
52
            resultId = TERMINATOR;
53
            return;
54
        }
55

  
56

  
57
        author_iterator = relatedResult.iterator();
58
        autoritative_authors = authors_with_orcid;
59
        getNext();
60

  
61
    }
62

  
63
    private boolean containsOrcid(List<FieldTypeProtos.KeyValue> pidList){
64
        if(pidList == null)
65
            return false;
66
        return pidList
67
                .stream()
68
                .filter(kv -> kv.getKey().equals(PropagationConstants.AUTHOR_PID))
69
                .collect(Collectors.toList()).size() > 0;
70
    }
71

  
72
    private void getNext(){
73
        if (author_iterator.hasNext())
74
            resultId = author_iterator.next();
75
        else
76
            resultId = TERMINATOR;
77
    }
78

  
79
    @Override
80
    public List<OafProtos.Oaf> next() {
81
        //get the next merged author list
82
        try {
83
            //list of authors in the related result
84
            Emit e = Emit.fromJson(resultId);
85
            List<FieldTypeProtos.Author> author_list = getAuthorList(e);
86

  
87
            ResultProtos.Result.Metadata.Builder metadata = searchMatch(author_list);
88

  
89
            if (metadata != null){
90
                ArrayList<OafProtos.Oaf> ret = new ArrayList<OafProtos.Oaf>(Arrays.asList(getUpdate(metadata, e.getId())));
91
                getNext();
92
                return ret;
93
            }
94

  
95

  
96
        }catch(JsonFormat.ParseException e){
97

  
98
        }
99
        getNext();
100
        return null;
101
    }
102

  
103
    private ResultProtos.Result.Metadata.Builder searchMatch(List<FieldTypeProtos.Author> author_list){
104
        ResultProtos.Result.Metadata.Builder metadataBuilder = ResultProtos.Result.Metadata.newBuilder();
105
        boolean updated = false;
106
//        for (FieldTypeProtos.Author a: autoritative_authors){
107
//            searchAuthor(a,author_list);
108
//        }
109

  
110
        for (FieldTypeProtos.Author a: author_list){
111
            FieldTypeProtos.Author.Builder author = searchAuthor(a, autoritative_authors);
112
            if(author != null){
113
                updated = true;
114
                metadataBuilder.addAuthor(author);
115
            }else{
116
                metadataBuilder.addAuthor(FieldTypeProtos.Author.newBuilder(a));
117
            }
118
        }
119
        if(updated)
120
            return metadataBuilder;
121
        return null;
122
    }
123

  
124

  
125
    private boolean equals(FieldTypeProtos.Author a1, FieldTypeProtos.Author a2){
126
        if(a1.hasSurname()){
127
            if(a2.hasSurname()){
128
                if(!a1.getSurname().trim().equalsIgnoreCase(a2.getSurname().trim())){
129
                    return false;
130
                }
131
                //have the same surname. Check the name
132
                if(a1.hasName()){
133
                    if (a2.hasName()){
134
                        if (a1.getName().trim().equalsIgnoreCase(a2.getName().trim())){
135
                            return true; //same name and same surname in a related research result
136
                        }
137
                        //they could be differently written (i.e. only the initials of the name in one of the two
138
                        return (a1.getName().trim().substring(0,0).equalsIgnoreCase(a2.getName().trim().substring(0,0)));
139
                    }
140
                }
141
            }
142
        }
143
//        if(a1.hasFullname()){
144
//            if (a2.hasFullname()){
145
//                if (a1.getFullname().trim().equalsIgnoreCase(a2.getFullname().trim())){
146
//                    return true;
147
//                }
148
//                //split string containing name and surname
149
//                String[] ns_a1 = a1.getFullname().trim().split(" ");
150
//                String[] ns_a2 = a2.getFullname().trim().split(" ");
151
//
152
//
153
//                if (ns_a1[0].endsWith(".") || ns_a1[0].endsWith(",")){
154
//                    ns_a1[0] = ns_a1[0].substring(0,ns_a1[0].length()-1);
155
//                }
156
//                if (ns_a1[1].endsWith(".") || ns_a1[1].endsWith(",")){
157
//                    ns_a1[1] = ns_a1[1].substring(0,ns_a1[1].length()-1);
158
//                }
159
//
160
//                if (ns_a2[0].endsWith(".") || ns_a2[0].endsWith(",")){
161
//                    ns_a2[0] = ns_a2[0].substring(0,ns_a2[0].length()-1);
162
//                }
163
//                if (ns_a2[1].endsWith(".") || ns_a2[1].endsWith(",")){
164
//                    ns_a2[1] = ns_a2[1].substring(0,ns_a2[1].length()-1);
165
//                }
166
//
167
//                if(ns_a1[0].compareTo(ns_a1[1]) < 0){
168
//                    String tmp = ns_a1[0];
169
//                    ns_a1[0] = ns_a1[1];
170
//                    ns_a1[1] = tmp;
171
//                }
172
//
173
//                if(ns_a2[0].compareTo(ns_a2[1]) < 0){
174
//                    String tmp = ns_a2[0];
175
//                    ns_a2[0] = ns_a2[1];
176
//                    ns_a2[1] = tmp;
177
//
178
//                }
179
//
180
//                if(ns_a1[0].equalsIgnoreCase(ns_a2[0])){
181
//                    if(ns_a1[1].equalsIgnoreCase(ns_a2[1])){//same name and surname
182
//                        return true;
183
//                    }
184
//                    if(ns_a1[1].length() == 1 || ns_a2[1].length() == 1){
185
//                        return ns_a1[1].charAt(0) == ns_a2[1].charAt(0);//same surname and initial of the name
186
//                    }
187
//                    return false;
188
//
189
//                }else{
190
//                    if(ns_a1[1].equalsIgnoreCase(ns_a2[1])){
191
//                        if(ns_a1[0].length() == 1 || ns_a2[0].length()==1)
192
//                            return ns_a1[0].charAt(0) == ns_a2[0].charAt(0);
193
//                        else
194
//                            return false;
195
//                    }
196
//                }
197
//
198
//
199
//
200
//        }
201
//        return false;
202
//    }
203
        return false;
204

  
205
    }
206

  
207
    private FieldTypeProtos.Author.Builder searchAuthor(FieldTypeProtos.Author a, List<FieldTypeProtos.Author> author_list){
208
        if(containsOrcid(a.getPidList()))
209
            return null;
210
        for(FieldTypeProtos.Author autoritative_author : author_list) {
211
                if (equals(autoritative_author, a)) {
212
                    if(!containsOrcid(a.getPidList()))
213
                        return update(a, autoritative_author);
214
                }
215
        }
216
        return  null;
217

  
218
    }
219

  
220
    private void analizeValueList() throws JsonFormat.ParseException {
221
        while(it.hasNext()){
222
            Value v = Value.fromJson(it.next().toString());
223

  
224
            if(v.getType().equals(PropagationConstants.Type.fromresult)){
225
                autoritative_authors.addAll(getAuthorList(Emit.fromJson(v.getValue ())));
226
            }
227
            if(v.getType().equals(PropagationConstants.Type.fromsemrel)){
228
                relatedResult.add(v.getValue());
229
            }
230
        }
231

  
232
    }
233
    private FieldTypeProtos.Author.Builder update(FieldTypeProtos.Author related_author, FieldTypeProtos.Author autoritative_autor ){
234

  
235
        FieldTypeProtos.Author.Builder res = FieldTypeProtos.Author.newBuilder(related_author);
236
        List<FieldTypeProtos.KeyValue> apid_list = autoritative_autor.getPidList();
237
        FieldTypeProtos.KeyValue akv = apid_list.stream().filter(kv -> kv.getKey().equals(PropagationConstants.AUTHOR_PID)).collect(Collectors.toList()).get(0);
238
        FieldTypeProtos.KeyValue.Builder kvb = FieldTypeProtos.KeyValue.newBuilder();
239
        kvb.setKey(akv.getKey()).setValue(akv.getValue());
240
        kvb.setDataInfo(Utils.getDataInfo(
241
                PropagationConstants.ORCID_RESULT_TRUST,
242
                PropagationConstants.CLASS_ORCID_ID,
243
                PropagationConstants.SCHEMA_ID,
244
                PropagationConstants.SCHEMA_NAME,
245
                PropagationConstants.DATA_INFO_TYPE,
246
                PropagationConstants.CLASS_ORCID_NAME)
247
        );
248
        return res.addPid(kvb);
249

  
250

  
251
    }
252

  
253
    private List<FieldTypeProtos.Author> getAuthorList(Emit e) throws JsonFormat.ParseException {
254

  
255
        List<FieldTypeProtos.Author> authors = new ArrayList<>();
256
        for (String author : e.getAuthor_list()) {
257
            FieldTypeProtos.Author.Builder author_builder = FieldTypeProtos.Author.newBuilder();
258
            JsonFormat.merge(author, author_builder);
259
            authors.add(author_builder.build());
260
        }
261

  
262
        return authors;
263

  
264
    }
265

  
266

  
267
    public static OafProtos.Oaf getUpdate(ResultProtos.Result.Metadata.Builder metadata, String resultId) {
268
        final ResultProtos.Result.Builder result = ResultProtos.Result.newBuilder().setMetadata(metadata);
269
        final OafProtos.OafEntity.Builder entity = OafProtos.OafEntity.newBuilder()
270
                .setType(TypeProtos.Type.result)
271
                .setId(resultId)
272
                .setResult(result);
273

  
274
        return OafProtos.Oaf.newBuilder()
275
                .setKind(KindProtos.Kind.entity)
276
                .setEntity(entity)
277
                .build();
278
    }
279
}
modules/dnet-mapreduce-jobs/branches/tree-dedup/src/main/java/eu/dnetlib/data/mapreduce/hbase/propagation/orcidthroughproducts/PropagationOrcidToResultMapper.java
1
package eu.dnetlib.data.mapreduce.hbase.propagation.orcidthroughproducts;
2

  
3
import com.google.gson.Gson;
4
import com.googlecode.protobuf.format.JsonFormat;
5
import eu.dnetlib.data.mapreduce.hbase.dedup.fixrelation.Key;
6
import eu.dnetlib.data.mapreduce.hbase.propagation.Value;
7
import eu.dnetlib.data.mapreduce.hbase.propagation.communitythroughorganization.DedupedList;
8
import eu.dnetlib.data.mapreduce.hbase.propagation.communitythroughorganization.OrganizationMap;
9
import eu.dnetlib.data.mapreduce.hbase.propagation.projecttoresult.PropagationProjectToResultReducer;
10
import eu.dnetlib.data.mapreduce.util.OafRowKeyDecoder;
11
import eu.dnetlib.data.proto.FieldTypeProtos;
12
import eu.dnetlib.data.proto.OafProtos;
13
import eu.dnetlib.data.proto.TypeProtos;
14
import org.apache.avro.generic.GenericData;
15
import org.apache.commons.lang3.StringUtils;
16
import org.apache.commons.logging.Log;
17
import org.apache.commons.logging.LogFactory;
18
import org.apache.hadoop.hbase.client.Result;
19
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
20
import org.apache.hadoop.hbase.mapreduce.TableMapper;
21
import org.apache.hadoop.hbase.util.Bytes;
22
import org.apache.hadoop.io.Text;
23

  
24
import java.io.IOException;
25
import java.util.ArrayList;
26
import java.util.HashSet;
27
import java.util.List;
28
import java.util.Set;
29
import java.util.stream.Collectors;
30

  
31
import static eu.dnetlib.data.mapreduce.hbase.propagation.PropagationConstants.*;
32
import static eu.dnetlib.data.mapreduce.hbase.propagation.PropagationConstants.COUNTER_PROPAGATION;
33
import static eu.dnetlib.data.mapreduce.hbase.propagation.Utils.getEntity;
34
import static eu.dnetlib.data.mapreduce.hbase.propagation.Utils.getRelationTarget;
35

  
36
public class PropagationOrcidToResultMapper extends TableMapper<ImmutableBytesWritable, Text> {
37
    private static final Log log = LogFactory.getLog(PropagationOrcidToResultMapper.class); // NOPMD by marko on 11/24/08 5:02 PM
38
    private Text valueOut;
39
    private ImmutableBytesWritable keyOut;
40
    private String[] sem_rels;
41
    private String trust;
42

  
43
    @Override
44
    protected void setup(final Context context) throws IOException, InterruptedException {
45
        super.setup(context);
46
        valueOut = new Text();
47
        keyOut = new ImmutableBytesWritable();
48

  
49
        sem_rels = context.getConfiguration().getStrings("propagatetoorcid.semanticrelations", DEFAULT_RESULT_RELATION_SET);
50
        trust = context.getConfiguration().get("propagatetoorcid.trust","0.85");
51

  
52
    }
53

  
54
    @Override
55
    protected void map(final ImmutableBytesWritable keyIn, final Result value, final Context context) throws IOException, InterruptedException {
56
        final TypeProtos.Type type = OafRowKeyDecoder.decode(keyIn.copyBytes()).getType();
57
        final OafProtos.OafEntity entity = getEntity(value, type);//getEntity already verified that it is not delByInference
58

  
59

  
60
        if (entity != null) {
61

  
62
            if (type == TypeProtos.Type.result){
63
                Set<String> result_result = new HashSet<>();
64
                //verifico se il risultato ha una relazione semantica verso uno o piu' risultati.
65
                //per ogni risultato linkato con issupplementto o issupplementedby emetto:
66
                // id risultato linkato come chiave,
67
                // id risultato oggetto del mapping e lista degli autori del risultato oggetto del mapper come value
68
                for(String sem : sem_rels){
69
                     result_result.addAll(getRelationTarget(value, sem, context, COUNTER_PROPAGATION));
70
                }
71
                if(!result_result.isEmpty()){
72
                    List<String> authorlist = getAuthorList(entity.getResult().getMetadata().getAuthorList());
73
                    Emit e = new Emit();
74
                    e.setId(Bytes.toString(keyIn.get()));
75
                    e.setAuthor_list(authorlist);
76
                    valueOut.set(Value.newInstance(new Gson().toJson(e, Emit.class),
77
                            trust,
78
                            Type.fromsemrel).toJson());
79
                    for (String result: result_result){
80
                        keyOut.set(Bytes.toBytes(result));
81
                        context.write(keyOut,valueOut);
82
                        context.getCounter(COUNTER_PROPAGATION,"emit for sem_rel").increment(1);
83
                    }
84

  
85
                    //emetto anche id dell'oggetto del mapper come chiave e lista degli autori come valore
86
                        e.setId(keyIn.toString());
87
                        e.setAuthor_list(authorlist);
88
                        valueOut.set(Value.newInstance(new Gson().toJson(e, Emit.class), trust, Type.fromresult).toJson());
89
                        context.write(keyIn, valueOut);
90
                        context.getCounter(COUNTER_PROPAGATION,"emit for result with orcid").increment(1);
91

  
92
                }
93
            }
94

  
95
        }
96
    }
97

  
98
    private List<String> getAuthorList(List<FieldTypeProtos.Author> author_list){
99

  
100
        return author_list.stream().map(a -> new JsonFormat().printToString(a)).collect(Collectors.toList());
101

  
102
    }
103

  
104

  
105

  
106
}
modules/dnet-mapreduce-jobs/branches/tree-dedup/src/main/java/eu/dnetlib/data/mapreduce/hbase/propagation/orcidthroughproducts/Emit.java
1
package eu.dnetlib.data.mapreduce.hbase.propagation.orcidthroughproducts;
2

  
3
import com.google.gson.Gson;
4
import eu.dnetlib.data.mapreduce.hbase.propagation.communitythroughorganization.DedupedList;
5
import eu.dnetlib.data.proto.FieldTypeProtos;
6

  
7
import java.io.Serializable;
8
import java.util.List;
9

  
10
public class Emit implements Serializable {
11

  
12
    private String id;
13
    private List<String> author_list;
14

  
15
    public String getId() {
16
        return id;
17
    }
18

  
19
    public void setId(String id) {
20
        this.id = id;
21
    }
22

  
23
    public List<String> getAuthor_list() {
24
        return author_list;
25
    }
26

  
27
    public void setAuthor_list(List<String> author_list) {
28
        this.author_list = author_list;
29
    }
30

  
31
    public static Emit fromJson(String value) {
32
        return new Gson().fromJson(value, Emit.class);
33
    }
34
}
modules/dnet-mapreduce-jobs/branches/tree-dedup/src/main/java/eu/dnetlib/data/mapreduce/hbase/propagation/projecttoresult/ResultProjectIterator.java
88 88
                                .setRelMetadata(
89 89
                                        RelMetadataProtos.RelMetadata.newBuilder()
90 90
                                                .setSemantics(
91
                                                        getQualifier(semantics,DNET_RELATION_SCHEMA,semantics,DNET_RELATION_SCHEMA)
91
                                                        getQualifier(semantics, DNET_RELATION_SCHEMA_PROJECTS, semantics, DNET_RELATION_SCHEMA_PROJECTS)
92 92
                                                )
93 93
                                )
94 94
                );
95 95

  
96 96
        final OafProtos.OafRel.Builder relation = OafProtos.OafRel.newBuilder()
97 97
                .setChild(false)
98
                .setSubRelType(SUBREL_TYPE)
99
                .setRelType(REL_TYPE)
98
                .setSubRelType(SUBREL_TYPE_PROJECT)
99
                .setRelType(REL_TYPE_PROJECT)
100 100
                .setRelClass(semantics)
101 101
                .setTarget(target)
102 102
                .setSource(source)
modules/dnet-mapreduce-jobs/branches/tree-dedup/src/main/java/eu/dnetlib/data/mapreduce/hbase/propagation/communitythroughorganization/PropagationCommunityThroughOrganizationFileReducer.java
58 58

  
59 59
        if(communities.size() > 0){
60 60
            final ResultProtos.Result.Metadata.Builder metadata = ResultProtos.Result.Metadata.newBuilder();
61
            communities.stream().forEach(community->metadata.addContext(Utils.getContext(community, ORGANIZATION_COMMUNITY_TRUST, CLASS_ORGANIZATION_ID, DATA_INFO_TYPE,CLASS_ORGANIZATION_NAME)));
61
            communities.stream().forEach(community->
62
                                            {metadata.addContext(Utils.getContext(community, ORGANIZATION_COMMUNITY_TRUST, CLASS_ORGANIZATION_ID, DATA_INFO_TYPE,CLASS_ORGANIZATION_NAME));
63
                                            context.getCounter(COUNTER_PROPAGATION, "added result to community " + community).increment(resultIds.size());});
64

  
62 65
            for(String result: resultIds){
63 66
                keyOut.set(result);
64 67
                outValue.set(JsonFormat.printToString(Utils.getUpdate(metadata, result)).getBytes());
......
66 69
                context.getCounter(COUNTER_PROPAGATION, "added community to result").increment(communities.size());
67 70
            }
68 71

  
72

  
73

  
69 74
        }
70 75

  
71 76

  
modules/dnet-mapreduce-jobs/branches/tree-dedup/src/main/java/eu/dnetlib/data/mapreduce/hbase/propagation/communitythroughorganization/PropagationCommunityThroughOrganizationReducer.java
37 37
        DedupedList communities = new DedupedList();
38 38
        Set<String> resultIds = new HashSet<>();
39 39

  
40
        while(it.hasNext()){
40
        while (it.hasNext()) {
41 41
            Value v = Value.fromJson(it.next().toString());
42
            switch (v.getType()){
42
            switch (v.getType()) {
43 43
                case fromorganization:
44 44
                    communities.addAll(DedupedList.fromJson(v.getValue()));
45 45
                    break;
......
51 51

  
52 52
        }
53 53

  
54
        if(communities.size() > 0){
54
        if (communities.size() > 0) {
55 55
            final ResultProtos.Result.Metadata.Builder metadata = ResultProtos.Result.Metadata.newBuilder();
56
            communities.stream().forEach(community->metadata.addContext(Utils.getContext(community, ORGANIZATION_COMMUNITY_TRUST, CLASS_ORGANIZATION_ID, DATA_INFO_TYPE,CLASS_ORGANIZATION_NAME)));
57
            for(String result: resultIds){
56
            communities.stream().forEach(community -> {
57
                metadata.addContext(Utils.getContext(community, ORGANIZATION_COMMUNITY_TRUST, CLASS_ORGANIZATION_ID, DATA_INFO_TYPE, CLASS_ORGANIZATION_NAME));
58
                context.getCounter(COUNTER_PROPAGATION, "added result to community " + community).increment(resultIds.size());
59
            });
60
            for (String result : resultIds) {
58 61
                final Put put = new Put(Bytes.toBytes(result)).add(Bytes.toBytes("result"), Bytes.toBytes("update_" + System.nanoTime()), Utils.getUpdate(metadata, result).toByteArray());
59 62
                keyOut.set(Bytes.toBytes(result));
60 63
                context.write(keyOut, put);
......
65 68

  
66 69
    }
67 70

  
68

  
69 71
}
modules/dnet-mapreduce-jobs/branches/tree-dedup/src/main/java/eu/dnetlib/data/mapreduce/hbase/broker/enrich/AbstractEnrichmentReducer.java
17 17
import eu.dnetlib.data.mapreduce.hbase.broker.model.EventWrapper;
18 18
import eu.dnetlib.data.proto.OafProtos.Oaf;
19 19
import eu.dnetlib.pace.config.DedupConfig;
20
import eu.dnetlib.pace.distance.PaceDocumentDistance;
21
import eu.dnetlib.pace.distance.eval.ScoreResult;
22 20
import eu.dnetlib.pace.model.MapDocument;
23 21
import eu.dnetlib.pace.model.ProtoDocumentBuilder;
22
import eu.dnetlib.pace.tree.support.TreeProcessor;
24 23
import org.apache.commons.lang.StringUtils;
25 24
import org.apache.commons.math.util.MathUtils;
26 25
import org.apache.hadoop.hbase.client.HTable;
......
160 159
		final MapDocument a = ProtoDocumentBuilder.newInstance(oa.getEntity().getId(), oa.getEntity(), dedupConf.getPace().getModel());
161 160
		final MapDocument b = ProtoDocumentBuilder.newInstance(ob.getEntity().getId(), ob.getEntity(), dedupConf.getPace().getModel());
162 161

  
163
		final ScoreResult sr =  new PaceDocumentDistance().between(a, b, dedupConf);
164
		return sr.getScore();
162
		TreeProcessor tree = new TreeProcessor(dedupConf);
163
		return tree.computeScore(a, b);
165 164
	}
166 165

  
167 166
	protected float scale(final double d) {
modules/dnet-mapreduce-jobs/branches/tree-dedup/src/main/java/eu/dnetlib/data/mapreduce/hbase/broker/OrcidEventFactory.java
19 19
import eu.dnetlib.data.proto.FieldTypeProtos;
20 20
import eu.dnetlib.data.proto.OafProtos.Oaf;
21 21
import eu.dnetlib.miscutils.collections.Pair;
22
import eu.dnetlib.pace.distance.algo.JaroWinkler;
22
import eu.dnetlib.pace.tree.JaroWinkler;
23 23

  
24 24
public class OrcidEventFactory {
25 25

  
modules/dnet-mapreduce-jobs/branches/tree-dedup/src/main/java/eu/dnetlib/data/mapreduce/hbase/dataimport/DOIBoostToActions.java
10 10
import java.util.stream.Stream;
11 11
import java.util.zip.Inflater;
12 12

  
13
import com.google.common.collect.Lists;
13 14
import com.google.gson.Gson;
14 15
import com.google.gson.JsonElement;
15 16
import com.google.gson.JsonObject;
......
44 45
	public static final String SEPARATOR = "::";
45 46
	public static final String DNET_LANGUAGES = "dnet:languages";
46 47

  
48
	private static final List<String> DATE_TYPES = Lists.newArrayList("issued", "accepted", "published-online", "published-print");
49

  
50

  
51

  
47 52
	private static Map<String, Pair<String, String>> datasources = new HashMap<String, Pair<String, String>>() {{
48 53
		put(MAG.toLowerCase(), new Pair<>("Microsoft Academic Graph", OPENAIRE_PREFIX + SEPARATOR + "microsoft"));
49 54
		put(ORCID.toLowerCase(), new Pair<>(ORCID, OPENAIRE_PREFIX + SEPARATOR + "orcid"));
......
391 396
						.setQualifier(getQualifier("main title", "dnet:dataCite_title"))
392 397
						.build()));
393 398

  
394
		settingRelevantDate(rootElement, metadata, "issued", "issued", true);
399

  
400
		final String firstValidDate = getFirstValidDate(rootElement);
401
		if (StringUtils.isNotBlank(firstValidDate)) {
402
			setDate(metadata, "issued", firstValidDate, true);
403
		} else {
404
			context.incrementCounter("filtered", "missing_date", 1);
405
			return null;
406
		}
395 407
		settingRelevantDate(rootElement, metadata, "accepted", "accepted", false);
396 408
		settingRelevantDate(rootElement, metadata, "published-online", "published-online", false);
397 409
		settingRelevantDate(rootElement, metadata, "published-print", "published-print", false);
......
547 559
		return root.has(key) && root.get(key).isJsonArray();
548 560
	}
549 561

  
562
	private static String getFirstValidDate(final JsonObject root) {
563
		return DATE_TYPES.stream()
564
			.map(type -> getStringValue(root, type))
565
			.filter(Objects::nonNull)
566
			.filter(DumpToActionsUtility::isValidDate)
567
			.findFirst()
568
			.orElseGet(null);
569
	}
570

  
571
	private static void setDate(ResultProtos.Result.Metadata.Builder metadata,
572
											final String dictionaryKey,
573
											final String date,
574
											final boolean addToDateOfAcceptance) {
575
		if (date == null)
576
			return;
577
		if (addToDateOfAcceptance) {
578
			metadata.setDateofacceptance(FieldTypeProtos.StringField.newBuilder().setValue(date).build());
579
		} else {
580
			metadata.addRelevantdate(FieldTypeProtos.StructuredProperty.newBuilder()
581
					.setValue(date)
582
					.setQualifier(getQualifier(dictionaryKey, "dnet:dataCite_date"))
583
					.build());
584
		}
585
	}
586

  
550 587
	private static void settingRelevantDate(JsonObject rootElement,
551 588
			ResultProtos.Result.Metadata.Builder metadata,
552 589
			final String jsonKey,
modules/dnet-mapreduce-jobs/branches/tree-dedup/src/main/java/eu/dnetlib/data/mapreduce/hbase/dataimport/DumpToActionsUtility.java
47 47
    }
48 48

  
49 49
    public static boolean isValidDate(final String date) {
50
        return date.matches("\\d{4}-\\d{2}-\\d{2}");
50
        return date.matches("\\d{4}-\\d{1,2}-\\d{1,2}");
51 51
    }
52 52

  
53 53
    public static FieldTypeProtos.StructuredProperty getPid(final JsonObject localIdentifier, final Map<String, ScholExplorerConfiguration> conf) {
modules/dnet-mapreduce-jobs/branches/tree-dedup/src/main/java/eu/dnetlib/data/mapreduce/hbase/dataexport/ProtoConverter.java
476 476
        entity.setRank(author.getRank());
477 477
        entity.setPid(author.getPidList()
478 478
                .stream()
479
                .map(ProtoConverter::mapKV)
479
                .map(kv -> {
480
                    final StructuredProperty sp = new StructuredProperty();
481
                    sp.setValue(kv.getValue());
482
                    final Qualifier q = new Qualifier();
483
                    q.setClassid(kv.getKey());
484
                    q.setClassname(kv.getKey());
485
                    sp.setQualifier(q);
486
                    return sp;
487
                })
480 488
                .collect(Collectors.toList()));
481 489
        entity.setAffiliation(author.getAffiliationList()
482 490
                .stream()
modules/dnet-mapreduce-jobs/branches/tree-dedup/pom.xml
9 9
	<modelVersion>4.0.0</modelVersion>
10 10
	<groupId>eu.dnetlib</groupId>
11 11
	<artifactId>dnet-mapreduce-jobs</artifactId>
12
	<version>1.2.1-SNAPSHOT</version>
12
	<version>1.2.1-TREEDEDUP</version>
13 13
	<packaging>jar</packaging>
14 14
	<scm>
15 15
		<developerConnection>scm:svn:https://svn.driver.research-infrastructures.eu/driver/dnet45/modules/dnet-mapreduce-jobs/trunk</developerConnection>
......
195 195
		<dependency>
196 196
			<groupId>eu.dnetlib</groupId>
197 197
			<artifactId>dnet-openaireplus-mapping-utils</artifactId>
198
			<version>[6.3.25,7.0.0)</version>
198
			<version>6.3.39-TREEDEDUP</version>
199 199
		</dependency>
200 200
		<dependency>
201 201
			<groupId>org.antlr</groupId>

Also available in: Unified diff