Project

General

Profile

« Previous | Next » 

Revision 51225

backported from dnet50

View differences:

modules/dnet-graph-domain/trunk/src/main/java/eu/dnetlib/data/graph/model/DNGFUtils.java
1
package eu.dnetlib.data.graph.model;
2

  
3
import com.google.common.base.Function;
4
import com.google.common.base.Predicate;
5
import com.google.common.collect.Iterables;
6
import com.google.common.collect.Lists;
7
import com.google.common.collect.Sets;
8
import com.google.protobuf.Descriptors;
9
import com.google.protobuf.GeneratedMessage;
10
import eu.dnetlib.data.proto.*;
11

  
12
import java.util.Arrays;
13
import java.util.Set;
14

  
15
/**
16
 * Created by sandro on 12/13/16.
17
 */
18
public class DNGFUtils {
19

  
20
    public static Set<String> entities() {
21
        return Sets.newHashSet(Iterables.transform(Lists.newArrayList(TypeProtos.Type.values()), t -> t.toString()));
22
    }
23

  
24
    public static Predicate<DNGFProtos.DNGF> relationFilter() {
25
        return dngf -> dngf.getKind().equals(KindProtos.Kind.relation);
26
    }
27

  
28
    public static Predicate<DNGFProtos.DNGF> entityFilter() {
29
        return dngf -> dngf.getKind().equals(KindProtos.Kind.entity);
30
    }
31

  
32
    public static Function<DNGFDecoder, String> idDecoder() {
33
        return input -> input.getEntityId();
34
    }
35

  
36
    public static Predicate<FieldTypeProtos.StructuredProperty> mainTitleFilter() {
37
        return sp -> (sp.getQualifier() != null) && sp.getQualifier().getClassname().equals("main title");
38
    }
39

  
40
    public static Set<String> getFieldNames(final Descriptors.Descriptor d, final Integer... tag) {
41
        return Sets.newHashSet(Iterables.transform(Arrays.asList(tag), i -> {
42
            final Descriptors.FieldDescriptor fd = d.findFieldByNumber(i);
43
            if (fd == null) throw new IllegalArgumentException("undefined tag: " + i + " for type: " + d.getFullName());
44
            return fd.getName();
45
        }));
46
    }
47

  
48
    public static FieldTypeProtos.StringField sf(final String value) {
49
        return FieldTypeProtos.StringField.newBuilder().setValue(value).build();
50
    }
51

  
52
    public static FieldTypeProtos.StructuredProperty sp(final String value, final String classid, final String schemeid) {
53
        return sp(value, classid, schemeid, di());
54
    }
55

  
56
    public static FieldTypeProtos.StructuredProperty sp(final String value, final String classid, final String schemeid, final FieldTypeProtos.DataInfo di) {
57
        return FieldTypeProtos.StructuredProperty.newBuilder().setValue(value).setQualifier(simpleQualifier(classid, schemeid)).setDataInfo(di).build();
58
    }
59

  
60
    public static FieldTypeProtos.KeyValue kv(final String k, final String v) {
61
        return FieldTypeProtos.KeyValue.newBuilder().setKey(k).setValue(v).build();
62
    }
63

  
64
    public static FieldTypeProtos.Qualifier simpleQualifier(final String classid, final String schemeid) {
65
        return q(classid, classid, schemeid, schemeid);
66
    }
67

  
68
    public static FieldTypeProtos.Qualifier q(final String classid, final String classname, final String schemeid, final String schemename) {
69
        return FieldTypeProtos.Qualifier.newBuilder()
70
                .setClassid(classid)
71
                .setClassname(classname)
72
                .setSchemename(schemeid)
73
                .setSchemeid(schemename).build();
74
    }
75

  
76
    public static FieldTypeProtos.DataInfo di(final String trust) {
77
        return di(false, "", false, simpleQualifier("", ""), trust);
78
    }
79

  
80
    public static FieldTypeProtos.DataInfo di() {
81
        return di("");
82
    }
83

  
84
    public static FieldTypeProtos.DataInfo di(Boolean deletedByInference, String inferenceProvenance, Boolean inferred, FieldTypeProtos.Qualifier provenanceAction, String trust) {
85
        return FieldTypeProtos.DataInfo.newBuilder()
86
                .setDeletedbyinference(deletedByInference)
87
                .setInferenceprovenance(inferenceProvenance)
88
                .setInferred(inferred)
89
                .setProvenanceaction(provenanceAction)
90
                .setTrust(trust).build();
91
    }
92

  
93
    public static FieldTypeProtos.Context context(final String id) {
94
        return FieldTypeProtos.Context.newBuilder().setId(id).build();
95
    }
96

  
97

  
98
    public static DNGFDecoder embed(final GeneratedMessage msg,
99
                                    final KindProtos.Kind kind,
100
                                    final boolean deletedByInference,
101
                                    final boolean inferred,
102
                                    final String provenance,
103
                                    final String action) {
104

  
105
        final DNGFProtos.DNGF.Builder DNGF = DNGFProtos.DNGF
106
                .newBuilder()
107
                .setKind(kind)
108
                .setLastupdatetimestamp(System.currentTimeMillis())
109
                .setDataInfo(di(deletedByInference, provenance, inferred, simpleQualifier(action, action), "0.5"));
110
        switch (kind) {
111
            case entity:
112
                DNGF.setEntity((DNGFProtos.DNGFEntity) msg);
113
                break;
114
            case relation:
115
                DNGF.setRel((DNGFProtos.DNGFRel) msg);
116
                break;
117
            default:
118
                break;
119
        }
120

  
121
        return DNGFDecoder.decode(DNGF.build());
122
    }
123

  
124
    public static DNGFDecoder embed(final GeneratedMessage msg, final KindProtos.Kind kind) {
125
        return embed(msg, kind, false, false, "inference_provenance", "provenance_action");
126
    }
127

  
128

  
129
}
modules/dnet-graph-domain/trunk/src/main/java/eu/dnetlib/data/graph/utils/RelDescriptor.java
1
package eu.dnetlib.data.graph.utils;
2

  
3
import java.util.regex.Matcher;
4
import java.util.regex.Pattern;
5

  
6
import eu.dnetlib.data.proto.FieldTypeProtos.Qualifier;
7
import org.apache.commons.lang3.StringUtils;
8

  
9
public class RelDescriptor {
10

  
11
    public static final String ONTOLOGY_SEPARATOR = "_";
12

  
13
	public static final String QUALIFIER_SEPARATOR = ":";
14

  
15
	public static final String RELTYPE_REGEX = "(\\w+_\\w+)_(\\w+)";
16
	public static final String ID_REGEX = "([0-9][0-9]\\|.{12}::[a-zA-Z0-9]{32})";
17

  
18
	public static final Pattern QUALIFIER_PATTERN = Pattern.compile("^" + RELTYPE_REGEX + "$");
19
	public static final Pattern FULL_QUALIFIER_PATTERN = Pattern.compile("^" + RELTYPE_REGEX + QUALIFIER_SEPARATOR + ID_REGEX + "$");
20

  
21
    private String code;
22

  
23
	private String ontologyCode;
24

  
25
	private String termCode;
26

  
27
	private String targetId = null;
28

  
29
    public RelDescriptor(final String ontologyCode, final String termCode) {
30
        this.ontologyCode = ontologyCode;
31
        this.termCode = termCode;
32
        this.code = ontologyCode + ONTOLOGY_SEPARATOR + termCode;
33
    }
34

  
35
	public RelDescriptor(final String ontologyCode, final String termCode, final String targetId) {
36
		this(ontologyCode, termCode);
37
		this.targetId = targetId;
38
	}
39

  
40
	public RelDescriptor(final String value) {
41
		if (StringUtils.isBlank(value)) {
42
			throw new IllegalArgumentException("empty value is not permitted");
43
		}
44

  
45
		final Matcher matcher = QUALIFIER_PATTERN.matcher(value);
46
		if (matcher.matches()) {
47
			this.code = value;
48
			this.ontologyCode = matcher.group(1);
49
			this.termCode = matcher.group(2);
50
		} else {
51
			final Matcher fullMatcher = FULL_QUALIFIER_PATTERN.matcher(value);
52
			if (fullMatcher.matches()) {
53
				this.code = fullMatcher.group(1) + ONTOLOGY_SEPARATOR + fullMatcher.group(2);
54
				this.ontologyCode = fullMatcher.group(1);
55
				this.termCode = fullMatcher.group(2);
56
				this.targetId = fullMatcher.group(3);
57
			} else {
58
				throw new IllegalArgumentException(String.format("invalid qualifier format: '%s'", value));
59
			}
60
		}
61
    }
62

  
63
    public String shortQualifier() {
64
    	return getOntologyCode() + ONTOLOGY_SEPARATOR + getTermCode();
65
    }
66

  
67
    public String qualifier() {
68
		return qualifier(getTargetId());
69
    }
70

  
71
	public String qualifier(final String targetId) {
72
		if (StringUtils.isBlank(targetId)) {
73
			throw new IllegalStateException("incomplete qualifier");
74
		}
75

  
76
		return shortQualifier() + QUALIFIER_SEPARATOR + targetId;
77
	}
78

  
79
	public static String asString(final Qualifier qualifier) {
80
    	if (qualifier == null) {
81
		    throw new IllegalArgumentException("invalid qualifier");
82
	    }
83
    	return qualifier.getSchemeid() + ONTOLOGY_SEPARATOR + qualifier.getClassid();
84
	}
85

  
86
	public Qualifier asQualifier() {
87
		return Qualifier.newBuilder()
88
				.setClassid(getTermCode()).setClassname(getTermCode())
89
				.setSchemeid(getOntologyCode()).setSchemename(getOntologyCode()).build();
90
	}
91

  
92
	public String getCode() {
93
		return code;
94
	}
95

  
96
	public String getOntologyCode() {
97
		return ontologyCode;
98
	}
99

  
100
	public String getTermCode() {
101
		return termCode;
102
	}
103

  
104
	@Override
105
	public String toString() {
106
		return getCode();
107
	}
108

  
109
	@Override
110
	public int hashCode() {
111
		final int prime = 31;
112
		int result = 1;
113
		result = (prime * result) + ((code == null) ? 0 : code.hashCode());
114
		return result;
115
	}
116

  
117
	@Override
118
	public boolean equals(final Object obj) {
119
		if (this == obj) return true;
120
		if (obj == null) return false;
121
		if (getClass() != obj.getClass()) return false;
122
		RelDescriptor other = (RelDescriptor) obj;
123
		if (code == null) {
124
			if (other.code != null) return false;
125
		} else if (!code.equals(other.code)) return false;
126
		return true;
127
	}
128

  
129
	public String getTargetId() {
130
		return targetId;
131
	}
132

  
133
	public void setTargetId(final String targetId) {
134
		this.targetId = targetId;
135
	}
136
}
modules/dnet-graph-domain/trunk/src/main/java/eu/dnetlib/pace/model/gt/GTAuthorMapper.java
1
package eu.dnetlib.pace.model.gt;
2

  
3
import com.google.common.base.Function;
4
import com.google.common.collect.Iterables;
5
import eu.dnetlib.data.proto.FieldTypeProtos.DataInfo;
6
import eu.dnetlib.data.proto.FieldTypeProtos.Qualifier;
7
import eu.dnetlib.data.proto.FieldTypeProtos.StringField;
8
import eu.dnetlib.data.proto.KindProtos.Kind;
9
import eu.dnetlib.data.proto.DNGFProtos.DNGF;
10
import eu.dnetlib.data.proto.DNGFProtos.DNGFEntity;
11
import eu.dnetlib.data.proto.PersonProtos.Person;
12
import eu.dnetlib.data.proto.PersonProtos.Person.CoAuthor;
13
import eu.dnetlib.data.proto.PersonProtos.Person.MergedPerson;
14
import eu.dnetlib.data.proto.PersonProtos.Person.Metadata;
15
import eu.dnetlib.data.proto.TypeProtos.Type;
16
import org.apache.commons.lang3.StringUtils;
17

  
18
public class GTAuthorMapper {
19

  
20
	public DNGF map(final GTAuthor gta) {
21

  
22
		final DNGF.Builder oaf = DNGF.newBuilder();
23

  
24
		oaf.setDataInfo(getDataInfo());
25
		oaf.setLastupdatetimestamp(System.currentTimeMillis());
26
		oaf.setKind(Kind.entity);
27
		oaf.setEntity(getDNGFEntity(gta));
28

  
29
		return oaf.build();
30
	}
31

  
32
	private DNGFEntity getDNGFEntity(final GTAuthor gta) {
33
		final DNGFEntity.Builder entity = DNGFEntity.newBuilder();
34
		entity.setType(Type.person);
35
		entity.setId(gta.getId());
36
		entity.setPerson(getPerson(gta));
37
		return entity.build();
38
	}
39

  
40
	private Person getPerson(final GTAuthor gta) {
41
		final Person.Builder person = Person.newBuilder();
42

  
43
		if (gta.getAuthor() != null) {
44
			final Person.Metadata.Builder m = Person.Metadata.newBuilder();
45

  
46
			if (StringUtils.isNotBlank(gta.getAuthor().getFullname())) {
47
				m.setFullname(sf(gta.getAuthor().getFullname()));
48
			}
49
			if (StringUtils.isNotBlank(gta.getAuthor().getFirstname())) {
50
				m.setFirstname(sf(gta.getAuthor().getFirstname()));
51
			}
52
			if (StringUtils.isNotBlank(gta.getAuthor().getSecondnames())) {
53
				m.addSecondnames(sf(gta.getAuthor().getSecondnames()));
54
			}
55

  
56
			person.setMetadata(m.build());
57
		}
58

  
59
		person.setAnchor(true);
60
		person.addAllMergedperson(Iterables.transform(gta.getMerged(), new Function<Author, MergedPerson>() {
61

  
62
			@Override
63
			public MergedPerson apply(final Author a) {
64
				final MergedPerson.Builder mp = MergedPerson.newBuilder();
65
				mp.setId(a.getId());
66
				mp.setMetadata(getMetadata(a));
67
				return mp.build();
68
			}
69

  
70
		}));
71

  
72
		person.addAllCoauthor(Iterables.transform(gta.getCoAuthors(), new Function<eu.dnetlib.pace.model.gt.CoAuthor, CoAuthor>() {
73

  
74
			@Override
75
			public CoAuthor apply(final eu.dnetlib.pace.model.gt.CoAuthor ca) {
76
				final CoAuthor.Builder coAuthor = CoAuthor.newBuilder();
77
				coAuthor.setId(ca.getId());
78
				if (StringUtils.isNotBlank(ca.getAnchorId())) {
79
					coAuthor.setAnchorId(ca.getAnchorId());
80
				}
81
				coAuthor.setMetadata(getMetadata(ca));
82
				return coAuthor.build();
83
			}
84
		}));
85

  
86
		return person.build();
87
	}
88

  
89
	private Metadata getMetadata(final Author a) {
90
		final Metadata.Builder m = Metadata.newBuilder();
91
		m.setFullname(sf(a.getFullname()));
92
		if (a.isWellFormed()) {
93
			m.setFirstname(sf(a.getFirstname()));
94
			m.addSecondnames(sf(a.getSecondnames()));
95
		}
96
		return m.build();
97
	}
98

  
99
	private DataInfo getDataInfo() {
100
		final DataInfo.Builder d = DataInfo.newBuilder();
101
		d.setDeletedbyinference(false);
102
		d.setInferred(true);
103
		d.setTrust("0.5");
104
		d.setInferenceprovenance("dedup-person-groundtruth");
105
		d.setProvenanceaction(Qualifier.newBuilder().setClassid("").setClassname("").setSchemeid("").setSchemename(""));
106
		return d.build();
107
	}
108

  
109
	private StringField sf(final String s) {
110
		final StringField.Builder sf = StringField.newBuilder();
111
		sf.setValue(s);
112
		return sf.build();
113
	}
114

  
115
}
modules/dnet-graph-domain/trunk/src/main/resources/eu/dnetlib/bootstrap/profiles/HadoopJobConfigurationDSResources/HadoopJobConfigurationDSResourceType/elasticsearchFeedWDSDataJob.xml
1
<RESOURCE_PROFILE>
2
    <HEADER>
3
        <RESOURCE_IDENTIFIER
4
                value="4bf4bdc4-3668-4dc5-977c-a98862ad7a24_SGFkb29wSm9iQ29uZmlndXJhdGlvbkRTUmVzb3VyY2VzL0hhZG9vcEpvYkNvbmZpZ3VyYXRpb25EU1Jlc291cmNlVHlwZQ=="/>
5
        <RESOURCE_TYPE value="HadoopJobConfigurationDSResourceType"/>
6
        <RESOURCE_KIND value="HadoopJobConfigurationDSResources"/>
7
        <RESOURCE_URI value=""/>
8
        <DATE_OF_CREATION value="2001-12-31T12:00:00"/>
9
    </HEADER>
10
    <BODY>
11
        <HADOOP_JOB name="elasticsearchFeedWDSDataJob" type="mapreduce">
12
            <DESCRIPTION>map only job that builds WDS records and index them on ES</DESCRIPTION>
13
            <STATIC_CONFIGURATION>
14

  
15
                <!-- I/O FORMAT -->
16
                <PROPERTY key="mapreduce.inputformat.class" value="org.apache.hadoop.hbase.mapreduce.TableInputFormat"/>
17
                <PROPERTY key="mapreduce.outputformat.class" value="org.elasticsearch.hadoop.mr.EsOutputFormat"/>
18

  
19
                <!-- MAPPER -->
20
                <PROPERTY key="mapreduce.map.class" value="eu.dnetlib.data.mapreduce.hbase.wds.IndexWDSMapper"/>
21
                <PROPERTY key="mapred.mapoutput.key.class" value="org.apache.hadoop.io.Text"/>
22
                <PROPERTY key="mapred.mapoutput.value.class" value="org.apache.hadoop.io.Text"/>
23

  
24
                <!-- MISC -->
25
                <PROPERTY key="es.nodes"
26
                          value="ip-90-147-167-137.ct1.garrservices.it:9200, ip-90-147-167-126.ct1.garrservices.it:9200, ip-90-147-167-13.ct1.garrservices.it:9200, ip-90-147-167-125.ct1.garrservices.it:9200"/>
27
                <PROPERTY key="es.nodes.resolve.hostname" value="false"/>
28
                <PROPERTY key="es.nodes.wan.only" value="true"/>
29

  
30
                <PROPERTY key="es.resource" value="wds/item"/>
31
                <PROPERTY key="es.mapping.id" value="id"/>
32
                <PROPERTY key="es.input.json" value="yes"/>
33

  
34
                <PROPERTY key="mapred.reduce.tasks" value="0"/>
35
                <PROPERTY key="mapred.reduce.tasks.speculative.execution" value="false"/>
36
                <PROPERTY key="mapred.map.tasks.speculative.execution" value="false"/>
37

  
38
            </STATIC_CONFIGURATION>
39
            <JOB_INTERFACE>
40
                <PARAM name="hbase.mapred.inputtable" required="true" description="source hbase table"/>
41
                <PARAM name="hbase.mapreduce.inputtable" required="true" description="source hbase table"/>
42
            </JOB_INTERFACE>
43
            <SCAN>
44
                <FILTERS/>
45
                <FAMILIES/>
46
            </SCAN>
47
        </HADOOP_JOB>
48
        <STATUS>
49
            <LAST_SUBMISSION_DATE value="2001-12-31T12:00:00"/>
50
            <RUNNING_INSTANCES value="0"/>
51
            <CUMULATIVE_RUN value="0"/>
52
        </STATUS>
53
        <SECURITY_PARAMETERS>SECURITY_PARAMETERS</SECURITY_PARAMETERS>
54
    </BODY>
55
</RESOURCE_PROFILE>
modules/dnet-graph-domain/trunk/src/main/resources/eu/dnetlib/bootstrap/profiles/HadoopJobConfigurationDSResources/HadoopJobConfigurationDSResourceType/dedupBuildRootsJob.xml
1
<RESOURCE_PROFILE>
2
    <HEADER>
3
        <RESOURCE_IDENTIFIER value="895ce6a9-4131-4954-b9ed-949ff78f5448_SGFkb29wSm9iQ29uZmlndXJhdGlvbkRTUmVzb3VyY2VzL0hhZG9vcEpvYkNvbmZpZ3VyYXRpb25EU1Jlc291cmNlVHlwZQ=="/>
4
        <RESOURCE_TYPE value="HadoopJobConfigurationDSResourceType"/>
5
        <RESOURCE_KIND value="HadoopJobConfigurationDSResources"/>
6
        <RESOURCE_URI value=""/>
7
        <DATE_OF_CREATION value="2001-12-31T12:00:00"/>
8
    </HEADER>
9
    <BODY>
10
        <HADOOP_JOB name="dedupBuildRootsJob" type="mapreduce">
11
 			<DESCRIPTION>map reduce job that build the roots and redirects the rels</DESCRIPTION>
12
        	<STATIC_CONFIGURATION>
13

  
14
				<!-- I/O FORMAT -->
15
				<PROPERTY key="mapreduce.inputformat.class" value="org.apache.hadoop.hbase.mapreduce.TableInputFormat" />
16
				<PROPERTY key="mapreduce.outputformat.class" value="org.apache.hadoop.hbase.mapreduce.TableOutputFormat" />
17

  
18
        		<!-- MAPPER -->
19
				<PROPERTY key="mapreduce.map.class" value="eu.dnetlib.data.mapreduce.hbase.dedup.DedupBuildRootsMapper" />
20
				<PROPERTY key="mapred.mapoutput.key.class" value="org.apache.hadoop.io.Text" />
21
				<PROPERTY key="mapred.mapoutput.value.class" value="org.apache.hadoop.hbase.io.ImmutableBytesWritable" />
22

  
23
				<!-- REDUCER -->
24
				<PROPERTY key="mapreduce.reduce.class" value="eu.dnetlib.data.mapreduce.hbase.dedup.DedupBuildRootsReducer" />
25
				<PROPERTY key="mapred.output.key.class" value="org.apache.hadoop.hbase.io.ImmutableBytesWritable" />				
26
				<PROPERTY key="mapred.output.value.class" value="org.apache.hadoop.io.Writable" />				
27
	
28
				<!-- MISC -->				
29
				<PROPERTY key="mapred.reduce.tasks.speculative.execution" value="false" />	
30
				<PROPERTY key="mapred.map.tasks.speculative.execution" value="false" />	
31
				<PROPERTY key="mapreduce.map.speculative" value="false" />
32
                <PROPERTY key="mapreduce.reduce.speculative" value="false"/>
33
                <PROPERTY key="mapred.task.timeout" value="2400000"/>
34
			
35
				<PROPERTY key="mapred.reduce.tasks" value="10" />
36
				<!-- <PROPERTY key="user.name" value="dnet" /> -->
37
	
38
		<!--  	Uncomment to override the default lib path -->			
39
		<!--	<PROPERTY key="job.lib" value="/user/dnet/dnet-mapreduce-jobs-0.0.2-SNAPSHOT-jar-with-dependencies.jar"/> -->
40
        	</STATIC_CONFIGURATION>
41
        	<JOB_INTERFACE>
42
        		<PARAM name="hbase.mapred.inputtable" required="true" description="source hbase table" />
43
        		<PARAM name="hbase.mapred.outputtable" required="true" description="target hbase table" />
44
        		<PARAM name="hbase.mapreduce.inputtable" required="true" description="source hbase table" />
45
        		<PARAM name="hbase.mapreduce.outputtable" required="true" description="target hbase table" />     
46
        	</JOB_INTERFACE>
47
        	<SCAN caching="10">
48
        		<FILTERS operator="MUST_PASS_ALL">
49
        			<FILTER type="prefix" param="entityTypeId" />
50
        		</FILTERS>
51
        		<FAMILIES/>        		
52
        	</SCAN>
53
        </HADOOP_JOB>
54
        <STATUS>
55
            <LAST_SUBMISSION_DATE value="2001-12-31T12:00:00"/>
56
            <RUNNING_INSTANCES value="0"/>
57
            <CUMULATIVE_RUN value="0" />
58
        </STATUS>
59
        <SECURITY_PARAMETERS>SECURITY_PARAMETERS</SECURITY_PARAMETERS>
60
    </BODY>
61
</RESOURCE_PROFILE>
modules/dnet-graph-domain/trunk/src/main/resources/eu/dnetlib/bootstrap/profiles/HadoopJobConfigurationDSResources/HadoopJobConfigurationDSResourceType/elasticsearchFeedDliScholixDataJob.xml
1
<RESOURCE_PROFILE>
2
    <HEADER>
3
        <RESOURCE_IDENTIFIER
4
                        value="6e96b76e-8398-4afe-a74b-56cea0b1dc68_SGFkb29wSm9iQ29uZmlndXJhdGlvbkRTUmVzb3VyY2VzL0hhZG9vcEpvYkNvbmZpZ3VyYXRpb25EU1Jlc291cmNlVHlwZQ=="/>
5
        <RESOURCE_TYPE value="HadoopJobConfigurationDSResourceType"/>
6
        <RESOURCE_KIND value="HadoopJobConfigurationDSResources"/>
7
        <RESOURCE_URI value=""/>
8
        <DATE_OF_CREATION value="2001-12-31T12:00:00"/>
9
    </HEADER>
10
    <BODY>
11
        <HADOOP_JOB name="elasticsearchFeedDliScholixDataJob" type="mapreduce">
12
            <DESCRIPTION>map reduce job that joins the entities on the hbase table and produces scholix records that are indexed on ES</DESCRIPTION>
13
            <STATIC_CONFIGURATION>
14

  
15
                <!-- I/O FORMAT -->
16
                <PROPERTY key="mapreduce.inputformat.class" value="org.apache.hadoop.hbase.mapreduce.TableInputFormat"/>
17
                <PROPERTY key="mapreduce.outputformat.class" value="org.elasticsearch.hadoop.mr.EsOutputFormat" />
18

  
19
                <!-- MAPPER -->
20
                <PROPERTY key="mapreduce.map.class" value="eu.dnetlib.data.mapreduce.hbase.dli.PrepareScholixDataMapper"/>
21
                <PROPERTY key="mapred.mapoutput.key.class" value="eu.dnetlib.data.mapreduce.hbase.dli.kv.DliKey"/>
22
                <PROPERTY key="mapreduce.mapoutput.key.class" value="eu.dnetlib.data.mapreduce.hbase.dli.kv.DliKey"/>
23
                <PROPERTY key="mapred.mapoutput.value.class" value="org.apache.hadoop.hbase.io.ImmutableBytesWritable"/>
24

  
25
                <!-- PARTITIONER -->
26
                <PROPERTY key="mapred.partitioner.class" value="eu.dnetlib.data.mapreduce.hbase.dli.kv.NaturalKeyPartitioner"/>
27
                <PROPERTY key="mapreduce.partitioner.class" value="eu.dnetlib.data.mapreduce.hbase.dli.kv.NaturalKeyPartitioner"/>
28
                <PROPERTY key="mapred.output.value.groupfn.class" value="eu.dnetlib.data.mapreduce.hbase.dli.kv.NaturalKeyGroupingComparator"/>
29
                <PROPERTY key="mapreduce.output.value.groupfn.class" value="eu.dnetlib.data.mapreduce.hbase.dli.kv.NaturalKeyGroupingComparator"/>
30
                <!--<PROPERTY key="mapred.output.key.comparator.class" value="eu.dnetlib.data.mapreduce.hbase.dli.kv.CompositeKeyComparator"/>-->
31

  
32
                <!-- REDUCER -->
33
                <PROPERTY key="mapreduce.reduce.class"
34
                          value="eu.dnetlib.data.mapreduce.hbase.dli.PrepareScholixDataReducer"/>
35
                <PROPERTY key="mapred.output.key.class" value="org.apache.hadoop.io.Text"/>
36
                <PROPERTY key="mapred.output.value.class" value="org.apache.hadoop.io.Text"/>
37

  
38
                <!-- MISC -->
39
                <PROPERTY key="es.nodes"
40
                          value="90.147.167.27:9200,90.147.167.25:9200,90.147.167.28:9200,90.147.167.26:9200"/>
41
                <PROPERTY key="es.nodes.resolve.hostname" value="false"/>
42
                <PROPERTY key="es.nodes.wan.only" value="true"/>
43
                <PROPERTY key="es.resource" value="dli/scholix"/>
44
                <PROPERTY key="es.input.json" value="yes"/>
45
                <PROPERTY key="mapred.reduce.tasks" value="10"/>
46

  
47
                <PROPERTY key="mapred.reduce.tasks.speculative.execution" value="false"/>
48
                <PROPERTY key="mapred.map.tasks.speculative.execution" value="false"/>
49

  
50
                <!-- <PROPERTY key="user.name" value="dnet" /> -->
51

  
52
                <!--  	Uncomment to override the default lib path -->
53
                <!--	<PROPERTY key="job.lib" value="/user/dnet/dnet-mapreduce-jobs-0.0.2-SNAPSHOT-jar-with-dependencies.jar"/> -->
54
            </STATIC_CONFIGURATION>
55
            <JOB_INTERFACE>
56
                <PARAM name="hbase.mapred.inputtable" required="true" description="source hbase table"/>
57
                <PARAM name="hbase.mapreduce.inputtable" required="true" description="source hbase table"/>
58
            </JOB_INTERFACE>
59
            <SCAN>
60
                <FILTERS/>
61
                <FAMILIES/>
62
            </SCAN>
63
        </HADOOP_JOB>
64
        <STATUS>
65
            <LAST_SUBMISSION_DATE value="2001-12-31T12:00:00"/>
66
            <RUNNING_INSTANCES value="0"/>
67
            <CUMULATIVE_RUN value="0"/>
68
        </STATUS>
69
        <SECURITY_PARAMETERS>SECURITY_PARAMETERS</SECURITY_PARAMETERS>
70
    </BODY>
71
</RESOURCE_PROFILE>
modules/dnet-graph-domain/trunk/src/main/java/eu/dnetlib/data/actionmanager/actions/AtomicActionDeserialiser.java
1
package eu.dnetlib.data.actionmanager.actions;
2

  
3
import java.lang.reflect.Type;
4

  
5
import com.google.gson.*;
6
import com.googlecode.protobuf.format.JsonFormat;
7
import eu.dnetlib.data.proto.DNGFProtos.DNGF;
8
import eu.dnetlib.rmi.data.hadoop.actionmanager.Agent;
9
import eu.dnetlib.rmi.data.hadoop.actionmanager.actions.AbstractActionSerializer;
10
import eu.dnetlib.rmi.data.hadoop.actionmanager.actions.AtomicAction;
11
import org.apache.commons.codec.binary.Base64;
12
import org.apache.commons.lang3.StringUtils;
13
import org.apache.commons.logging.Log;
14
import org.apache.commons.logging.LogFactory;
15

  
16
/**
17
 * Created by claudio on 30/09/15.
18
 */
19
public class AtomicActionDeserialiser extends AbstractActionSerializer implements JsonDeserializer<AtomicAction> {
20

  
21
	private static final Log log = LogFactory.getLog(AtomicActionDeserialiser.class);
22

  
23
	public static AtomicAction fromJSON(String s) {
24
		final GsonBuilder gson = new GsonBuilder();
25

  
26
		gson.registerTypeAdapter(AtomicAction.class, new AtomicActionDeserialiser());
27

  
28
		return gson.create().fromJson(s, AtomicAction.class);
29
	}
30

  
31
	@Override
32
	public AtomicAction deserialize(final JsonElement json, final Type typeOfT, final JsonDeserializationContext context) throws JsonParseException {
33

  
34
		if (json.isJsonObject()) {
35
			final JsonObject j = (JsonObject) json;
36

  
37
			final Agent a = new Agent();
38

  
39
			final JsonObject aj = j.get(agent).getAsJsonObject();
40
			a.setId(aj.get(agent_id).getAsString());
41
			a.setName(aj.get(agent_name).getAsString());
42
			a.setType(Agent.AGENT_TYPE.valueOf(aj.get(agent_type).getAsString()));
43

  
44
			AtomicAction aa = new AtomicAction(j.get(rawSet).getAsString(), a);
45

  
46
			aa.setTargetColumn(j.get(targetColumn).getAsString());
47
			aa.setTargetColumnFamily(j.get(targetColumnFamily).getAsString());
48
			aa.setTargetRowKey(j.get(targetRowKey).getAsString());
49

  
50
			aa.setRowKey(j.get(rowKey).getAsString());
51
			aa.setTargetValue(decodeTargetValue(j.get(targetValue).getAsString()));
52

  
53
			return aa;
54
		}
55

  
56
		throw new JsonParseException("input is not a json object");
57
	}
58

  
59
	private byte[] decodeTargetValue(final String s) {
60
		if (StringUtils.isBlank(s)) {
61
			return null;
62
		}
63
		try {
64
			final String json = new String(Base64.decodeBase64(s.getBytes()));
65

  
66
			DNGF.Builder oaf = DNGF.newBuilder();
67
			JsonFormat.merge(json, oaf);
68

  
69
			return oaf.build().toByteArray();
70
		} catch (JsonFormat.ParseException e) {
71
			log.error("unable to parse proto", e);
72
			return null;
73
		}
74
	}
75

  
76
}
77

  
modules/dnet-graph-domain/trunk/src/main/resources/eu/dnetlib/bootstrap/profiles/HadoopJobConfigurationDSResources/HadoopJobConfigurationDSResourceType/dedupFixRelationsJob.xml
1
<RESOURCE_PROFILE>
2
    <HEADER>
3
        <RESOURCE_IDENTIFIER
4
                value="5b99d940-7477-40db-a458-549d301419c5_SGFkb29wSm9iQ29uZmlndXJhdGlvbkRTUmVzb3VyY2VzL0hhZG9vcEpvYkNvbmZpZ3VyYXRpb25EU1Jlc291cmNlVHlwZQ=="/>
5
        <RESOURCE_TYPE value="HadoopJobConfigurationDSResourceType"/>
6
        <RESOURCE_KIND value="HadoopJobConfigurationDSResources"/>
7
        <RESOURCE_URI value=""/>
8
        <DATE_OF_CREATION value="2001-12-31T12:00:00"/>
9
    </HEADER>
10
    <BODY>
11
        <HADOOP_JOB name="dedupFixRelationsJob" type="mapreduce">
12
            <DESCRIPTION>map reduce job that redirects the rels to fix the graph consistency</DESCRIPTION>
13
            <STATIC_CONFIGURATION><!-- I/O FORMAT -->
14
                <PROPERTY key="mapreduce.inputformat.class" value="org.apache.hadoop.hbase.mapreduce.TableInputFormat"/>
15
                <PROPERTY key="mapreduce.outputformat.class"
16
                          value="org.apache.hadoop.hbase.mapreduce.TableOutputFormat"/>
17
                <!-- MAPPER -->
18
                <PROPERTY key="mapreduce.map.class"
19
                          value="eu.dnetlib.data.mapreduce.hbase.dedup.DedupFixRelationMapper"/>
20
                <PROPERTY key="mapreduce.mapoutput.key.class" value="eu.dnetlib.data.mapreduce.hbase.dedup.kv.DNGFKey"/>
21
                <PROPERTY key="mapred.mapoutput.key.class" value="eu.dnetlib.data.mapreduce.hbase.dedup.kv.DNGFKey"/>
22
                <PROPERTY key="mapreduce.mapoutput.value.class"
23
                          value="org.apache.hadoop.hbase.io.ImmutableBytesWritable"/>
24
                <PROPERTY key="mapred.mapoutput.value.class" value="org.apache.hadoop.hbase.io.ImmutableBytesWritable"/>
25
                <!-- PARTITIONER / SECONDARY SORTING -->
26
                <PROPERTY key="mapreduce.partitioner.class"
27
                          value="eu.dnetlib.data.mapreduce.hbase.dedup.kv.DNGFKeyPartitioner"/>
28
                <PROPERTY key="mapred.partitioner.class"
29
                          value="eu.dnetlib.data.mapreduce.hbase.dedup.kv.DNGFKeyPartitioner"/>
30

  
31
                <PROPERTY key="mapred.output.value.groupfn.class"
32
                          value="eu.dnetlib.data.mapreduce.hbase.dedup.kv.DNGFKeyGroupingComparator"/>
33
                <PROPERTY key="mapreduce.output.value.groupfn.class"
34
                          value="eu.dnetlib.data.mapreduce.hbase.dedup.kv.DNGFKeyGroupingComparator"/>
35
				<!-- REDUCER -->
36
                <PROPERTY key="mapreduce.reduce.class"
37
                          value="eu.dnetlib.data.mapreduce.hbase.dedup.DedupFixRelationReducer"/>
38
                <PROPERTY key="mapreduce.output.key.class" value="org.apache.hadoop.hbase.io.ImmutableBytesWritable"/>
39
                <PROPERTY key="mapreduce.output.value.class" value="org.apache.hadoop.io.Writable"/>
40
                <!-- MISC -->
41
                <PROPERTY key="mapreduce.reduce.tasks.speculative.execution" value="false"/>
42
                <PROPERTY key="mapreduce.map.tasks.speculative.execution" value="false"/>
43
                <PROPERTY key="mapreduce.map.speculative" value="false"/>
44
                <PROPERTY key="mapreduce.reduce.speculative" value="false"/>
45
                <PROPERTY key="mapred.reduce.tasks.speculative.execution" value="false"/>
46
                <PROPERTY key="mapred.map.tasks.speculative.execution" value="false"/>
47
                <PROPERTY key="mapred.map.speculative" value="false"/>
48
                <PROPERTY key="mapred.reduce.speculative" value="false"/>
49
                <PROPERTY key="mapred.reduce.tasks" value="100"/>
50

  
51
            </STATIC_CONFIGURATION>
52
            <JOB_INTERFACE>
53
                <PARAM name="hbase.mapreduce.inputtable" required="true" description="source hbase table"/>
54
                <PARAM name="hbase.mapreduce.outputtable" required="true" description="target hbase table"/>
55
                <PARAM name="ontologies" required="true"
56
                       description="ontology values fetched from the information system"/>
57
            </JOB_INTERFACE>
58
            <SCAN caching="10">
59
                <FILTERS operator="MUST_PASS_ALL">
60
                    <FILTER type="prefix" param="entityTypeId"/>
61
                </FILTERS>
62
                <FAMILIES/>
63
            </SCAN>
64
        </HADOOP_JOB>
65
        <STATUS>
66
            <LAST_SUBMISSION_DATE value="2017-03-16T10:45:48+01:00"/>
67
            <RUNNING_INSTANCES value="0"/>
68
            <CUMULATIVE_RUN value="2"/>
69
        </STATUS>
70
        <SECURITY_PARAMETERS>SECURITY_PARAMETERS</SECURITY_PARAMETERS>
71
    </BODY>
72
</RESOURCE_PROFILE>
modules/dnet-graph-domain/trunk/src/main/java/eu/dnetlib/data/index/CloudIndexClient.java
1
package eu.dnetlib.data.index;
2

  
3
import java.util.List;
4

  
5
import com.google.common.base.Function;
6

  
7

  
8
import org.apache.commons.logging.Log;
9
import org.apache.commons.logging.LogFactory;
10
import org.apache.solr.client.solrj.SolrQuery;
11
import org.apache.solr.client.solrj.impl.CloudSolrServer;
12
import org.apache.solr.client.solrj.response.QueryResponse;
13
import org.apache.solr.client.solrj.response.UpdateResponse;
14
import org.apache.solr.common.SolrInputDocument;
15

  
16
/**
17
 * Created by michele on 11/11/15.
18
 */
19
public class CloudIndexClient {
20

  
21
	private static final Log log = LogFactory.getLog(CloudIndexClient.class);
22
	private static final String INDEX_RECORD_RESULT_FIELD = "dnetResult";
23

  
24
	private final CloudSolrServer solrServer;
25

  
26
	protected CloudIndexClient(final CloudSolrServer solrServer) {
27
		this.solrServer = solrServer;
28
	}
29

  
30
	public int feed(final String record, final String indexDsId, final Function<String, String> toIndexRecord) throws CloudIndexClientException {
31
		try {
32
			final SolrInputDocument doc = prepareSolrDocument(record, indexDsId, toIndexRecord);
33
			if ((doc == null) || doc.isEmpty()) throw new CloudIndexClientException("Invalid solr document");
34
			return feed(doc);
35
		} catch (final Throwable e) {
36
			throw new CloudIndexClientException("Error feeding document", e);
37
		}
38
	}
39

  
40
	public int feed(final SolrInputDocument document) throws CloudIndexClientException {
41
		try {
42
			final UpdateResponse res = solrServer.add(document);
43
			//log.debug("feed time for single records, elapsed time: " + HumanTime.exactly(res.getElapsedTime()));
44
			if (res.getStatus() != 0) {
45
				throw new CloudIndexClientException("bad status: " + res.getStatus());
46
			}
47
			solrServer.commit();
48
			return res.getStatus();
49
		} catch (final Throwable e) {
50
			throw new CloudIndexClientException("Error feeding document", e);
51
		}
52
	}
53

  
54
	public void feed(final List<SolrInputDocument> docs, final AfterFeedingCallback callback) throws CloudIndexClientException {
55
		try {
56
			final UpdateResponse res = solrServer.add(docs);
57
			//log.debug("feed time for " + docs.size() + " records, elapsed tipe: : " + HumanTime.exactly(res.getElapsedTime()));
58
			solrServer.commit();
59
			if (callback != null) {
60
				callback.doAfterFeeding(res);
61
			}
62
			if (res.getStatus() != 0) throw new CloudIndexClientException("bad status: " + res.getStatus());
63
		} catch (final Throwable e) {
64
			throw new CloudIndexClientException("Error feeding documents", e);
65
		}
66
	}
67

  
68
	public SolrInputDocument prepareSolrDocument(final String record, final String indexDsId, final Function<String, String> toIndexRecord)
69
			throws CloudIndexClientException {
70
		try {
71
			/*
72
			final StreamingInputDocumentFactory documentFactory = new StreamingInputDocumentFactory();
73

  
74
			final String version = (new SimpleDateFormat("yyyy-MM-dd\'T\'hh:mm:ss\'Z\'")).format(new Date());
75
			final String indexRecord = toIndexRecord.apply(record);
76

  
77
			if (log.isDebugEnabled()) {
78
				log.debug("***************************************\nSubmitting index record:\n" + indexRecord + "\n***************************************\n");
79
			}
80

  
81
			return documentFactory.parseDocument(version, indexRecord, indexDsId, INDEX_RECORD_RESULT_FIELD);*/
82
			return null;
83
		} catch (final Throwable e) {
84
			throw new CloudIndexClientException("Error creating solr document", e);
85
		}
86
	}
87

  
88
	public boolean isRecordIndexed(final String id) throws CloudIndexClientException {
89
		try {
90
			final SolrQuery query = new SolrQuery();
91
			query.setQuery("objidentifier:\"" + id + "\"");
92
			final QueryResponse res = solrServer.query(query);
93
			return res.getResults().size() > 0;
94
		} catch (final Throwable e) {
95
			throw new CloudIndexClientException("Error searching documents", e);
96
		}
97
	}
98

  
99
	public void close() {
100
		if (solrServer != null) {
101
			solrServer.shutdown();
102
		}
103
	}
104

  
105
	public interface AfterFeedingCallback {
106

  
107
		void doAfterFeeding(final UpdateResponse response);
108
	}
109
}
modules/dnet-graph-domain/trunk/src/main/resources/eu/dnetlib/bootstrap/profiles/HadoopJobConfigurationDSResources/HadoopJobConfigurationDSResourceType/dedupMarkDeletedEntityJob.xml
1
<RESOURCE_PROFILE>
2
	<HEADER>
3
		<RESOURCE_IDENTIFIER
4
				value="667fe203-ee51-4dff-8c9c-b90e66e96eb4_SGFkb29wSm9iQ29uZmlndXJhdGlvbkRTUmVzb3VyY2VzL0hhZG9vcEpvYkNvbmZpZ3VyYXRpb25EU1Jlc291cmNlVHlwZQ=="/>
5
		<RESOURCE_TYPE value="HadoopJobConfigurationDSResourceType"/>
6
		<RESOURCE_KIND value="HadoopJobConfigurationDSResources"/>
7
		<RESOURCE_URI value=""/>
8
		<DATE_OF_CREATION value="2001-12-31T12:00:00"/>
9
	</HEADER>
10
	<BODY>
11
		<HADOOP_JOB name="dedupMarkDeletedEntityJob" type="mapreduce">
12
			<DESCRIPTION>map only job that closes the similarity mesh</DESCRIPTION>
13
			<STATIC_CONFIGURATION>
14

  
15
				<!-- I/O FORMAT -->
16
				<PROPERTY key="mapreduce.inputformat.class" value="org.apache.hadoop.hbase.mapreduce.TableInputFormat"/>
17
				<PROPERTY key="mapreduce.outputformat.class" value="org.apache.hadoop.hbase.mapreduce.TableOutputFormat"/>
18

  
19
				<!-- MAPPER -->
20
				<PROPERTY key="mapreduce.map.class" value="eu.dnetlib.data.mapreduce.hbase.dedup.DedupMarkDeletedEntityMapper"/>
21
				<PROPERTY key="mapred.mapoutput.key.class" value="org.apache.hadoop.hbase.io.ImmutableBytesWritable"/>
22
				<PROPERTY key="mapred.mapoutput.value.class" value="org.apache.hadoop.hbase.client.Put"/>
23

  
24
				<!-- MISC -->
25
				<PROPERTY key="mapred.reduce.tasks.speculative.execution" value="false"/>
26
				<PROPERTY key="mapred.map.tasks.speculative.execution" value="false"/>
27
				<PROPERTY key="mapreduce.map.speculative" value="false"/>
28
				<PROPERTY key="mapreduce.reduce.speculative" value="false"/>
29

  
30
				<PROPERTY key="mapred.reduce.tasks" value="0"/>
31
				<!-- <PROPERTY key="user.name" value="dnet" /> -->
32

  
33
				<!--  	Uncomment to override the default lib path -->
34
				<!--	<PROPERTY key="job.lib" value="/user/dnet/dnet-mapreduce-jobs-0.0.2-SNAPSHOT-jar-with-dependencies.jar"/> -->
35
			</STATIC_CONFIGURATION>
36
			<JOB_INTERFACE>
37
				<PARAM name="hbase.mapred.inputtable" required="true" description="source hbase table"/>
38
				<PARAM name="hbase.mapred.outputtable" required="true" description="target hbase table"/>
39
				<PARAM name="hbase.mapreduce.inputtable" required="true" description="source hbase table"/>
40
				<PARAM name="hbase.mapreduce.outputtable" required="true" description="target hbase table"/>
41
			</JOB_INTERFACE>
42
			<SCAN>
43
				<FILTERS operator="MUST_PASS_ALL">
44
					<FILTER type="prefix" param="entityTypeId"/>
45
				</FILTERS>
46
				<FAMILIES>
47
					<FAMILY value="metadata"/>
48
					<FAMILY value="rels"/>
49
				</FAMILIES>
50
			</SCAN>
51
		</HADOOP_JOB>
52
		<STATUS>
53
			<LAST_SUBMISSION_DATE value="2001-12-31T12:00:00"/>
54
			<RUNNING_INSTANCES value="0"/>
55
			<CUMULATIVE_RUN value="0"/>
56
		</STATUS>
57
		<SECURITY_PARAMETERS>SECURITY_PARAMETERS</SECURITY_PARAMETERS>
58
	</BODY>
59
</RESOURCE_PROFILE>
modules/dnet-graph-domain/trunk/src/main/java/eu/dnetlib/data/index/CloudIndexClientException.java
1
package eu.dnetlib.data.index;
2

  
3
/**
4
 * Created by michele on 23/11/15.
5
 */
6
public class CloudIndexClientException extends Exception {
7

  
8
	public CloudIndexClientException(final String message) {
9
		super(message);
10
	}
11

  
12
	public CloudIndexClientException(final String message, final Throwable cause) {
13
		super(message, cause);
14
	}
15
}
modules/dnet-graph-domain/trunk/src/main/resources/eu/dnetlib/bootstrap/profiles/HadoopJobConfigurationDSResources/HadoopJobConfigurationDSResourceType/dedupMinDistGraphJob.xml
1
<RESOURCE_PROFILE>
2
	<HEADER>
3
		<RESOURCE_IDENTIFIER
4
				value="de888da6-2d10-4d42-a624-a44d4083414a_SGFkb29wSm9iQ29uZmlndXJhdGlvbkRTUmVzb3VyY2VzL0hhZG9vcEpvYkNvbmZpZ3VyYXRpb25EU1Jlc291cmNlVHlwZQ=="/>
5
		<RESOURCE_TYPE value="HadoopJobConfigurationDSResourceType"/>
6
		<RESOURCE_KIND value="HadoopJobConfigurationDSResources"/>
7
		<RESOURCE_URI value=""/>
8
		<DATE_OF_CREATION value="2001-12-31T12:00:00"/>
9
	</HEADER>
10
	<BODY>
11
		<HADOOP_JOB name="dedupMinDistGraphJob" type="mapreduce">
12
			<DESCRIPTION>map reduce job that finds the minimum vertex in each connected component in the input graph (as adjacency lists)</DESCRIPTION>
13
			<STATIC_CONFIGURATION>
14

  
15
				<!-- I/O FORMAT -->
16
				<PROPERTY key="mapreduce.inputformat.class" value="org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat"/>
17
				<PROPERTY key="mapreduce.outputformat.class" value="org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat"/>
18

  
19
				<!-- MAPPER -->
20
				<PROPERTY key="mapreduce.map.class" value="eu.dnetlib.data.mapreduce.hbase.dedup.cc.MindistSearchMapper"/>
21
				<PROPERTY key="mapred.mapoutput.key.class" value="org.apache.hadoop.io.Text"/>
22
				<PROPERTY key="mapred.mapoutput.value.class" value="eu.dnetlib.data.mapreduce.hbase.dedup.cc.VertexWritable"/>
23

  
24
				<!-- REDUCER -->
25
				<PROPERTY key="mapreduce.reduce.class" value="eu.dnetlib.data.mapreduce.hbase.dedup.cc.MindistSearchReducer"/>
26
				<PROPERTY key="mapred.output.key.class" value="org.apache.hadoop.io.Text"/>
27
				<PROPERTY key="mapred.output.value.class" value="eu.dnetlib.data.mapreduce.hbase.dedup.cc.VertexWritable"/>
28

  
29
				<!-- MISC -->
30
				<PROPERTY key="mapred.compress.map.output" value="false"/>
31
				<PROPERTY key="mapred.reduce.tasks.speculative.execution" value="false"/>
32
				<PROPERTY key="mapred.map.tasks.speculative.execution" value="false"/>
33
				<PROPERTY key="mapreduce.map.speculative" value="false"/>
34
				<PROPERTY key="mapreduce.reduce.speculative" value="false"/>
35

  
36
				<PROPERTY key="mapred.reduce.tasks" value="100"/>
37
				<!-- <PROPERTY key="user.name" value="dnet" /> -->
38

  
39
				<!--  	Uncomment to override the default lib path -->
40
				<!--	<PROPERTY key="job.lib" value="/user/dnet/dnet-mapreduce-jobs-0.0.2-SNAPSHOT-jar-with-dependencies.jar"/> -->
41
			</STATIC_CONFIGURATION>
42
			<JOB_INTERFACE>
43
				<PARAM name="mapred.input.dir" required="true" description="source sequence file on hdfs"/>
44
				<PARAM name="mapred.output.dir" required="true" description="target sequence file on hdfs"/>
45
			</JOB_INTERFACE>
46
			<SCAN>
47
				<FILTERS/>
48
				<FAMILIES/>
49
			</SCAN>
50
		</HADOOP_JOB>
51
		<STATUS>
52
			<LAST_SUBMISSION_DATE value="2001-12-31T12:00:00"/>
53
			<RUNNING_INSTANCES value="0"/>
54
			<CUMULATIVE_RUN value="0"/>
55
		</STATUS>
56
		<SECURITY_PARAMETERS>SECURITY_PARAMETERS</SECURITY_PARAMETERS>
57
	</BODY>
58
</RESOURCE_PROFILE>
59

  
modules/dnet-graph-domain/trunk/src/main/java/eu/dnetlib/data/transform/xml/AbstractDNetXsltFunctions.java
1
package eu.dnetlib.data.transform.xml;
2

  
3
import com.google.common.base.Predicate;
4
import com.google.common.base.Predicates;
5
import com.google.common.base.Splitter;
6
import com.google.common.collect.Iterables;
7
import com.google.common.collect.Lists;
8
import com.google.protobuf.Descriptors.Descriptor;
9
import com.google.protobuf.Descriptors.FieldDescriptor;
10
import com.google.protobuf.InvalidProtocolBufferException;
11
import com.google.protobuf.Message;
12
import com.google.protobuf.Message.Builder;
13
import com.google.protobuf.ProtocolMessageEnum;
14
import eu.dnetlib.data.proto.DNGFProtos.DNGF;
15
import eu.dnetlib.data.proto.DNGFProtos.DNGFEntity;
16
import eu.dnetlib.data.proto.DNGFProtos.DNGFRel;
17
import eu.dnetlib.data.proto.FieldTypeProtos.*;
18
import eu.dnetlib.data.proto.FieldTypeProtos.OAIProvenance.OriginDescription;
19
import eu.dnetlib.data.proto.KindProtos.Kind;
20
import eu.dnetlib.data.proto.TypeProtos.Type;
21
import eu.dnetlib.miscutils.collections.Pair;
22
import org.apache.commons.codec.binary.Base64;
23
import org.apache.commons.codec.binary.Hex;
24
import org.apache.commons.lang3.StringUtils;
25
import org.apache.commons.lang3.math.NumberUtils;
26
import org.w3c.dom.NamedNodeMap;
27
import org.w3c.dom.Node;
28
import org.w3c.dom.NodeList;
29

  
30
import java.nio.charset.Charset;
31
import java.security.MessageDigest;
32
import java.text.ParseException;
33
import java.text.SimpleDateFormat;
34
import java.util.Date;
35
import java.util.Iterator;
36
import java.util.List;
37
import java.util.Map;
38

  
39
public abstract class AbstractDNetXsltFunctions {
40

  
41
	public static final String URL_REGEX = "^(http|https|ftp)\\://.*";
42
	private static final int MAX_NSPREFIX_LEN = 12;
43

  
44

  
45
	public static Predicate<String> urlFilter = s -> s.trim().matches(URL_REGEX);
46

  
47
	// Builder for Entities
48
	protected static DNGF getOaf(final DNGFEntity.Builder entity, final DataInfo.Builder info) {
49
		return _getOaf(DNGF.newBuilder(), info).setKind(Kind.entity).setEntity(entity).build();
50
	}
51

  
52
	// Builder for Rels
53
	protected static DNGF getOaf(final DNGFRel.Builder rel, final DataInfo.Builder info) {
54
		return _getOaf(DNGF.newBuilder(), info).setKind(Kind.relation).setRel(rel).build();
55
	}
56

  
57
	private static DNGF.Builder _getOaf(final DNGF.Builder oaf, final DataInfo.Builder info) {
58
		if (info != null) {
59
			return oaf.setDataInfo(ensureDataInfo(info));
60
		} else return oaf;
61
	}
62

  
63
	protected static DataInfo.Builder ensureDataInfo(final DataInfo.Builder info) {
64
		if (info.isInitialized()) return info;
65
		return getDataInfo(null, "UNKNOWN", "0.9", false, false);
66
	}
67

  
68
	protected static KeyValue getKV(final String id, final String name) {
69
		return KeyValue.newBuilder().setKey(id).setValue(name).build();
70
	}
71

  
72
	protected static DNGFRel.Builder getRel(
73
			final String sourceId,
74
			final Type sourceType,
75
			final String targetId,
76
			final Type targetType,
77
			final Qualifier relType,
78
			final boolean isChild) {
79
		return DNGFRel.newBuilder().setSource(sourceId).setTarget(targetId)
80
				.setSourceType(sourceType)
81
				.setTargetType(targetType).setRelType(relType)
82
				.setChild(isChild);
83
	}
84

  
85
	protected static DNGFEntity.Builder getEntity(final Type type,
86
			final String id,
87
			final KeyValue collectedFrom,
88
			final List<String> originalIds,
89
			final String dateOfCollection,
90
			final String dateOfTransformation,
91
			final List<StructuredProperty> pids) {
92
		final DNGFEntity.Builder builder = DNGFEntity.newBuilder().setType(type).setId(id);
93
		if (collectedFrom != null) builder.addCollectedfrom(collectedFrom);
94
		builder.setDateoftransformation(StringUtils.isBlank(dateOfTransformation) ? "" : dateOfTransformation);
95
		builder.setDateofcollection(StringUtils.isBlank(dateOfCollection) ? "" : dateOfCollection);
96

  
97
		if ((originalIds != null) && !originalIds.isEmpty()) {
98
			builder.addAllOriginalId(Iterables.filter(originalIds, getPredicateNotBlankString()));
99
		}
100

  
101
		if ((pids != null) && !pids.isEmpty()) {
102
			builder.addAllPid(Iterables.filter(pids, Predicates.notNull()));
103
		}
104

  
105
		return builder;
106
	}
107

  
108
    public static Predicate<String> getPredicateNotBlankString() {
109
        return s -> StringUtils.isNotBlank(s);
110
	}
111

  
112

  
113
	public static Node getDataInfo(final NodeList about) {
114
		if (about.getLength() > 0) {
115
			final NodeList aboutChildren = about.item(0).getChildNodes();
116
			for (int i = 0; i < aboutChildren.getLength(); i++) {
117
				final Node currentNode = aboutChildren.item(i);
118
				if ("datainfo".equals(currentNode.getLocalName())) {
119
					return currentNode;
120
				}
121
			}
122
		}
123
		return null;
124
	}
125

  
126

  
127
	public static DataInfo.Builder getDataInfo(final NodeList about,
128
			final String provenanceaction,
129
			final String trust,
130
			final boolean deletedbyinference,
131
			final boolean inferred) {
132

  
133
		final DataInfo.Builder dataInfoBuilder = DataInfo.newBuilder();
134
		dataInfoBuilder.setInferred(Boolean.valueOf(inferred));
135
		dataInfoBuilder.setDeletedbyinference(Boolean.valueOf(deletedbyinference));
136
		dataInfoBuilder.setTrust(trust);
137
		dataInfoBuilder.setProvenanceaction(getSimpleQualifier(provenanceaction, "dnet:provenanceActions").build());
138

  
139
		// checking instanceof because when receiving an empty <oaf:datainfo> we don't want to parse it.
140
		if (((about != null) && (about.getLength() > 0)) /* && (dataInfo instanceof org.w3c.dom.Element) */) {
141

  
142
			final org.w3c.dom.Element dataInfoElement = getDirectChild((org.w3c.dom.Element) about.item(0), "datainfo");
143
			if (dataInfoElement != null) {
144
				org.w3c.dom.Element elem = getDirectChild(dataInfoElement, "inferred");
145
				dataInfoBuilder.setInferred(Boolean.valueOf(getStringValue(elem, String.valueOf(inferred))));
146

  
147
				elem = getDirectChild(dataInfoElement, "deletedbyinference");
148
				dataInfoBuilder.setDeletedbyinference(Boolean.valueOf(getStringValue(elem, String.valueOf(deletedbyinference))));
149

  
150
				elem = getDirectChild(dataInfoElement, "trust");
151
				dataInfoBuilder.setTrust(getStringValue(elem, trust));
152

  
153
				elem = getDirectChild(dataInfoElement, "inferenceprovenance");
154
				dataInfoBuilder.setInferenceprovenance(getStringValue(elem));
155

  
156
				elem = getDirectChild(dataInfoElement, "provenanceaction");
157
				final Qualifier.Builder pBuilder = Qualifier.newBuilder();
158
				if (elem != null && elem.hasAttributes()) {
159
					final NamedNodeMap attributes = elem.getAttributes();
160
					pBuilder.setClassid(getAttributeValue(attributes, "classid"));
161
					pBuilder.setClassname(getAttributeValue(attributes, "classname"));
162
					pBuilder.setSchemeid(getAttributeValue(attributes, "schemeid"));
163
					pBuilder.setSchemename(getAttributeValue(attributes, "schemename"));
164
				} else {
165
					pBuilder.mergeFrom(getSimpleQualifier(provenanceaction, "dnet:provenanceActions").build());
166
				}
167
				dataInfoBuilder.setProvenanceaction(pBuilder);
168
			}
169
		}
170

  
171
		return dataInfoBuilder;
172
	}
173

  
174

  
175

  
176
	protected static OAIProvenance getOAIProvenance(final NodeList about) {
177

  
178
		OAIProvenance.Builder oaiProv = OAIProvenance.newBuilder();
179

  
180
		if (((about != null) && (about.getLength() > 0))) {
181

  
182
			final org.w3c.dom.Element provenance = getDirectChild((org.w3c.dom.Element) about.item(0), "provenance");
183

  
184
			if (provenance != null) {
185
				final org.w3c.dom.Element origDesc = getDirectChild(provenance, "originDescription");
186
				oaiProv.setOriginDescription(buildOriginDescription(origDesc, OriginDescription.newBuilder()));
187
			}
188
		}
189

  
190
		return oaiProv.build();
191
	}
192

  
193
	private static OriginDescription buildOriginDescription(final org.w3c.dom.Element origDesc, final OriginDescription.Builder od) {
194
		od.setHarvestDate(origDesc.getAttribute("harvestDate")).setAltered(Boolean.valueOf(origDesc.getAttribute("altered")));
195

  
196
		org.w3c.dom.Element elem = getDirectChild(origDesc, "baseURL");
197
		od.setBaseURL(getStringValue(elem));
198

  
199
		elem = getDirectChild(origDesc, "identifier");
200
		od.setIdentifier(getStringValue(elem));
201

  
202
		elem = getDirectChild(origDesc, "datestamp");
203
		od.setDatestamp(getStringValue(elem));
204

  
205
		elem = getDirectChild(origDesc, "metadataNamespace");
206
		od.setMetadataNamespace(getStringValue(elem));
207

  
208
		elem = getDirectChild(origDesc, "originDescription");
209

  
210
		if (elem != null) {
211

  
212
			od.setOriginDescription(buildOriginDescription(elem, OriginDescription.newBuilder()));
213
		}
214

  
215
		return od.build();
216
	}
217

  
218
	protected static String getStringValue(final org.w3c.dom.Element elem, final String defaultValue) {
219
		return (elem != null && elem.getTextContent() != null) ? elem.getTextContent() : defaultValue;
220
	}
221

  
222
	protected static String getStringValue(final org.w3c.dom.Element elem) {
223
		return getStringValue(elem, "");
224
	}
225

  
226
	protected static String getAttributeValue(final NamedNodeMap attributes, final String name) {
227
		final Node attr = attributes.getNamedItem(name);
228
		if (attr == null) return "";
229
		final String value = attr.getNodeValue();
230
		return value != null ? value : "";
231
	}
232

  
233
	protected static org.w3c.dom.Element getDirectChild(final org.w3c.dom.Element parent, final String name) {
234
		for (Node child = parent.getFirstChild(); child != null; child = child.getNextSibling()) {
235
			if ((child instanceof org.w3c.dom.Element) && name.equals(child.getLocalName())) return (org.w3c.dom.Element) child;
236
		}
237
		return null;
238
	}
239

  
240
	protected static Qualifier.Builder getSimpleQualifier(final String classname, final String schemename) {
241
		return getQualifier(classname, classname, schemename, schemename);
242
	}
243

  
244
	protected static Qualifier.Builder getSimpleQualifier(final ProtocolMessageEnum classname, final String schemename) {
245
		return getQualifier(classname.toString(), classname.toString(), schemename, schemename);
246
	}
247

  
248
	protected static Qualifier.Builder getQualifier(final String classid, final String classname, final String schemeid, final String schemename) {
249
		return Qualifier.newBuilder().setClassid(classid).setClassname(classname).setSchemeid(schemeid).setSchemename(schemename);
250
	}
251

  
252
	protected static Qualifier.Builder setQualifier(final Qualifier.Builder qualifier, final List<String> fields) {
253
		if ((fields == null) || fields.isEmpty() || fields.get(0).isEmpty()) return null;
254

  
255
		if ((fields != null) && !fields.isEmpty() && (fields.get(0) != null)) {
256
			qualifier.setClassid(fields.get(0));
257
			qualifier.setClassname(fields.get(0));
258
		}
259
		return qualifier;
260
	}
261

  
262
	protected static void addStructuredProps(final Builder builder,
263
			final FieldDescriptor fd,
264
			final List<String> values,
265
			final String classid,
266
			final String schemeid) {
267
		if (values != null) {
268
			for (final String s : values) {
269
				addField(builder, fd, getStructuredProperty(s, classid, classid, schemeid, schemeid));
270
			}
271
		}
272
	}
273

  
274
	protected static List<StructuredProperty> parsePids(final NodeList nodelist) {
275

  
276
		final List<StructuredProperty> pids = Lists.newArrayList();
277

  
278
		for (int i = 0; i < nodelist.getLength(); i++) {
279
			final Node node = nodelist.item(i);
280
			Node pidType = null;
281
			if (node.getNodeType() == Node.ELEMENT_NODE) {
282
				if (node.getLocalName().equalsIgnoreCase("identifier")) {
283
					pidType = node.getAttributes().getNamedItem("identifierType");
284
				}
285
				//this is to handle dataset pids
286
				if (node.getLocalName().equalsIgnoreCase("alternateIdentifier")) {
287
					pidType = node.getAttributes().getNamedItem("alternateIdentifierType");
288
				}
289

  
290
				for (int j = 0; j < node.getChildNodes().getLength(); j++) {
291
					final Node child = node.getChildNodes().item(j);
292

  
293
					if ((child.getNodeType() == Node.TEXT_NODE) && (pidType != null) && (pidType.getNodeValue() != null) && !pidType.getNodeValue().isEmpty()
294
							&& !pidType.getNodeValue().equalsIgnoreCase("url")) {
295

  
296
						final String type = pidType.getNodeValue().toLowerCase();
297

  
298
						final String value = child.getTextContent();
299

  
300
						pids.add(getStructuredProperty(value, type, type, "dnet:pid_types", "dnet:pid_types"));
301
						break;
302
					}
303
				}
304
			}
305
		}
306
		return pids;
307
	}
308

  
309
	@SuppressWarnings("unchecked")
310
	protected static void addField(final Builder builder, final FieldDescriptor descriptor, Object value) {
311

  
312
		if (value == null) return;
313

  
314
		if (value instanceof List<?>) {
315
			for (final Object o : (List<Object>) value) {
316
				addField(builder, descriptor, o);
317
			}
318
		} else {
319
			Object fieldValue = value;
320
			switch (descriptor.getType()) {
321
			case BOOL:
322
				fieldValue = Boolean.valueOf(value.toString());
323
				break;
324
			case BYTES:
325
				fieldValue = value.toString().getBytes(Charset.forName("UTF-8"));
326
				break;
327
			case DOUBLE:
328
				fieldValue = Double.valueOf(value.toString());
329
				break;
330
			case FLOAT:
331
				fieldValue = Float.valueOf(value.toString());
332
				break;
333
			case INT32:
334
			case INT64:
335
			case SINT32:
336
			case SINT64:
337
				fieldValue = Integer.valueOf(value.toString());
338
				break;
339
			case MESSAGE:
340
				final Builder q = builder.newBuilderForField(descriptor);
341

  
342
				if (value instanceof Builder) {
343
					value = ((Builder) value).build();
344
					final byte[] b = ((Message) value).toByteArray();
345
					try {
346
						q.mergeFrom(b);
347
					} catch (final InvalidProtocolBufferException e) {
348
						throw new IllegalArgumentException("Unable to merge value: " + value + " with builder: " + q.getDescriptorForType().getName());
349
					}
350
				} else if (Qualifier.getDescriptor().getName().equals(q.getDescriptorForType().getName())) {
351
					if (value instanceof Qualifier) {
352
						q.mergeFrom((Qualifier) value);
353
					} else {
354
						parseMessage(q, Qualifier.getDescriptor(), value.toString(), "@@@");
355
					}
356
				} else if (StructuredProperty.getDescriptor().getName().equals(q.getDescriptorForType().getName())) {
357
					if (value instanceof StructuredProperty) {
358
						q.mergeFrom((StructuredProperty) value);
359
					} else {
360
						parseMessage(q, StructuredProperty.getDescriptor(), value.toString(), "###");
361
					}
362
				} else if (KeyValue.getDescriptor().getName().equals(q.getDescriptorForType().getName())) {
363
					if (value instanceof KeyValue) {
364
						q.mergeFrom((KeyValue) value);
365
					} else {
366
						parseMessage(q, KeyValue.getDescriptor(), value.toString(), "&&&");
367
					}
368
				} else if (StringField.getDescriptor().getName().equals(q.getDescriptorForType().getName())) {
369
					if (value instanceof StringField) {
370
						q.mergeFrom((StringField) value);
371
					} else {
372
						q.setField(StringField.getDescriptor().findFieldByName("value"), value);
373
					}
374
				} else if (BoolField.getDescriptor().getName().equals(q.getDescriptorForType().getName())) {
375
					if (value instanceof BoolField) {
376
						q.mergeFrom((BoolField) value);
377
					} else if (value instanceof String) {
378
						q.setField(BoolField.getDescriptor().findFieldByName("value"), Boolean.valueOf((String) value));
379
					} else {
380
						q.setField(BoolField.getDescriptor().findFieldByName("value"), value);
381
					}
382
				} else if (IntField.getDescriptor().getName().equals(q.getDescriptorForType().getName())) {
383
					if (value instanceof IntField) {
384
						q.mergeFrom((IntField) value);
385
					} else if (value instanceof String) {
386
						q.setField(IntField.getDescriptor().findFieldByName("value"), NumberUtils.toInt((String) value));
387
					} else {
388
						q.setField(IntField.getDescriptor().findFieldByName("value"), value);
389
					}
390
				}
391

  
392
				fieldValue = q.buildPartial();
393
				break;
394
			default:
395
				break;
396
			}
397

  
398
			doAddField(builder, descriptor, fieldValue);
399
		}
400

  
401
	}
402

  
403
	protected static void doAddField(final Builder builder, final FieldDescriptor fd, final Object value) {
404
		if (value != null) {
405
			if (fd.isRepeated()) {
406
				builder.addRepeatedField(fd, value);
407
			} else if (fd.isOptional() || fd.isRequired()) {
408
				builder.setField(fd, value);
409
			}
410
		}
411
	}
412

  
413
	protected static void parseMessage(final Builder builder, final Descriptor descriptor, final String value, final String split) {
414

  
415
		Iterable<Pair> iterablePair = () -> {
416

  
417
			final Iterator<FieldDescriptor> fields = descriptor.getFields().iterator();
418
			final Iterator<String> values = Lists.newArrayList(Splitter.on(split).trimResults().split(value)).iterator();
419

  
420
			return new Iterator<Pair>() {
421
				@Override
422
				public boolean hasNext() {
423
					return fields.hasNext() && values.hasNext();
424
				}
425

  
426
				@Override
427
				public Pair next() {
428
					final FieldDescriptor field = fields.next();
429
					final String value1 = values.next();
430
					return new Pair(field, value1);
431
				}
432

  
433
				@Override
434
				public void remove() {
435
					throw new UnsupportedOperationException("cannot remove");
436
				}
437
			};
438
		};
439

  
440
//		final IterablePair<FieldDescriptor, String> iterablePair =
441
//				new IterablePair<FieldDescriptor, String>(descriptor.getFields(), Lists.newArrayList(Splitter
442
//						.on(split).trimResults().split(value)));
443

  
444
		for (final Pair<FieldDescriptor, String> p : iterablePair) {
445
			addField(builder, p.getKey(), p.getValue());
446
		}
447
	}
448

  
449
	protected static String base64(final byte[] data) {
450
		final byte[] bytes = Base64.encodeBase64(data);
451
		return new String(bytes);
452
	}
453

  
454
	public static String replace(final String s, final String regex, final String replacement) {
455
		return s.replaceAll(regex, replacement);
456
	}
457

  
458
	public static String trim(final String s) {
459
		return s.trim();
460
	}
461

  
462
	protected static String removePrefix(final Type type, final String s) {
463
		return removePrefix(type.toString(), s);
464
	}
465

  
466
	private static String removePrefix(final String prefix, final String s) {
467
		return StringUtils.removeStart("" + s, prefix + "|");
468
	}
469

  
470
	protected static Qualifier.Builder getDefaultQualifier(final String scheme) {
471
		final Qualifier.Builder qualifier = Qualifier.newBuilder().setSchemeid(scheme).setSchemename(scheme);
472
		return qualifier;
473
	}
474

  
475
	protected static StructuredProperty getStructuredProperty(final String value,
476
			final String classid,
477
			final String classname,
... This diff was truncated because it exceeds the maximum size that can be displayed.

Also available in: Unified diff