Project

General

Profile

1 26600 sandro.lab
package eu.dnetlib.data.transform.xml;
2
3 52411 claudio.at
import java.util.HashMap;
4 26600 sandro.lab
import java.util.List;
5
6 52231 claudio.at
import com.google.common.collect.Iterables;
7 26600 sandro.lab
import com.google.common.collect.Lists;
8
import com.google.protobuf.Descriptors.Descriptor;
9
import eu.dnetlib.data.mapreduce.util.OafRowKeyDecoder;
10 49028 claudio.at
import eu.dnetlib.data.proto.FieldTypeProtos.Author;
11 42532 alessia.ba
import eu.dnetlib.data.proto.FieldTypeProtos.KeyValue;
12 28092 claudio.at
import eu.dnetlib.data.proto.FieldTypeProtos.StructuredProperty;
13 26600 sandro.lab
import eu.dnetlib.data.proto.OafProtos.Oaf;
14
import eu.dnetlib.data.proto.OafProtos.OafEntity;
15
import eu.dnetlib.data.proto.ResultProtos.Result;
16
import eu.dnetlib.data.proto.ResultProtos.Result.Context;
17
import eu.dnetlib.data.proto.ResultProtos.Result.ExternalReference;
18
import eu.dnetlib.data.proto.ResultProtos.Result.Instance;
19
import eu.dnetlib.data.proto.TypeProtos.Type;
20 52169 claudio.at
import org.apache.commons.lang3.StringUtils;
21 40014 claudio.at
import org.w3c.dom.NodeList;
22 26600 sandro.lab
23 40198 claudio.at
public class OafToHbaseXsltFunctions extends CommonDNetXsltFunctions {
24 26600 sandro.lab
25 40198 claudio.at
	public static String oafResult(
26
			final String resultId,
27 49480 claudio.at
			final boolean invisible,
28 40198 claudio.at
			final String provenance,
29 30969 claudio.at
			final String trust,
30 38972 claudio.at
			final NodeList about,
31 26600 sandro.lab
			final String originalId,
32
			final String dateOfCollection,
33 40313 claudio.at
			final String dateOfTransformation,
34 52169 claudio.at
			final NodeList metadata) {
35
36
		ValueMap values = null;
37 26600 sandro.lab
		try {
38
			final String entityId = OafRowKeyDecoder.decode(resultId).getKey();
39 52169 claudio.at
			values = ValueMap.parseNodeList(metadata);
40 42532 alessia.ba
			final Descriptor mDesc = Result.Metadata.getDescriptor();
41 26600 sandro.lab
42 52169 claudio.at
			final List<KeyValue> collectedFrom = getKeyValues(values, "collectedfrom", Type.datasource);
43
			final List<KeyValue> hostedBy = getKeyValues(values, "hostedby", Type.datasource);
44
45
			final Result.Metadata.Builder metadataBuilder = buildMetadata(values, mDesc);
46
			final Result.Builder result = buildResult(metadataBuilder, values, collectedFrom, hostedBy);
47
			final OafEntity.Builder entity = buildOafEntity(result, entityId, metadata, collectedFrom, originalId);
48 42532 alessia.ba
			entity.setDateofcollection(dateOfCollection)
49
					.setDateoftransformation(dateOfTransformation).setOaiprovenance(getOAIProvenance(about));
50 26600 sandro.lab
51 49480 claudio.at
			final Oaf oaf = getOaf(entity, getDataInfo(invisible, about, provenance, trust, false, false));
52 42532 alessia.ba
			return base64(oaf.toByteArray());
53
		} catch (final Throwable e) {
54 52169 claudio.at
			handleException(e, resultId, values);
55 42532 alessia.ba
		}
56
		return null;
57
	}
58
59
	public static String oafResultUpdate(final String resultId,
60
			final String provenance,
61
			final String trust,
62 52169 claudio.at
			final NodeList nodelist) {
63
		ValueMap values = null;
64 42532 alessia.ba
		try {
65
			final String entityId = OafRowKeyDecoder.decode(resultId).getKey();
66 52169 claudio.at
			values = ValueMap.parseNodeList(nodelist);
67
			final List<KeyValue> hostedBy = getKeyValues(values, "hostedby", Type.datasource);
68
69 26600 sandro.lab
			final Descriptor mDesc = Result.Metadata.getDescriptor();
70
71 42532 alessia.ba
			final Result.Metadata.Builder metadata = buildMetadata(values, mDesc);
72 48679 claudio.at
			final Result.Builder result = buildResult(metadata, values, null, hostedBy);
73 35824 claudio.at
74 42532 alessia.ba
			final OafEntity.Builder entity = buildOafEntity(result, entityId, nodelist, null, null);
75
			final Oaf oaf = getOaf(entity, null);
76
			return base64(oaf.toByteArray());
77
		} catch (final Throwable e) {
78 52169 claudio.at
			handleException(e, resultId, values);
79 42532 alessia.ba
		}
80
		return null;
81
	}
82 35824 claudio.at
83 52169 claudio.at
	private static OafEntity.Builder buildOafEntity(
84
			final Result.Builder result,
85 42532 alessia.ba
			final String entityId,
86
			final NodeList nodelist,
87 52169 claudio.at
			final List<KeyValue> collectedFrom,
88
			final String originalId) {
89 35824 claudio.at
90 42532 alessia.ba
		final List<StructuredProperty> pids = Lists.newArrayList();
91
		pids.addAll(parsePids(nodelist));
92 35824 claudio.at
93 42532 alessia.ba
		final OafEntity.Builder entity =
94
				getEntity(Type.result, entityId, collectedFrom, StringUtils.isBlank(originalId) ? null : Lists.newArrayList(originalId), null, null, pids)
95
						.setResult(result);
96
		return entity;
97
	}
98
99
	private static Result.Metadata.Builder buildMetadata(final ValueMap values, final Descriptor mDesc) {
100
		final Result.Metadata.Builder metadata = Result.Metadata.newBuilder();
101 43986 claudio.at
102 49028 claudio.at
		if (values.get("creator") != null) {
103
			for (final Element e : values.get("creator")) {
104
105
				final Author.Builder author = Author.newBuilder();
106
107
				final String fullname = e.getText();
108
				author.setFullname(fullname);
109
				author.setRank(Integer.valueOf(e.getAttributeValue(ValueMap.IDX_ATTRIBUTE)));
110
111 52169 claudio.at
				final String nameIdentifier = e.getAttributeValue("nameIdentifier");
112
				final String nameIdentifierScheme = e.getAttributeValue("nameIdentifierScheme");
113
114
				if (StringUtils.isNotBlank(nameIdentifier) && StringUtils.isNotBlank(nameIdentifierScheme)) {
115
					author.addPid(getKV(nameIdentifierScheme, nameIdentifier));
116
				}
117
118 49028 claudio.at
				final eu.dnetlib.pace.model.Person p = new eu.dnetlib.pace.model.Person(fullname, false);
119
				if (p.isAccurate()) {
120
					author.setName(p.getNormalisedFirstName());
121
					author.setSurname(p.getNormalisedSurname());
122
				}
123
				metadata.addAuthor(author);
124
			}
125
		}
126
127 44552 claudio.at
		addStructuredProps(metadata, mDesc.findFieldByName("subject"), values.get("subject"), "keyword", "dnet:subject_classification_typologies");
128 43986 claudio.at
		addStructuredProps(metadata, mDesc.findFieldByName("title"), values.get("title"), "main title", "dnet:dataCite_title");
129 42532 alessia.ba
		for (final String fieldname : Lists.newArrayList("description", "source", "contributor")) {
130
			if (values.get(fieldname) != null) {
131
				for (final String s : values.get(fieldname).listValues()) {
132
					addField(metadata, mDesc.findFieldByName(fieldname), s);
133 35824 claudio.at
				}
134
			}
135 42532 alessia.ba
		}
136
		addField(metadata, mDesc.findFieldByName("language"), setQualifier(getDefaultQualifier("dnet:languages"), values.get("language").listValues()));
137
		addField(metadata, mDesc.findFieldByName("dateofacceptance"), values.get("dateaccepted").listValues());
138
		addField(metadata, mDesc.findFieldByName("publisher"), values.get("publisher").listValues());
139
		addField(metadata, mDesc.findFieldByName("embargoenddate"), values.get("embargoenddate").listValues());
140
		addField(metadata, mDesc.findFieldByName("storagedate"), values.get("storagedate").listValues());
141 35824 claudio.at
142 52411 claudio.at
		String resulttype = getResultType(values);
143 47480 claudio.at
		addField(metadata, mDesc.findFieldByName("resulttype"), getSimpleQualifier(resulttype, "dnet:result_typologies"));
144
145 42532 alessia.ba
		addField(metadata, mDesc.findFieldByName("fulltext"), values.get("fulltext").listValues());
146
		addField(metadata, mDesc.findFieldByName("format"), values.get("format").listValues());
147
		if (values.get("concept") != null) {
148
			for (final Element e : values.get("concept")) {
149
				final String id = e.getAttributes().get("id");
150 52945 claudio.at
				if (StringUtils.isNotBlank(id)) {
151
					metadata.addContext(Context.newBuilder().setId(id));
152
				}
153 26600 sandro.lab
			}
154 42532 alessia.ba
		}
155
		if (values.get("journal") != null) {
156
			for (final Element e : values.get("journal")) {
157 54977 alessia.ba
				addJournal(metadata, e);
158 26600 sandro.lab
			}
159 42532 alessia.ba
		}
160
		return metadata;
161
	}
162 26600 sandro.lab
163 52411 claudio.at
	private static String getResultType(final ValueMap values) {
164
165
		final Element cobjcategory = values.get("cobjcategory").stream()
166
				.map(e -> StringUtils.isNotBlank(e.getText()) ? e : new Element("0000", e.getAttributes()))
167
				.findFirst()
168
				.orElse(new Element("0000", new HashMap<>()));
169
170
		final String resulttype = cobjcategory.getAttributeValue("type");
171
		if (StringUtils.isNotBlank(resulttype)) {
172
			return resulttype;
173
		}
174
175 52525 claudio.at
		return getDefaultResulttype(cobjcategory);
176 52411 claudio.at
	}
177
178 42532 alessia.ba
	private static Result.Builder buildResult(final Result.Metadata.Builder metadata,
179
			final ValueMap values,
180 52169 claudio.at
			final List<KeyValue> collectedFrom,
181
			final List<KeyValue> hostedBy) {
182 42532 alessia.ba
		final Result.Builder result = Result.newBuilder();
183 49028 claudio.at
184 52169 claudio.at
		final Instance.Builder instance = Instance.newBuilder();
185 26600 sandro.lab
186 49095 claudio.at
		addField(instance, Instance.getDescriptor().findFieldByName("license"), values.get("license").listValues());
187
188
		addField(instance, Instance.getDescriptor().findFieldByName("accessright"),
189 42532 alessia.ba
				setQualifier(getDefaultQualifier("dnet:access_modes"), values.get("accessrights").listValues()));
190 49095 claudio.at
191 42532 alessia.ba
		addField(instance, Instance.getDescriptor().findFieldByName("instancetype"),
192
				setQualifier(getDefaultQualifier("dnet:publication_resource"), values.get("cobjcategory").listValues()));
193 26600 sandro.lab
194 52169 claudio.at
		addField(instance, Instance.getDescriptor().findFieldByName("hostedby"), hostedBy);
195 48679 claudio.at
		addField(instance, Instance.getDescriptor().findFieldByName("collectedfrom"), collectedFrom);
196
		addField(instance, Instance.getDescriptor().findFieldByName("dateofacceptance"), values.get("dateaccepted").listValues());
197
198 42532 alessia.ba
		if (values.get("identifier") != null) {
199 52231 claudio.at
			addField(instance, Instance.getDescriptor().findFieldByName("url"),
200
					Lists.newArrayList(Iterables.filter(values.get("identifier").listValues(), urlFilter)));
201 42532 alessia.ba
		}
202 26600 sandro.lab
203 42532 alessia.ba
		result.addInstance(instance);
204 26600 sandro.lab
205 42532 alessia.ba
		final List<Element> extrefs = values.get("reference");
206
		if (!extrefs.isEmpty()) {
207
			final Descriptor extDesc = ExternalReference.getDescriptor();
208
			for (final Element element : extrefs) {
209
				final ExternalReference.Builder extref = ExternalReference.newBuilder();
210
				addField(extref, extDesc.findFieldByName("url"), element.getText());
211
				addField(extref, extDesc.findFieldByName("sitename"), element.getAttributes().get("source"));
212
				addField(extref, extDesc.findFieldByName("refidentifier"), element.getAttributes().get("identifier"));
213
				addField(extref, extDesc.findFieldByName("label"), element.getAttributes().get("title"));
214
				addField(extref, extDesc.findFieldByName("query"), element.getAttributes().get("query"));
215
				addField(extref, extDesc.findFieldByName("qualifier"),
216
						setQualifier(getDefaultQualifier("dnet:externalReference_typologies"), Lists.newArrayList(element.getAttributes().get("type")))
217
								.build());
218 26600 sandro.lab
219 42532 alessia.ba
				result.addExternalReference(extref);
220
			}
221
		}
222 38972 claudio.at
223 42532 alessia.ba
		return result.setMetadata(metadata);
224 26600 sandro.lab
	}
225
226 52169 claudio.at
	private static void handleException(Throwable e, final String resultId, final ValueMap values) {
227 42532 alessia.ba
		System.err.println("resultId: " + resultId);
228 52169 claudio.at
		if (values != null) {
229
			System.err.println("values: " + values);
230
		}
231 42532 alessia.ba
		e.printStackTrace();
232
		throw new RuntimeException(e);
233
	}
234 26600 sandro.lab
}