Project

General

Profile

1 42814 claudio.at
package eu.dnetlib.data.transform.xml;
2
3
import java.util.List;
4
import java.util.Map;
5
6
import com.google.common.collect.Iterables;
7
import com.google.common.collect.Lists;
8
import com.google.protobuf.Descriptors.Descriptor;
9 43514 claudio.at
import eu.dnetlib.data.mapreduce.util.DNGFRowKeyDecoder;
10
import eu.dnetlib.data.proto.DNGFProtos.DNGF;
11
import eu.dnetlib.data.proto.DNGFProtos.DNGFEntity;
12
import eu.dnetlib.data.proto.FieldTypeProtos.*;
13 42814 claudio.at
import eu.dnetlib.data.proto.PersonProtos.Person;
14 43514 claudio.at
import eu.dnetlib.data.proto.PublicationProtos.Publication;
15 42814 claudio.at
import eu.dnetlib.data.proto.TypeProtos.Type;
16
import org.apache.commons.lang3.StringUtils;
17
import org.w3c.dom.NodeList;
18
19
public class OafToHbaseXsltFunctions extends CommonDNetXsltFunctions {
20
21 43514 claudio.at
	public static String oafPublication(
22 42814 claudio.at
			final String resultId,
23
			final String provenance,
24
			final String trust,
25
			final NodeList about,
26
			final String hostedbyId,
27
			final String hostedbyName,
28
			final String collectedFromId,
29
			final String collectedFromName,
30
			final String originalId,
31
			final String dateOfCollection,
32
			final String dateOfTransformation,
33
			final NodeList nodelist) {
34
		try {
35 43514 claudio.at
			final String entityId = DNGFRowKeyDecoder.decode(resultId).getKey();
36 42814 claudio.at
			final ValueMap values = ValueMap.parseNodeList(nodelist);
37 43514 claudio.at
			final Descriptor mDesc = Publication.Metadata.getDescriptor();
38 42814 claudio.at
39 43514 claudio.at
			final Publication.Metadata.Builder metadata = buildMetadata(values, mDesc);
40
			final Publication.Builder result = buildPublication(metadata, values, mDesc, hostedbyId, hostedbyName);
41
			final DNGFEntity.Builder entity = buildOafEntity(result, entityId, nodelist, getKV(collectedFromId, collectedFromName), originalId);
42 42814 claudio.at
			entity.setDateofcollection(dateOfCollection)
43
					.setDateoftransformation(dateOfTransformation).setOaiprovenance(getOAIProvenance(about));
44
45 43514 claudio.at
			final DNGF oaf = getOaf(entity, getDataInfo(about, provenance, trust, false, false));
46 42814 claudio.at
			return base64(oaf.toByteArray());
47
		} catch (final Throwable e) {
48
			handleException(e, resultId, hostedbyId, hostedbyName, provenance, trust, collectedFromId, collectedFromName, originalId, dateOfCollection);
49
		}
50
		return null;
51
	}
52
53 43514 claudio.at
	public static String oafPublicationUpdate(final String resultId,
54 42814 claudio.at
			final String provenance,
55
			final String trust,
56
			final NodeList nodelist,
57
			final String hostedbyId,
58
			final String hostedbyName) {
59
		try {
60 43514 claudio.at
			final String entityId = DNGFRowKeyDecoder.decode(resultId).getKey();
61 42814 claudio.at
			final ValueMap values = ValueMap.parseNodeList(nodelist);
62 43514 claudio.at
			final Descriptor mDesc = Publication.Metadata.getDescriptor();
63 42814 claudio.at
64 43514 claudio.at
			final Publication.Metadata.Builder metadata = buildMetadata(values, mDesc);
65
			final Publication.Builder result = buildPublication(metadata, values, mDesc, hostedbyId, hostedbyName);
66 42814 claudio.at
67 43514 claudio.at
			final DNGFEntity.Builder entity = buildOafEntity(result, entityId, nodelist, null, null);
68
			final DNGF oaf = getOaf(entity, null);
69 42814 claudio.at
			return base64(oaf.toByteArray());
70
		} catch (final Throwable e) {
71
			handleException(e, resultId, hostedbyId, hostedbyName, provenance, trust, null, null, null, null);
72
		}
73
		return null;
74
	}
75
76 43514 claudio.at
	private static DNGFEntity.Builder buildOafEntity(final Publication.Builder pub,
77 42814 claudio.at
			final String entityId,
78
			final NodeList nodelist,
79
			KeyValue collectedFrom,
80
			String originalId) {
81
82
		final List<StructuredProperty> pids = Lists.newArrayList();
83
		pids.addAll(parsePids(nodelist));
84
85 43514 claudio.at
		final DNGFEntity.Builder entity =
86
				getEntity(Type.publication, entityId, collectedFrom, StringUtils.isBlank(originalId) ? null : Lists.newArrayList(originalId), null, null, pids)
87
						.setPublication(pub);
88 42814 claudio.at
		return entity;
89
	}
90
91 44352 sandro.lab
	protected static Publication.Metadata.Builder buildMetadata(final ValueMap values, final Descriptor mDesc) {
92 43514 claudio.at
		final Publication.Metadata.Builder metadata = Publication.Metadata.newBuilder();
93 42814 claudio.at
		addStructuredProps(metadata, mDesc.findFieldByName("subject"), values.get("subject").listValues(), "keyword", "dnet:result_subject");
94
		addStructuredProps(metadata, mDesc.findFieldByName("title"), values.get("title").listValues(), "main title", "dnet:dataCite_title");
95
		for (final String fieldname : Lists.newArrayList("description", "source", "contributor")) {
96
			if (values.get(fieldname) != null) {
97
				for (final String s : values.get(fieldname).listValues()) {
98
					addField(metadata, mDesc.findFieldByName(fieldname), s);
99
				}
100
			}
101
		}
102
		addField(metadata, mDesc.findFieldByName("language"), setQualifier(getDefaultQualifier("dnet:languages"), values.get("language").listValues()));
103
		addField(metadata, mDesc.findFieldByName("dateofacceptance"), values.get("dateaccepted").listValues());
104
		addField(metadata, mDesc.findFieldByName("publisher"), values.get("publisher").listValues());
105
		addField(metadata, mDesc.findFieldByName("embargoenddate"), values.get("embargoenddate").listValues());
106
		addField(metadata, mDesc.findFieldByName("storagedate"), values.get("storagedate").listValues());
107
108
		addField(metadata, mDesc.findFieldByName("resulttype"), getSimpleQualifier("publication", "dnet:result_typologies"));
109
110
		addField(metadata, mDesc.findFieldByName("fulltext"), values.get("fulltext").listValues());
111
		addField(metadata, mDesc.findFieldByName("format"), values.get("format").listValues());
112
		if (values.get("concept") != null) {
113
			for (final Element e : values.get("concept")) {
114
				final String id = e.getAttributes().get("id");
115
				if (StringUtils.isBlank(id)) throw new IllegalArgumentException("Context id cannot be blank");
116
				metadata.addContext(Context.newBuilder().setId(id));
117
			}
118
		}
119
		if (values.get("journal") != null) {
120
			for (final Element e : values.get("journal")) {
121
122
				final Journal.Builder journal = Journal.newBuilder();
123
				if (e.getText() != null) {
124
					journal.setName(e.getText());
125
				}
126
127
				final Map<String, String> attr = e.getAttributes();
128
				if (attr != null) {
129
					if (attr.get("issn") != null) {
130
						journal.setIssnPrinted(attr.get("issn"));
131
					}
132
					if (attr.get("eissn") != null) {
133
						journal.setIssnOnline(attr.get("eissn"));
134
					}
135
					if (attr.get("lissn") != null) {
136
						journal.setIssnLinking(attr.get("lissn"));
137
					}
138
				}
139
				metadata.setJournal(journal.build());
140
			}
141
		}
142
		return metadata;
143
	}
144
145 44352 sandro.lab
	protected static Publication.Builder buildPublication(final Publication.Metadata.Builder metadata,
146 42814 claudio.at
			final ValueMap values,
147
			final Descriptor mDesc,
148
			final String hostedbyId,
149
			final String hostedbyName) {
150 43514 claudio.at
		final Publication.Builder result = Publication.newBuilder();
151 42814 claudio.at
		if (values.get("creator") != null) {
152
			for (final String fullname : Iterables.limit(values.get("creator").listValues(), 10)) {
153
154
				final Person.Metadata.Builder authorMetadata = Person.Metadata.newBuilder();
155
156
				authorMetadata.setFullname(sf(fullname));
157
158
				final eu.dnetlib.pace.model.Person p = new eu.dnetlib.pace.model.Person(fullname, false);
159
				if (p.isAccurate()) {
160
					authorMetadata.setFirstname(sf(p.getNormalisedFirstName()));
161
					authorMetadata.clearSecondnames().addSecondnames(sf(p.getNormalisedSurname()));
162
					authorMetadata.setFullname(sf(p.getNormalisedFullname()));
163
				}
164
165
				result.addAuthor(Person.newBuilder().setMetadata(authorMetadata));
166
			}
167
		}
168
169
		final Instance.Builder instance = Instance.newBuilder().setHostedby(getKV(hostedbyId, hostedbyName));
170
171
		addField(instance, Instance.getDescriptor().findFieldByName("licence"),
172
				setQualifier(getDefaultQualifier("dnet:access_modes"), values.get("accessrights").listValues()));
173
		addField(instance, Instance.getDescriptor().findFieldByName("instancetype"),
174
				setQualifier(getDefaultQualifier("dnet:publication_resource"), values.get("cobjcategory").listValues()));
175
176
		if (values.get("identifier") != null) {
177
			addField(instance, Instance.getDescriptor().findFieldByName("url"),
178
					Lists.newArrayList(Iterables.filter(values.get("identifier").listValues(), urlFilter)));
179
		}
180
181
		result.addInstance(instance);
182
183
		final List<Element> extrefs = values.get("reference");
184
		if (!extrefs.isEmpty()) {
185
			final Descriptor extDesc = ExternalReference.getDescriptor();
186
			for (final Element element : extrefs) {
187
				final ExternalReference.Builder extref = ExternalReference.newBuilder();
188
				addField(extref, extDesc.findFieldByName("url"), element.getText());
189
				addField(extref, extDesc.findFieldByName("sitename"), element.getAttributes().get("source"));
190
				addField(extref, extDesc.findFieldByName("refidentifier"), element.getAttributes().get("identifier"));
191
				addField(extref, extDesc.findFieldByName("label"), element.getAttributes().get("title"));
192
				addField(extref, extDesc.findFieldByName("query"), element.getAttributes().get("query"));
193
				addField(extref, extDesc.findFieldByName("qualifier"),
194
						setQualifier(getDefaultQualifier("dnet:externalReference_typologies"), Lists.newArrayList(element.getAttributes().get("type")))
195
								.build());
196
197
				result.addExternalReference(extref);
198
			}
199
		}
200
201
		return result.setMetadata(metadata);
202
	}
203
204
	private static void handleException(Throwable e, final String resultId, final String hostedbyId, final String hostedbyName,
205
			final String provenance, final String trust, final String collectedFromId, final String collectedFromName,
206
			final String originalId, final String dateOfCollection) {
207
		System.err.println("resultId: " + resultId);
208
		if (StringUtils.isNotBlank(hostedbyId)) System.err.println("hostedbyId: " + hostedbyId);
209
		if (StringUtils.isNotBlank(hostedbyName)) System.err.println("hostedbyName: " + hostedbyName);
210
		if (StringUtils.isNotBlank(provenance)) System.err.println("provenance: " + provenance);
211
		if (StringUtils.isNotBlank(trust)) System.err.println("trust: " + trust);
212
		if (StringUtils.isNotBlank(collectedFromId)) System.err.println("collectedFromId: " + collectedFromId);
213
		if (StringUtils.isNotBlank(collectedFromName)) System.err.println("collectedFromName: " + collectedFromName);
214
		if (StringUtils.isNotBlank(originalId)) System.err.println("originalId: " + originalId);
215
		if (StringUtils.isNotBlank(dateOfCollection)) System.err.println("dateOfCollection: " + dateOfCollection);
216
		e.printStackTrace();
217
		throw new RuntimeException(e);
218
	}
219
}