Project

General

Profile

1
package eu.dnetlib.data.mapreduce.hbase.dataimport;
2

    
3
import static eu.dnetlib.data.mapreduce.hbase.dataimport.DumpToActionsUtility.getArrayValues;
4
import static eu.dnetlib.data.mapreduce.hbase.dataimport.DumpToActionsUtility.getDefaultResulttype;
5
import static eu.dnetlib.data.mapreduce.hbase.dataimport.DumpToActionsUtility.getQualifier;
6
import static eu.dnetlib.data.mapreduce.hbase.dataimport.DumpToActionsUtility.getStringValue;
7
import static eu.dnetlib.data.mapreduce.hbase.dataimport.DumpToActionsUtility.isValidDate;
8

    
9
import java.io.IOException;
10
import java.io.InputStream;
11
import java.util.ArrayList;
12
import java.util.HashMap;
13
import java.util.List;
14
import java.util.Map;
15

    
16
import org.apache.commons.io.IOUtils;
17
import org.apache.commons.lang3.StringUtils;
18

    
19
import com.google.gson.Gson;
20
import com.google.gson.JsonArray;
21
import com.google.gson.JsonElement;
22
import com.google.gson.JsonObject;
23
import com.googlecode.protobuf.format.JsonFormat;
24

    
25
import eu.dnetlib.actionmanager.actions.ActionFactory;
26
import eu.dnetlib.actionmanager.actions.AtomicAction;
27
import eu.dnetlib.actionmanager.common.Agent;
28
import eu.dnetlib.data.mapreduce.hbase.Reporter;
29
import eu.dnetlib.data.mapreduce.util.StreamUtils;
30
import eu.dnetlib.data.proto.FieldTypeProtos;
31
import eu.dnetlib.data.proto.FieldTypeProtos.Author;
32
import eu.dnetlib.data.proto.FieldTypeProtos.DataInfo;
33
import eu.dnetlib.data.proto.FieldTypeProtos.KeyValue;
34
import eu.dnetlib.data.proto.FieldTypeProtos.Qualifier;
35
import eu.dnetlib.data.proto.FieldTypeProtos.StringField;
36
import eu.dnetlib.data.proto.FieldTypeProtos.StructuredProperty;
37
import eu.dnetlib.data.proto.KindProtos;
38
import eu.dnetlib.data.proto.OafProtos;
39
import eu.dnetlib.data.proto.ResultProtos;
40
import eu.dnetlib.data.proto.TypeProtos;
41
import eu.dnetlib.data.transform.xml.AbstractDNetXsltFunctions;
42
import eu.dnetlib.miscutils.collections.Pair;
43
import eu.dnetlib.miscutils.datetime.DateUtils;
44
import eu.dnetlib.pace.model.Person;
45

    
46
public class OrcidToActions {
47

    
48
	public static final String ORCID = "ORCID";
49
	public final static String orcidPREFIX = "orcid_______";
50
	public static final String OPENAIRE_PREFIX = "openaire____";
51
	public static final String SEPARATOR = "::";
52

    
53
	private static Map<String, Pair<String, String>> datasources = new HashMap<String, Pair<String, String>>() {
54

    
55
		{
56
			put(ORCID.toLowerCase(), new Pair<>(ORCID, OPENAIRE_PREFIX + SEPARATOR + "orcid"));
57

    
58
		}
59
	};
60

    
61
	// json external id will be mapped to oaf:pid/@classid Map to oaf:pid/@classname
62
	private static Map<String, Pair<String, String>> externalIds = new HashMap<String, Pair<String, String>>() {
63

    
64
		{
65
			put("ark".toLowerCase(), new Pair<>("ark", "ark"));
66
			put("arxiv".toLowerCase(), new Pair<>("arxiv", "arXiv"));
67
			put("pmc".toLowerCase(), new Pair<>("pmc", "pmc"));
68
			put("pmid".toLowerCase(), new Pair<>("pmid", "pmid"));
69
			put("source-work-id".toLowerCase(), new Pair<>("orcidworkid", "orcidworkid"));
70
			put("urn".toLowerCase(), new Pair<>("urn", "urn"));
71
		}
72
	};
73

    
74
	static Map<String, Map<String, String>> typologiesMapping;
75

    
76
	static {
77
		try {
78
			final InputStream is = OrcidToActions.class.getResourceAsStream("/eu/dnetlib/data/mapreduce/hbase/dataimport/mapping_typologies_orcid.json");
79
			final String tt = IOUtils.toString(is);
80
			typologiesMapping = new Gson().fromJson(tt, Map.class);
81
		} catch (final IOException e) {
82
			e.printStackTrace();
83
		}
84
	}
85

    
86
	public static final String PID_TYPES = "dnet:pid_types";
87

    
88
	public static List<AtomicAction> generatePublicationActionsFromDump(final JsonObject rootElement,
89
			final ActionFactory factory,
90
			final String setName,
91
			final Agent agent,
92
			final Reporter context) {
93

    
94
		if (!isValid(rootElement, context)) { return null; }
95

    
96
		// Create OAF proto
97

    
98
		final OafProtos.Oaf.Builder oaf = OafProtos.Oaf.newBuilder();
99

    
100
		oaf.setDataInfo(
101
				DataInfo.newBuilder()
102
						.setDeletedbyinference(false)
103
						.setInferred(false)
104
						.setTrust("0.9")
105
						.setProvenanceaction(getQualifier("sysimport:actionset:orcidworks-no-doi", "dnet:provenanceActions"))
106
						.build());
107

    
108
		// Adding kind
109
		oaf.setKind(KindProtos.Kind.entity);
110

    
111
		oaf.setLastupdatetimestamp(DateUtils.now());
112

    
113
		// creating result proto
114
		final OafProtos.OafEntity.Builder entity = OafProtos.OafEntity.newBuilder().setType(TypeProtos.Type.result);
115

    
116
		entity.setDateofcollection("2018-10-22");
117
		entity.setDateoftransformation(DateUtils.now_ISO8601());
118

    
119
		// Adding external ids
120
		StreamUtils.toStream(externalIds.keySet().iterator())
121
				.forEach(jsonExtId -> {
122
					final String classid = externalIds.get(jsonExtId.toLowerCase()).getValue();
123
					final String classname = externalIds.get(jsonExtId.toLowerCase()).getKey();
124
					final String extId = getStringValue(rootElement, jsonExtId);
125
					if (StringUtils.isNotBlank(extId)) {
126
						entity.addPid(StructuredProperty.newBuilder()
127
								.setValue(extId)
128
								.setQualifier(Qualifier.newBuilder().setClassid(classid).setClassname(classname).setSchemeid("dnet:pid_types")
129
										.setSchemename("dnet:pid_types").build())
130
								.build());
131
					}
132
				});
133

    
134
		// Create result field
135
		final ResultProtos.Result.Builder result = ResultProtos.Result.newBuilder();
136

    
137
		// Create metadata proto
138
		final ResultProtos.Result.Metadata.Builder metadata = ResultProtos.Result.Metadata.newBuilder();
139

    
140
		// Adding source
141
		final String source = getStringValue(rootElement, "source");
142
		if (StringUtils.isNotBlank(source)) {
143
			metadata.addSource(StringField.newBuilder().setValue(source).build());
144
		}
145

    
146
		// Adding title
147
		final String title = createRepeatedField(rootElement, "titles");
148
		if (StringUtils.isBlank(title)) {
149
			context.incrementCounter("filtered", "title_not_found", 1);
150
			return null;
151
		}
152
		metadata.addTitle(FieldTypeProtos.StructuredProperty.newBuilder()
153
				.setValue(title)
154
				.setQualifier(getQualifier("main title", "dnet:dataCite_title"))
155
				.build());
156

    
157
		// Adding identifier
158
		final String id = getStringValue(rootElement, "id");
159
		String sourceId = null;
160
		if (id != null) {
161
			entity.addOriginalId(id);
162
			sourceId = String.format("50|%s" + SEPARATOR + "%s", orcidPREFIX, AbstractDNetXsltFunctions.md5(id));
163
		} else {
164
			sourceId = String.format("50|%s" + SEPARATOR + "%s", orcidPREFIX, AbstractDNetXsltFunctions.md5(title));
165
		}
166
		entity.setId(sourceId);
167

    
168
		// Adding relevant date
169
		settingRelevantDate(rootElement, metadata, "publication_date", "issued", true);
170

    
171
		// Adding collectedfrom
172
		final FieldTypeProtos.KeyValue collectedFrom = FieldTypeProtos.KeyValue.newBuilder()
173
				.setValue(ORCID)
174
				.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "806360c771262b4d6770e7cdf04b5c5a")
175
				.build();
176
		entity.addCollectedfrom(collectedFrom);
177

    
178
		// Adding type
179
		final String type = getStringValue(rootElement, "type");
180
		String cobjValue = "";
181
		if (StringUtils.isNotBlank(type)) {
182

    
183
			metadata.setResourcetype(FieldTypeProtos.Qualifier.newBuilder()
184
					.setClassid(type)
185
					.setClassname(type)
186
					.setSchemeid("dnet:dataCite_resource")
187
					.setSchemename("dnet:dataCite_resource")
188
					.build());
189

    
190
			final String typeValue = typologiesMapping.get(type).get("value");
191
			cobjValue = typologiesMapping.get(type).get("cobj");
192
			final ResultProtos.Result.Instance.Builder instance = ResultProtos.Result.Instance.newBuilder();
193

    
194
			// Adding hostedby
195
			instance.setHostedby(FieldTypeProtos.KeyValue.newBuilder()
196
					.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "55045bd2a65019fd8e6741a755395c8c")
197
					.setValue("Unknown Repository")
198
					.build());
199

    
200
			// Adding url
201
			final String url = createRepeatedField(rootElement, "urls");
202
			if (StringUtils.isNotBlank(url)) {
203
				instance.addUrl(url);
204
			}
205

    
206
			final String pubDate = getPublicationDate(rootElement, "publication_date");
207
			if (StringUtils.isNotBlank(pubDate)) {
208
				instance.setDateofacceptance(FieldTypeProtos.StringField.newBuilder().setValue(pubDate).build());
209
			}
210

    
211
			instance.setCollectedfrom(collectedFrom);
212

    
213
			// Adding accessright
214
			instance.setAccessright(FieldTypeProtos.Qualifier.newBuilder()
215
					.setClassid("UNKNOWN")
216
					.setClassname("UNKNOWN")
217
					.setSchemeid("dnet:access_modes")
218
					.setSchemename("dnet:access_modes")
219
					.build());
220

    
221
			// Adding type
222
			instance.setInstancetype(FieldTypeProtos.Qualifier.newBuilder()
223
					.setClassid(cobjValue)
224
					.setClassname(typeValue)
225
					.setSchemeid("dnet:publication_resource")
226
					.setSchemename("dnet:publication_resource")
227
					.build());
228

    
229
			result.addInstance(instance);
230
		} else {
231
			context.incrementCounter("filtered", "type_not_found", 1);
232
			return null;
233
		}
234

    
235
		// Adding authors
236
		final List<Author> authors = createAuthors(rootElement);
237
		if (authors != null && authors.size() > 0) {
238
			metadata.addAllAuthor(authors);
239
		} else {
240
			context.incrementCounter("filtered", "author_not_found", 1);
241
			return null;
242
		}
243

    
244
		metadata.setResulttype(getQualifier(getDefaultResulttype(cobjValue), "dnet:result_typologies"));
245
		result.setMetadata(metadata.build());
246
		entity.setResult(result.build());
247
		oaf.setEntity(entity.build());
248

    
249
		final List<AtomicAction> actionList = new ArrayList<>();
250

    
251
		actionList.add(factory.createAtomicAction(setName, agent, oaf.getEntity().getId(), "result", "body", oaf.build().toByteArray()));
252

    
253
//		 System.out.println(JsonFormat.printToString(oaf.build()));
254
		return actionList;
255

    
256
	}
257

    
258
	public static List<Author> createAuthors(final JsonObject root) {
259

    
260
		final String authorsJSONFieldName = "authors";
261

    
262
		if (root.has(authorsJSONFieldName) && root.get(authorsJSONFieldName).isJsonArray()) {
263

    
264
			final List<Author> authors = new ArrayList<>();
265
			final JsonArray jsonAuthors = root.getAsJsonArray(authorsJSONFieldName);
266
			int firstCounter = 0;
267
			int defaultCounter = 0;
268
			int rank = 1;
269
			int currentRank = 0;
270

    
271
			for (final JsonElement item : jsonAuthors) {
272
				final JsonObject author = item.getAsJsonObject();
273
				final Author.Builder result = Author.newBuilder();
274
				if (item.isJsonObject()) {
275
					final String surname = getStringValue(author, "surname");
276
					final String name = getStringValue(author, "name");
277
					final String oid = getStringValue(author, "oid");
278
					final String seq = getStringValue(author, "seq");
279
					if (StringUtils.isNotBlank(seq)) {
280
						if (seq.equals("first")) {
281
							firstCounter += 1;
282
							rank = firstCounter;
283

    
284
						} else if (seq.equals("additional")) {
285
							rank = currentRank + 1;
286
						} else {
287
							defaultCounter += 1;
288
							rank = defaultCounter;
289
						}
290
					}
291

    
292
					if (StringUtils.isNotBlank(oid)) {
293
						result.addPid(KeyValue.newBuilder()
294
								.setValue(oid)
295
								.setKey("ORCID")
296
								.build());
297
						result.setFullname(name + " " + surname);
298
						if (StringUtils.isNotBlank(name)) {
299
							result.setName(name);
300
						}
301
						if (StringUtils.isNotBlank(surname)) {
302
							result.setSurname(surname);
303
						}
304
					} else {
305
						String fullname = "";
306
						if (StringUtils.isNotBlank(name)) {
307
							fullname = name;
308
						} else {
309
							if (StringUtils.isNotBlank(surname)) {
310
								fullname = surname;
311
							}
312
						}
313
						Person p = new Person(fullname, false);
314
						if (p.isAccurate()) {
315
							result.setName(p.getNormalisedFirstName());
316
							result.setSurname(p.getNormalisedSurname());
317
							result.setFullname(p.getNormalisedFullname());
318
						}
319
						else {
320
							result.setFullname(fullname);
321
						}
322
					}
323
				}
324
				result.setRank(rank);
325
				authors.add(result.build());
326
				currentRank = rank;
327
			}
328
			return authors;
329

    
330
		}
331
		return null;
332
	}
333

    
334
	private static String createRepeatedField(final JsonObject rootElement, final String fieldName) {
335
		String field = "";
336
		if (!rootElement.has(fieldName)) { return null; }
337
		if (rootElement.has(fieldName) && rootElement.get(fieldName).isJsonNull()) { return null; }
338
		if (rootElement.get(fieldName).isJsonArray()) {
339
			if (!isValidJsonArray(rootElement, fieldName)) { return null; }
340
			final StringBuilder ttl = new StringBuilder();
341
			getArrayValues(rootElement, fieldName).forEach(ttl::append);
342
			field = ttl.toString();
343
		} else {
344
			field = getStringValue(rootElement, fieldName);
345
		}
346

    
347
		if (StringUtils.isNotBlank(field) && field.charAt(0) == '"' && field.charAt(field.length() - 1) == '"') {
348
			field = StringUtils.strip(field, "\"");
349
		}
350
		return field;
351
	}
352

    
353
	private static void settingRelevantDate(final JsonObject rootElement,
354
			final ResultProtos.Result.Metadata.Builder metadata,
355
			final String jsonKey,
356
			final String dictionaryKey,
357
			final boolean addToDateOfAcceptance) {
358

    
359
		final String pubDate = getPublicationDate(rootElement, "publication_date");
360
		if (StringUtils.isNotBlank(pubDate)) {
361
			if (addToDateOfAcceptance) {
362
				metadata.setDateofacceptance(FieldTypeProtos.StringField.newBuilder().setValue(pubDate).build());
363
			}
364
			metadata.addRelevantdate(FieldTypeProtos.StructuredProperty.newBuilder()
365
					.setValue(pubDate)
366
					.setQualifier(getQualifier(dictionaryKey, "dnet:dataCite_date"))
367
					.build());
368
		}
369
	}
370

    
371
	private static String getPublicationDate(final JsonObject rootElement,
372
			final String jsonKey) {
373

    
374
		final JsonObject pubDateJson = rootElement.getAsJsonObject(jsonKey);
375
		if (pubDateJson == null) { return null; }
376
		final String year = getStringValue(pubDateJson, "year");
377
		final String month = getStringValue(pubDateJson, "month");
378
		final String day = getStringValue(pubDateJson, "day");
379

    
380
		if (StringUtils.isBlank(year)) { return null; }
381
		String pubDate = "".concat(year);
382
		if (StringUtils.isNotBlank(month)) {
383
			pubDate = pubDate.concat("-" + month);
384
			if (StringUtils.isNotBlank(day)) {
385
				pubDate = pubDate.concat("-" + day);
386
			} else {
387
				pubDate += "-01";
388
			}
389
		} else {
390
			pubDate += "-01-01";
391
		}
392
		if (isValidDate(pubDate)) { return pubDate; }
393
		return null;
394
	}
395

    
396
	protected static boolean isValid(final JsonObject rootElement, final Reporter context) {
397

    
398
		final String type = getStringValue(rootElement, "type");
399
		if (!typologiesMapping.containsKey(type)) {
400
			context.incrementCounter("filtered", "unknowntype_" + type, 1);
401
			return false;
402
		}
403

    
404
		if (!isValidJsonArray(rootElement, "titles")) {
405
			context.incrementCounter("filtered", "invalid_title", 1);
406
			return false;
407
		}
408
		return true;
409
	}
410

    
411
	private static boolean isValidJsonArray(final JsonObject rootElement, final String fieldName) {
412
		if (!rootElement.has(fieldName)) { return false; }
413
		final JsonElement jsonElement = rootElement.get(fieldName);
414
		if (jsonElement.isJsonNull()) { return false; }
415
		if (jsonElement.isJsonArray()) {
416
			final JsonArray jsonArray = jsonElement.getAsJsonArray();
417
			if (jsonArray.isJsonNull()) { return false; }
418
			if (jsonArray.get(0).isJsonNull()) { return false; }
419
		}
420
		return true;
421
	}
422
}
(14-14/18)