Project

General

Profile

1
package eu.dnetlib.data.mapreduce.hbase.dataimport;
2

    
3
import static eu.dnetlib.data.mapreduce.hbase.dataimport.DumpToActionsUtility.getArrayValues;
4
import static eu.dnetlib.data.mapreduce.hbase.dataimport.DumpToActionsUtility.getDefaultResulttype;
5
import static eu.dnetlib.data.mapreduce.hbase.dataimport.DumpToActionsUtility.getQualifier;
6
import static eu.dnetlib.data.mapreduce.hbase.dataimport.DumpToActionsUtility.getStringValue;
7
import static eu.dnetlib.data.mapreduce.hbase.dataimport.DumpToActionsUtility.isValidDate;
8

    
9
import java.io.IOException;
10
import java.io.InputStream;
11
import java.util.ArrayList;
12
import java.util.HashMap;
13
import java.util.List;
14
import java.util.Map;
15

    
16
import org.apache.commons.io.IOUtils;
17
import org.apache.commons.lang3.StringUtils;
18

    
19
import com.google.gson.Gson;
20
import com.google.gson.JsonArray;
21
import com.google.gson.JsonElement;
22
import com.google.gson.JsonObject;
23

    
24
import eu.dnetlib.actionmanager.actions.ActionFactory;
25
import eu.dnetlib.actionmanager.actions.AtomicAction;
26
import eu.dnetlib.actionmanager.common.Agent;
27
import eu.dnetlib.data.mapreduce.hbase.Reporter;
28
import eu.dnetlib.data.mapreduce.util.StreamUtils;
29
import eu.dnetlib.data.proto.FieldTypeProtos;
30
import eu.dnetlib.data.proto.FieldTypeProtos.Author;
31
import eu.dnetlib.data.proto.FieldTypeProtos.DataInfo;
32
import eu.dnetlib.data.proto.FieldTypeProtos.KeyValue;
33
import eu.dnetlib.data.proto.FieldTypeProtos.Qualifier;
34
import eu.dnetlib.data.proto.FieldTypeProtos.StringField;
35
import eu.dnetlib.data.proto.FieldTypeProtos.StructuredProperty;
36
import eu.dnetlib.data.proto.KindProtos;
37
import eu.dnetlib.data.proto.OafProtos;
38
import eu.dnetlib.data.proto.ResultProtos;
39
import eu.dnetlib.data.proto.TypeProtos;
40
import eu.dnetlib.data.transform.xml.AbstractDNetXsltFunctions;
41
import eu.dnetlib.miscutils.collections.Pair;
42
import eu.dnetlib.miscutils.datetime.DateUtils;
43

    
44
public class OrcidToActions {
45

    
46
	public static final String ORCID = "ORCID";
47
	public final static String orcidPREFIX = "orcid_______";
48
	public static final String OPENAIRE_PREFIX = "openaire____";
49
	public static final String SEPARATOR = "::";
50

    
51
	private static Map<String, Pair<String, String>> datasources = new HashMap<String, Pair<String, String>>() {
52

    
53
		{
54
			put(ORCID.toLowerCase(), new Pair<>(ORCID, OPENAIRE_PREFIX + SEPARATOR + "orcid"));
55

    
56
		}
57
	};
58

    
59
	// json external id will be mapped to oaf:pid/@classid Map to oaf:pid/@classname
60
	private static Map<String, Pair<String, String>> externalIds = new HashMap<String, Pair<String, String>>() {
61

    
62
		{
63
			put("ark".toLowerCase(), new Pair<>("ark", "ark"));
64
			put("arxiv".toLowerCase(), new Pair<>("arxiv", "arXiv"));
65
			put("pmc".toLowerCase(), new Pair<>("pmc", "pmc"));
66
			put("pmid".toLowerCase(), new Pair<>("pmid", "pmid"));
67
			put("source-work-id".toLowerCase(), new Pair<>("orcidworkid", "orcidworkid"));
68
			put("urn".toLowerCase(), new Pair<>("urn", "urn"));
69
		}
70
	};
71

    
72
	static Map<String, Map<String, String>> typologiesMapping;
73

    
74
	static {
75
		try {
76
			final InputStream is = OrcidToActions.class.getResourceAsStream("/eu/dnetlib/data/mapreduce/hbase/dataimport/mapping_typologies_orcid.json");
77
			final String tt = IOUtils.toString(is);
78
			typologiesMapping = new Gson().fromJson(tt, Map.class);
79
		} catch (final IOException e) {
80
			e.printStackTrace();
81
		}
82
	}
83

    
84
	public static final String PID_TYPES = "dnet:pid_types";
85

    
86
	public static List<AtomicAction> generatePublicationActionsFromDump(final JsonObject rootElement,
87
			final ActionFactory factory,
88
			final String setName,
89
			final Agent agent,
90
			final Reporter context) {
91

    
92
		if (!isValid(rootElement, context)) { return null; }
93

    
94
		// Create OAF proto
95

    
96
		final OafProtos.Oaf.Builder oaf = OafProtos.Oaf.newBuilder();
97

    
98
		oaf.setDataInfo(
99
				DataInfo.newBuilder()
100
						.setDeletedbyinference(false)
101
						.setInferred(false)
102
						.setTrust("0.9")
103
						.setProvenanceaction(getQualifier("sysimport:actionset:orcidworks-no-doi", "dnet:provenanceActions"))
104
						.build());
105

    
106
		// Adding kind
107
		oaf.setKind(KindProtos.Kind.entity);
108

    
109
		oaf.setLastupdatetimestamp(DateUtils.now());
110

    
111
		// creating result proto
112
		final OafProtos.OafEntity.Builder entity = OafProtos.OafEntity.newBuilder().setType(TypeProtos.Type.result);
113

    
114
		entity.setDateofcollection("2018-10-22");
115
		entity.setDateoftransformation(DateUtils.now_ISO8601());
116

    
117
		// Adding external ids
118
		StreamUtils.toStream(externalIds.keySet().iterator())
119
				.forEach(jsonExtId -> {
120
					final String classid = externalIds.get(jsonExtId.toLowerCase()).getValue();
121
					final String classname = externalIds.get(jsonExtId.toLowerCase()).getKey();
122
					final String extId = getStringValue(rootElement, jsonExtId);
123
					if (StringUtils.isNotBlank(extId)) {
124
						entity.addPid(StructuredProperty.newBuilder()
125
								.setValue(extId)
126
								.setQualifier(Qualifier.newBuilder().setClassid(classid).setClassname(classname).setSchemeid("dnet:pid_types")
127
										.setSchemename("dnet:pid_types").build())
128
								.build());
129
					}
130
				});
131

    
132
		// Create result field
133
		final ResultProtos.Result.Builder result = ResultProtos.Result.newBuilder();
134

    
135
		// Create metadata proto
136
		final ResultProtos.Result.Metadata.Builder metadata = ResultProtos.Result.Metadata.newBuilder();
137

    
138
		// Adding source
139
		final String source = getStringValue(rootElement, "source");
140
		if (StringUtils.isNotBlank(source)) {
141
			metadata.addSource(StringField.newBuilder().setValue(source).build());
142
		}
143

    
144
		// Adding title
145
		final String title = createRepeatedField(rootElement, "titles");
146
		if (StringUtils.isBlank(title)) {
147
			context.incrementCounter("filtered", "title_not_found", 1);
148
			return null;
149
		}
150
		metadata.addTitle(FieldTypeProtos.StructuredProperty.newBuilder()
151
				.setValue(title)
152
				.setQualifier(getQualifier("main title", "dnet:dataCite_title"))
153
				.build());
154

    
155
		// Adding identifier
156
		final String id = getStringValue(rootElement, "id");
157
		String sourceId = null;
158
		if (id != null) {
159
			entity.addOriginalId(id);
160
			sourceId = String.format("50|%s" + SEPARATOR + "%s", orcidPREFIX, AbstractDNetXsltFunctions.md5(id));
161
		} else {
162
			sourceId = String.format("50|%s" + SEPARATOR + "%s", orcidPREFIX, AbstractDNetXsltFunctions.md5(title));
163
		}
164
		entity.setId(sourceId);
165

    
166
		// Adding relevant date
167
		settingRelevantDate(rootElement, metadata, "publication_date", "issued", true);
168

    
169
		// Adding collectedfrom
170
		final FieldTypeProtos.KeyValue collectedFrom = FieldTypeProtos.KeyValue.newBuilder()
171
				.setValue(ORCID)
172
				.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "806360c771262b4d6770e7cdf04b5c5a")
173
				.build();
174
		entity.addCollectedfrom(collectedFrom);
175

    
176
		// Adding type
177
		final String type = getStringValue(rootElement, "type");
178
		String cobjValue = "";
179
		if (StringUtils.isNotBlank(type)) {
180

    
181
			metadata.setResourcetype(FieldTypeProtos.Qualifier.newBuilder()
182
					.setClassid(type)
183
					.setClassname(type)
184
					.setSchemeid("dnet:dataCite_resource")
185
					.setSchemename("dnet:dataCite_resource")
186
					.build());
187

    
188
			final String typeValue = typologiesMapping.get(type).get("value");
189
			cobjValue = typologiesMapping.get(type).get("cobj");
190
			final ResultProtos.Result.Instance.Builder instance = ResultProtos.Result.Instance.newBuilder();
191

    
192
			// Adding hostedby
193
			instance.setHostedby(FieldTypeProtos.KeyValue.newBuilder()
194
					.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "55045bd2a65019fd8e6741a755395c8c")
195
					.setValue("Unknown Repository")
196
					.build());
197

    
198
			// Adding url
199
			final String url = createRepeatedField(rootElement, "urls");
200
			if (StringUtils.isNotBlank(url)) {
201
				instance.addUrl(url);
202
			}
203

    
204
			final String pubDate = getPublicationDate(rootElement, "publication_date");
205
			if (StringUtils.isNotBlank(pubDate)) {
206
				instance.setDateofacceptance(FieldTypeProtos.StringField.newBuilder().setValue(pubDate).build());
207
			}
208

    
209
			instance.setCollectedfrom(collectedFrom);
210

    
211
			// Adding accessright
212
			instance.setAccessright(FieldTypeProtos.Qualifier.newBuilder()
213
					.setClassid("UNKNOWN")
214
					.setClassname("UNKNOWN")
215
					.setSchemeid("dnet:access_modes")
216
					.setSchemename("dnet:access_modes")
217
					.build());
218

    
219
			// Adding type
220
			instance.setInstancetype(FieldTypeProtos.Qualifier.newBuilder()
221
					.setClassid(cobjValue)
222
					.setClassname(typeValue)
223
					.setSchemeid("dnet:publication_resource")
224
					.setSchemename("dnet:publication_resource")
225
					.build());
226

    
227
			result.addInstance(instance);
228
		} else {
229
			context.incrementCounter("filtered", "type_not_found", 1);
230
			return null;
231
		}
232

    
233
		// Adding authors
234
		final List<Author> authors = createAuthors(rootElement);
235
		if (authors != null && authors.size() > 0) {
236
			metadata.addAllAuthor(authors);
237
		} else {
238
			context.incrementCounter("filtered", "author_not_found", 1);
239
			return null;
240
		}
241

    
242
		metadata.setResulttype(getQualifier(getDefaultResulttype(cobjValue), "dnet:result_typologies"));
243
		result.setMetadata(metadata.build());
244
		entity.setResult(result.build());
245
		oaf.setEntity(entity.build());
246

    
247
		final List<AtomicAction> actionList = new ArrayList<>();
248

    
249
		actionList.add(factory.createAtomicAction(setName, agent, oaf.getEntity().getId(), "result", "body", oaf.build().toByteArray()));
250

    
251
		// System.out.println(JsonFormat.printToString(oaf.build()));
252
		return actionList;
253

    
254
	}
255

    
256
	public static List<Author> createAuthors(final JsonObject root) {
257

    
258
		final String authorsJSONFieldName = "authors";
259

    
260
		if (root.has(authorsJSONFieldName) && root.get(authorsJSONFieldName).isJsonArray()) {
261

    
262
			final List<Author> authors = new ArrayList<>();
263
			final JsonArray jsonAuthors = root.getAsJsonArray(authorsJSONFieldName);
264
			int firstCounter = 0;
265
			int defaultCounter = 0;
266
			int rank = 1;
267
			int currentRank = 0;
268

    
269
			for (final JsonElement item : jsonAuthors) {
270
				final JsonObject author = item.getAsJsonObject();
271
				final Author.Builder result = Author.newBuilder();
272
				if (item.isJsonObject()) {
273
					final String surname = getStringValue(author, "surname");
274
					final String name = getStringValue(author, "name");
275
					final String oid = getStringValue(author, "oid");
276
					final String seq = getStringValue(author, "seq");
277
					if (StringUtils.isNotBlank(seq)) {
278
						if (seq.equals("first")) {
279
							firstCounter += 1;
280
							rank = firstCounter;
281

    
282
						} else if (seq.equals("additional")) {
283
							rank = currentRank + 1;
284
						} else {
285
							defaultCounter += 1;
286
							rank = defaultCounter;
287
						}
288
					}
289

    
290
					if (StringUtils.isNotBlank(oid)) {
291
						result.addPid(KeyValue.newBuilder()
292
								.setValue(oid)
293
								.setKey("ORCID")
294
								.build());
295
						result.setFullname(name + " " + surname);
296
						if (StringUtils.isNotBlank(name)) {
297
							result.setName(name);
298
						}
299
						if (StringUtils.isNotBlank(surname)) {
300
							result.setSurname(surname);
301
						}
302
					} else {
303
						if (StringUtils.isNotBlank(name)) {
304
							result.setFullname(name);
305
						} else {
306
							if (StringUtils.isNotBlank(surname)) {
307
								result.setFullname(surname);
308
							}
309
						}
310
					}
311
				}
312
				result.setRank(rank);
313
				authors.add(result.build());
314
				currentRank = rank;
315
			}
316
			return authors;
317

    
318
		}
319
		return null;
320
	}
321

    
322
	private static String createRepeatedField(final JsonObject rootElement, final String fieldName) {
323
		String field = "";
324
		if (!rootElement.has(fieldName)) { return null; }
325
		if (rootElement.has(fieldName) && rootElement.get(fieldName).isJsonNull()) { return null; }
326
		if (rootElement.get(fieldName).isJsonArray()) {
327
			if (!isValidJsonArray(rootElement, fieldName)) { return null; }
328
			final StringBuilder ttl = new StringBuilder();
329
			getArrayValues(rootElement, fieldName).forEach(ttl::append);
330
			field = ttl.toString();
331
		} else {
332
			field = getStringValue(rootElement, fieldName);
333
		}
334

    
335
		if (field != null && !field.isEmpty() && field.charAt(0) == '"' && field.charAt(field.length() - 1) == '"') {
336
			field = field.substring(1, field.length() - 1);
337
		}
338
		return field;
339
	}
340

    
341
	private static void settingRelevantDate(final JsonObject rootElement,
342
			final ResultProtos.Result.Metadata.Builder metadata,
343
			final String jsonKey,
344
			final String dictionaryKey,
345
			final boolean addToDateOfAcceptance) {
346

    
347
		final String pubDate = getPublicationDate(rootElement, "publication_date");
348
		if (StringUtils.isNotBlank(pubDate)) {
349
			if (addToDateOfAcceptance) {
350
				metadata.setDateofacceptance(FieldTypeProtos.StringField.newBuilder().setValue(pubDate).build());
351
			}
352
			metadata.addRelevantdate(FieldTypeProtos.StructuredProperty.newBuilder()
353
					.setValue(pubDate)
354
					.setQualifier(getQualifier(dictionaryKey, "dnet:dataCite_date"))
355
					.build());
356
		}
357
	}
358

    
359
	private static String getPublicationDate(final JsonObject rootElement,
360
			final String jsonKey) {
361

    
362
		final JsonObject pubDateJson = rootElement.getAsJsonObject(jsonKey);
363
		if (pubDateJson == null) { return null; }
364
		final String year = getStringValue(pubDateJson, "year");
365
		final String month = getStringValue(pubDateJson, "month");
366
		final String day = getStringValue(pubDateJson, "day");
367

    
368
		if (StringUtils.isBlank(year)) { return null; }
369
		String pubDate = "".concat(year);
370
		if (StringUtils.isNotBlank(month)) {
371
			pubDate = pubDate.concat("-" + month);
372
			if (StringUtils.isNotBlank(day)) {
373
				pubDate = pubDate.concat("-" + day);
374
			} else {
375
				pubDate += "-01";
376
			}
377
		} else {
378
			pubDate += "-01-01";
379
		}
380
		if (isValidDate(pubDate)) { return pubDate; }
381
		return null;
382
	}
383

    
384
	protected static boolean isValid(final JsonObject rootElement, final Reporter context) {
385

    
386
		final String type = getStringValue(rootElement, "type");
387
		if (!typologiesMapping.containsKey(type)) {
388
			context.incrementCounter("filtered", "unknowntype_" + type, 1);
389
			return false;
390
		}
391

    
392
		if (!isValidJsonArray(rootElement, "titles")) {
393
			context.incrementCounter("filtered", "invalid_title", 1);
394
			return false;
395
		}
396
		return true;
397
	}
398

    
399
	private static boolean isValidJsonArray(final JsonObject rootElement, final String fieldName) {
400
		if (!rootElement.has(fieldName)) { return false; }
401
		final JsonElement jsonElement = rootElement.get(fieldName);
402
		if (jsonElement.isJsonNull()) { return false; }
403
		if (jsonElement.isJsonArray()) {
404
			final JsonArray jsonArray = jsonElement.getAsJsonArray();
405
			if (jsonArray.isJsonNull()) { return false; }
406
			if (jsonArray.get(0).isJsonNull()) { return false; }
407
		}
408
		return true;
409
	}
410
}
(14-14/18)