Project

General

Profile

1
package eu.dnetlib.data.mapreduce.hbase.dataimport;
2

    
3
import static eu.dnetlib.data.mapreduce.hbase.dataimport.DumpToActionsUtility.getArrayValues;
4
import static eu.dnetlib.data.mapreduce.hbase.dataimport.DumpToActionsUtility.getDefaultResulttype;
5
import static eu.dnetlib.data.mapreduce.hbase.dataimport.DumpToActionsUtility.getQualifier;
6
import static eu.dnetlib.data.mapreduce.hbase.dataimport.DumpToActionsUtility.getStringValue;
7
import static eu.dnetlib.data.mapreduce.hbase.dataimport.DumpToActionsUtility.isValidDate;
8

    
9
import java.io.IOException;
10
import java.io.InputStream;
11
import java.util.ArrayList;
12
import java.util.HashMap;
13
import java.util.List;
14
import java.util.Map;
15

    
16
import org.apache.commons.io.IOUtils;
17
import org.apache.commons.lang3.StringUtils;
18

    
19
import com.google.gson.Gson;
20
import com.google.gson.JsonArray;
21
import com.google.gson.JsonElement;
22
import com.google.gson.JsonObject;
23

    
24
import eu.dnetlib.actionmanager.actions.ActionFactory;
25
import eu.dnetlib.actionmanager.actions.AtomicAction;
26
import eu.dnetlib.actionmanager.common.Agent;
27
import eu.dnetlib.data.mapreduce.hbase.Reporter;
28
import eu.dnetlib.data.mapreduce.util.StreamUtils;
29
import eu.dnetlib.data.proto.FieldTypeProtos;
30
import eu.dnetlib.data.proto.FieldTypeProtos.Author;
31
import eu.dnetlib.data.proto.FieldTypeProtos.KeyValue;
32
import eu.dnetlib.data.proto.FieldTypeProtos.Qualifier;
33
import eu.dnetlib.data.proto.FieldTypeProtos.StringField;
34
import eu.dnetlib.data.proto.FieldTypeProtos.StructuredProperty;
35
import eu.dnetlib.data.proto.KindProtos;
36
import eu.dnetlib.data.proto.OafProtos;
37
import eu.dnetlib.data.proto.ResultProtos;
38
import eu.dnetlib.data.proto.TypeProtos;
39
import eu.dnetlib.data.transform.xml.AbstractDNetXsltFunctions;
40
import eu.dnetlib.miscutils.collections.Pair;
41

    
42
public class OrcidToActions {
43

    
44
	public static final String ORCID = "ORCID";
45
	public final static String orcidPREFIX = "orcid____";
46
	public static final String OPENAIRE_PREFIX = "openaire____";
47
	public static final String SEPARATOR = "::";
48

    
49
	private static Map<String, Pair<String, String>> datasources = new HashMap<String, Pair<String, String>>() {
50

    
51
		{
52
			put(ORCID.toLowerCase(), new Pair<>(ORCID, OPENAIRE_PREFIX + SEPARATOR + "orcid"));
53

    
54
		}
55
	};
56

    
57
	// json external id will be mapped to oaf:pid/@classid Map to oaf:pid/@classname
58
	private static Map<String, Pair<String, String>> externalIds = new HashMap<String, Pair<String, String>>() {
59

    
60
		{
61
			put("ark".toLowerCase(), new Pair<>("ark", "ark"));
62
			put("arxiv".toLowerCase(), new Pair<>("arxiv", "arXiv"));
63
			put("pmc".toLowerCase(), new Pair<>("pmc", "pmc"));
64
			put("pmid".toLowerCase(), new Pair<>("pmid", "pmid"));
65
			put("source-work-id".toLowerCase(), new Pair<>("orcidworkid", "orcidworkid"));
66
			put("urn".toLowerCase(), new Pair<>("urn", "urn"));
67
		}
68
	};
69

    
70
	static Map<String, Map<String, String>> typologiesMapping;
71

    
72
	static {
73
		try {
74
			final InputStream is = OrcidToActions.class.getResourceAsStream("/eu/dnetlib/data/mapreduce/hbase/dataimport/mapping_typologies_orcid.json");
75
			final String tt = IOUtils.toString(is);
76
			typologiesMapping = new Gson().fromJson(tt, Map.class);
77
		} catch (final IOException e) {
78
			e.printStackTrace();
79
		}
80
	}
81

    
82
	public static final String PID_TYPES = "dnet:pid_types";
83

    
84
	public static List<AtomicAction> generatePublicationActionsFromDump(final JsonObject rootElement,
85
			final ActionFactory factory,
86
			final String setName,
87
			final Agent agent,
88
			final Reporter context) {
89

    
90
		if (!isValid(rootElement, context)) { return null; }
91

    
92
		// Create OAF proto
93

    
94
		final OafProtos.Oaf.Builder oaf = OafProtos.Oaf.newBuilder();
95

    
96
		// Adding kind
97
		oaf.setKind(KindProtos.Kind.entity);
98

    
99
		// creating result proto
100
		final OafProtos.OafEntity.Builder entity = OafProtos.OafEntity.newBuilder().setType(TypeProtos.Type.result);
101

    
102
		entity.setDateofcollection("2018-10-22");
103

    
104
		// Adding external ids
105
		StreamUtils.toStream(externalIds.keySet().iterator())
106
				.forEach(jsonExtId -> {
107
					final String classid = externalIds.get(jsonExtId.toLowerCase()).getValue();
108
					final String classname = externalIds.get(jsonExtId.toLowerCase()).getKey();
109
					final String extId = getStringValue(rootElement, jsonExtId);
110
					if (StringUtils.isNotBlank(extId)) {
111
						entity.addPid(StructuredProperty.newBuilder()
112
								.setValue(extId)
113
								.setQualifier(Qualifier.newBuilder().setClassid(classid).setClassname(classname).setSchemeid("dnet:pid_types")
114
										.setSchemename("dnet:pid_types").build())
115
								.build());
116
					}
117
				});
118

    
119
		// Create result field
120
		final ResultProtos.Result.Builder result = ResultProtos.Result.newBuilder();
121

    
122
		// Create metadata proto
123
		final ResultProtos.Result.Metadata.Builder metadata = ResultProtos.Result.Metadata.newBuilder();
124

    
125
		// Adding source
126
		final String source = getStringValue(rootElement, "source");
127
		if (StringUtils.isNotBlank(source)) {
128
			metadata.addSource(StringField.newBuilder().setValue(source).build());
129
		}
130

    
131
		// Adding title
132
		final String title = createRepeatedField(rootElement, "titles");
133
		metadata.addTitle(FieldTypeProtos.StructuredProperty.newBuilder()
134
				.setValue(title)
135
				.setQualifier(getQualifier("main title", "dnet:dataCite_title"))
136
				.build());
137

    
138
		// Adding identifier
139
		final String id = getStringValue(rootElement, "id");
140
		String sourceId = null;
141
		if (id != null) {
142
			sourceId = String.format("50|%s" + SEPARATOR + "%s", orcidPREFIX, AbstractDNetXsltFunctions.md5(id));
143
		} else {
144
			sourceId = String.format("50|%s" + SEPARATOR + "%s", orcidPREFIX, AbstractDNetXsltFunctions.md5(title));
145
		}
146
		entity.setId(sourceId);
147

    
148
		// Adding relevant date
149
		settingRelevantDate(rootElement, metadata, "publication_date", "issued", true);
150

    
151
		// Adding type
152
		final String type = getStringValue(rootElement, "type");
153
		String cobjValue = "";
154
		if (StringUtils.isNotBlank(type)) {
155

    
156
			metadata.setResourcetype(FieldTypeProtos.Qualifier.newBuilder()
157
					.setClassid(type)
158
					.setClassname(type)
159
					.setSchemeid("dnet:dataCite_resource")
160
					.setSchemename("dnet:dataCite_resource")
161
					.build());
162

    
163
			final String typeValue = typologiesMapping.get(type).get("value");
164
			cobjValue = typologiesMapping.get(type).get("cobj");
165
			final ResultProtos.Result.Instance.Builder instance = ResultProtos.Result.Instance.newBuilder();
166

    
167
			// Adding hostedby
168
			instance.setHostedby(FieldTypeProtos.KeyValue.newBuilder()
169
					.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "55045bd2a65019fd8e6741a755395c8c")
170
					.setValue("Unknown Repository")
171
					.build());
172

    
173
			// Adding url
174
			final String url = createRepeatedField(rootElement, "urls");
175
			if (StringUtils.isNotBlank(url)) {
176
				instance.addUrl(url);
177
			}
178

    
179
			final String pubDate = getPublicationDate(rootElement, "publication_date");
180
			if (StringUtils.isNotBlank(pubDate)) {
181
				instance.setDateofacceptance(FieldTypeProtos.StringField.newBuilder().setValue(pubDate).build());
182
			}
183

    
184
			// Adding collectedfrom
185
			final FieldTypeProtos.KeyValue collectedFrom = FieldTypeProtos.KeyValue.newBuilder()
186
					.setValue(ORCID)
187
					.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "806360c771262b4d6770e7cdf04b5c5a")
188
					.build();
189
			instance.setCollectedfrom(collectedFrom);
190

    
191
			// Adding accessright
192
			instance.setAccessright(FieldTypeProtos.Qualifier.newBuilder()
193
					.setClassid("UNKNOWN")
194
					.setClassname("UNKNOWN")
195
					.setSchemeid("dnet:access_modes")
196
					.setSchemename("dnet:access_modes")
197
					.build());
198

    
199
			// Adding type
200
			instance.setInstancetype(FieldTypeProtos.Qualifier.newBuilder()
201
					.setClassid(cobjValue)
202
					.setClassname(typeValue)
203
					.setSchemeid("dnet:publication_resource")
204
					.setSchemename("dnet:publication_resource")
205
					.build());
206

    
207
			result.addInstance(instance);
208
		}
209

    
210
		// Adding authors
211
		final List<Author> authors = createAuthors(rootElement);
212
		if (authors != null) {
213
			metadata.addAllAuthor(authors);
214
		}
215

    
216
		metadata.setResulttype(getQualifier(getDefaultResulttype(cobjValue), "dnet:result_typologies"));
217
		result.setMetadata(metadata.build());
218
		entity.setResult(result.build());
219
		oaf.setEntity(entity.build());
220

    
221
		final List<AtomicAction> actionList = new ArrayList<>();
222

    
223
		actionList.add(factory.createAtomicAction(setName, agent, oaf.getEntity().getId(), "result", "body", oaf.build().toByteArray()));
224

    
225
		return actionList;
226

    
227
	}
228

    
229
	public static List<Author> createAuthors(final JsonObject root) {
230

    
231
		final String authorsJSONFieldName = "authors";
232

    
233
		if (root.has(authorsJSONFieldName) && root.get(authorsJSONFieldName).isJsonArray()) {
234

    
235
			final List<Author> authors = new ArrayList<>();
236
			final JsonArray jsonAuthors = root.getAsJsonArray(authorsJSONFieldName);
237
			int firstCounter = 0;
238
			int defaultCounter = 0;
239
			int rank = 1;
240
			int currentRank = 0;
241

    
242
			for (final JsonElement item : jsonAuthors) {
243
				final JsonObject author = item.getAsJsonObject();
244
				final Author.Builder result = Author.newBuilder();
245
				if (item.isJsonObject()) {
246
					final String surname = getStringValue(author, "surname");
247
					final String name = getStringValue(author, "name");
248
					final String oid = getStringValue(author, "oid");
249
					final String seq = getStringValue(author, "seq");
250
					if (StringUtils.isNotBlank(seq)) {
251
						if (seq.equals("first")) {
252
							firstCounter += 1;
253
							rank = firstCounter;
254

    
255
						} else if (seq.equals("additional")) {
256
							rank = currentRank + 1;
257
						} else {
258
							defaultCounter += 1;
259
							rank = defaultCounter;
260
						}
261
					}
262

    
263
					if (StringUtils.isNotBlank(oid)) {
264
						result.addPid(KeyValue.newBuilder()
265
								.setValue(oid)
266
								.setKey("ORCID")
267
								.build());
268
						result.setFullname(name + " " + surname);
269
						if (StringUtils.isNotBlank(name)) {
270
							result.setName(name);
271
						}
272
						if (StringUtils.isNotBlank(surname)) {
273
							result.setSurname(surname);
274
						}
275
					} else {
276
						if (StringUtils.isNotBlank(name)) {
277
							result.setFullname(name);
278
						} else {
279
							if (StringUtils.isNotBlank(surname)) {
280
								result.setFullname(surname);
281
							}
282
						}
283
					}
284
				}
285
				result.setRank(rank);
286
				authors.add(result.build());
287
				currentRank = rank;
288
			}
289
			return authors;
290

    
291
		}
292
		return null;
293
	}
294

    
295
	private static String createRepeatedField(final JsonObject rootElement, final String fieldName) {
296
		String field = "";
297
		if (!rootElement.has(fieldName)) { return null; }
298
		if (rootElement.has(fieldName) && rootElement.get(fieldName).isJsonNull()) { return null; }
299
		if (rootElement.get(fieldName).isJsonArray()) {
300
			if (!isValidJsonArray(rootElement, fieldName)) { return null; }
301
			final StringBuilder ttl = new StringBuilder();
302
			getArrayValues(rootElement, fieldName).forEach(ttl::append);
303
			field = ttl.toString();
304
		} else {
305
			field = getStringValue(rootElement, fieldName);
306
		}
307

    
308
		if (field != null && !field.isEmpty() && field.charAt(0) == '"' && field.charAt(field.length() - 1) == '"') {
309
			field = field.substring(1, field.length() - 1);
310
		}
311
		return field;
312
	}
313

    
314
	private static void settingRelevantDate(final JsonObject rootElement,
315
			final ResultProtos.Result.Metadata.Builder metadata,
316
			final String jsonKey,
317
			final String dictionaryKey,
318
			final boolean addToDateOfAcceptance) {
319

    
320
		// final JsonObject pubDateJson = rootElement.getAsJsonObject(jsonKey);
321
		// if (pubDateJson == null) { return; }
322
		// final String year = getStringValue(pubDateJson, "year");
323
		// final String month = getStringValue(pubDateJson, "month");
324
		// final String day = getStringValue(pubDateJson, "day");
325
		//
326
		// if (StringUtils.isBlank(year)) { return; }
327
		// String pubDate = "".concat(year);
328
		// if (StringUtils.isNotBlank(month)) {
329
		// pubDate = pubDate.concat("-" + month);
330
		// if (StringUtils.isNotBlank(day)) {
331
		// pubDate = pubDate.concat("-" + day);
332
		// } else {
333
		// pubDate += "-01";
334
		// }
335
		// } else {
336
		// pubDate += "-01-01";
337
		// }
338

    
339
		final String pubDate = getPublicationDate(rootElement, "publication_date");
340
		if (StringUtils.isNotBlank(pubDate)) {
341
			// if (addToDateOfAcceptance) {
342
			// metadata.setDateofacceptance(FieldTypeProtos.StringField.newBuilder().setValue(pubDate).build());
343
			// }
344
			metadata.addRelevantdate(FieldTypeProtos.StructuredProperty.newBuilder()
345
					.setValue(pubDate)
346
					.setQualifier(getQualifier(dictionaryKey, "dnet:dataCite_date"))
347
					.build());
348
		}
349
	}
350

    
351
	private static String getPublicationDate(final JsonObject rootElement,
352
			final String jsonKey) {
353

    
354
		final JsonObject pubDateJson = rootElement.getAsJsonObject(jsonKey);
355
		if (pubDateJson == null) { return null; }
356
		final String year = getStringValue(pubDateJson, "year");
357
		final String month = getStringValue(pubDateJson, "month");
358
		final String day = getStringValue(pubDateJson, "day");
359

    
360
		if (StringUtils.isBlank(year)) { return null; }
361
		String pubDate = "".concat(year);
362
		if (StringUtils.isNotBlank(month)) {
363
			pubDate = pubDate.concat("-" + month);
364
			if (StringUtils.isNotBlank(day)) {
365
				pubDate = pubDate.concat("-" + day);
366
			} else {
367
				pubDate += "-01";
368
			}
369
		} else {
370
			pubDate += "-01-01";
371
		}
372
		if (isValidDate(pubDate)) { return pubDate; }
373
		return null;
374
	}
375

    
376
	protected static boolean isValid(final JsonObject rootElement, final Reporter context) {
377

    
378
		final String type = getStringValue(rootElement, "type");
379
		if (!typologiesMapping.containsKey(type)) {
380
			context.incrementCounter("filtered", "unknowntype_" + type, 1);
381
			return false;
382
		}
383

    
384
		if (!isValidJsonArray(rootElement, "titles")) {
385
			context.incrementCounter("filtered", "invalid_title", 1);
386
			return false;
387
		}
388
		return true;
389
	}
390

    
391
	private static boolean isValidJsonArray(final JsonObject rootElement, final String fieldName) {
392
		if (!rootElement.has(fieldName)) { return false; }
393
		final JsonElement jsonElement = rootElement.get(fieldName);
394
		if (jsonElement.isJsonNull()) { return false; }
395
		if (jsonElement.isJsonArray()) {
396
			final JsonArray jsonArray = jsonElement.getAsJsonArray();
397
			if (jsonArray.isJsonNull()) { return false; }
398
			if (jsonArray.get(0).isJsonNull()) { return false; }
399
		}
400
		return true;
401
	}
402
}
(14-14/18)