Project

General

Profile

1
package eu.dnetlib.data.mapreduce.hbase.dataimport;
2

    
3
import static eu.dnetlib.data.mapreduce.hbase.dataimport.DumpToActionsUtility.getArrayValues;
4
import static eu.dnetlib.data.mapreduce.hbase.dataimport.DumpToActionsUtility.getQualifier;
5
import static eu.dnetlib.data.mapreduce.hbase.dataimport.DumpToActionsUtility.getStringValue;
6
import static eu.dnetlib.data.mapreduce.hbase.dataimport.DumpToActionsUtility.isValidDate;
7

    
8
import java.io.IOException;
9
import java.io.InputStream;
10
import java.util.ArrayList;
11
import java.util.HashMap;
12
import java.util.List;
13
import java.util.Map;
14

    
15
import org.apache.commons.io.IOUtils;
16
import org.apache.commons.lang3.StringUtils;
17

    
18
import com.google.gson.Gson;
19
import com.google.gson.JsonArray;
20
import com.google.gson.JsonElement;
21
import com.google.gson.JsonObject;
22

    
23
import eu.dnetlib.actionmanager.actions.ActionFactory;
24
import eu.dnetlib.actionmanager.actions.AtomicAction;
25
import eu.dnetlib.actionmanager.common.Agent;
26
import eu.dnetlib.data.mapreduce.hbase.Reporter;
27
import eu.dnetlib.data.mapreduce.util.StreamUtils;
28
import eu.dnetlib.data.proto.FieldTypeProtos;
29
import eu.dnetlib.data.proto.FieldTypeProtos.Author;
30
import eu.dnetlib.data.proto.FieldTypeProtos.KeyValue;
31
import eu.dnetlib.data.proto.FieldTypeProtos.Qualifier;
32
import eu.dnetlib.data.proto.FieldTypeProtos.StringField;
33
import eu.dnetlib.data.proto.FieldTypeProtos.StructuredProperty;
34
import eu.dnetlib.data.proto.KindProtos;
35
import eu.dnetlib.data.proto.OafProtos;
36
import eu.dnetlib.data.proto.ResultProtos;
37
import eu.dnetlib.data.proto.TypeProtos;
38
import eu.dnetlib.data.transform.xml.AbstractDNetXsltFunctions;
39
import eu.dnetlib.miscutils.collections.Pair;
40

    
41
public class OrcidToActions {
42

    
43
	public static final String ORCID = "ORCID";
44
	public final static String orcidPREFIX = "orcid____";
45
	public static final String OPENAIRE_PREFIX = "openaire____";
46
	public static final String SEPARATOR = "::";
47

    
48
	private static Map<String, Pair<String, String>> datasources = new HashMap<String, Pair<String, String>>() {
49

    
50
		{
51
			put(ORCID.toLowerCase(), new Pair<>(ORCID, OPENAIRE_PREFIX + SEPARATOR + "orcid"));
52

    
53
		}
54
	};
55

    
56
	// json external id will be mapped to oaf:pid/@classid Map to oaf:pid/@classname
57
	private static Map<String, Pair<String, String>> externalIds = new HashMap<String, Pair<String, String>>() {
58

    
59
		{
60
			put("ark".toLowerCase(), new Pair<>("ark", "ark"));
61
			put("arxiv".toLowerCase(), new Pair<>("arxiv", "arXiv"));
62
			put("pmc".toLowerCase(), new Pair<>("pmc", "pmc"));
63
			put("pmid".toLowerCase(), new Pair<>("pmid", "pmid"));
64
			put("source-work-id".toLowerCase(), new Pair<>("orcidworkid", "orcidworkid"));
65
			put("urn".toLowerCase(), new Pair<>("urn", "urn"));
66
		}
67
	};
68

    
69
	static Map<String, Map<String, String>> typologiesMapping;
70

    
71
	static {
72
		try {
73
			final InputStream is = OrcidToActions.class.getResourceAsStream("/eu/dnetlib/data/mapreduce/hbase/dataimport/mapping_typologies.json");
74
			final String tt = IOUtils.toString(is);
75
			typologiesMapping = new Gson().fromJson(tt, Map.class);
76
		} catch (final IOException e) {
77
			e.printStackTrace();
78
		}
79
	}
80

    
81
	public static final String PID_TYPES = "dnet:pid_types";
82

    
83
	public static List<AtomicAction> generatePublicationActionsFromDump(final JsonObject rootElement,
84
			final ActionFactory factory,
85
			final String setName,
86
			final Agent agent,
87
			final Reporter context) {
88

    
89
		if (!isValid(rootElement, context)) { return null; }
90

    
91
		// Create OAF proto
92

    
93
		final OafProtos.Oaf.Builder oaf = OafProtos.Oaf.newBuilder();
94

    
95
		// Adding kind
96
		oaf.setKind(KindProtos.Kind.entity);
97

    
98
		// creating result proto
99
		final OafProtos.OafEntity.Builder entity = OafProtos.OafEntity.newBuilder().setType(TypeProtos.Type.result);
100

    
101
		entity.setDateofcollection("2018-10-22");
102

    
103
		// Adding external ids
104
		StreamUtils.toStream(externalIds.keySet().iterator())
105
				.forEach(jsonExtId -> {
106
					final String classid = externalIds.get(jsonExtId.toLowerCase()).getValue();
107
					final String classname = externalIds.get(jsonExtId.toLowerCase()).getKey();
108
					final String extId = getStringValue(rootElement, jsonExtId);
109
					if (StringUtils.isNotBlank(extId)) {
110
						entity.addPid(StructuredProperty.newBuilder()
111
								.setValue(extId)
112
								.setQualifier(Qualifier.newBuilder().setClassid(classid).setClassname(classname).setSchemeid("dnet:pid_types")
113
										.setSchemename("dnet:pid_types").build())
114
								.build());
115
					}
116
				});
117

    
118
		// Create result field
119
		final ResultProtos.Result.Builder result = ResultProtos.Result.newBuilder();
120

    
121
		// Create metadata proto
122
		final ResultProtos.Result.Metadata.Builder metadata = ResultProtos.Result.Metadata.newBuilder();
123

    
124
		// Adding source
125
		final String source = getStringValue(rootElement, "source");
126
		if (StringUtils.isNotBlank(source)) {
127
			metadata.addSource(StringField.newBuilder().setValue(source).build());
128
		}
129

    
130
		// Adding title
131
		final String title = createRepeatedField(rootElement, "titles");
132
		metadata.addTitle(FieldTypeProtos.StructuredProperty.newBuilder()
133
				.setValue(title)
134
				.setQualifier(getQualifier("main title", "dnet:dataCite_title"))
135
				.build());
136

    
137
		// Adding identifier
138
		final String id = getStringValue(rootElement, "id");
139
		String sourceId = null;
140
		if (id != null) {
141
			sourceId = String.format("50|%s" + SEPARATOR + "%s", orcidPREFIX, AbstractDNetXsltFunctions.md5(id));
142
		} else {
143
			sourceId = String.format("50|%s" + SEPARATOR + "%s", orcidPREFIX, AbstractDNetXsltFunctions.md5(title));
144
		}
145
		entity.setId(sourceId);
146

    
147
		// Adding relevant date
148
		settingRelevantDate(rootElement, metadata, "publication_date", "issued", true);
149

    
150
		// Adding type
151
		final String type = getStringValue(rootElement, "type");
152
		if (StringUtils.isNotBlank(type)) {
153
			final String typeValue = typologiesMapping.get(type).get("value");
154
			final String cobjValue = typologiesMapping.get(type).get("cobj");
155
			final ResultProtos.Result.Instance.Builder instance = ResultProtos.Result.Instance.newBuilder();
156

    
157
			// Adding hostedby
158
			instance.setHostedby(FieldTypeProtos.KeyValue.newBuilder()
159
					.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "55045bd2a65019fd8e6741a755395c8c")
160
					.setValue("Unknown Repository")
161
					.build());
162

    
163
			// Adding url
164
			final String url = createRepeatedField(rootElement, "urls");
165
			if (StringUtils.isNotBlank(url)) {
166
				instance.addUrl(url);
167
			}
168

    
169
			final String pubDate = getPublicationDate(rootElement, "publication_date");
170
			if (StringUtils.isNotBlank(pubDate)) {
171
				instance.setDateofacceptance(FieldTypeProtos.StringField.newBuilder().setValue(pubDate).build());
172
			}
173

    
174
			// Adding collectedfrom
175
			final FieldTypeProtos.KeyValue collectedFrom = FieldTypeProtos.KeyValue.newBuilder()
176
					.setValue(ORCID)
177
					.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "806360c771262b4d6770e7cdf04b5c5a")
178
					.build();
179
			instance.setCollectedfrom(collectedFrom);
180

    
181
			// Adding accessright
182
			instance.setAccessright(FieldTypeProtos.Qualifier.newBuilder()
183
					.setClassid("UNKNOWN")
184
					.setClassname("UNKNOWN")
185
					.setSchemeid("dnet:access_modes")
186
					.setSchemename("dnet:access_modes")
187
					.build());
188

    
189
			// Adding type
190
			instance.setInstancetype(FieldTypeProtos.Qualifier.newBuilder()
191
					.setClassid(cobjValue)
192
					.setClassname(typeValue)
193
					.setSchemeid("dnet:publication_resource")
194
					.setSchemename("dnet:publication_resource")
195
					.build());
196

    
197
			result.addInstance(instance);
198
		}
199

    
200
		// Adding authors
201
		final List<Author> authors = createAuthors(rootElement);
202
		if (authors != null) {
203
			metadata.addAllAuthor(authors);
204
		}
205

    
206
		result.setMetadata(metadata.build());
207

    
208
		entity.setResult(result.build());
209

    
210
		oaf.setEntity(entity.build());
211

    
212
		// System.out.println("Proto dump: " + com.googlecode.protobuf.format.JsonFormat.printToString(oaf.build()));
213

    
214
		final List<AtomicAction> actionList = new ArrayList<>();
215

    
216
		actionList.add(factory.createAtomicAction(setName, agent, oaf.getEntity().getId(), "result", "body", oaf.build().toByteArray()));
217

    
218
		return actionList;
219

    
220
	}
221

    
222
	public static List<Author> createAuthors(final JsonObject root) {
223

    
224
		final String authorsJSONFieldName = "authors";
225

    
226
		if (root.has(authorsJSONFieldName) && root.get(authorsJSONFieldName).isJsonArray()) {
227

    
228
			final List<Author> authors = new ArrayList<>();
229
			final JsonArray jsonAuthors = root.getAsJsonArray(authorsJSONFieldName);
230
			int firstCounter = 0;
231
			int defaultCounter = 0;
232
			int rank = 1;
233
			int currentRank = 0;
234

    
235
			for (final JsonElement item : jsonAuthors) {
236
				final JsonObject author = item.getAsJsonObject();
237
				final Author.Builder result = Author.newBuilder();
238
				if (item.isJsonObject()) {
239
					final String surname = getStringValue(author, "surname");
240
					final String name = getStringValue(author, "name");
241
					final String oid = getStringValue(author, "oid");
242
					final String seq = getStringValue(author, "seq");
243
					if (StringUtils.isNotBlank(seq)) {
244
						if (seq.equals("first")) {
245
							firstCounter += 1;
246
							rank = firstCounter;
247

    
248
						} else if (seq.equals("additional")) {
249
							rank = currentRank + 1;
250
						} else {
251
							defaultCounter += 1;
252
							rank = defaultCounter;
253
						}
254
					}
255

    
256
					if (StringUtils.isNotBlank(oid)) {
257
						result.addPid(KeyValue.newBuilder()
258
								.setValue(oid)
259
								.setKey("ORCID")
260
								.build());
261
						result.setFullname(name + " " + surname);
262
						if (StringUtils.isNotBlank(name)) {
263
							result.setName(name);
264
						}
265
						if (StringUtils.isNotBlank(surname)) {
266
							result.setSurname(surname);
267
						}
268
					} else {
269
						if (StringUtils.isNotBlank(name)) {
270
							result.setFullname(name);
271
						} else {
272
							if (StringUtils.isNotBlank(surname)) {
273
								result.setFullname(surname);
274
							}
275
						}
276
					}
277
				}
278
				result.setRank(rank);
279
				authors.add(result.build());
280
				currentRank = rank;
281
			}
282
			return authors;
283

    
284
		}
285
		return null;
286
	}
287

    
288
	private static String createRepeatedField(final JsonObject rootElement, final String fieldName) {
289
		String field = "";
290
		if (!rootElement.has(fieldName)) { return null; }
291
		if (rootElement.has(fieldName) && rootElement.get(fieldName).isJsonNull()) { return null; }
292
		if (rootElement.get(fieldName).isJsonArray()) {
293
			if (!isValidJsonArray(rootElement, fieldName)) { return null; }
294
			final StringBuilder ttl = new StringBuilder();
295
			getArrayValues(rootElement, fieldName).forEach(ttl::append);
296
			field = ttl.toString();
297
		} else {
298
			field = getStringValue(rootElement, fieldName);
299
		}
300

    
301
		if (field != null && field.charAt(0) == '"' && field.charAt(field.length() - 1) == '"') {
302
			field = field.substring(1, field.length() - 1);
303
		}
304
		return field;
305
	}
306

    
307
	private static void settingRelevantDate(final JsonObject rootElement,
308
			final ResultProtos.Result.Metadata.Builder metadata,
309
			final String jsonKey,
310
			final String dictionaryKey,
311
			final boolean addToDateOfAcceptance) {
312

    
313
		// final JsonObject pubDateJson = rootElement.getAsJsonObject(jsonKey);
314
		// if (pubDateJson == null) { return; }
315
		// final String year = getStringValue(pubDateJson, "year");
316
		// final String month = getStringValue(pubDateJson, "month");
317
		// final String day = getStringValue(pubDateJson, "day");
318
		//
319
		// if (StringUtils.isBlank(year)) { return; }
320
		// String pubDate = "".concat(year);
321
		// if (StringUtils.isNotBlank(month)) {
322
		// pubDate = pubDate.concat("-" + month);
323
		// if (StringUtils.isNotBlank(day)) {
324
		// pubDate = pubDate.concat("-" + day);
325
		// } else {
326
		// pubDate += "-01";
327
		// }
328
		// } else {
329
		// pubDate += "-01-01";
330
		// }
331

    
332
		final String pubDate = getPublicationDate(rootElement, "publication_date");
333
		if (StringUtils.isNotBlank(pubDate)) {
334
			// if (addToDateOfAcceptance) {
335
			// metadata.setDateofacceptance(FieldTypeProtos.StringField.newBuilder().setValue(pubDate).build());
336
			// }
337
			metadata.addRelevantdate(FieldTypeProtos.StructuredProperty.newBuilder()
338
					.setValue(pubDate)
339
					.setQualifier(getQualifier(dictionaryKey, "dnet:dataCite_date"))
340
					.build());
341
		}
342
	}
343

    
344
	private static String getPublicationDate(final JsonObject rootElement,
345
			final String jsonKey) {
346

    
347
		final JsonObject pubDateJson = rootElement.getAsJsonObject(jsonKey);
348
		if (pubDateJson == null) { return null; }
349
		final String year = getStringValue(pubDateJson, "year");
350
		final String month = getStringValue(pubDateJson, "month");
351
		final String day = getStringValue(pubDateJson, "day");
352

    
353
		if (StringUtils.isBlank(year)) { return null; }
354
		String pubDate = "".concat(year);
355
		if (StringUtils.isNotBlank(month)) {
356
			pubDate = pubDate.concat("-" + month);
357
			if (StringUtils.isNotBlank(day)) {
358
				pubDate = pubDate.concat("-" + day);
359
			} else {
360
				pubDate += "-01";
361
			}
362
		} else {
363
			pubDate += "-01-01";
364
		}
365
		if (isValidDate(pubDate)) { return pubDate; }
366
		return null;
367
	}
368

    
369
	protected static boolean isValid(final JsonObject rootElement, final Reporter context) {
370

    
371
		final String type = getStringValue(rootElement, "type");
372
		if (!typologiesMapping.containsKey(type)) {
373
			context.incrementCounter("filtered", "unknowntype_" + type, 1);
374
			return false;
375
		}
376

    
377
		if (!isValidJsonArray(rootElement, "titles")) {
378
			context.incrementCounter("filtered", "invalid_title", 1);
379
			return false;
380
		}
381
		return true;
382
	}
383

    
384
	private static boolean isValidJsonArray(final JsonObject rootElement, final String fieldName) {
385
		if (!rootElement.has(fieldName)) { return false; }
386
		final JsonElement jsonElement = rootElement.get(fieldName);
387
		if (jsonElement.isJsonNull()) { return false; }
388
		if (jsonElement.isJsonArray()) {
389
			final JsonArray jsonArray = jsonElement.getAsJsonArray();
390
			if (jsonArray.isJsonNull()) { return false; }
391
			if (jsonArray.get(0).isJsonNull()) { return false; }
392
		}
393
		return true;
394
	}
395
}
(13-13/17)