Project

General

Profile

1
package eu.dnetlib.data.mapreduce.hbase.dataimport;
2

    
3
import java.io.IOException;
4
import java.io.InputStream;
5
import java.util.ArrayList;
6
import java.util.HashMap;
7
import java.util.List;
8
import java.util.Map;
9

    
10
import com.google.gson.Gson;
11
import com.google.gson.JsonArray;
12
import com.google.gson.JsonElement;
13
import com.google.gson.JsonObject;
14
import eu.dnetlib.actionmanager.actions.ActionFactory;
15
import eu.dnetlib.actionmanager.actions.AtomicAction;
16
import eu.dnetlib.actionmanager.common.Agent;
17
import eu.dnetlib.data.mapreduce.hbase.Reporter;
18
import eu.dnetlib.data.mapreduce.util.StreamUtils;
19
import eu.dnetlib.data.proto.*;
20
import eu.dnetlib.data.proto.FieldTypeProtos.*;
21
import eu.dnetlib.data.transform.xml.AbstractDNetXsltFunctions;
22
import eu.dnetlib.miscutils.collections.Pair;
23
import org.apache.commons.io.IOUtils;
24
import org.apache.commons.lang3.StringUtils;
25

    
26
import static eu.dnetlib.data.mapreduce.hbase.dataimport.DumpToActionsUtility.*;
27

    
28
public class OrcidToActions {
29

    
30
	public static final String ORCID = "ORCID";
31
	public final static String orcidPREFIX = "orcid____";
32
	public static final String OPENAIRE_PREFIX = "openaire____";
33
	public static final String SEPARATOR = "::";
34

    
35
	private static Map<String, Pair<String, String>> datasources = new HashMap<String, Pair<String, String>>() {
36

    
37
		{
38
			put(ORCID.toLowerCase(), new Pair<>(ORCID, OPENAIRE_PREFIX + SEPARATOR + "orcid"));
39

    
40
		}
41
	};
42

    
43
	// json external id will be mapped to oaf:pid/@classid Map to oaf:pid/@classname
44
	private static Map<String, Pair<String, String>> externalIds = new HashMap<String, Pair<String, String>>() {
45

    
46
		{
47
			put("ark".toLowerCase(), new Pair<>("ark", "ark"));
48
			put("arxiv".toLowerCase(), new Pair<>("arxiv", "arXiv"));
49
			put("pmc".toLowerCase(), new Pair<>("pmc", "pmc"));
50
			put("pmid".toLowerCase(), new Pair<>("pmid", "pmid"));
51
			put("source-work-id".toLowerCase(), new Pair<>("orcidworkid", "orcidworkid"));
52
			put("urn".toLowerCase(), new Pair<>("urn", "urn"));
53
		}
54
	};
55

    
56
	static Map<String, Map<String, String>> typologiesMapping;
57

    
58
	static {
59
		try {
60
			final InputStream is = OrcidToActions.class.getResourceAsStream("/eu/dnetlib/data/mapreduce/hbase/dataimport/mapping_typologies_orcid.json");
61
			final String tt = IOUtils.toString(is);
62
			typologiesMapping = new Gson().fromJson(tt, Map.class);
63
		} catch (final IOException e) {
64
			e.printStackTrace();
65
		}
66
	}
67

    
68
	public static final String PID_TYPES = "dnet:pid_types";
69

    
70
	public static List<AtomicAction> generatePublicationActionsFromDump(final JsonObject rootElement,
71
			final ActionFactory factory,
72
			final String setName,
73
			final Agent agent,
74
			final Reporter context) {
75

    
76
		if (!isValid(rootElement, context)) { return null; }
77

    
78
		// Create OAF proto
79

    
80
		final OafProtos.Oaf.Builder oaf = OafProtos.Oaf.newBuilder();
81

    
82
		// Adding kind
83
		oaf.setKind(KindProtos.Kind.entity);
84

    
85
		// creating result proto
86
		final OafProtos.OafEntity.Builder entity = OafProtos.OafEntity.newBuilder().setType(TypeProtos.Type.result);
87

    
88
		entity.setDateofcollection("2018-10-22");
89

    
90
		// Adding external ids
91
		StreamUtils.toStream(externalIds.keySet().iterator())
92
				.forEach(jsonExtId -> {
93
					final String classid = externalIds.get(jsonExtId.toLowerCase()).getValue();
94
					final String classname = externalIds.get(jsonExtId.toLowerCase()).getKey();
95
					final String extId = getStringValue(rootElement, jsonExtId);
96
					if (StringUtils.isNotBlank(extId)) {
97
						entity.addPid(StructuredProperty.newBuilder()
98
								.setValue(extId)
99
								.setQualifier(Qualifier.newBuilder().setClassid(classid).setClassname(classname).setSchemeid("dnet:pid_types")
100
										.setSchemename("dnet:pid_types").build())
101
								.build());
102
					}
103
				});
104

    
105
		// Create result field
106
		final ResultProtos.Result.Builder result = ResultProtos.Result.newBuilder();
107

    
108
		// Create metadata proto
109
		final ResultProtos.Result.Metadata.Builder metadata = ResultProtos.Result.Metadata.newBuilder();
110

    
111
		// Adding source
112
		final String source = getStringValue(rootElement, "source");
113
		if (StringUtils.isNotBlank(source)) {
114
			metadata.addSource(StringField.newBuilder().setValue(source).build());
115
		}
116

    
117
		// Adding title
118
		final String title = createRepeatedField(rootElement, "titles");
119
		metadata.addTitle(FieldTypeProtos.StructuredProperty.newBuilder()
120
				.setValue(title)
121
				.setQualifier(getQualifier("main title", "dnet:dataCite_title"))
122
				.build());
123

    
124
		// Adding identifier
125
		final String id = getStringValue(rootElement, "id");
126
		String sourceId = null;
127
		if (id != null) {
128
			sourceId = String.format("50|%s" + SEPARATOR + "%s", orcidPREFIX, AbstractDNetXsltFunctions.md5(id));
129
		} else {
130
			sourceId = String.format("50|%s" + SEPARATOR + "%s", orcidPREFIX, AbstractDNetXsltFunctions.md5(title));
131
		}
132
		entity.setId(sourceId);
133

    
134
		// Adding relevant date
135
		settingRelevantDate(rootElement, metadata, "publication_date", "issued", true);
136

    
137
		// Adding type
138
		final String type = getStringValue(rootElement, "type");
139
		String cobjValue = "";
140
		if (StringUtils.isNotBlank(type)) {
141

    
142
			metadata.setResourcetype(FieldTypeProtos.Qualifier.newBuilder()
143
					.setClassid(type)
144
					.setClassname(type)
145
					.setSchemeid("dnet:dataCite_resource")
146
					.setSchemename("dnet:dataCite_resource")
147
					.build());
148

    
149
			final String typeValue = typologiesMapping.get(type).get("value");
150
			cobjValue = typologiesMapping.get(type).get("cobj");
151
			final ResultProtos.Result.Instance.Builder instance = ResultProtos.Result.Instance.newBuilder();
152

    
153
			// Adding hostedby
154
			instance.setHostedby(FieldTypeProtos.KeyValue.newBuilder()
155
					.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "55045bd2a65019fd8e6741a755395c8c")
156
					.setValue("Unknown Repository")
157
					.build());
158

    
159
			// Adding url
160
			final String url = createRepeatedField(rootElement, "urls");
161
			if (StringUtils.isNotBlank(url)) {
162
				instance.addUrl(url);
163
			}
164

    
165
			final String pubDate = getPublicationDate(rootElement, "publication_date");
166
			if (StringUtils.isNotBlank(pubDate)) {
167
				instance.setDateofacceptance(FieldTypeProtos.StringField.newBuilder().setValue(pubDate).build());
168
			}
169

    
170
			// Adding collectedfrom
171
			final FieldTypeProtos.KeyValue collectedFrom = FieldTypeProtos.KeyValue.newBuilder()
172
					.setValue(ORCID)
173
					.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "806360c771262b4d6770e7cdf04b5c5a")
174
					.build();
175
			instance.setCollectedfrom(collectedFrom);
176

    
177
			// Adding accessright
178
			instance.setAccessright(FieldTypeProtos.Qualifier.newBuilder()
179
					.setClassid("UNKNOWN")
180
					.setClassname("UNKNOWN")
181
					.setSchemeid("dnet:access_modes")
182
					.setSchemename("dnet:access_modes")
183
					.build());
184

    
185
			// Adding type
186
			instance.setInstancetype(FieldTypeProtos.Qualifier.newBuilder()
187
					.setClassid(cobjValue)
188
					.setClassname(typeValue)
189
					.setSchemeid("dnet:publication_resource")
190
					.setSchemename("dnet:publication_resource")
191
					.build());
192

    
193
			result.addInstance(instance);
194
		}
195

    
196
		// Adding authors
197
		final List<Author> authors = createAuthors(rootElement);
198
		if (authors != null) {
199
			metadata.addAllAuthor(authors);
200
		}
201

    
202
		metadata.setResulttype(getQualifier(getDefaultResulttype(cobjValue), "dnet:result_typologies"));
203
		result.setMetadata(metadata.build());
204
		entity.setResult(result.build());
205
		oaf.setEntity(entity.build());
206

    
207
		final List<AtomicAction> actionList = new ArrayList<>();
208

    
209
		actionList.add(factory.createAtomicAction(setName, agent, oaf.getEntity().getId(), "result", "body", oaf.build().toByteArray()));
210

    
211
		//System.out.println(JsonFormat.printToString(oaf.build()));
212
		return actionList;
213

    
214
	}
215

    
216
	public static List<Author> createAuthors(final JsonObject root) {
217

    
218
		final String authorsJSONFieldName = "authors";
219

    
220
		if (root.has(authorsJSONFieldName) && root.get(authorsJSONFieldName).isJsonArray()) {
221

    
222
			final List<Author> authors = new ArrayList<>();
223
			final JsonArray jsonAuthors = root.getAsJsonArray(authorsJSONFieldName);
224
			int firstCounter = 0;
225
			int defaultCounter = 0;
226
			int rank = 1;
227
			int currentRank = 0;
228

    
229
			for (final JsonElement item : jsonAuthors) {
230
				final JsonObject author = item.getAsJsonObject();
231
				final Author.Builder result = Author.newBuilder();
232
				if (item.isJsonObject()) {
233
					final String surname = getStringValue(author, "surname");
234
					final String name = getStringValue(author, "name");
235
					final String oid = getStringValue(author, "oid");
236
					final String seq = getStringValue(author, "seq");
237
					if (StringUtils.isNotBlank(seq)) {
238
						if (seq.equals("first")) {
239
							firstCounter += 1;
240
							rank = firstCounter;
241

    
242
						} else if (seq.equals("additional")) {
243
							rank = currentRank + 1;
244
						} else {
245
							defaultCounter += 1;
246
							rank = defaultCounter;
247
						}
248
					}
249

    
250
					if (StringUtils.isNotBlank(oid)) {
251
						result.addPid(KeyValue.newBuilder()
252
								.setValue(oid)
253
								.setKey("ORCID")
254
								.build());
255
						result.setFullname(name + " " + surname);
256
						if (StringUtils.isNotBlank(name)) {
257
							result.setName(name);
258
						}
259
						if (StringUtils.isNotBlank(surname)) {
260
							result.setSurname(surname);
261
						}
262
					} else {
263
						if (StringUtils.isNotBlank(name)) {
264
							result.setFullname(name);
265
						} else {
266
							if (StringUtils.isNotBlank(surname)) {
267
								result.setFullname(surname);
268
							}
269
						}
270
					}
271
				}
272
				result.setRank(rank);
273
				authors.add(result.build());
274
				currentRank = rank;
275
			}
276
			return authors;
277

    
278
		}
279
		return null;
280
	}
281

    
282
	private static String createRepeatedField(final JsonObject rootElement, final String fieldName) {
283
		String field = "";
284
		if (!rootElement.has(fieldName)) { return null; }
285
		if (rootElement.has(fieldName) && rootElement.get(fieldName).isJsonNull()) { return null; }
286
		if (rootElement.get(fieldName).isJsonArray()) {
287
			if (!isValidJsonArray(rootElement, fieldName)) { return null; }
288
			final StringBuilder ttl = new StringBuilder();
289
			getArrayValues(rootElement, fieldName).forEach(ttl::append);
290
			field = ttl.toString();
291
		} else {
292
			field = getStringValue(rootElement, fieldName);
293
		}
294

    
295
		if (field != null && !field.isEmpty() && field.charAt(0) == '"' && field.charAt(field.length() - 1) == '"') {
296
			field = field.substring(1, field.length() - 1);
297
		}
298
		return field;
299
	}
300

    
301
	private static void settingRelevantDate(final JsonObject rootElement,
302
			final ResultProtos.Result.Metadata.Builder metadata,
303
			final String jsonKey,
304
			final String dictionaryKey,
305
			final boolean addToDateOfAcceptance) {
306

    
307
		// final JsonObject pubDateJson = rootElement.getAsJsonObject(jsonKey);
308
		// if (pubDateJson == null) { return; }
309
		// final String year = getStringValue(pubDateJson, "year");
310
		// final String month = getStringValue(pubDateJson, "month");
311
		// final String day = getStringValue(pubDateJson, "day");
312
		//
313
		// if (StringUtils.isBlank(year)) { return; }
314
		// String pubDate = "".concat(year);
315
		// if (StringUtils.isNotBlank(month)) {
316
		// pubDate = pubDate.concat("-" + month);
317
		// if (StringUtils.isNotBlank(day)) {
318
		// pubDate = pubDate.concat("-" + day);
319
		// } else {
320
		// pubDate += "-01";
321
		// }
322
		// } else {
323
		// pubDate += "-01-01";
324
		// }
325

    
326
		final String pubDate = getPublicationDate(rootElement, "publication_date");
327
		if (StringUtils.isNotBlank(pubDate)) {
328
			if (addToDateOfAcceptance) {
329
			 metadata.setDateofacceptance(FieldTypeProtos.StringField.newBuilder().setValue(pubDate).build());
330
			}
331
			metadata.addRelevantdate(FieldTypeProtos.StructuredProperty.newBuilder()
332
					.setValue(pubDate)
333
					.setQualifier(getQualifier(dictionaryKey, "dnet:dataCite_date"))
334
					.build());
335
		}
336
	}
337

    
338
	private static String getPublicationDate(final JsonObject rootElement,
339
			final String jsonKey) {
340

    
341
		final JsonObject pubDateJson = rootElement.getAsJsonObject(jsonKey);
342
		if (pubDateJson == null) { return null; }
343
		final String year = getStringValue(pubDateJson, "year");
344
		final String month = getStringValue(pubDateJson, "month");
345
		final String day = getStringValue(pubDateJson, "day");
346

    
347
		if (StringUtils.isBlank(year)) { return null; }
348
		String pubDate = "".concat(year);
349
		if (StringUtils.isNotBlank(month)) {
350
			pubDate = pubDate.concat("-" + month);
351
			if (StringUtils.isNotBlank(day)) {
352
				pubDate = pubDate.concat("-" + day);
353
			} else {
354
				pubDate += "-01";
355
			}
356
		} else {
357
			pubDate += "-01-01";
358
		}
359
		if (isValidDate(pubDate)) { return pubDate; }
360
		return null;
361
	}
362

    
363
	protected static boolean isValid(final JsonObject rootElement, final Reporter context) {
364

    
365
		final String type = getStringValue(rootElement, "type");
366
		if (!typologiesMapping.containsKey(type)) {
367
			context.incrementCounter("filtered", "unknowntype_" + type, 1);
368
			return false;
369
		}
370

    
371
		if (!isValidJsonArray(rootElement, "titles")) {
372
			context.incrementCounter("filtered", "invalid_title", 1);
373
			return false;
374
		}
375
		return true;
376
	}
377

    
378
	private static boolean isValidJsonArray(final JsonObject rootElement, final String fieldName) {
379
		if (!rootElement.has(fieldName)) { return false; }
380
		final JsonElement jsonElement = rootElement.get(fieldName);
381
		if (jsonElement.isJsonNull()) { return false; }
382
		if (jsonElement.isJsonArray()) {
383
			final JsonArray jsonArray = jsonElement.getAsJsonArray();
384
			if (jsonArray.isJsonNull()) { return false; }
385
			if (jsonArray.get(0).isJsonNull()) { return false; }
386
		}
387
		return true;
388
	}
389
}
(14-14/18)