Project

General

Profile

1
package eu.dnetlib.data.collector.plugins.schemaorg;
2

    
3
import org.apache.commons.logging.Log;
4
import org.apache.commons.logging.LogFactory;
5
import org.json.JSONObject;
6

    
7
import java.net.URL;
8
import java.time.LocalDate;
9
import java.time.format.DateTimeFormatter;
10
import java.util.*;
11

    
12
public class DatasetMappingIterator implements Iterator<String> {
13
	private static final Log log = LogFactory.getLog(EndpointAccessIterator.class);
14

    
15
	public static class Options {
16
		public static class IdentifierOptions{
17
			public List<String> mappingARK;
18
			public List<String> mappingDOI;
19
			public List<String> mappingHandle;
20
			public List<String> mappingPURL;
21
			public List<String> mappingURN;
22
			public List<String> mappingURL;
23
			public DatasetDocument.Identifier.IdentifierType fallbackType;
24
			public Boolean fallbackURL;
25
		}
26

    
27
		public static class ContributorOptions{
28
			public DatasetDocument.Contributor.ContributorType fallbackType;
29
		}
30

    
31
		public static class PublicationDateOptions{
32
			public String format;
33
		}
34

    
35
		public static class CreatedDateOptions{
36
			public String format;
37
		}
38

    
39
		public static class UpdatedDateOptions{
40
			public String format;
41
		}
42

    
43
		private IdentifierOptions identifierOptions;
44
		private PublicationDateOptions publicationDateOptions;
45
		private ContributorOptions contributorOptions;
46
		private CreatedDateOptions createdDateOptions;
47
		private UpdatedDateOptions updatedDateOptions;
48

    
49
		public UpdatedDateOptions getUpdatedDateOptions() {
50
			return updatedDateOptions;
51
		}
52

    
53
		public void setUpdatedDateOptions(UpdatedDateOptions updatedDateOptions) {
54
			this.updatedDateOptions = updatedDateOptions;
55
		}
56

    
57
		public CreatedDateOptions getCreatedDateOptions() {
58
			return createdDateOptions;
59
		}
60

    
61
		public void setCreatedDateOptions(CreatedDateOptions createdDateOptions) {
62
			this.createdDateOptions = createdDateOptions;
63
		}
64

    
65
		public ContributorOptions getContributorOptions() {
66
			return contributorOptions;
67
		}
68

    
69
		public void setContributorOptions(ContributorOptions contributorOptions) {
70
			this.contributorOptions = contributorOptions;
71
		}
72

    
73
		public PublicationDateOptions getPublicationDateOptions() {
74
			return publicationDateOptions;
75
		}
76

    
77
		public void setPublicationDateOptions(PublicationDateOptions publicationDateOptions) {
78
			this.publicationDateOptions = publicationDateOptions;
79
		}
80

    
81
		public IdentifierOptions getIdentifierOptions() {
82
			return identifierOptions;
83
		}
84

    
85
		public void setIdentifierOptions(IdentifierOptions identifierOptions) {
86
			this.identifierOptions = identifierOptions;
87
		}
88
	}
89

    
90
	private Options options;
91
	private EndpointAccessIterator endpointAccessIterator;
92

    
93
	public DatasetMappingIterator(Options options, EndpointAccessIterator endpointAccessIterator) {
94
		this.options = options;
95
		this.endpointAccessIterator = endpointAccessIterator;
96
	}
97

    
98
	@Override
99
	public boolean hasNext() {
100
		return this.endpointAccessIterator.hasNext();
101
	}
102

    
103
	@Override
104
	public String next() {
105
		JSONObject document = this.endpointAccessIterator.next();
106
		String xml = null;
107
		if (document == null) {
108
			log.debug("no document provided to process. returning empty");
109
			xml = DatasetDocument.emptyXml();
110
		}
111
		else {
112
			log.debug("building document");
113
			xml = this.buildDataset(document);
114
			if (!Utils.validateXml(xml)) {
115
				log.debug("xml not valid. setting to empty");
116
				xml = null;
117
			}
118
			if (xml == null) {
119
				log.debug("could not build xml. returning empty");
120
				xml = DatasetDocument.emptyXml();
121
			}
122
		}
123

    
124
		//if all else fails
125
		if(xml == null){
126
			log.debug("could not build xml. returning empty");
127
			xml = "<dataset/>";
128
		}
129

    
130
		log.debug("xml document for dataset is: "+xml);
131

    
132
		return xml;
133
	}
134

    
135
	private String buildDataset(JSONObject document){
136
		String xml = null;
137
		try{
138
			DatasetDocument dataset = new DatasetDocument();
139

    
140
			dataset.setIdentifiers(this.extractIdentifier(document));
141
			dataset.setCreators(this.extractCreator(document));
142
			dataset.setTitles(this.extractTitles(document));
143
			dataset.setAlternativeTitles(this.extractAlternateTitles(document));
144
			dataset.setPublishers(this.extractPublisher(document));
145
			dataset.setPublicationDates(this.extractPublicationDate(document));
146
			dataset.setSubjects(this.extractSubjects(document));
147
			dataset.setContributors(this.extractContributors(document));
148
			dataset.setCreatedDates(this.extractCreatedDate(document));
149
			dataset.setUpdatedDates(this.extractUpdatedDate(document));
150
			dataset.setLanguages(this.extractLanguages(document));
151
			dataset.setResourceTypes(this.extractResourceTypes(document));
152
			dataset.setAlternateIdentifier(this.extractAlternateIdentifiers(document));
153
			dataset.setCitations(this.extractCitations(document));
154
			dataset.setSizes(this.extractSize(document));
155
			dataset.setFormat(this.extractEncodingFormat(document));
156
			dataset.setVersion(this.extractVersion(document));
157
			dataset.setLicenses(this.extractLicense(document));
158
			dataset.setDescriptions(this.extractDescription(document));
159
			dataset.setDisambiguatingDescriptions(this.extractDisambiguatingDescription(document));
160
			dataset.setGeoLocations(this.extractSpatialCoverage(document));
161

    
162
			log.debug("document contains native identifier: : "+(dataset.getIdentifiers()!=null && dataset.getIdentifiers().size() > 0));
163

    
164
			if((dataset.getIdentifiers() == null || dataset.getIdentifiers().size() == 0) &&
165
					this.options.getIdentifierOptions().fallbackURL){
166
				log.debug("falling back to url identifier");
167
				dataset.setIdentifiers(this.extractIdentifierFallbackURL(document));
168
				log.debug("document contains overridden identifier: : "+(dataset.getIdentifiers()!=null && dataset.getIdentifiers().size() > 0));
169
			}
170

    
171
			xml = dataset.toXml();
172
		}
173
		catch(Exception ex){
174
			log.error("problem constructing dataset xml. returning empty", ex);
175
			xml = null;
176
		}
177
		return xml;
178
	}
179

    
180
	private List<DatasetDocument.Identifier> extractIdentifierFallbackURL(JSONObject document){
181
		List<String> urls = JSONLDUtils.extractString(document, "url");
182

    
183
		ArrayList<DatasetDocument.Identifier> curated = new ArrayList<>();
184
		for(String item : urls){
185
			if(item == null || item.trim().length() == 0) continue;
186
			curated.add(new DatasetDocument.Identifier(DatasetDocument.Identifier.IdentifierType.URL,  item.trim()));
187
		}
188
		return curated;
189
	}
190

    
191
	private List<DatasetDocument.SpatialCoverage> extractSpatialCoverage(JSONObject document){
192
		List<JSONLDUtils.PlaceInfo> spatials = JSONLDUtils.extractPlaces(document, "spatialCoverage");
193

    
194
		ArrayList<DatasetDocument.SpatialCoverage> curated = new ArrayList<>();
195
		for(JSONLDUtils.PlaceInfo item : spatials){
196
			if((item.name == null || item.name.trim().length() == 0) &&
197
					(item.geoCoordinates == null || item.geoCoordinates.size() == 0) &&
198
					(item.geoShapes == null || item.geoShapes.size() == 0)) continue;
199

    
200
			List<DatasetDocument.SpatialCoverage.Point> points = new ArrayList<>();
201
			List<String> boxes = new ArrayList<>();
202
			if(item.geoCoordinates!=null) {
203
				for (JSONLDUtils.GeoCoordinatesInfo iter : item.geoCoordinates){
204
					points.add(new DatasetDocument.SpatialCoverage.Point(iter.latitude, iter.longitude));
205
				}
206
			}
207
			if(item.geoShapes!=null) {
208
				for (JSONLDUtils.GeoShapeInfo iter : item.geoShapes){
209
					boxes.add(iter.box);
210
				}
211
			}
212
			curated.add(new DatasetDocument.SpatialCoverage(item.name, points, boxes));
213
		}
214
		return curated;
215
	}
216

    
217
	private List<String> extractDescription(JSONObject document){
218
		List<String> descriptions = JSONLDUtils.extractString(document, "description");
219

    
220
		ArrayList<String> curated = new ArrayList<>();
221
		for(String item : descriptions){
222
			if(item == null || item.trim().length() == 0) continue;
223
			curated.add(item);
224
		}
225
		return curated;
226
	}
227

    
228
	private List<String> extractDisambiguatingDescription(JSONObject document){
229
		List<String> descriptions = JSONLDUtils.extractString(document, "disambiguatingDescription");
230

    
231
		ArrayList<String> curated = new ArrayList<>();
232
		for(String item : descriptions){
233
			if(item == null || item.trim().length() == 0) continue;
234
			curated.add(item);
235
		}
236
		return curated;
237
	}
238

    
239
	private List<DatasetDocument.License> extractLicense(JSONObject document){
240
		List<JSONLDUtils.LicenseInfo> licenses = JSONLDUtils.extractLicenses(document, "license");
241

    
242
		ArrayList<DatasetDocument.License> curated = new ArrayList<>();
243
		for(JSONLDUtils.LicenseInfo item : licenses){
244
			if(item.url == null || item.url.trim().length() == 0) continue;
245
			curated.add(new DatasetDocument.License(item.name, item.url));
246
		}
247
		return curated;
248
	}
249

    
250
	private List<String> extractVersion(JSONObject document){
251
		List<String> versions = JSONLDUtils.extractString(document, "version");
252

    
253
		ArrayList<String> curated = new ArrayList<>();
254
		for(String item : versions){
255
			if(item == null || item.trim().length() == 0) continue;
256
			curated.add(item);
257
		}
258
		return curated;
259
	}
260

    
261
	private List<String> extractSize(JSONObject document) {
262
		List<String> sizes = JSONLDUtils.extractSize(document, "distribution");
263

    
264
		HashSet<String> curated = new HashSet<>();
265
		for (String item : sizes) {
266
			if (item == null || item.trim().length() == 0) continue;
267
			curated.add(item);
268
		}
269
		return new ArrayList<>(curated);
270
	}
271

    
272
	private List<String> extractEncodingFormat(JSONObject document){
273
		List<String> formats = JSONLDUtils.extractEncodingFormat(document, "distribution");
274

    
275
		HashSet<String> curated = new HashSet<>();
276
		for(String item : formats){
277
			if(item == null || item.trim().length() == 0) continue;
278
			curated.add(item);
279
		}
280
		return new ArrayList<>(curated);
281
	}
282

    
283
	//TODO: Handle different citation types. Currently only urls
284
	private List<DatasetDocument.Citation> extractCitations(JSONObject document){
285
		List<JSONLDUtils.CitationInfo> citations = JSONLDUtils.extractCitations(document, "citation");
286

    
287
		ArrayList<DatasetDocument.Citation> curated = new ArrayList<>();
288
		for(JSONLDUtils.CitationInfo item : citations){
289
			if(item.url == null || item.url.trim().length() == 0) continue;
290
			try{
291
				new URL(item.url);
292
			}catch (Exception ex){
293
				continue;
294
			}
295
			curated.add(new DatasetDocument.Citation(item.url, DatasetDocument.Citation.CitationIdentifierType.URL));
296
		}
297
		return curated;
298
	}
299

    
300
	private List<DatasetDocument.AlternateIdentifier> extractAlternateIdentifiers(JSONObject document){
301
		List<String> issns = JSONLDUtils.extractString(document, "issn");
302
		List<String> urls = JSONLDUtils.extractString(document, "url");
303

    
304
		ArrayList<DatasetDocument.AlternateIdentifier> curated = new ArrayList<>();
305
		for(String item : issns){
306
			if(item == null || item.trim().length() == 0) continue;
307
			curated.add(new DatasetDocument.AlternateIdentifier(item.trim(), "ISSN"));
308
		}
309
		for(String item : urls){
310
			if(item == null || item.trim().length() == 0) continue;
311
			curated.add(new DatasetDocument.AlternateIdentifier(item.trim(), "URL"));
312
		}
313
		return curated;
314
	}
315

    
316
	private List<DatasetDocument.ResourceType> extractResourceTypes(JSONObject document){
317
		List<DatasetDocument.ResourceType> resourceTypes = new ArrayList<>();
318
		resourceTypes.add(new DatasetDocument.ResourceType(DatasetDocument.ResourceType.ResourceTypeGeneralType.Dataset));
319
		return resourceTypes;
320
	}
321

    
322
	private List<String> extractLanguages(JSONObject document){
323
		List<String> languages = JSONLDUtils.extractLanguage(document, "inLanguage");
324

    
325
		ArrayList<String> curated = new ArrayList<>();
326
		for(String item : languages){
327
			if(item == null || item.trim().length() == 0) continue;
328
			curated.add(item);
329
		}
330
		return curated;
331
	}
332

    
333
	private List<LocalDate> extractUpdatedDate(JSONObject document){
334
		List<LocalDate> updatedDates = new ArrayList<>();
335
		if(this.options.getUpdatedDateOptions() == null || this.options.getUpdatedDateOptions().format == null || this.options.getUpdatedDateOptions().format.length() == 0) return updatedDates;
336

    
337
		DateTimeFormatter formatter = DateTimeFormatter.ofPattern(this.options.getPublicationDateOptions().format);
338

    
339
		List<String> dates = JSONLDUtils.extractString(document, "dateModified");
340
		for(String updatedDate : dates){
341
			if(updatedDate == null || updatedDate.trim().length() == 0) continue;
342
			try {
343
				LocalDate localDate = LocalDate.parse(updatedDate, formatter);
344
				updatedDates.add(localDate);
345
			} catch (Exception e) {
346
				continue;
347
			}
348
		}
349
		return updatedDates;
350
	}
351

    
352
	private List<LocalDate> extractCreatedDate(JSONObject document){
353
		List<LocalDate> createdDates = new ArrayList<>();
354
		if(this.options.getCreatedDateOptions() == null || this.options.getCreatedDateOptions().format == null || this.options.getCreatedDateOptions().format.length() == 0) return createdDates;
355

    
356
		DateTimeFormatter formatter = DateTimeFormatter.ofPattern(this.options.getCreatedDateOptions().format);
357

    
358
		List<String> dates = JSONLDUtils.extractString(document, "dateCreated");
359
		for(String createdDate : dates){
360
			if(createdDate == null || createdDate.trim().length() == 0) continue;
361
			try {
362
				LocalDate localDate = LocalDate.parse(createdDate, formatter);
363
				createdDates.add(localDate);
364
			} catch (Exception e) {
365
				continue;
366
			}
367
		}
368
		return createdDates;
369
	}
370

    
371
	private List<DatasetDocument.Contributor> extractContributors(JSONObject document){
372
		List<JSONLDUtils.PrincipalInfo> editors = JSONLDUtils.extractPrincipal(document, "editor");
373
		List<JSONLDUtils.PrincipalInfo> funders = JSONLDUtils.extractPrincipal(document, "funder");
374
		List<JSONLDUtils.PrincipalInfo> producers = JSONLDUtils.extractPrincipal(document, "producer");
375
		List<JSONLDUtils.PrincipalInfo> sponsors = JSONLDUtils.extractPrincipal(document, "sponsor");
376
		List<JSONLDUtils.PrincipalInfo> constributors = JSONLDUtils.extractPrincipal(document, "contributor");
377

    
378
		ArrayList<DatasetDocument.Contributor> curated = new ArrayList<>();
379
		for(JSONLDUtils.PrincipalInfo item : editors){
380
			if(item.name() == null || item.name().trim().length() == 0) continue;
381
			curated.add(new DatasetDocument.Contributor(item.name(), item.affiliationNames(), DatasetDocument.Contributor.ContributorType.Editor));
382
		}
383
		for(JSONLDUtils.PrincipalInfo item : funders){
384
			if(item.name() == null || item.name().trim().length() == 0) continue;
385
			curated.add(new DatasetDocument.Contributor(item.name(), item.affiliationNames(), DatasetDocument.Contributor.ContributorType.Funder));
386
		}
387
		for(JSONLDUtils.PrincipalInfo item : producers){
388
			if(item.name() == null || item.name().trim().length() == 0) continue;
389
			curated.add(new DatasetDocument.Contributor(item.name(), item.affiliationNames(), DatasetDocument.Contributor.ContributorType.Producer));
390
		}
391
		for(JSONLDUtils.PrincipalInfo item : sponsors){
392
			if(item.name() == null || item.name().trim().length() == 0) continue;
393
			curated.add(new DatasetDocument.Contributor(item.name(), item.affiliationNames(), DatasetDocument.Contributor.ContributorType.Sponsor));
394
		}
395
		for(JSONLDUtils.PrincipalInfo item : constributors){
396
			if(item.name() == null || item.name().trim().length() == 0) continue;
397
			DatasetDocument.Contributor.ContributorType type = DatasetDocument.Contributor.ContributorType.Other;
398
			if(this.options.getContributorOptions()!=null && this.options.getContributorOptions().fallbackType != null) type = this.options.getContributorOptions().fallbackType;
399
			curated.add(new DatasetDocument.Contributor(item.name(), item.affiliationNames(), type));
400
		}
401
		return curated;
402
	}
403

    
404
	private List<String> extractSubjects(JSONObject document){
405
		List<String> subjects = JSONLDUtils.extractString(document, "keywords");
406

    
407
		ArrayList<String> curated = new ArrayList<>();
408
		for(String item : subjects){
409
			if(item == null || item.trim().length() == 0) continue;
410
			curated.add(item);
411
		}
412
		return curated;
413
	}
414

    
415
	private List<LocalDate> extractPublicationDate(JSONObject document){
416
		List<LocalDate> publicationDates = new ArrayList<>();
417
		if(this.options.getPublicationDateOptions() == null || this.options.getPublicationDateOptions().format == null || this.options.getPublicationDateOptions().format.length() == 0) return publicationDates;
418

    
419
		DateTimeFormatter formatter = DateTimeFormatter.ofPattern(this.options.getPublicationDateOptions().format);
420

    
421
		List<String> dates = JSONLDUtils.extractString(document, "datePublished");
422
		for(String publicationDate : dates){
423
			if(publicationDate == null || publicationDate.trim().length() == 0) continue;
424
			try {
425
				LocalDate localDate = LocalDate.parse(publicationDate, formatter);
426
				publicationDates.add(localDate);
427
			} catch (Exception e) {
428
				continue;
429
			}
430
		}
431
		return publicationDates;
432
	}
433

    
434
	private List<String> extractPublisher(JSONObject document){
435
		List<JSONLDUtils.PrincipalInfo> publishers = JSONLDUtils.extractPrincipal(document, "publisher");
436

    
437
		ArrayList<String> curated = new ArrayList<>();
438
		for(JSONLDUtils.PrincipalInfo item : publishers){
439
			if(item.name() == null || item.name().trim().length() == 0) continue;
440
			curated.add(item.name());
441
		}
442
		return curated;
443
	}
444

    
445
	private List<String> extractTitles(JSONObject document){
446
		List<String> names = JSONLDUtils.extractString(document, "name");
447
		List<String> headlines = JSONLDUtils.extractString(document, "headline");
448

    
449
		HashSet<String> titles = new HashSet<>();
450
		titles.addAll(names);
451
		titles.addAll(headlines);
452
		return new ArrayList<>(titles);
453
	}
454

    
455
	private List<String> extractAlternateTitles(JSONObject document){
456
		List<String> names = JSONLDUtils.extractString(document, "alternateName");
457
		List<String> headlines = JSONLDUtils.extractString(document, "alternativeHeadline");
458

    
459
		HashSet<String> titles = new HashSet<>();
460
		titles.addAll(names);
461
		titles.addAll(headlines);
462
		return new ArrayList<>(titles);
463
	}
464

    
465
	private List<DatasetDocument.Identifier> extractIdentifier(JSONObject document){
466
		List<DatasetDocument.Identifier> curated = new ArrayList<>();
467

    
468
		List<JSONLDUtils.IdentifierInfo> identifiers = JSONLDUtils.extractIdentifier(document, "identifier");
469

    
470
		for(JSONLDUtils.IdentifierInfo item : identifiers){
471
			if(item.value == null || item.value.trim().length() == 0) continue;
472
			if(item.type == null || item.type.trim().length() == 0) {
473
				if (this.options.getIdentifierOptions().fallbackType == null) continue;
474
				curated.add(new DatasetDocument.Identifier(this.options.getIdentifierOptions().fallbackType, item.value.trim()));
475
			}
476
			else {
477
				DatasetDocument.Identifier.IdentifierType type = null;
478
				if(this.options.getIdentifierOptions().mappingARK != null && this.options.getIdentifierOptions().mappingARK.contains(item.type.trim())) type = DatasetDocument.Identifier.IdentifierType.ARK;
479
				else if(this.options.getIdentifierOptions().mappingDOI != null && this.options.getIdentifierOptions().mappingDOI.contains(item.type.trim())) type = DatasetDocument.Identifier.IdentifierType.DOI;
480
				else if(this.options.getIdentifierOptions().mappingHandle != null && this.options.getIdentifierOptions().mappingHandle.contains(item.type.trim())) type = DatasetDocument.Identifier.IdentifierType.Handle;
481
				else if(this.options.getIdentifierOptions().mappingPURL != null && this.options.getIdentifierOptions().mappingPURL.contains(item.type.trim())) type = DatasetDocument.Identifier.IdentifierType.PURL;
482
				else if(this.options.getIdentifierOptions().mappingURL != null && this.options.getIdentifierOptions().mappingURL.contains(item.type.trim())) type = DatasetDocument.Identifier.IdentifierType.URL;
483
				else if(this.options.getIdentifierOptions().mappingURN != null && this.options.getIdentifierOptions().mappingURN.contains(item.type.trim())) type = DatasetDocument.Identifier.IdentifierType.URN;
484

    
485
				if(type == null) continue;
486
				curated.add(new DatasetDocument.Identifier(type, item.value.trim()));
487
			}
488
		}
489
		return curated;
490
	}
491

    
492
	private List<DatasetDocument.Creator> extractCreator(JSONObject document){
493
		List<JSONLDUtils.PrincipalInfo> creators = JSONLDUtils.extractPrincipal(document, "creator");
494
		List<JSONLDUtils.PrincipalInfo> authors = JSONLDUtils.extractPrincipal(document, "author");
495

    
496
		HashSet<String> foundNames = new HashSet<>();
497
		List<DatasetDocument.Creator> curated = new ArrayList<>();
498
		for(JSONLDUtils.PrincipalInfo item : creators){
499
			if(item.name() == null || item.name().trim().length() == 0) continue;
500
			if(foundNames.contains(item.name())) continue;
501
			foundNames.add(item.name());
502
			curated.add(new DatasetDocument.Creator(item.name(), item.affiliationNames()));
503
		}
504
		for(JSONLDUtils.PrincipalInfo item : authors){
505
			if(item.name() == null || item.name().trim().length() == 0) continue;
506
			if(foundNames.contains(item.name())) continue;
507
			foundNames.add(item.name());
508

    
509
			curated.add(new DatasetDocument.Creator(item.name(), item.affiliationNames()));
510
		}
511
		return curated;
512
	}
513

    
514
}
(2-2/11)