Project

General

Profile

1
package eu.dnetlib.data.collector.plugins.schemaorg;
2

    
3
import org.apache.commons.logging.Log;
4
import org.apache.commons.logging.LogFactory;
5
import org.json.JSONObject;
6

    
7
import java.net.URL;
8
import java.time.LocalDate;
9
import java.time.format.DateTimeFormatter;
10
import java.util.*;
11

    
12
public class DatasetMappingIterator implements Iterator<String> {
13
	private static final Log log = LogFactory.getLog(EndpointAccessIterator.class);
14

    
15
	public static class Options {
16
		public static class IdentifierOptions{
17
			public List<String> mappingARK;
18
			public List<String> mappingDOI;
19
			public List<String> mappingHandle;
20
			public List<String> mappingPURL;
21
			public List<String> mappingURN;
22
			public List<String> mappingURL;
23
			public DatasetDocument.Identifier.IdentifierType fallbackType;
24
			public Boolean fallbackURL;
25
		}
26

    
27
		public static class ContributorOptions{
28
			public DatasetDocument.Contributor.ContributorType fallbackType;
29
		}
30

    
31
		public static class PublicationDateOptions{
32
			public String format;
33
		}
34

    
35
		public static class CreatedDateOptions{
36
			public String format;
37
		}
38

    
39
		public static class UpdatedDateOptions{
40
			public String format;
41
		}
42

    
43
		private IdentifierOptions identifierOptions;
44
		private PublicationDateOptions publicationDateOptions;
45
		private ContributorOptions contributorOptions;
46
		private CreatedDateOptions createdDateOptions;
47
		private UpdatedDateOptions updatedDateOptions;
48

    
49
		public UpdatedDateOptions getUpdatedDateOptions() {
50
			return updatedDateOptions;
51
		}
52

    
53
		public void setUpdatedDateOptions(UpdatedDateOptions updatedDateOptions) {
54
			this.updatedDateOptions = updatedDateOptions;
55
		}
56

    
57
		public CreatedDateOptions getCreatedDateOptions() {
58
			return createdDateOptions;
59
		}
60

    
61
		public void setCreatedDateOptions(CreatedDateOptions createdDateOptions) {
62
			this.createdDateOptions = createdDateOptions;
63
		}
64

    
65
		public ContributorOptions getContributorOptions() {
66
			return contributorOptions;
67
		}
68

    
69
		public void setContributorOptions(ContributorOptions contributorOptions) {
70
			this.contributorOptions = contributorOptions;
71
		}
72

    
73
		public PublicationDateOptions getPublicationDateOptions() {
74
			return publicationDateOptions;
75
		}
76

    
77
		public void setPublicationDateOptions(PublicationDateOptions publicationDateOptions) {
78
			this.publicationDateOptions = publicationDateOptions;
79
		}
80

    
81
		public IdentifierOptions getIdentifierOptions() {
82
			return identifierOptions;
83
		}
84

    
85
		public void setIdentifierOptions(IdentifierOptions identifierOptions) {
86
			this.identifierOptions = identifierOptions;
87
		}
88
	}
89

    
90
	private Options options;
91
	private EndpointAccessIterator endpointAccessIterator;
92

    
93
	public DatasetMappingIterator(Options options, EndpointAccessIterator endpointAccessIterator) {
94
		this.options = options;
95
		this.endpointAccessIterator = endpointAccessIterator;
96
	}
97

    
98
	@Override
99
	public boolean hasNext() {
100
		return this.endpointAccessIterator.hasNext();
101
	}
102

    
103
	@Override
104
	public String next() {
105
		JSONObject document = this.endpointAccessIterator.next();
106
		if (document == null) return null;
107

    
108
		String xml = this.buildDataset(document);
109

    
110
		return xml;
111
	}
112

    
113
	private String buildDataset(JSONObject document){
114
		String xml = null;
115
		try{
116
			DatasetDocument dataset = new DatasetDocument();
117

    
118
			dataset.setIdentifiers(this.extractIdentifier(document));
119
			dataset.setCreators(this.extractCreator(document));
120
			dataset.setTitles(this.extractTitles(document));
121
			dataset.setAlternativeTitles(this.extractAlternateTitles(document));
122
			dataset.setPublishers(this.extractPublisher(document));
123
			dataset.setPublicationDates(this.extractPublicationDate(document));
124
			dataset.setSubjects(this.extractSubjects(document));
125
			dataset.setContributors(this.extractContributors(document));
126
			dataset.setCreatedDates(this.extractCreatedDate(document));
127
			dataset.setUpdatedDates(this.extractUpdatedDate(document));
128
			dataset.setLanguages(this.extractLanguages(document));
129
			dataset.setResourceTypes(this.extractResourceTypes(document));
130
			dataset.setAlternateIdentifier(this.extractAlternateIdentifiers(document));
131
			dataset.setCitations(this.extractCitations(document));
132
			dataset.setSizes(this.extractSize(document));
133
			dataset.setFormat(this.extractEncodingFormat(document));
134
			dataset.setVersion(this.extractVersion(document));
135
			dataset.setLicenses(this.extractLicense(document));
136
			dataset.setDescriptions(this.extractDescription(document));
137
			dataset.setDisambiguatingDescriptions(this.extractDisambiguatingDescription(document));
138
			dataset.setGeoLocations(this.extractSpatialCoverage(document));
139

    
140
			if((dataset.getIdentifiers() == null || dataset.getIdentifiers().size() == 0) &&
141
					this.options.getIdentifierOptions().fallbackURL) dataset.setIdentifiers(this.extractIdentifierFallbackURL(document));
142

    
143
			xml = dataset.toXml();
144
		}
145
		catch(Exception ex){
146
			log.error("problem constructing dataset xml. returning empty", ex);
147
			xml = null;
148
		}
149
		return xml;
150
	}
151

    
152
	private List<DatasetDocument.Identifier> extractIdentifierFallbackURL(JSONObject document){
153
		List<String> urls = JSONLDUtils.extractString(document, "url");
154

    
155
		ArrayList<DatasetDocument.Identifier> curated = new ArrayList<>();
156
		for(String item : urls){
157
			if(item == null || item.trim().length() == 0) continue;
158
			curated.add(new DatasetDocument.Identifier(DatasetDocument.Identifier.IdentifierType.URL,  item.trim()));
159
		}
160
		return curated;
161
	}
162

    
163
	private List<DatasetDocument.SpatialCoverage> extractSpatialCoverage(JSONObject document){
164
		List<JSONLDUtils.PlaceInfo> spatials = JSONLDUtils.extractPlaces(document, "spatialCoverage");
165

    
166
		ArrayList<DatasetDocument.SpatialCoverage> curated = new ArrayList<>();
167
		for(JSONLDUtils.PlaceInfo item : spatials){
168
			if((item.name == null || item.name.trim().length() == 0) &&
169
					(item.geoCoordinates == null || item.geoCoordinates.size() == 0) &&
170
					(item.geoShapes == null || item.geoShapes.size() == 0)) continue;
171

    
172
			List<DatasetDocument.SpatialCoverage.Point> points = new ArrayList<>();
173
			List<String> boxes = new ArrayList<>();
174
			if(item.geoCoordinates!=null) {
175
				for (JSONLDUtils.GeoCoordinatesInfo iter : item.geoCoordinates){
176
					points.add(new DatasetDocument.SpatialCoverage.Point(iter.latitude, iter.longitude));
177
				}
178
			}
179
			if(item.geoShapes!=null) {
180
				for (JSONLDUtils.GeoShapeInfo iter : item.geoShapes){
181
					boxes.add(iter.box);
182
				}
183
			}
184
			curated.add(new DatasetDocument.SpatialCoverage(item.name, points, boxes));
185
		}
186
		return curated;
187
	}
188

    
189
	private List<String> extractDescription(JSONObject document){
190
		List<String> descriptions = JSONLDUtils.extractString(document, "description");
191

    
192
		ArrayList<String> curated = new ArrayList<>();
193
		for(String item : descriptions){
194
			if(item == null || item.trim().length() == 0) continue;
195
			curated.add(item);
196
		}
197
		return curated;
198
	}
199

    
200
	private List<String> extractDisambiguatingDescription(JSONObject document){
201
		List<String> descriptions = JSONLDUtils.extractString(document, "disambiguatingDescription");
202

    
203
		ArrayList<String> curated = new ArrayList<>();
204
		for(String item : descriptions){
205
			if(item == null || item.trim().length() == 0) continue;
206
			curated.add(item);
207
		}
208
		return curated;
209
	}
210

    
211
	private List<DatasetDocument.License> extractLicense(JSONObject document){
212
		List<JSONLDUtils.LicenseInfo> licenses = JSONLDUtils.extractLicenses(document, "license");
213

    
214
		ArrayList<DatasetDocument.License> curated = new ArrayList<>();
215
		for(JSONLDUtils.LicenseInfo item : licenses){
216
			if(item.url == null || item.url.trim().length() == 0) continue;
217
			curated.add(new DatasetDocument.License(item.name, item.url));
218
		}
219
		return curated;
220
	}
221

    
222
	private List<String> extractVersion(JSONObject document){
223
		List<String> versions = JSONLDUtils.extractString(document, "version");
224

    
225
		ArrayList<String> curated = new ArrayList<>();
226
		for(String item : versions){
227
			if(item == null || item.trim().length() == 0) continue;
228
			curated.add(item);
229
		}
230
		return curated;
231
	}
232

    
233
	private List<String> extractSize(JSONObject document) {
234
		List<String> sizes = JSONLDUtils.extractSize(document, "distribution");
235

    
236
		HashSet<String> curated = new HashSet<>();
237
		for (String item : sizes) {
238
			if (item == null || item.trim().length() == 0) continue;
239
			curated.add(item);
240
		}
241
		return new ArrayList<>(curated);
242
	}
243

    
244
	private List<String> extractEncodingFormat(JSONObject document){
245
		List<String> formats = JSONLDUtils.extractEncodingFormat(document, "distribution");
246

    
247
		HashSet<String> curated = new HashSet<>();
248
		for(String item : formats){
249
			if(item == null || item.trim().length() == 0) continue;
250
			curated.add(item);
251
		}
252
		return new ArrayList<>(curated);
253
	}
254

    
255
	//TODO: Handle different citation types. Currently only urls
256
	private List<DatasetDocument.Citation> extractCitations(JSONObject document){
257
		List<JSONLDUtils.CitationInfo> citations = JSONLDUtils.extractCitations(document, "citation");
258

    
259
		ArrayList<DatasetDocument.Citation> curated = new ArrayList<>();
260
		for(JSONLDUtils.CitationInfo item : citations){
261
			if(item.url == null || item.url.trim().length() == 0) continue;
262
			try{
263
				new URL(item.url);
264
			}catch (Exception ex){
265
				continue;
266
			}
267
			curated.add(new DatasetDocument.Citation(item.url, DatasetDocument.Citation.CitationIdentifierType.URL));
268
		}
269
		return curated;
270
	}
271

    
272
	private List<DatasetDocument.AlternateIdentifier> extractAlternateIdentifiers(JSONObject document){
273
		List<String> issns = JSONLDUtils.extractString(document, "issn");
274
		List<String> urls = JSONLDUtils.extractString(document, "url");
275

    
276
		ArrayList<DatasetDocument.AlternateIdentifier> curated = new ArrayList<>();
277
		for(String item : issns){
278
			if(item == null || item.trim().length() == 0) continue;
279
			curated.add(new DatasetDocument.AlternateIdentifier(item.trim(), "ISSN"));
280
		}
281
		for(String item : urls){
282
			if(item == null || item.trim().length() == 0) continue;
283
			curated.add(new DatasetDocument.AlternateIdentifier(item.trim(), "URL"));
284
		}
285
		return curated;
286
	}
287

    
288
	private List<DatasetDocument.ResourceType> extractResourceTypes(JSONObject document){
289
		List<DatasetDocument.ResourceType> resourceTypes = new ArrayList<>();
290
		resourceTypes.add(new DatasetDocument.ResourceType(DatasetDocument.ResourceType.ResourceTypeGeneralType.Dataset));
291
		return resourceTypes;
292
	}
293

    
294
	private List<String> extractLanguages(JSONObject document){
295
		List<String> languages = JSONLDUtils.extractLanguage(document, "inLanguage");
296

    
297
		ArrayList<String> curated = new ArrayList<>();
298
		for(String item : languages){
299
			if(item == null || item.trim().length() == 0) continue;
300
			curated.add(item);
301
		}
302
		return curated;
303
	}
304

    
305
	private List<LocalDate> extractUpdatedDate(JSONObject document){
306
		List<LocalDate> updatedDates = new ArrayList<>();
307
		if(this.options.getUpdatedDateOptions() == null || this.options.getUpdatedDateOptions().format == null || this.options.getUpdatedDateOptions().format.length() == 0) return updatedDates;
308

    
309
		DateTimeFormatter formatter = DateTimeFormatter.ofPattern(this.options.getPublicationDateOptions().format);
310

    
311
		List<String> dates = JSONLDUtils.extractString(document, "dateModified");
312
		for(String updatedDate : dates){
313
			if(updatedDate == null || updatedDate.trim().length() == 0) continue;
314
			try {
315
				LocalDate localDate = LocalDate.parse(updatedDate, formatter);
316
				updatedDates.add(localDate);
317
			} catch (Exception e) {
318
				continue;
319
			}
320
		}
321
		return updatedDates;
322
	}
323

    
324
	private List<LocalDate> extractCreatedDate(JSONObject document){
325
		List<LocalDate> createdDates = new ArrayList<>();
326
		if(this.options.getCreatedDateOptions() == null || this.options.getCreatedDateOptions().format == null || this.options.getCreatedDateOptions().format.length() == 0) return createdDates;
327

    
328
		DateTimeFormatter formatter = DateTimeFormatter.ofPattern(this.options.getCreatedDateOptions().format);
329

    
330
		List<String> dates = JSONLDUtils.extractString(document, "dateCreated");
331
		for(String createdDate : dates){
332
			if(createdDate == null || createdDate.trim().length() == 0) continue;
333
			try {
334
				LocalDate localDate = LocalDate.parse(createdDate, formatter);
335
				createdDates.add(localDate);
336
			} catch (Exception e) {
337
				continue;
338
			}
339
		}
340
		return createdDates;
341
	}
342

    
343
	private List<DatasetDocument.Contributor> extractContributors(JSONObject document){
344
		List<JSONLDUtils.PrincipalInfo> editors = JSONLDUtils.extractPrincipal(document, "editor");
345
		List<JSONLDUtils.PrincipalInfo> funders = JSONLDUtils.extractPrincipal(document, "funder");
346
		List<JSONLDUtils.PrincipalInfo> producers = JSONLDUtils.extractPrincipal(document, "producer");
347
		List<JSONLDUtils.PrincipalInfo> sponsors = JSONLDUtils.extractPrincipal(document, "sponsor");
348
		List<JSONLDUtils.PrincipalInfo> constributors = JSONLDUtils.extractPrincipal(document, "contributor");
349

    
350
		ArrayList<DatasetDocument.Contributor> curated = new ArrayList<>();
351
		for(JSONLDUtils.PrincipalInfo item : editors){
352
			if(item.name() == null || item.name().trim().length() == 0) continue;
353
			curated.add(new DatasetDocument.Contributor(item.name(), item.affiliationNames(), DatasetDocument.Contributor.ContributorType.Editor));
354
		}
355
		for(JSONLDUtils.PrincipalInfo item : funders){
356
			if(item.name() == null || item.name().trim().length() == 0) continue;
357
			curated.add(new DatasetDocument.Contributor(item.name(), item.affiliationNames(), DatasetDocument.Contributor.ContributorType.Funder));
358
		}
359
		for(JSONLDUtils.PrincipalInfo item : producers){
360
			if(item.name() == null || item.name().trim().length() == 0) continue;
361
			curated.add(new DatasetDocument.Contributor(item.name(), item.affiliationNames(), DatasetDocument.Contributor.ContributorType.Producer));
362
		}
363
		for(JSONLDUtils.PrincipalInfo item : sponsors){
364
			if(item.name() == null || item.name().trim().length() == 0) continue;
365
			curated.add(new DatasetDocument.Contributor(item.name(), item.affiliationNames(), DatasetDocument.Contributor.ContributorType.Sponsor));
366
		}
367
		for(JSONLDUtils.PrincipalInfo item : constributors){
368
			if(item.name() == null || item.name().trim().length() == 0) continue;
369
			DatasetDocument.Contributor.ContributorType type = DatasetDocument.Contributor.ContributorType.Other;
370
			if(this.options.getContributorOptions()!=null && this.options.getContributorOptions().fallbackType != null) type = this.options.getContributorOptions().fallbackType;
371
			curated.add(new DatasetDocument.Contributor(item.name(), item.affiliationNames(), type));
372
		}
373
		return curated;
374
	}
375

    
376
	private List<String> extractSubjects(JSONObject document){
377
		List<String> subjects = JSONLDUtils.extractString(document, "keywords");
378

    
379
		ArrayList<String> curated = new ArrayList<>();
380
		for(String item : subjects){
381
			if(item == null || item.trim().length() == 0) continue;
382
			curated.add(item);
383
		}
384
		return curated;
385
	}
386

    
387
	private List<LocalDate> extractPublicationDate(JSONObject document){
388
		List<LocalDate> publicationDates = new ArrayList<>();
389
		if(this.options.getPublicationDateOptions() == null || this.options.getPublicationDateOptions().format == null || this.options.getPublicationDateOptions().format.length() == 0) return publicationDates;
390

    
391
		DateTimeFormatter formatter = DateTimeFormatter.ofPattern(this.options.getPublicationDateOptions().format);
392

    
393
		List<String> dates = JSONLDUtils.extractString(document, "datePublished");
394
		for(String publicationDate : dates){
395
			if(publicationDate == null || publicationDate.trim().length() == 0) continue;
396
			try {
397
				LocalDate localDate = LocalDate.parse(publicationDate, formatter);
398
				publicationDates.add(localDate);
399
			} catch (Exception e) {
400
				continue;
401
			}
402
		}
403
		return publicationDates;
404
	}
405

    
406
	private List<String> extractPublisher(JSONObject document){
407
		List<JSONLDUtils.PrincipalInfo> publishers = JSONLDUtils.extractPrincipal(document, "publisher");
408

    
409
		ArrayList<String> curated = new ArrayList<>();
410
		for(JSONLDUtils.PrincipalInfo item : publishers){
411
			if(item.name() == null || item.name().trim().length() == 0) continue;
412
			curated.add(item.name());
413
		}
414
		return curated;
415
	}
416

    
417
	private List<String> extractTitles(JSONObject document){
418
		List<String> names = JSONLDUtils.extractString(document, "name");
419
		List<String> headlines = JSONLDUtils.extractString(document, "headline");
420

    
421
		HashSet<String> titles = new HashSet<>();
422
		titles.addAll(names);
423
		titles.addAll(headlines);
424
		return new ArrayList<>(titles);
425
	}
426

    
427
	private List<String> extractAlternateTitles(JSONObject document){
428
		List<String> names = JSONLDUtils.extractString(document, "alternateName");
429
		List<String> headlines = JSONLDUtils.extractString(document, "alternativeHeadline");
430

    
431
		HashSet<String> titles = new HashSet<>();
432
		titles.addAll(names);
433
		titles.addAll(headlines);
434
		return new ArrayList<>(titles);
435
	}
436

    
437
	private List<DatasetDocument.Identifier> extractIdentifier(JSONObject document){
438
		List<DatasetDocument.Identifier> curated = new ArrayList<>();
439

    
440
		List<JSONLDUtils.IdentifierInfo> identifiers = JSONLDUtils.extractIdentifier(document, "identifier");
441

    
442
		for(JSONLDUtils.IdentifierInfo item : identifiers){
443
			if(item.value == null || item.value.trim().length() == 0) continue;
444
			if(item.type == null || item.type.trim().length() == 0) {
445
				if (this.options.getIdentifierOptions().fallbackType == null) continue;
446
				curated.add(new DatasetDocument.Identifier(this.options.getIdentifierOptions().fallbackType, item.value.trim()));
447
			}
448
			else {
449
				DatasetDocument.Identifier.IdentifierType type = null;
450
				if(this.options.getIdentifierOptions().mappingARK != null && this.options.getIdentifierOptions().mappingARK.contains(item.type.trim())) type = DatasetDocument.Identifier.IdentifierType.ARK;
451
				else if(this.options.getIdentifierOptions().mappingDOI != null && this.options.getIdentifierOptions().mappingDOI.contains(item.type.trim())) type = DatasetDocument.Identifier.IdentifierType.DOI;
452
				else if(this.options.getIdentifierOptions().mappingHandle != null && this.options.getIdentifierOptions().mappingHandle.contains(item.type.trim())) type = DatasetDocument.Identifier.IdentifierType.Handle;
453
				else if(this.options.getIdentifierOptions().mappingPURL != null && this.options.getIdentifierOptions().mappingPURL.contains(item.type.trim())) type = DatasetDocument.Identifier.IdentifierType.PURL;
454
				else if(this.options.getIdentifierOptions().mappingURL != null && this.options.getIdentifierOptions().mappingURL.contains(item.type.trim())) type = DatasetDocument.Identifier.IdentifierType.URL;
455
				else if(this.options.getIdentifierOptions().mappingURN != null && this.options.getIdentifierOptions().mappingURN.contains(item.type.trim())) type = DatasetDocument.Identifier.IdentifierType.URN;
456

    
457
				if(type == null) continue;
458
				curated.add(new DatasetDocument.Identifier(type, item.value.trim()));
459
			}
460
		}
461
		return curated;
462
	}
463

    
464
	private List<DatasetDocument.Creator> extractCreator(JSONObject document){
465
		List<JSONLDUtils.PrincipalInfo> creators = JSONLDUtils.extractPrincipal(document, "creator");
466
		List<JSONLDUtils.PrincipalInfo> authors = JSONLDUtils.extractPrincipal(document, "author");
467

    
468
		HashSet<String> foundNames = new HashSet<>();
469
		List<DatasetDocument.Creator> curated = new ArrayList<>();
470
		for(JSONLDUtils.PrincipalInfo item : creators){
471
			if(item.name() == null || item.name().trim().length() == 0) continue;
472
			if(foundNames.contains(item.name())) continue;
473
			foundNames.add(item.name());
474
			curated.add(new DatasetDocument.Creator(item.name(), item.affiliationNames()));
475
		}
476
		for(JSONLDUtils.PrincipalInfo item : authors){
477
			if(item.name() == null || item.name().trim().length() == 0) continue;
478
			if(foundNames.contains(item.name())) continue;
479
			foundNames.add(item.name());
480

    
481
			curated.add(new DatasetDocument.Creator(item.name(), item.affiliationNames()));
482
		}
483
		return curated;
484
	}
485

    
486
}
(2-2/9)