1
|
package eu.dnetlib.data.collector.plugins.schemaorg;
|
2
|
|
3
|
import org.apache.commons.logging.Log;
|
4
|
import org.apache.commons.logging.LogFactory;
|
5
|
import org.json.JSONObject;
|
6
|
|
7
|
import java.net.URL;
|
8
|
import java.time.LocalDate;
|
9
|
import java.time.format.DateTimeFormatter;
|
10
|
import java.util.*;
|
11
|
|
12
|
public class DatasetMappingIterator implements Iterator<String> {
|
13
|
private static final Log log = LogFactory.getLog(EndpointAccessIterator.class);
|
14
|
|
15
|
public static class Options {
|
16
|
public static class IdentifierOptions{
|
17
|
public List<String> mappingARK;
|
18
|
public List<String> mappingDOI;
|
19
|
public List<String> mappingHandle;
|
20
|
public List<String> mappingPURL;
|
21
|
public List<String> mappingURN;
|
22
|
public List<String> mappingURL;
|
23
|
public DatasetDocument.Identifier.IdentifierType fallbackType;
|
24
|
public Boolean fallbackURL;
|
25
|
}
|
26
|
|
27
|
public static class ContributorOptions{
|
28
|
public DatasetDocument.Contributor.ContributorType fallbackType;
|
29
|
}
|
30
|
|
31
|
public static class PublicationDateOptions{
|
32
|
public String format;
|
33
|
}
|
34
|
|
35
|
public static class CreatedDateOptions{
|
36
|
public String format;
|
37
|
}
|
38
|
|
39
|
public static class UpdatedDateOptions{
|
40
|
public String format;
|
41
|
}
|
42
|
|
43
|
private IdentifierOptions identifierOptions;
|
44
|
private PublicationDateOptions publicationDateOptions;
|
45
|
private ContributorOptions contributorOptions;
|
46
|
private CreatedDateOptions createdDateOptions;
|
47
|
private UpdatedDateOptions updatedDateOptions;
|
48
|
|
49
|
public UpdatedDateOptions getUpdatedDateOptions() {
|
50
|
return updatedDateOptions;
|
51
|
}
|
52
|
|
53
|
public void setUpdatedDateOptions(UpdatedDateOptions updatedDateOptions) {
|
54
|
this.updatedDateOptions = updatedDateOptions;
|
55
|
}
|
56
|
|
57
|
public CreatedDateOptions getCreatedDateOptions() {
|
58
|
return createdDateOptions;
|
59
|
}
|
60
|
|
61
|
public void setCreatedDateOptions(CreatedDateOptions createdDateOptions) {
|
62
|
this.createdDateOptions = createdDateOptions;
|
63
|
}
|
64
|
|
65
|
public ContributorOptions getContributorOptions() {
|
66
|
return contributorOptions;
|
67
|
}
|
68
|
|
69
|
public void setContributorOptions(ContributorOptions contributorOptions) {
|
70
|
this.contributorOptions = contributorOptions;
|
71
|
}
|
72
|
|
73
|
public PublicationDateOptions getPublicationDateOptions() {
|
74
|
return publicationDateOptions;
|
75
|
}
|
76
|
|
77
|
public void setPublicationDateOptions(PublicationDateOptions publicationDateOptions) {
|
78
|
this.publicationDateOptions = publicationDateOptions;
|
79
|
}
|
80
|
|
81
|
public IdentifierOptions getIdentifierOptions() {
|
82
|
return identifierOptions;
|
83
|
}
|
84
|
|
85
|
public void setIdentifierOptions(IdentifierOptions identifierOptions) {
|
86
|
this.identifierOptions = identifierOptions;
|
87
|
}
|
88
|
}
|
89
|
|
90
|
private Options options;
|
91
|
private EndpointAccessIterator endpointAccessIterator;
|
92
|
|
93
|
public DatasetMappingIterator(Options options, EndpointAccessIterator endpointAccessIterator) {
|
94
|
this.options = options;
|
95
|
this.endpointAccessIterator = endpointAccessIterator;
|
96
|
}
|
97
|
|
98
|
@Override
|
99
|
public boolean hasNext() {
|
100
|
return this.endpointAccessIterator.hasNext();
|
101
|
}
|
102
|
|
103
|
@Override
|
104
|
public String next() {
|
105
|
JSONObject document = this.endpointAccessIterator.next();
|
106
|
String xml = null;
|
107
|
if (document == null) {
|
108
|
log.debug("no document provided to process. returning empty");
|
109
|
xml = DatasetDocument.emptyXml();
|
110
|
}
|
111
|
else {
|
112
|
log.debug("building document");
|
113
|
xml = this.buildDataset(document);
|
114
|
if (!Utils.validateXml(xml)) {
|
115
|
log.debug("xml not valid. setting to empty");
|
116
|
xml = null;
|
117
|
}
|
118
|
if (xml == null) {
|
119
|
log.debug("could not build xml. returning empty");
|
120
|
xml = DatasetDocument.emptyXml();
|
121
|
}
|
122
|
}
|
123
|
|
124
|
//if all else fails
|
125
|
if(xml == null){
|
126
|
log.debug("could not build xml. returning empty");
|
127
|
xml = "<dataset/>";
|
128
|
}
|
129
|
|
130
|
log.debug("xml document for dataset is: "+xml);
|
131
|
|
132
|
return xml;
|
133
|
}
|
134
|
|
135
|
private String buildDataset(JSONObject document){
|
136
|
String xml = null;
|
137
|
try{
|
138
|
DatasetDocument dataset = new DatasetDocument();
|
139
|
|
140
|
dataset.setIdentifiers(this.extractIdentifier(document));
|
141
|
dataset.setCreators(this.extractCreator(document));
|
142
|
dataset.setTitles(this.extractTitles(document));
|
143
|
dataset.setAlternativeTitles(this.extractAlternateTitles(document));
|
144
|
dataset.setPublishers(this.extractPublisher(document));
|
145
|
dataset.setPublicationDates(this.extractPublicationDate(document));
|
146
|
dataset.setSubjects(this.extractSubjects(document));
|
147
|
dataset.setContributors(this.extractContributors(document));
|
148
|
dataset.setCreatedDates(this.extractCreatedDate(document));
|
149
|
dataset.setUpdatedDates(this.extractUpdatedDate(document));
|
150
|
dataset.setLanguages(this.extractLanguages(document));
|
151
|
dataset.setResourceTypes(this.extractResourceTypes(document));
|
152
|
dataset.setAlternateIdentifier(this.extractAlternateIdentifiers(document));
|
153
|
dataset.setCitations(this.extractCitations(document));
|
154
|
dataset.setSizes(this.extractSize(document));
|
155
|
dataset.setFormat(this.extractEncodingFormat(document));
|
156
|
dataset.setVersion(this.extractVersion(document));
|
157
|
dataset.setLicenses(this.extractLicense(document));
|
158
|
dataset.setDescriptions(this.extractDescription(document));
|
159
|
dataset.setDisambiguatingDescriptions(this.extractDisambiguatingDescription(document));
|
160
|
dataset.setGeoLocations(this.extractSpatialCoverage(document));
|
161
|
|
162
|
log.debug("document contains native identifier: : "+(dataset.getIdentifiers()!=null && dataset.getIdentifiers().size() > 0));
|
163
|
|
164
|
if((dataset.getIdentifiers() == null || dataset.getIdentifiers().size() == 0) &&
|
165
|
this.options.getIdentifierOptions().fallbackURL){
|
166
|
log.debug("falling back to url identifier");
|
167
|
dataset.setIdentifiers(this.extractIdentifierFallbackURL(document));
|
168
|
log.debug("document contains overridden identifier: : "+(dataset.getIdentifiers()!=null && dataset.getIdentifiers().size() > 0));
|
169
|
}
|
170
|
|
171
|
xml = dataset.toXml();
|
172
|
}
|
173
|
catch(Exception ex){
|
174
|
log.error("problem constructing dataset xml. returning empty", ex);
|
175
|
xml = null;
|
176
|
}
|
177
|
return xml;
|
178
|
}
|
179
|
|
180
|
private List<DatasetDocument.Identifier> extractIdentifierFallbackURL(JSONObject document){
|
181
|
List<String> urls = JSONLDUtils.extractString(document, "url");
|
182
|
|
183
|
ArrayList<DatasetDocument.Identifier> curated = new ArrayList<>();
|
184
|
for(String item : urls){
|
185
|
if(item == null || item.trim().length() == 0) continue;
|
186
|
curated.add(new DatasetDocument.Identifier(DatasetDocument.Identifier.IdentifierType.URL, item.trim()));
|
187
|
}
|
188
|
return curated;
|
189
|
}
|
190
|
|
191
|
private List<DatasetDocument.SpatialCoverage> extractSpatialCoverage(JSONObject document){
|
192
|
List<JSONLDUtils.PlaceInfo> spatials = JSONLDUtils.extractPlaces(document, "spatialCoverage");
|
193
|
|
194
|
ArrayList<DatasetDocument.SpatialCoverage> curated = new ArrayList<>();
|
195
|
for(JSONLDUtils.PlaceInfo item : spatials){
|
196
|
if((item.name == null || item.name.trim().length() == 0) &&
|
197
|
(item.geoCoordinates == null || item.geoCoordinates.size() == 0) &&
|
198
|
(item.geoShapes == null || item.geoShapes.size() == 0)) continue;
|
199
|
|
200
|
List<DatasetDocument.SpatialCoverage.Point> points = new ArrayList<>();
|
201
|
List<String> boxes = new ArrayList<>();
|
202
|
if(item.geoCoordinates!=null) {
|
203
|
for (JSONLDUtils.GeoCoordinatesInfo iter : item.geoCoordinates){
|
204
|
points.add(new DatasetDocument.SpatialCoverage.Point(iter.latitude, iter.longitude));
|
205
|
}
|
206
|
}
|
207
|
if(item.geoShapes!=null) {
|
208
|
for (JSONLDUtils.GeoShapeInfo iter : item.geoShapes){
|
209
|
boxes.add(iter.box);
|
210
|
}
|
211
|
}
|
212
|
curated.add(new DatasetDocument.SpatialCoverage(item.name, points, boxes));
|
213
|
}
|
214
|
return curated;
|
215
|
}
|
216
|
|
217
|
private List<String> extractDescription(JSONObject document){
|
218
|
List<String> descriptions = JSONLDUtils.extractString(document, "description");
|
219
|
|
220
|
ArrayList<String> curated = new ArrayList<>();
|
221
|
for(String item : descriptions){
|
222
|
if(item == null || item.trim().length() == 0) continue;
|
223
|
curated.add(item);
|
224
|
}
|
225
|
return curated;
|
226
|
}
|
227
|
|
228
|
private List<String> extractDisambiguatingDescription(JSONObject document){
|
229
|
List<String> descriptions = JSONLDUtils.extractString(document, "disambiguatingDescription");
|
230
|
|
231
|
ArrayList<String> curated = new ArrayList<>();
|
232
|
for(String item : descriptions){
|
233
|
if(item == null || item.trim().length() == 0) continue;
|
234
|
curated.add(item);
|
235
|
}
|
236
|
return curated;
|
237
|
}
|
238
|
|
239
|
private List<DatasetDocument.License> extractLicense(JSONObject document){
|
240
|
List<JSONLDUtils.LicenseInfo> licenses = JSONLDUtils.extractLicenses(document, "license");
|
241
|
|
242
|
ArrayList<DatasetDocument.License> curated = new ArrayList<>();
|
243
|
for(JSONLDUtils.LicenseInfo item : licenses){
|
244
|
if(item.url == null || item.url.trim().length() == 0) continue;
|
245
|
curated.add(new DatasetDocument.License(item.name, item.url));
|
246
|
}
|
247
|
return curated;
|
248
|
}
|
249
|
|
250
|
private List<String> extractVersion(JSONObject document){
|
251
|
List<String> versions = JSONLDUtils.extractString(document, "version");
|
252
|
|
253
|
ArrayList<String> curated = new ArrayList<>();
|
254
|
for(String item : versions){
|
255
|
if(item == null || item.trim().length() == 0) continue;
|
256
|
curated.add(item);
|
257
|
}
|
258
|
return curated;
|
259
|
}
|
260
|
|
261
|
private List<String> extractSize(JSONObject document) {
|
262
|
List<String> sizes = JSONLDUtils.extractSize(document, "distribution");
|
263
|
|
264
|
HashSet<String> curated = new HashSet<>();
|
265
|
for (String item : sizes) {
|
266
|
if (item == null || item.trim().length() == 0) continue;
|
267
|
curated.add(item);
|
268
|
}
|
269
|
return new ArrayList<>(curated);
|
270
|
}
|
271
|
|
272
|
private List<String> extractEncodingFormat(JSONObject document){
|
273
|
List<String> formats = JSONLDUtils.extractEncodingFormat(document, "distribution");
|
274
|
|
275
|
HashSet<String> curated = new HashSet<>();
|
276
|
for(String item : formats){
|
277
|
if(item == null || item.trim().length() == 0) continue;
|
278
|
curated.add(item);
|
279
|
}
|
280
|
return new ArrayList<>(curated);
|
281
|
}
|
282
|
|
283
|
//TODO: Handle different citation types. Currently only urls
|
284
|
private List<DatasetDocument.Citation> extractCitations(JSONObject document){
|
285
|
List<JSONLDUtils.CitationInfo> citations = JSONLDUtils.extractCitations(document, "citation");
|
286
|
|
287
|
ArrayList<DatasetDocument.Citation> curated = new ArrayList<>();
|
288
|
for(JSONLDUtils.CitationInfo item : citations){
|
289
|
if(item.url == null || item.url.trim().length() == 0) continue;
|
290
|
try{
|
291
|
new URL(item.url);
|
292
|
}catch (Exception ex){
|
293
|
continue;
|
294
|
}
|
295
|
curated.add(new DatasetDocument.Citation(item.url, DatasetDocument.Citation.CitationIdentifierType.URL));
|
296
|
}
|
297
|
return curated;
|
298
|
}
|
299
|
|
300
|
private List<DatasetDocument.AlternateIdentifier> extractAlternateIdentifiers(JSONObject document){
|
301
|
List<String> issns = JSONLDUtils.extractString(document, "issn");
|
302
|
List<String> urls = JSONLDUtils.extractString(document, "url");
|
303
|
|
304
|
ArrayList<DatasetDocument.AlternateIdentifier> curated = new ArrayList<>();
|
305
|
for(String item : issns){
|
306
|
if(item == null || item.trim().length() == 0) continue;
|
307
|
curated.add(new DatasetDocument.AlternateIdentifier(item.trim(), "ISSN"));
|
308
|
}
|
309
|
for(String item : urls){
|
310
|
if(item == null || item.trim().length() == 0) continue;
|
311
|
curated.add(new DatasetDocument.AlternateIdentifier(item.trim(), "URL"));
|
312
|
}
|
313
|
return curated;
|
314
|
}
|
315
|
|
316
|
private List<DatasetDocument.ResourceType> extractResourceTypes(JSONObject document){
|
317
|
List<DatasetDocument.ResourceType> resourceTypes = new ArrayList<>();
|
318
|
resourceTypes.add(new DatasetDocument.ResourceType(DatasetDocument.ResourceType.ResourceTypeGeneralType.Dataset));
|
319
|
return resourceTypes;
|
320
|
}
|
321
|
|
322
|
private List<String> extractLanguages(JSONObject document){
|
323
|
List<String> languages = JSONLDUtils.extractLanguage(document, "inLanguage");
|
324
|
|
325
|
ArrayList<String> curated = new ArrayList<>();
|
326
|
for(String item : languages){
|
327
|
if(item == null || item.trim().length() == 0) continue;
|
328
|
curated.add(item);
|
329
|
}
|
330
|
return curated;
|
331
|
}
|
332
|
|
333
|
private List<LocalDate> extractUpdatedDate(JSONObject document){
|
334
|
List<LocalDate> updatedDates = new ArrayList<>();
|
335
|
if(this.options.getUpdatedDateOptions() == null || this.options.getUpdatedDateOptions().format == null || this.options.getUpdatedDateOptions().format.length() == 0) return updatedDates;
|
336
|
|
337
|
DateTimeFormatter formatter = DateTimeFormatter.ofPattern(this.options.getPublicationDateOptions().format);
|
338
|
|
339
|
List<String> dates = JSONLDUtils.extractString(document, "dateModified");
|
340
|
for(String updatedDate : dates){
|
341
|
if(updatedDate == null || updatedDate.trim().length() == 0) continue;
|
342
|
try {
|
343
|
LocalDate localDate = LocalDate.parse(updatedDate, formatter);
|
344
|
updatedDates.add(localDate);
|
345
|
} catch (Exception e) {
|
346
|
continue;
|
347
|
}
|
348
|
}
|
349
|
return updatedDates;
|
350
|
}
|
351
|
|
352
|
private List<LocalDate> extractCreatedDate(JSONObject document){
|
353
|
List<LocalDate> createdDates = new ArrayList<>();
|
354
|
if(this.options.getCreatedDateOptions() == null || this.options.getCreatedDateOptions().format == null || this.options.getCreatedDateOptions().format.length() == 0) return createdDates;
|
355
|
|
356
|
DateTimeFormatter formatter = DateTimeFormatter.ofPattern(this.options.getCreatedDateOptions().format);
|
357
|
|
358
|
List<String> dates = JSONLDUtils.extractString(document, "dateCreated");
|
359
|
for(String createdDate : dates){
|
360
|
if(createdDate == null || createdDate.trim().length() == 0) continue;
|
361
|
try {
|
362
|
LocalDate localDate = LocalDate.parse(createdDate, formatter);
|
363
|
createdDates.add(localDate);
|
364
|
} catch (Exception e) {
|
365
|
continue;
|
366
|
}
|
367
|
}
|
368
|
return createdDates;
|
369
|
}
|
370
|
|
371
|
private List<DatasetDocument.Contributor> extractContributors(JSONObject document){
|
372
|
List<JSONLDUtils.PrincipalInfo> editors = JSONLDUtils.extractPrincipal(document, "editor");
|
373
|
List<JSONLDUtils.PrincipalInfo> funders = JSONLDUtils.extractPrincipal(document, "funder");
|
374
|
List<JSONLDUtils.PrincipalInfo> producers = JSONLDUtils.extractPrincipal(document, "producer");
|
375
|
List<JSONLDUtils.PrincipalInfo> sponsors = JSONLDUtils.extractPrincipal(document, "sponsor");
|
376
|
List<JSONLDUtils.PrincipalInfo> constributors = JSONLDUtils.extractPrincipal(document, "contributor");
|
377
|
|
378
|
ArrayList<DatasetDocument.Contributor> curated = new ArrayList<>();
|
379
|
for(JSONLDUtils.PrincipalInfo item : editors){
|
380
|
if(item.name() == null || item.name().trim().length() == 0) continue;
|
381
|
curated.add(new DatasetDocument.Contributor(item.name(), item.affiliationNames(), DatasetDocument.Contributor.ContributorType.Editor));
|
382
|
}
|
383
|
for(JSONLDUtils.PrincipalInfo item : funders){
|
384
|
if(item.name() == null || item.name().trim().length() == 0) continue;
|
385
|
curated.add(new DatasetDocument.Contributor(item.name(), item.affiliationNames(), DatasetDocument.Contributor.ContributorType.Funder));
|
386
|
}
|
387
|
for(JSONLDUtils.PrincipalInfo item : producers){
|
388
|
if(item.name() == null || item.name().trim().length() == 0) continue;
|
389
|
curated.add(new DatasetDocument.Contributor(item.name(), item.affiliationNames(), DatasetDocument.Contributor.ContributorType.Producer));
|
390
|
}
|
391
|
for(JSONLDUtils.PrincipalInfo item : sponsors){
|
392
|
if(item.name() == null || item.name().trim().length() == 0) continue;
|
393
|
curated.add(new DatasetDocument.Contributor(item.name(), item.affiliationNames(), DatasetDocument.Contributor.ContributorType.Sponsor));
|
394
|
}
|
395
|
for(JSONLDUtils.PrincipalInfo item : constributors){
|
396
|
if(item.name() == null || item.name().trim().length() == 0) continue;
|
397
|
DatasetDocument.Contributor.ContributorType type = DatasetDocument.Contributor.ContributorType.Other;
|
398
|
if(this.options.getContributorOptions()!=null && this.options.getContributorOptions().fallbackType != null) type = this.options.getContributorOptions().fallbackType;
|
399
|
curated.add(new DatasetDocument.Contributor(item.name(), item.affiliationNames(), type));
|
400
|
}
|
401
|
return curated;
|
402
|
}
|
403
|
|
404
|
private List<String> extractSubjects(JSONObject document){
|
405
|
List<String> subjects = JSONLDUtils.extractString(document, "keywords");
|
406
|
|
407
|
ArrayList<String> curated = new ArrayList<>();
|
408
|
for(String item : subjects){
|
409
|
if(item == null || item.trim().length() == 0) continue;
|
410
|
curated.add(item);
|
411
|
}
|
412
|
return curated;
|
413
|
}
|
414
|
|
415
|
private List<LocalDate> extractPublicationDate(JSONObject document){
|
416
|
List<LocalDate> publicationDates = new ArrayList<>();
|
417
|
if(this.options.getPublicationDateOptions() == null || this.options.getPublicationDateOptions().format == null || this.options.getPublicationDateOptions().format.length() == 0) return publicationDates;
|
418
|
|
419
|
DateTimeFormatter formatter = DateTimeFormatter.ofPattern(this.options.getPublicationDateOptions().format);
|
420
|
|
421
|
List<String> dates = JSONLDUtils.extractString(document, "datePublished");
|
422
|
for(String publicationDate : dates){
|
423
|
if(publicationDate == null || publicationDate.trim().length() == 0) continue;
|
424
|
try {
|
425
|
LocalDate localDate = LocalDate.parse(publicationDate, formatter);
|
426
|
publicationDates.add(localDate);
|
427
|
} catch (Exception e) {
|
428
|
continue;
|
429
|
}
|
430
|
}
|
431
|
return publicationDates;
|
432
|
}
|
433
|
|
434
|
private List<String> extractPublisher(JSONObject document){
|
435
|
List<JSONLDUtils.PrincipalInfo> publishers = JSONLDUtils.extractPrincipal(document, "publisher");
|
436
|
|
437
|
ArrayList<String> curated = new ArrayList<>();
|
438
|
for(JSONLDUtils.PrincipalInfo item : publishers){
|
439
|
if(item.name() == null || item.name().trim().length() == 0) continue;
|
440
|
curated.add(item.name());
|
441
|
}
|
442
|
return curated;
|
443
|
}
|
444
|
|
445
|
private List<String> extractTitles(JSONObject document){
|
446
|
List<String> names = JSONLDUtils.extractString(document, "name");
|
447
|
List<String> headlines = JSONLDUtils.extractString(document, "headline");
|
448
|
|
449
|
HashSet<String> titles = new HashSet<>();
|
450
|
titles.addAll(names);
|
451
|
titles.addAll(headlines);
|
452
|
return new ArrayList<>(titles);
|
453
|
}
|
454
|
|
455
|
private List<String> extractAlternateTitles(JSONObject document){
|
456
|
List<String> names = JSONLDUtils.extractString(document, "alternateName");
|
457
|
List<String> headlines = JSONLDUtils.extractString(document, "alternativeHeadline");
|
458
|
|
459
|
HashSet<String> titles = new HashSet<>();
|
460
|
titles.addAll(names);
|
461
|
titles.addAll(headlines);
|
462
|
return new ArrayList<>(titles);
|
463
|
}
|
464
|
|
465
|
private List<DatasetDocument.Identifier> extractIdentifier(JSONObject document){
|
466
|
List<DatasetDocument.Identifier> curated = new ArrayList<>();
|
467
|
|
468
|
List<JSONLDUtils.IdentifierInfo> identifiers = JSONLDUtils.extractIdentifier(document, "identifier");
|
469
|
|
470
|
for(JSONLDUtils.IdentifierInfo item : identifiers){
|
471
|
if(item.value == null || item.value.trim().length() == 0) continue;
|
472
|
if(item.type == null || item.type.trim().length() == 0) {
|
473
|
if (this.options.getIdentifierOptions().fallbackType == null) continue;
|
474
|
curated.add(new DatasetDocument.Identifier(this.options.getIdentifierOptions().fallbackType, item.value.trim()));
|
475
|
}
|
476
|
else {
|
477
|
DatasetDocument.Identifier.IdentifierType type = null;
|
478
|
if(this.options.getIdentifierOptions().mappingARK != null && this.options.getIdentifierOptions().mappingARK.contains(item.type.trim())) type = DatasetDocument.Identifier.IdentifierType.ARK;
|
479
|
else if(this.options.getIdentifierOptions().mappingDOI != null && this.options.getIdentifierOptions().mappingDOI.contains(item.type.trim())) type = DatasetDocument.Identifier.IdentifierType.DOI;
|
480
|
else if(this.options.getIdentifierOptions().mappingHandle != null && this.options.getIdentifierOptions().mappingHandle.contains(item.type.trim())) type = DatasetDocument.Identifier.IdentifierType.Handle;
|
481
|
else if(this.options.getIdentifierOptions().mappingPURL != null && this.options.getIdentifierOptions().mappingPURL.contains(item.type.trim())) type = DatasetDocument.Identifier.IdentifierType.PURL;
|
482
|
else if(this.options.getIdentifierOptions().mappingURL != null && this.options.getIdentifierOptions().mappingURL.contains(item.type.trim())) type = DatasetDocument.Identifier.IdentifierType.URL;
|
483
|
else if(this.options.getIdentifierOptions().mappingURN != null && this.options.getIdentifierOptions().mappingURN.contains(item.type.trim())) type = DatasetDocument.Identifier.IdentifierType.URN;
|
484
|
|
485
|
if(type == null) continue;
|
486
|
curated.add(new DatasetDocument.Identifier(type, item.value.trim()));
|
487
|
}
|
488
|
}
|
489
|
return curated;
|
490
|
}
|
491
|
|
492
|
private List<DatasetDocument.Creator> extractCreator(JSONObject document){
|
493
|
List<JSONLDUtils.PrincipalInfo> creators = JSONLDUtils.extractPrincipal(document, "creator");
|
494
|
List<JSONLDUtils.PrincipalInfo> authors = JSONLDUtils.extractPrincipal(document, "author");
|
495
|
|
496
|
HashSet<String> foundNames = new HashSet<>();
|
497
|
List<DatasetDocument.Creator> curated = new ArrayList<>();
|
498
|
for(JSONLDUtils.PrincipalInfo item : creators){
|
499
|
if(item.name() == null || item.name().trim().length() == 0) continue;
|
500
|
if(foundNames.contains(item.name())) continue;
|
501
|
foundNames.add(item.name());
|
502
|
curated.add(new DatasetDocument.Creator(item.name(), item.affiliationNames()));
|
503
|
}
|
504
|
for(JSONLDUtils.PrincipalInfo item : authors){
|
505
|
if(item.name() == null || item.name().trim().length() == 0) continue;
|
506
|
if(foundNames.contains(item.name())) continue;
|
507
|
foundNames.add(item.name());
|
508
|
|
509
|
curated.add(new DatasetDocument.Creator(item.name(), item.affiliationNames()));
|
510
|
}
|
511
|
return curated;
|
512
|
}
|
513
|
|
514
|
}
|