Project

General

Profile

« Previous | Next » 

Revision 62810

[maven-release-plugin] copy for tag dnet-collector-plugins-1.7.0

View differences:

modules/dnet-collector-plugins/tags/dnet-collector-plugins-1.7.0/src/main/java/eu/dnetlib/data/collector/plugins/schemaorg/DatasetMappingIterator.java
1
package eu.dnetlib.data.collector.plugins.schemaorg;
2

  
3
import org.apache.commons.logging.Log;
4
import org.apache.commons.logging.LogFactory;
5
import org.json.JSONObject;
6

  
7
import java.net.URL;
8
import java.time.LocalDate;
9
import java.time.format.DateTimeFormatter;
10
import java.util.*;
11

  
12
public class DatasetMappingIterator implements Iterator<String> {
13
	private static final Log log = LogFactory.getLog(EndpointAccessIterator.class);
14

  
15
	public static class Options {
16
		public static class IdentifierOptions{
17
			public List<String> mappingARK;
18
			public List<String> mappingDOI;
19
			public List<String> mappingHandle;
20
			public List<String> mappingPURL;
21
			public List<String> mappingURN;
22
			public List<String> mappingURL;
23
			public DatasetDocument.Identifier.IdentifierType fallbackType;
24
			public Boolean fallbackURL;
25
		}
26

  
27
		public static class ContributorOptions{
28
			public DatasetDocument.Contributor.ContributorType fallbackType;
29
		}
30

  
31
		public static class PublicationDateOptions{
32
			public String format;
33
		}
34

  
35
		public static class CreatedDateOptions{
36
			public String format;
37
		}
38

  
39
		public static class UpdatedDateOptions{
40
			public String format;
41
		}
42

  
43
		private IdentifierOptions identifierOptions;
44
		private PublicationDateOptions publicationDateOptions;
45
		private ContributorOptions contributorOptions;
46
		private CreatedDateOptions createdDateOptions;
47
		private UpdatedDateOptions updatedDateOptions;
48

  
49
		public UpdatedDateOptions getUpdatedDateOptions() {
50
			return updatedDateOptions;
51
		}
52

  
53
		public void setUpdatedDateOptions(UpdatedDateOptions updatedDateOptions) {
54
			this.updatedDateOptions = updatedDateOptions;
55
		}
56

  
57
		public CreatedDateOptions getCreatedDateOptions() {
58
			return createdDateOptions;
59
		}
60

  
61
		public void setCreatedDateOptions(CreatedDateOptions createdDateOptions) {
62
			this.createdDateOptions = createdDateOptions;
63
		}
64

  
65
		public ContributorOptions getContributorOptions() {
66
			return contributorOptions;
67
		}
68

  
69
		public void setContributorOptions(ContributorOptions contributorOptions) {
70
			this.contributorOptions = contributorOptions;
71
		}
72

  
73
		public PublicationDateOptions getPublicationDateOptions() {
74
			return publicationDateOptions;
75
		}
76

  
77
		public void setPublicationDateOptions(PublicationDateOptions publicationDateOptions) {
78
			this.publicationDateOptions = publicationDateOptions;
79
		}
80

  
81
		public IdentifierOptions getIdentifierOptions() {
82
			return identifierOptions;
83
		}
84

  
85
		public void setIdentifierOptions(IdentifierOptions identifierOptions) {
86
			this.identifierOptions = identifierOptions;
87
		}
88
	}
89

  
90
	private Options options;
91
	private EndpointAccessIterator endpointAccessIterator;
92

  
93
	public DatasetMappingIterator(Options options, EndpointAccessIterator endpointAccessIterator) {
94
		this.options = options;
95
		this.endpointAccessIterator = endpointAccessIterator;
96
	}
97

  
98
	@Override
99
	public boolean hasNext() {
100
		return this.endpointAccessIterator.hasNext();
101
	}
102

  
103
	@Override
104
	public String next() {
105
		JSONObject document = this.endpointAccessIterator.next();
106
		String xml = null;
107
		if (document == null) {
108
			log.debug("no document provided to process. returning empty");
109
			xml = DatasetDocument.emptyXml();
110
		}
111
		else {
112
			log.debug("building document");
113
			xml = this.buildDataset(document);
114
			if (!Utils.validateXml(xml)) {
115
				log.debug("xml not valid. setting to empty");
116
				xml = null;
117
			}
118
			if (xml == null) {
119
				log.debug("could not build xml. returning empty");
120
				xml = DatasetDocument.emptyXml();
121
			}
122
		}
123

  
124
		//if all else fails
125
		if(xml == null){
126
			log.debug("could not build xml. returning empty");
127
			xml = "<dataset/>";
128
		}
129

  
130
		log.debug("xml document for dataset is: "+xml);
131

  
132
		return xml;
133
	}
134

  
135
	private String buildDataset(JSONObject document){
136
		String xml = null;
137
		try{
138
			DatasetDocument dataset = new DatasetDocument();
139

  
140
			dataset.setIdentifiers(this.extractIdentifier(document));
141
			dataset.setCreators(this.extractCreator(document));
142
			dataset.setTitles(this.extractTitles(document));
143
			dataset.setAlternativeTitles(this.extractAlternateTitles(document));
144
			dataset.setPublishers(this.extractPublisher(document));
145
			dataset.setPublicationDates(this.extractPublicationDate(document));
146
			dataset.setSubjects(this.extractSubjects(document));
147
			dataset.setContributors(this.extractContributors(document));
148
			dataset.setCreatedDates(this.extractCreatedDate(document));
149
			dataset.setUpdatedDates(this.extractUpdatedDate(document));
150
			dataset.setLanguages(this.extractLanguages(document));
151
			dataset.setResourceTypes(this.extractResourceTypes(document));
152
			dataset.setAlternateIdentifier(this.extractAlternateIdentifiers(document));
153
			dataset.setCitations(this.extractCitations(document));
154
			dataset.setSizes(this.extractSize(document));
155
			dataset.setFormat(this.extractEncodingFormat(document));
156
			dataset.setVersion(this.extractVersion(document));
157
			dataset.setLicenses(this.extractLicense(document));
158
			dataset.setDescriptions(this.extractDescription(document));
159
			dataset.setDisambiguatingDescriptions(this.extractDisambiguatingDescription(document));
160
			dataset.setGeoLocations(this.extractSpatialCoverage(document));
161

  
162
			log.debug("document contains native identifier: : "+(dataset.getIdentifiers()!=null && dataset.getIdentifiers().size() > 0));
163

  
164
			if((dataset.getIdentifiers() == null || dataset.getIdentifiers().size() == 0) &&
165
					this.options.getIdentifierOptions().fallbackURL){
166
				log.debug("falling back to url identifier");
167
				dataset.setIdentifiers(this.extractIdentifierFallbackURL(document));
168
				log.debug("document contains overridden identifier: : "+(dataset.getIdentifiers()!=null && dataset.getIdentifiers().size() > 0));
169
			}
170

  
171
			xml = dataset.toXml();
172
		}
173
		catch(Exception ex){
174
			log.error("problem constructing dataset xml. returning empty", ex);
175
			xml = null;
176
		}
177
		return xml;
178
	}
179

  
180
	private List<DatasetDocument.Identifier> extractIdentifierFallbackURL(JSONObject document){
181
		List<String> urls = JSONLDUtils.extractString(document, "url");
182

  
183
		ArrayList<DatasetDocument.Identifier> curated = new ArrayList<>();
184
		for(String item : urls){
185
			if(item == null || item.trim().length() == 0) continue;
186
			curated.add(new DatasetDocument.Identifier(DatasetDocument.Identifier.IdentifierType.URL,  item.trim()));
187
		}
188
		return curated;
189
	}
190

  
191
	private List<DatasetDocument.SpatialCoverage> extractSpatialCoverage(JSONObject document){
192
		List<JSONLDUtils.PlaceInfo> spatials = JSONLDUtils.extractPlaces(document, "spatialCoverage");
193

  
194
		ArrayList<DatasetDocument.SpatialCoverage> curated = new ArrayList<>();
195
		for(JSONLDUtils.PlaceInfo item : spatials){
196
			if((item.name == null || item.name.trim().length() == 0) &&
197
					(item.geoCoordinates == null || item.geoCoordinates.size() == 0) &&
198
					(item.geoShapes == null || item.geoShapes.size() == 0)) continue;
199

  
200
			List<DatasetDocument.SpatialCoverage.Point> points = new ArrayList<>();
201
			List<String> boxes = new ArrayList<>();
202
			if(item.geoCoordinates!=null) {
203
				for (JSONLDUtils.GeoCoordinatesInfo iter : item.geoCoordinates){
204
					points.add(new DatasetDocument.SpatialCoverage.Point(iter.latitude, iter.longitude));
205
				}
206
			}
207
			if(item.geoShapes!=null) {
208
				for (JSONLDUtils.GeoShapeInfo iter : item.geoShapes){
209
					boxes.add(iter.box);
210
				}
211
			}
212
			curated.add(new DatasetDocument.SpatialCoverage(item.name, points, boxes));
213
		}
214
		return curated;
215
	}
216

  
217
	private List<String> extractDescription(JSONObject document){
218
		List<String> descriptions = JSONLDUtils.extractString(document, "description");
219

  
220
		ArrayList<String> curated = new ArrayList<>();
221
		for(String item : descriptions){
222
			if(item == null || item.trim().length() == 0) continue;
223
			curated.add(item);
224
		}
225
		return curated;
226
	}
227

  
228
	private List<String> extractDisambiguatingDescription(JSONObject document){
229
		List<String> descriptions = JSONLDUtils.extractString(document, "disambiguatingDescription");
230

  
231
		ArrayList<String> curated = new ArrayList<>();
232
		for(String item : descriptions){
233
			if(item == null || item.trim().length() == 0) continue;
234
			curated.add(item);
235
		}
236
		return curated;
237
	}
238

  
239
	private List<DatasetDocument.License> extractLicense(JSONObject document){
240
		List<JSONLDUtils.LicenseInfo> licenses = JSONLDUtils.extractLicenses(document, "license");
241

  
242
		ArrayList<DatasetDocument.License> curated = new ArrayList<>();
243
		for(JSONLDUtils.LicenseInfo item : licenses){
244
			if(item.url == null || item.url.trim().length() == 0) continue;
245
			curated.add(new DatasetDocument.License(item.name, item.url));
246
		}
247
		return curated;
248
	}
249

  
250
	private List<String> extractVersion(JSONObject document){
251
		List<String> versions = JSONLDUtils.extractString(document, "version");
252

  
253
		ArrayList<String> curated = new ArrayList<>();
254
		for(String item : versions){
255
			if(item == null || item.trim().length() == 0) continue;
256
			curated.add(item);
257
		}
258
		return curated;
259
	}
260

  
261
	private List<String> extractSize(JSONObject document) {
262
		List<String> sizes = JSONLDUtils.extractSize(document, "distribution");
263

  
264
		HashSet<String> curated = new HashSet<>();
265
		for (String item : sizes) {
266
			if (item == null || item.trim().length() == 0) continue;
267
			curated.add(item);
268
		}
269
		return new ArrayList<>(curated);
270
	}
271

  
272
	private List<String> extractEncodingFormat(JSONObject document){
273
		List<String> formats = JSONLDUtils.extractEncodingFormat(document, "distribution");
274

  
275
		HashSet<String> curated = new HashSet<>();
276
		for(String item : formats){
277
			if(item == null || item.trim().length() == 0) continue;
278
			curated.add(item);
279
		}
280
		return new ArrayList<>(curated);
281
	}
282

  
283
	//TODO: Handle different citation types. Currently only urls
284
	private List<DatasetDocument.Citation> extractCitations(JSONObject document){
285
		List<JSONLDUtils.CitationInfo> citations = JSONLDUtils.extractCitations(document, "citation");
286

  
287
		ArrayList<DatasetDocument.Citation> curated = new ArrayList<>();
288
		for(JSONLDUtils.CitationInfo item : citations){
289
			if(item.url == null || item.url.trim().length() == 0) continue;
290
			try{
291
				new URL(item.url);
292
			}catch (Exception ex){
293
				continue;
294
			}
295
			curated.add(new DatasetDocument.Citation(item.url, DatasetDocument.Citation.CitationIdentifierType.URL));
296
		}
297
		return curated;
298
	}
299

  
300
	private List<DatasetDocument.AlternateIdentifier> extractAlternateIdentifiers(JSONObject document){
301
		List<String> issns = JSONLDUtils.extractString(document, "issn");
302
		List<String> urls = JSONLDUtils.extractString(document, "url");
303

  
304
		ArrayList<DatasetDocument.AlternateIdentifier> curated = new ArrayList<>();
305
		for(String item : issns){
306
			if(item == null || item.trim().length() == 0) continue;
307
			curated.add(new DatasetDocument.AlternateIdentifier(item.trim(), "ISSN"));
308
		}
309
		for(String item : urls){
310
			if(item == null || item.trim().length() == 0) continue;
311
			curated.add(new DatasetDocument.AlternateIdentifier(item.trim(), "URL"));
312
		}
313
		return curated;
314
	}
315

  
316
	private List<DatasetDocument.ResourceType> extractResourceTypes(JSONObject document){
317
		List<DatasetDocument.ResourceType> resourceTypes = new ArrayList<>();
318
		resourceTypes.add(new DatasetDocument.ResourceType(DatasetDocument.ResourceType.ResourceTypeGeneralType.Dataset));
319
		return resourceTypes;
320
	}
321

  
322
	private List<String> extractLanguages(JSONObject document){
323
		List<String> languages = JSONLDUtils.extractLanguage(document, "inLanguage");
324

  
325
		ArrayList<String> curated = new ArrayList<>();
326
		for(String item : languages){
327
			if(item == null || item.trim().length() == 0) continue;
328
			curated.add(item);
329
		}
330
		return curated;
331
	}
332

  
333
	private List<LocalDate> extractUpdatedDate(JSONObject document){
334
		List<LocalDate> updatedDates = new ArrayList<>();
335
		if(this.options.getUpdatedDateOptions() == null || this.options.getUpdatedDateOptions().format == null || this.options.getUpdatedDateOptions().format.length() == 0) return updatedDates;
336

  
337
		DateTimeFormatter formatter = DateTimeFormatter.ofPattern(this.options.getPublicationDateOptions().format);
338

  
339
		List<String> dates = JSONLDUtils.extractString(document, "dateModified");
340
		for(String updatedDate : dates){
341
			if(updatedDate == null || updatedDate.trim().length() == 0) continue;
342
			try {
343
				LocalDate localDate = LocalDate.parse(updatedDate, formatter);
344
				updatedDates.add(localDate);
345
			} catch (Exception e) {
346
				continue;
347
			}
348
		}
349
		return updatedDates;
350
	}
351

  
352
	private List<LocalDate> extractCreatedDate(JSONObject document){
353
		List<LocalDate> createdDates = new ArrayList<>();
354
		if(this.options.getCreatedDateOptions() == null || this.options.getCreatedDateOptions().format == null || this.options.getCreatedDateOptions().format.length() == 0) return createdDates;
355

  
356
		DateTimeFormatter formatter = DateTimeFormatter.ofPattern(this.options.getCreatedDateOptions().format);
357

  
358
		List<String> dates = JSONLDUtils.extractString(document, "dateCreated");
359
		for(String createdDate : dates){
360
			if(createdDate == null || createdDate.trim().length() == 0) continue;
361
			try {
362
				LocalDate localDate = LocalDate.parse(createdDate, formatter);
363
				createdDates.add(localDate);
364
			} catch (Exception e) {
365
				continue;
366
			}
367
		}
368
		return createdDates;
369
	}
370

  
371
	private List<DatasetDocument.Contributor> extractContributors(JSONObject document){
372
		List<JSONLDUtils.PrincipalInfo> editors = JSONLDUtils.extractPrincipal(document, "editor");
373
		List<JSONLDUtils.PrincipalInfo> funders = JSONLDUtils.extractPrincipal(document, "funder");
374
		List<JSONLDUtils.PrincipalInfo> producers = JSONLDUtils.extractPrincipal(document, "producer");
375
		List<JSONLDUtils.PrincipalInfo> sponsors = JSONLDUtils.extractPrincipal(document, "sponsor");
376
		List<JSONLDUtils.PrincipalInfo> constributors = JSONLDUtils.extractPrincipal(document, "contributor");
377

  
378
		ArrayList<DatasetDocument.Contributor> curated = new ArrayList<>();
379
		for(JSONLDUtils.PrincipalInfo item : editors){
380
			if(item.name() == null || item.name().trim().length() == 0) continue;
381
			curated.add(new DatasetDocument.Contributor(item.name(), item.affiliationNames(), DatasetDocument.Contributor.ContributorType.Editor));
382
		}
383
		for(JSONLDUtils.PrincipalInfo item : funders){
384
			if(item.name() == null || item.name().trim().length() == 0) continue;
385
			curated.add(new DatasetDocument.Contributor(item.name(), item.affiliationNames(), DatasetDocument.Contributor.ContributorType.Funder));
386
		}
387
		for(JSONLDUtils.PrincipalInfo item : producers){
388
			if(item.name() == null || item.name().trim().length() == 0) continue;
389
			curated.add(new DatasetDocument.Contributor(item.name(), item.affiliationNames(), DatasetDocument.Contributor.ContributorType.Producer));
390
		}
391
		for(JSONLDUtils.PrincipalInfo item : sponsors){
392
			if(item.name() == null || item.name().trim().length() == 0) continue;
393
			curated.add(new DatasetDocument.Contributor(item.name(), item.affiliationNames(), DatasetDocument.Contributor.ContributorType.Sponsor));
394
		}
395
		for(JSONLDUtils.PrincipalInfo item : constributors){
396
			if(item.name() == null || item.name().trim().length() == 0) continue;
397
			DatasetDocument.Contributor.ContributorType type = DatasetDocument.Contributor.ContributorType.Other;
398
			if(this.options.getContributorOptions()!=null && this.options.getContributorOptions().fallbackType != null) type = this.options.getContributorOptions().fallbackType;
399
			curated.add(new DatasetDocument.Contributor(item.name(), item.affiliationNames(), type));
400
		}
401
		return curated;
402
	}
403

  
404
	private List<String> extractSubjects(JSONObject document){
405
		List<String> subjects = JSONLDUtils.extractString(document, "keywords");
406

  
407
		ArrayList<String> curated = new ArrayList<>();
408
		for(String item : subjects){
409
			if(item == null || item.trim().length() == 0) continue;
410
			curated.add(item);
411
		}
412
		return curated;
413
	}
414

  
415
	private List<LocalDate> extractPublicationDate(JSONObject document){
416
		List<LocalDate> publicationDates = new ArrayList<>();
417
		if(this.options.getPublicationDateOptions() == null || this.options.getPublicationDateOptions().format == null || this.options.getPublicationDateOptions().format.length() == 0) return publicationDates;
418

  
419
		DateTimeFormatter formatter = DateTimeFormatter.ofPattern(this.options.getPublicationDateOptions().format);
420

  
421
		List<String> dates = JSONLDUtils.extractString(document, "datePublished");
422
		for(String publicationDate : dates){
423
			if(publicationDate == null || publicationDate.trim().length() == 0) continue;
424
			try {
425
				LocalDate localDate = LocalDate.parse(publicationDate, formatter);
426
				publicationDates.add(localDate);
427
			} catch (Exception e) {
428
				continue;
429
			}
430
		}
431
		return publicationDates;
432
	}
433

  
434
	private List<String> extractPublisher(JSONObject document){
435
		List<JSONLDUtils.PrincipalInfo> publishers = JSONLDUtils.extractPrincipal(document, "publisher");
436

  
437
		ArrayList<String> curated = new ArrayList<>();
438
		for(JSONLDUtils.PrincipalInfo item : publishers){
439
			if(item.name() == null || item.name().trim().length() == 0) continue;
440
			curated.add(item.name());
441
		}
442
		return curated;
443
	}
444

  
445
	private List<String> extractTitles(JSONObject document){
446
		List<String> names = JSONLDUtils.extractString(document, "name");
447
		List<String> headlines = JSONLDUtils.extractString(document, "headline");
448

  
449
		HashSet<String> titles = new HashSet<>();
450
		titles.addAll(names);
451
		titles.addAll(headlines);
452
		return new ArrayList<>(titles);
453
	}
454

  
455
	private List<String> extractAlternateTitles(JSONObject document){
456
		List<String> names = JSONLDUtils.extractString(document, "alternateName");
457
		List<String> headlines = JSONLDUtils.extractString(document, "alternativeHeadline");
458

  
459
		HashSet<String> titles = new HashSet<>();
460
		titles.addAll(names);
461
		titles.addAll(headlines);
462
		return new ArrayList<>(titles);
463
	}
464

  
465
	private List<DatasetDocument.Identifier> extractIdentifier(JSONObject document){
466
		List<DatasetDocument.Identifier> curated = new ArrayList<>();
467

  
468
		List<JSONLDUtils.IdentifierInfo> identifiers = JSONLDUtils.extractIdentifier(document, "identifier");
469

  
470
		for(JSONLDUtils.IdentifierInfo item : identifiers){
471
			if(item.value == null || item.value.trim().length() == 0) continue;
472
			if(item.type == null || item.type.trim().length() == 0) {
473
				if (this.options.getIdentifierOptions().fallbackType == null) continue;
474
				curated.add(new DatasetDocument.Identifier(this.options.getIdentifierOptions().fallbackType, item.value.trim()));
475
			}
476
			else {
477
				DatasetDocument.Identifier.IdentifierType type = null;
478
				if(this.options.getIdentifierOptions().mappingARK != null && this.options.getIdentifierOptions().mappingARK.contains(item.type.trim())) type = DatasetDocument.Identifier.IdentifierType.ARK;
479
				else if(this.options.getIdentifierOptions().mappingDOI != null && this.options.getIdentifierOptions().mappingDOI.contains(item.type.trim())) type = DatasetDocument.Identifier.IdentifierType.DOI;
480
				else if(this.options.getIdentifierOptions().mappingHandle != null && this.options.getIdentifierOptions().mappingHandle.contains(item.type.trim())) type = DatasetDocument.Identifier.IdentifierType.Handle;
481
				else if(this.options.getIdentifierOptions().mappingPURL != null && this.options.getIdentifierOptions().mappingPURL.contains(item.type.trim())) type = DatasetDocument.Identifier.IdentifierType.PURL;
482
				else if(this.options.getIdentifierOptions().mappingURL != null && this.options.getIdentifierOptions().mappingURL.contains(item.type.trim())) type = DatasetDocument.Identifier.IdentifierType.URL;
483
				else if(this.options.getIdentifierOptions().mappingURN != null && this.options.getIdentifierOptions().mappingURN.contains(item.type.trim())) type = DatasetDocument.Identifier.IdentifierType.URN;
484

  
485
				if(type == null) continue;
486
				curated.add(new DatasetDocument.Identifier(type, item.value.trim()));
487
			}
488
		}
489
		return curated;
490
	}
491

  
492
	private List<DatasetDocument.Creator> extractCreator(JSONObject document){
493
		List<JSONLDUtils.PrincipalInfo> creators = JSONLDUtils.extractPrincipal(document, "creator");
494
		List<JSONLDUtils.PrincipalInfo> authors = JSONLDUtils.extractPrincipal(document, "author");
495

  
496
		HashSet<String> foundNames = new HashSet<>();
497
		List<DatasetDocument.Creator> curated = new ArrayList<>();
498
		for(JSONLDUtils.PrincipalInfo item : creators){
499
			if(item.name() == null || item.name().trim().length() == 0) continue;
500
			if(foundNames.contains(item.name())) continue;
501
			foundNames.add(item.name());
502
			curated.add(new DatasetDocument.Creator(item.name(), item.affiliationNames()));
503
		}
504
		for(JSONLDUtils.PrincipalInfo item : authors){
505
			if(item.name() == null || item.name().trim().length() == 0) continue;
506
			if(foundNames.contains(item.name())) continue;
507
			foundNames.add(item.name());
508

  
509
			curated.add(new DatasetDocument.Creator(item.name(), item.affiliationNames()));
510
		}
511
		return curated;
512
	}
513

  
514
}
modules/dnet-collector-plugins/tags/dnet-collector-plugins-1.7.0/src/main/java/eu/dnetlib/data/collector/plugins/schemaorg/SchemaOrgMainKaggle.java
1
package eu.dnetlib.data.collector.plugins.schemaorg;
2

  
3
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
4
import org.apache.commons.io.FileUtils;
5
import org.apache.commons.logging.Log;
6
import org.apache.commons.logging.LogFactory;
7
import org.apache.log4j.ConsoleAppender;
8
import org.apache.log4j.Level;
9
import org.apache.log4j.Logger;
10
import org.apache.log4j.PatternLayout;
11

  
12
import java.io.File;
13
import java.nio.charset.StandardCharsets;
14
import java.util.HashMap;
15
import java.util.concurrent.TimeUnit;
16

  
17
public class SchemaOrgMainKaggle {
18

  
19
    private static final Log log = LogFactory.getLog(SchemaOrgMainKaggle.class);
20

  
21
    public static void main(String[] args) throws Exception {
22

  
23
        ConsoleAppender console = new ConsoleAppender();
24
        console.setLayout(new PatternLayout("%d [%p|%c|%C{1}] %m%n"));
25
        console.setThreshold(Level.DEBUG);
26
        console.activateOptions();
27
        Logger.getLogger("eu.dnetlib.data.collector.plugins").addAppender(console);
28

  
29
        HashMap<String,String> params = new HashMap<>();
30
        params.put("consumerBlockPolling", Boolean.toString(true));
31
        params.put("consumerBlockPollingTimeout", "2");
32
        params.put("consumerBlockPollingTimeoutUnit", TimeUnit.MINUTES.toString());
33
        params.put("endpointCharset", StandardCharsets.UTF_8.name());
34
        params.put("updatedDateFormat", "YYYY-MM-DD");
35
        params.put("createdDateFormat", "YYYY-MM-DD");
36
        params.put("publicationDateFormat", "YYYY-MM-DD");
37
        params.put("contributorFallbackType", DatasetDocument.Contributor.ContributorType.Other.toString());
38
        params.put("identifierFallbackType", DatasetDocument.Identifier.IdentifierType.Handle.toString());
39
        params.put("identifierFallbackURL", Boolean.toString(true));
40
        params.put("identifierMappingARK", "ark, ARK");
41
        params.put("identifierMappingDOI", "doi, DOI");
42
        params.put("identifierMappingHandle", "Handle, HANDLE");
43
        params.put("identifierMappingPURL", "purl, PURL");
44
        params.put("identifierMappingURN", "urn, URN");
45
        params.put("identifierMappingURL", "url, URL");
46

  
47
        params.put("repositoryAccessType", "httpapi-kaggle");
48

  
49
        params.put("httpapi-kaggle_queueSize", "100");
50
        params.put("httpapi-kaggle_APICharset", StandardCharsets.UTF_8.name());
51
        params.put("httpapi-kaggle_queryUrl", "https://www.kaggle.com/datasets_v2.json?sortBy=updated&group=public&page={PAGE}&pageSize=20&size=sizeAll&filetype=fileTypeAll&license=licenseAll");
52
        params.put("httpapi-kaggle_queryPagePlaceholder", "{PAGE}");
53
        params.put("httpapi-kaggle_responsePropertyTotalDataset", "totalDatasetListItems");
54
        params.put("httpapi-kaggle_responsePropertyDatasetList", "datasetListItems");
55
        params.put("httpapi-kaggle_responsePropertyDatasetUrl", "datasetUrl");
56
        params.put("httpapi-kaggle_responseBaseDatasetUrl", "https://www.kaggle.com");
57
        params.put("httpapi-kaggle_producerBlockPollingTimeout", "2");
58
        params.put("httpapi-kaggle_producerBlockPollingTimeoutUnit", TimeUnit.MINUTES.toString());
59

  
60
        InterfaceDescriptor descriptor = new InterfaceDescriptor();
61
        descriptor.setId("schema.org - kaggle");
62
        descriptor.setBaseUrl("https://www.kaggle.com");
63

  
64
        descriptor.setParams(params);
65

  
66
        SchemaOrgPlugin schemaOrgPlugin = new SchemaOrgPlugin();
67

  
68
        Iterable<String> iterable = schemaOrgPlugin.collect(descriptor, null, null);
69

  
70
        String outDir = params.get("repositoryAccessType");
71

  
72
        log.info("saving content in " + outDir);
73

  
74
        File directory = new File(outDir);
75
        if (directory.exists()) {
76
            log.info(directory.getAbsolutePath() + " exists, cleaning up");
77
            FileUtils.deleteDirectory(directory);
78
        }
79
        FileUtils.forceMkdir(directory);
80
        Utils.writeFiles(iterable, outDir);
81

  
82
    }
83

  
84
}
modules/dnet-collector-plugins/tags/dnet-collector-plugins-1.7.0/src/main/java/eu/dnetlib/data/collector/plugins/schemaorg/SchemaOrgPlugin.java
1
package eu.dnetlib.data.collector.plugins.schemaorg;
2

  
3
import eu.dnetlib.data.collector.plugin.AbstractCollectorPlugin;
4
import eu.dnetlib.data.collector.plugins.schemaorg.httpapi.kaggle.KaggleRepositoryIterable;
5
import eu.dnetlib.data.collector.plugins.schemaorg.sitemapindex.SitemapFileIterator;
6
import eu.dnetlib.data.collector.plugins.schemaorg.sitemapindex.SitemapIndexIterator;
7
import eu.dnetlib.data.collector.plugins.schemaorg.sitemapindex.SitemapIndexRepositoryIterable;
8
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
9
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
10
import org.apache.commons.logging.Log;
11
import org.apache.commons.logging.LogFactory;
12

  
13
import java.net.MalformedURLException;
14
import java.net.URL;
15
import java.nio.charset.StandardCharsets;
16
import java.util.concurrent.TimeUnit;
17

  
18
public class SchemaOrgPlugin extends AbstractCollectorPlugin {
19

  
20
    private static final Log log = LogFactory.getLog(SchemaOrgPlugin.class);
21

  
22
    public String hello(){
23
        return "hello";
24
    }
25

  
26
    @Override
27
    public Iterable<String> collect(final InterfaceDescriptor interfaceDescriptor, final String fromDate, final String untilDate) throws CollectorServiceException {
28
        try {
29
			RepositoryIterable repository = null;
30
        	String repositoryAccessType = Utils.getAsString(interfaceDescriptor.getParams(), "repositoryAccessType", null);
31
        	switch(repositoryAccessType) {
32
				case "sitemapindex": {
33
					SitemapIndexRepositoryIterable.Options repositoryOptions = this.compileSitemapIndexRepositoryOptions(interfaceDescriptor);
34
					SitemapIndexRepositoryIterable repositoryIterable = new SitemapIndexRepositoryIterable(repositoryOptions);
35
					repositoryIterable.bootstrap();
36
					repository = repositoryIterable;
37
					break;
38
				}
39
				case "httpapi-kaggle": {
40
					KaggleRepositoryIterable.Options repositoryOptions = this.compileKaggleRepositoryOptions(interfaceDescriptor);
41
					KaggleRepositoryIterable repositoryIterable = new KaggleRepositoryIterable(repositoryOptions);
42
					repositoryIterable.bootstrap();
43
					repository = repositoryIterable;
44
					break;
45
				}
46
				default:
47
					throw new CollectorServiceException(String.format("unrecognized repository access type ", repositoryAccessType));
48
			}
49
			SchemaOrgIterable.Options schemaOrgOptions = this.compileSchemaOrgOptions(interfaceDescriptor);
50
            SchemaOrgIterable iterable = new SchemaOrgIterable(schemaOrgOptions, repository);
51
            return iterable;
52
        } catch (Exception e) {
53
            throw new CollectorServiceException("Could not create iterator", e);
54
        }
55
    }
56

  
57
	private KaggleRepositoryIterable.Options compileKaggleRepositoryOptions(InterfaceDescriptor interfaceDescriptor) throws MalformedURLException {
58
		KaggleRepositoryIterable.Options kaggleRepositoryOptions = new KaggleRepositoryIterable.Options();
59
		kaggleRepositoryOptions.setQueueSize(Utils.getAsInt(interfaceDescriptor.getParams(), "httpapi-kaggle_queueSize", 100));
60
		kaggleRepositoryOptions.setPutTimeout(Utils.getAsLong(interfaceDescriptor.getParams(), "httpapi-kaggle_producerBlockPollingTimeout", 20));
61
		kaggleRepositoryOptions.setPutTimeoutUnit(Utils.getAsEnum(interfaceDescriptor.getParams(), "httpapi-kaggle_producerBlockPollingTimeoutUnit", TimeUnit.MINUTES, TimeUnit.class));
62
		kaggleRepositoryOptions.setCharset(Utils.getAsCharset(interfaceDescriptor.getParams(), "httpapi-kaggle_APICharset", StandardCharsets.UTF_8));
63
		kaggleRepositoryOptions.setQueryUrl(Utils.getAsString(interfaceDescriptor.getParams(), "httpapi-kaggle_queryUrl", null));
64
		kaggleRepositoryOptions.setQueryPagePlaceholder(Utils.getAsString(interfaceDescriptor.getParams(), "httpapi-kaggle_queryPagePlaceholder", "{PAGE}"));
65
		kaggleRepositoryOptions.setResponsePropertyTotalDataset(Utils.getAsString(interfaceDescriptor.getParams(), "httpapi-kaggle_responsePropertyTotalDataset", "totalDatasetListItems"));
66
		kaggleRepositoryOptions.setResponsePropertyDatasetList(Utils.getAsString(interfaceDescriptor.getParams(), "httpapi-kaggle_responsePropertyDatasetList", "datasetListItems"));
67
		kaggleRepositoryOptions.setResponsePropertyDatasetUrl(Utils.getAsString(interfaceDescriptor.getParams(), "httpapi-kaggle_responsePropertyDatasetUrl", "datasetUrl"));
68
		kaggleRepositoryOptions.setResponseBaseDatasetUrl(Utils.getAsString(interfaceDescriptor.getParams(), "httpapi-kaggle_responseBaseDatasetUrl", interfaceDescriptor.getBaseUrl()));
69
		kaggleRepositoryOptions.setRepositoryQueueIteratorOptions(this.compileRepositoryQueueOptions(interfaceDescriptor));
70
		return kaggleRepositoryOptions;
71

  
72
	}
73

  
74
    private SitemapIndexIterator.Options compileSitemapIndexOptions(InterfaceDescriptor interfaceDescriptor) throws MalformedURLException {
75
		SitemapIndexIterator.Options sitemapIndexIteratorOptions = new SitemapIndexIterator.Options();
76
		sitemapIndexIteratorOptions.setCharset(Utils.getAsCharset(interfaceDescriptor.getParams(), "sitemap_IndexCharset", StandardCharsets.UTF_8));
77
		sitemapIndexIteratorOptions.setIndexUrl(new URL(interfaceDescriptor.getBaseUrl()));
78
		return sitemapIndexIteratorOptions;
79

  
80
	}
81

  
82
	private SitemapFileIterator.Options compileSitemapFileOptions(InterfaceDescriptor interfaceDescriptor) throws MalformedURLException {
83
		SitemapFileIterator.Options sitemapFileIteratorOptions = new SitemapFileIterator.Options();
84
		sitemapFileIteratorOptions.setCharset(Utils.getAsCharset(interfaceDescriptor.getParams(), "sitemap_FileCharset", StandardCharsets.UTF_8));
85
		sitemapFileIteratorOptions.setSchemaType(Utils.getAsEnum(interfaceDescriptor.getParams(), "sitemap_FileSchema", SitemapFileIterator.Options.SitemapSchemaType.Xml, SitemapFileIterator.Options.SitemapSchemaType.class));
86
		sitemapFileIteratorOptions.setFileType(Utils.getAsEnum(interfaceDescriptor.getParams(), "sitemap_FileType", SitemapFileIterator.Options.SitemapFileType.Text, SitemapFileIterator.Options.SitemapFileType.class));
87
		return sitemapFileIteratorOptions;
88
	}
89

  
90
	private RepositoryQueueIterator.Options compileRepositoryQueueOptions(InterfaceDescriptor interfaceDescriptor) throws MalformedURLException {
91
		RepositoryQueueIterator.Options repositoryQueueIteratorOptions = new RepositoryQueueIterator.Options();
92
		repositoryQueueIteratorOptions.setBlockPolling(Utils.getAsBoolean(interfaceDescriptor.getParams(), "consumerBlockPolling", true));
93
		repositoryQueueIteratorOptions.setPollTimeout(Utils.getAsLong(interfaceDescriptor.getParams(), "consumerBlockPollingTimeout", 2));
94
		repositoryQueueIteratorOptions.setPollTimeoutUnit(Utils.getAsEnum(interfaceDescriptor.getParams(), "consumerBlockPollingTimeoutUnit", TimeUnit.MINUTES, TimeUnit.class));
95
		return repositoryQueueIteratorOptions;
96
	}
97

  
98
	private SitemapIndexRepositoryIterable.Options compileSitemapIndexRepositoryOptions(InterfaceDescriptor interfaceDescriptor) throws MalformedURLException {
99
		SitemapIndexRepositoryIterable.Options sitemapIndexRepositoryIterableOptions = new SitemapIndexRepositoryIterable.Options();
100
		sitemapIndexRepositoryIterableOptions.setQueueSize(Utils.getAsInt(interfaceDescriptor.getParams(), "sitemap_queueSize", 100));
101
		sitemapIndexRepositoryIterableOptions.setPutTimeout(Utils.getAsLong(interfaceDescriptor.getParams(), "sitemap_producerBlockPollingTimeout", 20));
102
		sitemapIndexRepositoryIterableOptions.setPutTimeoutUnit(Utils.getAsEnum(interfaceDescriptor.getParams(), "sitemap_producerBlockPollingTimeoutUnit", TimeUnit.MINUTES, TimeUnit.class));
103
		sitemapIndexRepositoryIterableOptions.setRepositoryQueueIteratorOptions(this.compileRepositoryQueueOptions(interfaceDescriptor));
104
		sitemapIndexRepositoryIterableOptions.setSitemapFileIteratorOptions(this.compileSitemapFileOptions(interfaceDescriptor));
105
		sitemapIndexRepositoryIterableOptions.setSitemapIndexIteratorOptions(this.compileSitemapIndexOptions(interfaceDescriptor));
106
		return sitemapIndexRepositoryIterableOptions;
107
	}
108

  
109
	private EndpointAccessIterator.Options compileEndpointAccessOptions(InterfaceDescriptor interfaceDescriptor) throws MalformedURLException {
110
		EndpointAccessIterator.Options endpointAccessIteratorOptions = new EndpointAccessIterator.Options();
111
		endpointAccessIteratorOptions.setCharset(Utils.getAsCharset(interfaceDescriptor.getParams(), "endpointCharset", StandardCharsets.UTF_8));
112
		return endpointAccessIteratorOptions;
113
	}
114

  
115
	private DatasetMappingIterator.Options compileDatasetMappingOptions(InterfaceDescriptor interfaceDescriptor) throws MalformedURLException {
116
		DatasetMappingIterator.Options datasetMappingIteratorOptions = new DatasetMappingIterator.Options();
117

  
118
		DatasetMappingIterator.Options.UpdatedDateOptions datasetMappingIteratorUpdatedDateOptions = new DatasetMappingIterator.Options.UpdatedDateOptions();
119
		datasetMappingIteratorUpdatedDateOptions.format =Utils.getAsString(interfaceDescriptor.getParams(), "updatedDateFormat", "YYYY-MM-DD");
120
		datasetMappingIteratorOptions.setUpdatedDateOptions(datasetMappingIteratorUpdatedDateOptions);
121

  
122
		DatasetMappingIterator.Options.CreatedDateOptions datasetMappingIteratorCreatedDateOptions = new DatasetMappingIterator.Options.CreatedDateOptions();
123
		datasetMappingIteratorCreatedDateOptions.format =Utils.getAsString(interfaceDescriptor.getParams(), "createdDateFormat", "YYYY-MM-DD");
124
		datasetMappingIteratorOptions.setCreatedDateOptions(datasetMappingIteratorCreatedDateOptions);
125

  
126
		DatasetMappingIterator.Options.PublicationDateOptions datasetMappingIteratorPublicationDateOptions = new DatasetMappingIterator.Options.PublicationDateOptions();
127
		datasetMappingIteratorPublicationDateOptions.format =Utils.getAsString(interfaceDescriptor.getParams(), "publicationDateFormat", "YYYY-MM-DD");
128
		datasetMappingIteratorOptions.setPublicationDateOptions(datasetMappingIteratorPublicationDateOptions);
129

  
130
		DatasetMappingIterator.Options.ContributorOptions datasetMappingIteratorContributorOptions = new DatasetMappingIterator.Options.ContributorOptions();
131
		datasetMappingIteratorContributorOptions.fallbackType =Utils.getAsEnum(interfaceDescriptor.getParams(), "contributorFallbackType",DatasetDocument.Contributor.ContributorType.Other, DatasetDocument.Contributor.ContributorType.class);
132
		datasetMappingIteratorOptions.setContributorOptions(datasetMappingIteratorContributorOptions);
133

  
134
		DatasetMappingIterator.Options.IdentifierOptions datasetMappingIteratorIdentifierOptions = new DatasetMappingIterator.Options.IdentifierOptions();
135
		datasetMappingIteratorIdentifierOptions.fallbackType = Utils.getAsEnum(interfaceDescriptor.getParams(), "identifierFallbackType", null, DatasetDocument.Identifier.IdentifierType.class);
136
		datasetMappingIteratorIdentifierOptions.fallbackURL = Utils.getAsBoolean(interfaceDescriptor.getParams(), "identifierFallbackURL", true);
137
		datasetMappingIteratorIdentifierOptions.mappingARK = Utils.getAsStringCsv(interfaceDescriptor.getParams(), "identifierMappingARK", null);
138
		datasetMappingIteratorIdentifierOptions.mappingDOI = Utils.getAsStringCsv(interfaceDescriptor.getParams(), "identifierMappingDOI", null);
139
		datasetMappingIteratorIdentifierOptions.mappingHandle = Utils.getAsStringCsv(interfaceDescriptor.getParams(), "identifierMappingHandle", null);
140
		datasetMappingIteratorIdentifierOptions.mappingPURL = Utils.getAsStringCsv(interfaceDescriptor.getParams(), "identifierMappingPURL", null);
141
		datasetMappingIteratorIdentifierOptions.mappingURL = Utils.getAsStringCsv(interfaceDescriptor.getParams(), "identifierMappingURL", null);
142
		datasetMappingIteratorIdentifierOptions.mappingURN = Utils.getAsStringCsv(interfaceDescriptor.getParams(), "identifierMappingURN", null);
143
		datasetMappingIteratorOptions.setIdentifierOptions(datasetMappingIteratorIdentifierOptions);
144
		return datasetMappingIteratorOptions;
145
	}
146

  
147
	private SchemaOrgIterable.Options compileSchemaOrgOptions(InterfaceDescriptor interfaceDescriptor) throws MalformedURLException {
148
		SchemaOrgIterable.Options schemaOrgIterableOptions = new SchemaOrgIterable.Options();
149
		schemaOrgIterableOptions.setDatasetMappingOptions(this.compileDatasetMappingOptions(interfaceDescriptor));
150
		schemaOrgIterableOptions.setEndpointAccessOptions(this.compileEndpointAccessOptions(interfaceDescriptor));
151
		return schemaOrgIterableOptions;
152
	}
153
}
modules/dnet-collector-plugins/tags/dnet-collector-plugins-1.7.0/src/main/java/eu/dnetlib/data/collector/plugins/schemaorg/DatasetDocument.java
1
package eu.dnetlib.data.collector.plugins.schemaorg;
2

  
3
import org.w3c.dom.Attr;
4
import org.w3c.dom.Document;
5
import org.w3c.dom.Element;
6

  
7
import javax.xml.parsers.DocumentBuilder;
8
import javax.xml.parsers.DocumentBuilderFactory;
9
import javax.xml.parsers.ParserConfigurationException;
10
import javax.xml.transform.Transformer;
11
import javax.xml.transform.TransformerFactory;
12
import javax.xml.transform.dom.DOMSource;
13
import javax.xml.transform.stream.StreamResult;
14
import java.io.StringWriter;
15
import java.time.LocalDate;
16
import java.time.format.DateTimeFormatter;
17
import java.util.Calendar;
18
import java.util.Date;
19
import java.util.List;
20

  
21
public class DatasetDocument {
22
	private List<Identifier> identifiers;
23
	private List<Creator> creators;
24
	private List<String> titles;
25
	private List<String> alternativeTitles;
26
	private List<String> publishers;
27
	private List<LocalDate> publicationDates;
28
	private List<String> subjects;
29
	private List<Contributor> contributors;
30
	private List<LocalDate> createdDates;
31
	private List<LocalDate> updatedDates;
32
	private List<String> languages;
33
	private List<ResourceType> resourceTypes;
34
	private List<AlternateIdentifier> alternateIdentifier;
35
	private List<Citation> citations;
36
	private List<String> sizes;
37
	private List<String> format;
38
	private List<String> version;
39
	private List<License> licenses;
40
	private List<String> descriptions;
41
	private List<String> disambiguatingDescriptions;
42
	private List<SpatialCoverage> geoLocations;
43

  
44
	public List<Identifier> getIdentifiers() {
45
		return identifiers;
46
	}
47

  
48
	public void setIdentifiers(List<Identifier> identifiers) {
49
		this.identifiers = identifiers;
50
	}
51

  
52
	public List<Creator> getCreators() {
53
		return creators;
54
	}
55

  
56
	public void setCreators(List<Creator> creators) {
57
		this.creators = creators;
58
	}
59

  
60
	public List<String> getTitles() {
61
		return titles;
62
	}
63

  
64
	public void setTitles(List<String> titles) {
65
		this.titles = titles;
66
	}
67

  
68
	public List<String> getAlternativeTitles() {
69
		return alternativeTitles;
70
	}
71

  
72
	public void setAlternativeTitles(List<String> alternativeTitles) {
73
		this.alternativeTitles = alternativeTitles;
74
	}
75

  
76
	public List<String> getPublishers() {
77
		return publishers;
78
	}
79

  
80
	public void setPublishers(List<String> publishers) {
81
		this.publishers = publishers;
82
	}
83

  
84
	public List<LocalDate> getPublicationDates() {
85
		return publicationDates;
86
	}
87

  
88
	public void setPublicationDates(List<LocalDate> publicationDates) {
89
		this.publicationDates = publicationDates;
90
	}
91

  
92
	public List<String> getSubjects() {
93
		return subjects;
94
	}
95

  
96
	public void setSubjects(List<String> subjects) {
97
		this.subjects = subjects;
98
	}
99

  
100
	public List<Contributor> getContributors() {
101
		return contributors;
102
	}
103

  
104
	public void setContributors(List<Contributor> contributors) {
105
		this.contributors = contributors;
106
	}
107

  
108
	public List<LocalDate> getCreatedDates() {
109
		return createdDates;
110
	}
111

  
112
	public void setCreatedDates(List<LocalDate> createdDates) {
113
		this.createdDates = createdDates;
114
	}
115

  
116
	public List<LocalDate> getUpdatedDates() {
117
		return updatedDates;
118
	}
119

  
120
	public void setUpdatedDates(List<LocalDate> updatedDates) {
121
		this.updatedDates = updatedDates;
122
	}
123

  
124
	public List<String> getLanguages() {
125
		return languages;
126
	}
127

  
128
	public void setLanguages(List<String> languages) {
129
		this.languages = languages;
130
	}
131

  
132
	public List<ResourceType> getResourceTypes() {
133
		return resourceTypes;
134
	}
135

  
136
	public void setResourceTypes(List<ResourceType> resourceTypes) {
137
		this.resourceTypes = resourceTypes;
138
	}
139

  
140
	public List<AlternateIdentifier> getAlternateIdentifier() {
141
		return alternateIdentifier;
142
	}
143

  
144
	public void setAlternateIdentifier(List<AlternateIdentifier> alternateIdentifier) {
145
		this.alternateIdentifier = alternateIdentifier;
146
	}
147

  
148
	public List<Citation> getCitations() {
149
		return citations;
150
	}
151

  
152
	public void setCitations(List<Citation> citations) {
153
		this.citations = citations;
154
	}
155

  
156
	public List<String> getSizes() {
157
		return sizes;
158
	}
159

  
160
	public void setSizes(List<String> sizes) {
161
		this.sizes = sizes;
162
	}
163

  
164
	public List<String> getFormat() {
165
		return format;
166
	}
167

  
168
	public void setFormat(List<String> format) {
169
		this.format = format;
170
	}
171

  
172
	public List<String> getVersion() {
173
		return version;
174
	}
175

  
176
	public void setVersion(List<String> version) {
177
		this.version = version;
178
	}
179

  
180
	public List<License> getLicenses() {
181
		return licenses;
182
	}
183

  
184
	public void setLicenses(List<License> licenses) {
185
		this.licenses = licenses;
186
	}
187

  
188
	public List<String> getDescriptions() {
189
		return descriptions;
190
	}
191

  
192
	public void setDescriptions(List<String> descriptions) {
193
		this.descriptions = descriptions;
194
	}
195

  
196
	public List<String> getDisambiguatingDescriptions() {
197
		return disambiguatingDescriptions;
198
	}
199

  
200
	public void setDisambiguatingDescriptions(List<String> disambiguatingDescriptions) {
201
		this.disambiguatingDescriptions = disambiguatingDescriptions;
202
	}
203

  
204
	public List<SpatialCoverage> getGeoLocations() {
205
		return geoLocations;
206
	}
207

  
208
	public void setGeoLocations(List<SpatialCoverage> geoLocations) {
209
		this.geoLocations = geoLocations;
210
	}
211

  
212
	private  static String emptyXml;
213
	private  static Object lockEmptyXml = new Object();
214
	public static String emptyXml() {
215
		if(DatasetDocument.emptyXml!=null) return DatasetDocument.emptyXml;
216

  
217
		String xml = null;
218
		try {
219
			DocumentBuilderFactory docFactory = DocumentBuilderFactory.newInstance();
220
			DocumentBuilder docBuilder = docFactory.newDocumentBuilder();
221
			Document doc = docBuilder.newDocument();
222

  
223
			Element root = doc.createElement("dataset");
224
			doc.appendChild(root);
225

  
226
			TransformerFactory tf = TransformerFactory.newInstance();
227
			Transformer transformer = tf.newTransformer();
228
			StringWriter writer = new StringWriter();
229
			transformer.transform(new DOMSource(doc), new StreamResult(writer));
230
			xml = writer.getBuffer().toString();
231
		}catch(Exception ex){
232
			xml = "<dataset/>";
233
		}
234

  
235
		synchronized (DatasetDocument.lockEmptyXml) {
236
			if (DatasetDocument.emptyXml == null) DatasetDocument.emptyXml = xml;
237
		}
238

  
239
		return DatasetDocument.emptyXml;
240
	}
241

  
242
	public String toXml() throws Exception {
243
		DocumentBuilderFactory docFactory = DocumentBuilderFactory.newInstance();
244
		DocumentBuilder docBuilder = docFactory.newDocumentBuilder();
245
		Document doc = docBuilder.newDocument();
246

  
247
		Element root = doc.createElement("dataset");
248
		doc.appendChild(root);
249

  
250
		if(this.identifiers!=null){
251
			for(Identifier item : this.identifiers){
252
				item.toXml(root);
253
			}
254
		}
255
		if(this.creators!=null){
256
			Element creators = doc.createElement("creators");
257
			root.appendChild(creators);
258
			for(Creator item : this.creators){
259
				item.toXml(creators);
260
			}
261
		}
262
		if(this.titles!=null || this.alternativeTitles!=null){
263
			Element titles = doc.createElement("titles");
264
			root.appendChild(titles);
265
			if(this.titles!=null) {
266
				for (String item : this.titles) {
267
					Element title = doc.createElement("title");
268
					titles.appendChild(title);
269
					title.appendChild(doc.createTextNode(item));
270
				}
271
			}
272
			if(this.alternativeTitles!=null) {
273
				for (String item : this.alternativeTitles) {
274
					Element title = doc.createElement("title");
275
					titles.appendChild(title);
276
					title.setAttribute("titleType", "AlternativeTitle");
277
					title.appendChild(doc.createTextNode(item));
278
				}
279
			}
280
		}
281
		if(this.publishers!=null){
282
			for(String item : this.publishers){
283
				Element publisher = doc.createElement("publisher");
284
				root.appendChild(publisher);
285
				publisher.appendChild(doc.createTextNode(item));
286
			}
287
		}
288
		if(this.publicationDates!=null){
289
			for(LocalDate item : this.publicationDates){
290
				Element publicationYear = doc.createElement("publicationYear");
291
				root.appendChild(publicationYear);
292
				publicationYear.appendChild(doc.createTextNode(Integer.toString(item.getYear())));
293
			}
294
		}
295
		if(this.subjects!=null){
296
			Element subjects = doc.createElement("subjects");
297
			root.appendChild(subjects);
298
			for(String item : this.subjects){
299
				Element subject = doc.createElement("subject");
300
				subjects.appendChild(subject);
301
				subject.appendChild(doc.createTextNode(item));
302
			}
303
		}
304
		if(this.contributors!=null){
305
			for(Contributor item : this.contributors){
306
				item.toXml(root);
307
			}
308
		}
309
		if(this.createdDates!=null || this.updatedDates!=null){
310
			Element dates = doc.createElement("dates");
311
			root.appendChild(dates);
312

  
313
			DateTimeFormatter formatter = DateTimeFormatter.ofPattern("YYYY-MM-DD");
314

  
315
			if(createdDates!=null) {
316
				for (LocalDate item : this.createdDates) {
317
					Element date = doc.createElement("date");
318
					root.appendChild(date);
319
					date.setAttribute("dateType", "Created");
320
					date.appendChild(doc.createTextNode(item.format(formatter)));
321
				}
322
			}
323
			if(updatedDates!=null) {
324
				for (LocalDate item : this.updatedDates) {
325
					Element date = doc.createElement("date");
326
					root.appendChild(date);
327
					date.setAttribute("dateType", "Updated");
328
					date.appendChild(doc.createTextNode(item.format(formatter)));
329
				}
330
			}
331
		}
332
		if(this.languages!=null){
333
			for(String item : this.languages){
334
				Element language = doc.createElement("language");
335
				root.appendChild(language);
336
				language.appendChild(doc.createTextNode(item));
337
			}
338
		}
339
		if(this.resourceTypes!=null){
340
			for(ResourceType item : this.resourceTypes){
341
				item.toXml(root);
342
			}
343
		}
344
		if(this.alternateIdentifier!=null){
345
			Element alternateIdentifiers = doc.createElement("alternateIdentifiers");
346
			root.appendChild(alternateIdentifiers);
347
			for(AlternateIdentifier item : this.alternateIdentifier){
348
				item.toXml(alternateIdentifiers);
349
			}
350
		}
351
		if(this.citations!=null){
352
			for(Citation item : this.citations){
353
				item.toXml(root);
354
			}
355
		}
356
		if(this.sizes!=null){
357
			Element sizes = doc.createElement("sizes");
358
			root.appendChild(sizes);
359
			for(String item : this.sizes){
360
				Element size = doc.createElement("size");
361
				sizes.appendChild(size);
362
				size.appendChild(doc.createTextNode(item));
363
			}
364
		}
365
		if(this.format!=null){
366
			Element formats = doc.createElement("formats");
367
			root.appendChild(formats);
368
			for(String item : this.format){
369
				Element format = doc.createElement("format");
370
				formats.appendChild(format);
371
				format.appendChild(doc.createTextNode(item));
372
			}
373
		}
374
		if(this.version!=null){
375
			for(String item : this.version){
376
				Element version = doc.createElement("version");
377
				root.appendChild(version);
378
				version.appendChild(doc.createTextNode(item));
379
			}
380
		}
381
		if(this.licenses!=null){
382
			Element rightsList = doc.createElement("rightsList");
383
			root.appendChild(rightsList);
384
			for(License item : this.licenses){
385
				item.toXml(rightsList);
386
			}
387
		}
388
		if(this.descriptions!=null || this.disambiguatingDescriptions!=null){
389
			Element descriptions = doc.createElement("descriptions");
390
			root.appendChild(descriptions);
391
			if(this.descriptions!=null) {
392
				for (String item : this.descriptions) {
393
					Element description = doc.createElement("description");
394
					descriptions.appendChild(description);
395
					description.setAttribute("descriptionType", "Abstract");
396
					description.appendChild(doc.createTextNode(item));
397
				}
398
			}
399
			if(this.disambiguatingDescriptions!=null) {
400
				for (String item : this.disambiguatingDescriptions) {
401
					Element description = doc.createElement("description");
402
					descriptions.appendChild(description);
403
					description.setAttribute("descriptionType", "Other");
404
					description.appendChild(doc.createTextNode(item));
405
				}
406
			}
407
		}
408
		if(this.geoLocations!=null){
409
			Element geoLocations = doc.createElement("geoLocations");
410
			root.appendChild(geoLocations);
411
			for(SpatialCoverage item : this.geoLocations){
412
				item.toXml(geoLocations);
413
			}
414
		}
415

  
416
		TransformerFactory tf = TransformerFactory.newInstance();
417
		Transformer transformer = tf.newTransformer();
418
		StringWriter writer = new StringWriter();
419
		transformer.transform(new DOMSource(doc), new StreamResult(writer));
420
		String xml = writer.getBuffer().toString();
421
		return xml;
422
	}
423

  
424
	public static class SpatialCoverage{
425
		public static class Point{
426
			public String latitude;
427
			public String longitude;
428

  
429
			public Point() {}
430

  
431
			public Point(String latitude, String longitude){
432
				this.latitude = latitude;
433
				this.longitude = longitude;
434
			}
435
		}
436
		public String name;
437
		public List<Point> points;
438
		public List<String> boxes;
439

  
440
		public SpatialCoverage() {}
441

  
442
		public SpatialCoverage(String name, List<Point> points, List<String> boxes ) {
443
			this.name = name;
444
			this.points = points;
445
			this.boxes = boxes;
446
		}
447

  
448
		public void toXml(Element parent){
449
			Element node = parent.getOwnerDocument().createElement("geoLocation");
450
			parent.appendChild(node);
451

  
452
			if(this.points!=null) {
453
				for(Point point : this.points) {
454
					if(point.latitude == null || point.longitude == null) continue;
455
					Element geoLocationPoint = parent.getOwnerDocument().createElement("geoLocationPoint");
456
					geoLocationPoint.appendChild(parent.getOwnerDocument().createTextNode(String.format("%s %s", point.latitude, point.longitude)));
457
					node.appendChild(geoLocationPoint);
458
				}
459
			}
460
			if(this.boxes!=null) {
461
				for(String box : this.boxes) {
462
					if(box == null) continue;
463
					Element geoLocationBox = parent.getOwnerDocument().createElement("geoLocationBox");
464
					geoLocationBox.appendChild(parent.getOwnerDocument().createTextNode(box));
465
					node.appendChild(geoLocationBox);
466
				}
467
			}
468
			if(this.name!=null) {
469
				Element geoLocationPlace = parent.getOwnerDocument().createElement("geoLocationPlace");
470
				geoLocationPlace.appendChild(parent.getOwnerDocument().createTextNode(this.name));
471
				node.appendChild(geoLocationPlace);
472
			}
473
		}
474
	}
475

  
476
	public static class License{
477
		public String name;
478
		public String url;
479

  
480
		public License() {}
481

  
482
		public License(String name, String url) {
483
			this.name = name;
484
			this.url = url;
485
		}
486

  
487
		public void toXml(Element parent){
488
			Element node = parent.getOwnerDocument().createElement("rights");
489
			parent.appendChild(node);
490

  
491
			if(this.url!=null) {
492
				node.setAttribute("rightsURI", this.url);
493
			}
494
			if(this.name!=null) {
495
				node.appendChild(parent.getOwnerDocument().createTextNode(this.name));
496
			}
497
		}
498
	}
499

  
500
	public static class Citation{
501
		public enum CitationIdentifierType{
502
			ARK, arXiv, bibcode, DOI, EAN13, EISSN, Handle, ISBN, ISSN, ISTC, LISSN, LSID, PMID,
503
			PURL, UPC, URL, URN
504
		}
505

  
506
		public CitationIdentifierType type;
507
		public String value;
508

  
509
		public Citation() {}
510

  
511
		public Citation(String value, CitationIdentifierType type) {
512
			this.value = value;
513
			this.type = type;
514
		}
515

  
516
		public void toXml(Element parent){
517
			Element node = parent.getOwnerDocument().createElement("relatedIdentifier");
518
			parent.appendChild(node);
519

  
520
			node.setAttribute("relatedIdentifierType", this.type.toString());
521
			node.setAttribute("relationType", "Cites");
522
			node.appendChild(parent.getOwnerDocument().createTextNode(this.value));
523
		}
524
	}
525

  
526
	public static class Contributor{
527
		public enum ContributorType{
528
			ContactPerson, DataCollector, DataCurator, DataManager, Distributor, Editor, Funder, HostingInstitution,
529
			Producer, ProjectLeader, ProjectManager, ProjectMember, RegistrationAgency, RegistrationAuthority,
530
			RelatedPerson, Researcher, ResearchGroup, RightsHolder, Sponsor, Supervisor, WorkPackageLeader, Other
531
		}
532

  
533
		public String name;
534
		public List<String> affiliations;
535
		public ContributorType type;
536

  
537
		public Contributor() {
538
		}
539

  
540
		public Contributor(String name) {
541
			this.name = name;
542
		}
543

  
544
		public Contributor(String name, List<String> affiliations) {
545
			this.name = name;
546
			this.affiliations = affiliations;
547
		}
548

  
549
		public Contributor(String name, List<String> affiliations, ContributorType type) {
550
			this.name = name;
551
			this.affiliations = affiliations;
552
			this.type = type;
553
		}
554

  
555
		public void toXml(Element parent){
556
			Element node = parent.getOwnerDocument().createElement("contributor");
557
			parent.appendChild(node);
558

  
559
			node.setAttribute("contributorType", this.type.toString());
560

  
561
			if(this.name!=null) {
562
				Element contributorName = parent.getOwnerDocument().createElement("contributorName");
563
				node.appendChild(contributorName);
564
				contributorName.appendChild(parent.getOwnerDocument().createTextNode(this.name));
565
			}
566
			if(this.affiliations!=null) {
567
				for(String item : this.affiliations) {
568
					Element affiliation = parent.getOwnerDocument().createElement("affiliation");
569
					node.appendChild(affiliation);
570
					affiliation.appendChild(parent.getOwnerDocument().createTextNode(item));
571
				}
572
			}
573
		}
574
	}
575

  
576
	public static class AlternateIdentifier{
577
		public String identifier;
578
		public String type;
579

  
580
		public AlternateIdentifier() {}
581

  
582
		public AlternateIdentifier(String identifier, String type) {
583
			this.identifier = identifier;
584
			this.type = type;
585
		}
586

  
587
		public void toXml(Element parent){
588
			Element node = parent.getOwnerDocument().createElement("alternateIdentifier");
589
			parent.appendChild(node);
590

  
591
			if(this.type!=null) {
592
				node.setAttribute("alternateIdentifierType", this.type);
593
			}
594
			if(this.identifier!=null) {
595
				node.appendChild(parent.getOwnerDocument().createTextNode(this.identifier));
596
			}
597
		}
598
	}
599

  
600
	public static class ResourceType{
601
		public enum ResourceTypeGeneralType {
602
			Audiovisual, Collection, Dataset, Event, Image, InteractiveResource, Model, PhysicalObject, Service,
603
			Software, Sound, Text, Workflow, Other
604
		}
605

  
606
		public ResourceTypeGeneralType type;
607

  
608
		public ResourceType() {}
609

  
610
		public ResourceType(ResourceTypeGeneralType type) {
611
			this.type = type;
612
		}
613

  
614
		public void toXml(Element parent){
615
			Element node = parent.getOwnerDocument().createElement("resourceType");
616
			parent.appendChild(node);
617

  
618
			if(this.type!=null) {
619
				node.setAttribute("resourceTypeGeneral", this.type.toString());
620
			}
621
		}
622
	}
623

  
624
	public static class Creator {
625
		public String name;
626
		public List<String> affiliations;
627

  
628
		public Creator() {
629
		}
630

  
631
		public Creator(String name) {
632
			this.name = name;
633
		}
634

  
635
		public Creator(String name, List<String> affiliations) {
636
			this.name = name;
637
			this.affiliations = affiliations;
638
		}
639

  
640
		public void toXml(Element parent){
641
			Element node = parent.getOwnerDocument().createElement("creator");
642
			parent.appendChild(node);
643

  
644
			if(this.name!=null) {
645
				Element creatorName = parent.getOwnerDocument().createElement("creatorName");
646
				node.appendChild(creatorName);
647
				creatorName.appendChild(parent.getOwnerDocument().createTextNode(this.name));
648
			}
649
			if(this.affiliations!=null) {
650
				for(String item : this.affiliations) {
651
					Element affiliation = parent.getOwnerDocument().createElement("affiliation");
652
					node.appendChild(affiliation);
653
					affiliation.appendChild(parent.getOwnerDocument().createTextNode(item));
654
				}
655
			}
656
		}
657
	}
658

  
659
	public static class Identifier {
660
		public enum IdentifierType {
661
			ARK, DOI, Handle, PURL, URN, URL
662
		}
663

  
664
		public String value;
665
		public IdentifierType type;
666

  
667
		public Identifier() {
668
		}
669

  
670
		public Identifier(IdentifierType type, String value) {
671
			this.type = type;
672
			this.value = value;
673
		}
674

  
675
		public void toXml(Element parent){
676
			Element node = parent.getOwnerDocument().createElement("identifier");
677
			parent.appendChild(node);
678

  
679
			node.setAttribute("identifierType", this.type.toString());
680
			if(this.value!=null) {
681
				node.appendChild(parent.getOwnerDocument().createTextNode(this.value));
682
			}
683
		}
684
	}
685
}
modules/dnet-collector-plugins/tags/dnet-collector-plugins-1.7.0/src/main/java/eu/dnetlib/data/collector/plugins/schemaorg/httpapi/HttpApiRepositoryIterable.java
1
package eu.dnetlib.data.collector.plugins.schemaorg.httpapi;
2

  
3
import eu.dnetlib.data.collector.plugins.schemaorg.RepositoryIterable;
4

  
5
public interface HttpApiRepositoryIterable extends RepositoryIterable {
6
}
modules/dnet-collector-plugins/tags/dnet-collector-plugins-1.7.0/src/main/java/eu/dnetlib/data/collector/plugins/schemaorg/sitemapindex/SitemapIndexIterator.java
1
package eu.dnetlib.data.collector.plugins.schemaorg.sitemapindex;
2

  
3
import eu.dnetlib.data.collector.plugins.schemaorg.Utils;
4
import org.apache.commons.io.IOUtils;
5
import org.apache.commons.logging.Log;
6
import org.apache.commons.logging.LogFactory;
7

  
8
import java.net.URL;
9
import java.nio.charset.Charset;
10
import java.util.*;
11

  
12
public class SitemapIndexIterator implements Iterator<String> {
13
	private static final Log log = LogFactory.getLog(SitemapIndexIterator.class);
14

  
15
	public static class Options {
16
		private URL indexUrl;
17
		private Charset charset;
18

  
19
		public Options(){}
20

  
21
		public Options(URL indexUrl, Charset charset){
22
			this.indexUrl = indexUrl;
23
			this.charset = charset;
24
		}
25

  
26
		public URL getIndexUrl() {
27
			return indexUrl;
28
		}
29

  
... This diff was truncated because it exceeds the maximum size that can be displayed.

Also available in: Unified diff