Revision 62810
Added by Michele Artini about 1 year ago
modules/dnet-collector-plugins/tags/dnet-collector-plugins-1.7.0/src/main/java/eu/dnetlib/data/collector/plugins/schemaorg/DatasetMappingIterator.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.plugins.schemaorg; |
|
2 |
|
|
3 |
import org.apache.commons.logging.Log; |
|
4 |
import org.apache.commons.logging.LogFactory; |
|
5 |
import org.json.JSONObject; |
|
6 |
|
|
7 |
import java.net.URL; |
|
8 |
import java.time.LocalDate; |
|
9 |
import java.time.format.DateTimeFormatter; |
|
10 |
import java.util.*; |
|
11 |
|
|
12 |
public class DatasetMappingIterator implements Iterator<String> { |
|
13 |
private static final Log log = LogFactory.getLog(EndpointAccessIterator.class); |
|
14 |
|
|
15 |
public static class Options { |
|
16 |
public static class IdentifierOptions{ |
|
17 |
public List<String> mappingARK; |
|
18 |
public List<String> mappingDOI; |
|
19 |
public List<String> mappingHandle; |
|
20 |
public List<String> mappingPURL; |
|
21 |
public List<String> mappingURN; |
|
22 |
public List<String> mappingURL; |
|
23 |
public DatasetDocument.Identifier.IdentifierType fallbackType; |
|
24 |
public Boolean fallbackURL; |
|
25 |
} |
|
26 |
|
|
27 |
public static class ContributorOptions{ |
|
28 |
public DatasetDocument.Contributor.ContributorType fallbackType; |
|
29 |
} |
|
30 |
|
|
31 |
public static class PublicationDateOptions{ |
|
32 |
public String format; |
|
33 |
} |
|
34 |
|
|
35 |
public static class CreatedDateOptions{ |
|
36 |
public String format; |
|
37 |
} |
|
38 |
|
|
39 |
public static class UpdatedDateOptions{ |
|
40 |
public String format; |
|
41 |
} |
|
42 |
|
|
43 |
private IdentifierOptions identifierOptions; |
|
44 |
private PublicationDateOptions publicationDateOptions; |
|
45 |
private ContributorOptions contributorOptions; |
|
46 |
private CreatedDateOptions createdDateOptions; |
|
47 |
private UpdatedDateOptions updatedDateOptions; |
|
48 |
|
|
49 |
public UpdatedDateOptions getUpdatedDateOptions() { |
|
50 |
return updatedDateOptions; |
|
51 |
} |
|
52 |
|
|
53 |
public void setUpdatedDateOptions(UpdatedDateOptions updatedDateOptions) { |
|
54 |
this.updatedDateOptions = updatedDateOptions; |
|
55 |
} |
|
56 |
|
|
57 |
public CreatedDateOptions getCreatedDateOptions() { |
|
58 |
return createdDateOptions; |
|
59 |
} |
|
60 |
|
|
61 |
public void setCreatedDateOptions(CreatedDateOptions createdDateOptions) { |
|
62 |
this.createdDateOptions = createdDateOptions; |
|
63 |
} |
|
64 |
|
|
65 |
public ContributorOptions getContributorOptions() { |
|
66 |
return contributorOptions; |
|
67 |
} |
|
68 |
|
|
69 |
public void setContributorOptions(ContributorOptions contributorOptions) { |
|
70 |
this.contributorOptions = contributorOptions; |
|
71 |
} |
|
72 |
|
|
73 |
public PublicationDateOptions getPublicationDateOptions() { |
|
74 |
return publicationDateOptions; |
|
75 |
} |
|
76 |
|
|
77 |
public void setPublicationDateOptions(PublicationDateOptions publicationDateOptions) { |
|
78 |
this.publicationDateOptions = publicationDateOptions; |
|
79 |
} |
|
80 |
|
|
81 |
public IdentifierOptions getIdentifierOptions() { |
|
82 |
return identifierOptions; |
|
83 |
} |
|
84 |
|
|
85 |
public void setIdentifierOptions(IdentifierOptions identifierOptions) { |
|
86 |
this.identifierOptions = identifierOptions; |
|
87 |
} |
|
88 |
} |
|
89 |
|
|
90 |
private Options options; |
|
91 |
private EndpointAccessIterator endpointAccessIterator; |
|
92 |
|
|
93 |
public DatasetMappingIterator(Options options, EndpointAccessIterator endpointAccessIterator) { |
|
94 |
this.options = options; |
|
95 |
this.endpointAccessIterator = endpointAccessIterator; |
|
96 |
} |
|
97 |
|
|
98 |
@Override |
|
99 |
public boolean hasNext() { |
|
100 |
return this.endpointAccessIterator.hasNext(); |
|
101 |
} |
|
102 |
|
|
103 |
@Override |
|
104 |
public String next() { |
|
105 |
JSONObject document = this.endpointAccessIterator.next(); |
|
106 |
String xml = null; |
|
107 |
if (document == null) { |
|
108 |
log.debug("no document provided to process. returning empty"); |
|
109 |
xml = DatasetDocument.emptyXml(); |
|
110 |
} |
|
111 |
else { |
|
112 |
log.debug("building document"); |
|
113 |
xml = this.buildDataset(document); |
|
114 |
if (!Utils.validateXml(xml)) { |
|
115 |
log.debug("xml not valid. setting to empty"); |
|
116 |
xml = null; |
|
117 |
} |
|
118 |
if (xml == null) { |
|
119 |
log.debug("could not build xml. returning empty"); |
|
120 |
xml = DatasetDocument.emptyXml(); |
|
121 |
} |
|
122 |
} |
|
123 |
|
|
124 |
//if all else fails |
|
125 |
if(xml == null){ |
|
126 |
log.debug("could not build xml. returning empty"); |
|
127 |
xml = "<dataset/>"; |
|
128 |
} |
|
129 |
|
|
130 |
log.debug("xml document for dataset is: "+xml); |
|
131 |
|
|
132 |
return xml; |
|
133 |
} |
|
134 |
|
|
135 |
private String buildDataset(JSONObject document){ |
|
136 |
String xml = null; |
|
137 |
try{ |
|
138 |
DatasetDocument dataset = new DatasetDocument(); |
|
139 |
|
|
140 |
dataset.setIdentifiers(this.extractIdentifier(document)); |
|
141 |
dataset.setCreators(this.extractCreator(document)); |
|
142 |
dataset.setTitles(this.extractTitles(document)); |
|
143 |
dataset.setAlternativeTitles(this.extractAlternateTitles(document)); |
|
144 |
dataset.setPublishers(this.extractPublisher(document)); |
|
145 |
dataset.setPublicationDates(this.extractPublicationDate(document)); |
|
146 |
dataset.setSubjects(this.extractSubjects(document)); |
|
147 |
dataset.setContributors(this.extractContributors(document)); |
|
148 |
dataset.setCreatedDates(this.extractCreatedDate(document)); |
|
149 |
dataset.setUpdatedDates(this.extractUpdatedDate(document)); |
|
150 |
dataset.setLanguages(this.extractLanguages(document)); |
|
151 |
dataset.setResourceTypes(this.extractResourceTypes(document)); |
|
152 |
dataset.setAlternateIdentifier(this.extractAlternateIdentifiers(document)); |
|
153 |
dataset.setCitations(this.extractCitations(document)); |
|
154 |
dataset.setSizes(this.extractSize(document)); |
|
155 |
dataset.setFormat(this.extractEncodingFormat(document)); |
|
156 |
dataset.setVersion(this.extractVersion(document)); |
|
157 |
dataset.setLicenses(this.extractLicense(document)); |
|
158 |
dataset.setDescriptions(this.extractDescription(document)); |
|
159 |
dataset.setDisambiguatingDescriptions(this.extractDisambiguatingDescription(document)); |
|
160 |
dataset.setGeoLocations(this.extractSpatialCoverage(document)); |
|
161 |
|
|
162 |
log.debug("document contains native identifier: : "+(dataset.getIdentifiers()!=null && dataset.getIdentifiers().size() > 0)); |
|
163 |
|
|
164 |
if((dataset.getIdentifiers() == null || dataset.getIdentifiers().size() == 0) && |
|
165 |
this.options.getIdentifierOptions().fallbackURL){ |
|
166 |
log.debug("falling back to url identifier"); |
|
167 |
dataset.setIdentifiers(this.extractIdentifierFallbackURL(document)); |
|
168 |
log.debug("document contains overridden identifier: : "+(dataset.getIdentifiers()!=null && dataset.getIdentifiers().size() > 0)); |
|
169 |
} |
|
170 |
|
|
171 |
xml = dataset.toXml(); |
|
172 |
} |
|
173 |
catch(Exception ex){ |
|
174 |
log.error("problem constructing dataset xml. returning empty", ex); |
|
175 |
xml = null; |
|
176 |
} |
|
177 |
return xml; |
|
178 |
} |
|
179 |
|
|
180 |
private List<DatasetDocument.Identifier> extractIdentifierFallbackURL(JSONObject document){ |
|
181 |
List<String> urls = JSONLDUtils.extractString(document, "url"); |
|
182 |
|
|
183 |
ArrayList<DatasetDocument.Identifier> curated = new ArrayList<>(); |
|
184 |
for(String item : urls){ |
|
185 |
if(item == null || item.trim().length() == 0) continue; |
|
186 |
curated.add(new DatasetDocument.Identifier(DatasetDocument.Identifier.IdentifierType.URL, item.trim())); |
|
187 |
} |
|
188 |
return curated; |
|
189 |
} |
|
190 |
|
|
191 |
private List<DatasetDocument.SpatialCoverage> extractSpatialCoverage(JSONObject document){ |
|
192 |
List<JSONLDUtils.PlaceInfo> spatials = JSONLDUtils.extractPlaces(document, "spatialCoverage"); |
|
193 |
|
|
194 |
ArrayList<DatasetDocument.SpatialCoverage> curated = new ArrayList<>(); |
|
195 |
for(JSONLDUtils.PlaceInfo item : spatials){ |
|
196 |
if((item.name == null || item.name.trim().length() == 0) && |
|
197 |
(item.geoCoordinates == null || item.geoCoordinates.size() == 0) && |
|
198 |
(item.geoShapes == null || item.geoShapes.size() == 0)) continue; |
|
199 |
|
|
200 |
List<DatasetDocument.SpatialCoverage.Point> points = new ArrayList<>(); |
|
201 |
List<String> boxes = new ArrayList<>(); |
|
202 |
if(item.geoCoordinates!=null) { |
|
203 |
for (JSONLDUtils.GeoCoordinatesInfo iter : item.geoCoordinates){ |
|
204 |
points.add(new DatasetDocument.SpatialCoverage.Point(iter.latitude, iter.longitude)); |
|
205 |
} |
|
206 |
} |
|
207 |
if(item.geoShapes!=null) { |
|
208 |
for (JSONLDUtils.GeoShapeInfo iter : item.geoShapes){ |
|
209 |
boxes.add(iter.box); |
|
210 |
} |
|
211 |
} |
|
212 |
curated.add(new DatasetDocument.SpatialCoverage(item.name, points, boxes)); |
|
213 |
} |
|
214 |
return curated; |
|
215 |
} |
|
216 |
|
|
217 |
private List<String> extractDescription(JSONObject document){ |
|
218 |
List<String> descriptions = JSONLDUtils.extractString(document, "description"); |
|
219 |
|
|
220 |
ArrayList<String> curated = new ArrayList<>(); |
|
221 |
for(String item : descriptions){ |
|
222 |
if(item == null || item.trim().length() == 0) continue; |
|
223 |
curated.add(item); |
|
224 |
} |
|
225 |
return curated; |
|
226 |
} |
|
227 |
|
|
228 |
private List<String> extractDisambiguatingDescription(JSONObject document){ |
|
229 |
List<String> descriptions = JSONLDUtils.extractString(document, "disambiguatingDescription"); |
|
230 |
|
|
231 |
ArrayList<String> curated = new ArrayList<>(); |
|
232 |
for(String item : descriptions){ |
|
233 |
if(item == null || item.trim().length() == 0) continue; |
|
234 |
curated.add(item); |
|
235 |
} |
|
236 |
return curated; |
|
237 |
} |
|
238 |
|
|
239 |
private List<DatasetDocument.License> extractLicense(JSONObject document){ |
|
240 |
List<JSONLDUtils.LicenseInfo> licenses = JSONLDUtils.extractLicenses(document, "license"); |
|
241 |
|
|
242 |
ArrayList<DatasetDocument.License> curated = new ArrayList<>(); |
|
243 |
for(JSONLDUtils.LicenseInfo item : licenses){ |
|
244 |
if(item.url == null || item.url.trim().length() == 0) continue; |
|
245 |
curated.add(new DatasetDocument.License(item.name, item.url)); |
|
246 |
} |
|
247 |
return curated; |
|
248 |
} |
|
249 |
|
|
250 |
private List<String> extractVersion(JSONObject document){ |
|
251 |
List<String> versions = JSONLDUtils.extractString(document, "version"); |
|
252 |
|
|
253 |
ArrayList<String> curated = new ArrayList<>(); |
|
254 |
for(String item : versions){ |
|
255 |
if(item == null || item.trim().length() == 0) continue; |
|
256 |
curated.add(item); |
|
257 |
} |
|
258 |
return curated; |
|
259 |
} |
|
260 |
|
|
261 |
private List<String> extractSize(JSONObject document) { |
|
262 |
List<String> sizes = JSONLDUtils.extractSize(document, "distribution"); |
|
263 |
|
|
264 |
HashSet<String> curated = new HashSet<>(); |
|
265 |
for (String item : sizes) { |
|
266 |
if (item == null || item.trim().length() == 0) continue; |
|
267 |
curated.add(item); |
|
268 |
} |
|
269 |
return new ArrayList<>(curated); |
|
270 |
} |
|
271 |
|
|
272 |
private List<String> extractEncodingFormat(JSONObject document){ |
|
273 |
List<String> formats = JSONLDUtils.extractEncodingFormat(document, "distribution"); |
|
274 |
|
|
275 |
HashSet<String> curated = new HashSet<>(); |
|
276 |
for(String item : formats){ |
|
277 |
if(item == null || item.trim().length() == 0) continue; |
|
278 |
curated.add(item); |
|
279 |
} |
|
280 |
return new ArrayList<>(curated); |
|
281 |
} |
|
282 |
|
|
283 |
//TODO: Handle different citation types. Currently only urls |
|
284 |
private List<DatasetDocument.Citation> extractCitations(JSONObject document){ |
|
285 |
List<JSONLDUtils.CitationInfo> citations = JSONLDUtils.extractCitations(document, "citation"); |
|
286 |
|
|
287 |
ArrayList<DatasetDocument.Citation> curated = new ArrayList<>(); |
|
288 |
for(JSONLDUtils.CitationInfo item : citations){ |
|
289 |
if(item.url == null || item.url.trim().length() == 0) continue; |
|
290 |
try{ |
|
291 |
new URL(item.url); |
|
292 |
}catch (Exception ex){ |
|
293 |
continue; |
|
294 |
} |
|
295 |
curated.add(new DatasetDocument.Citation(item.url, DatasetDocument.Citation.CitationIdentifierType.URL)); |
|
296 |
} |
|
297 |
return curated; |
|
298 |
} |
|
299 |
|
|
300 |
private List<DatasetDocument.AlternateIdentifier> extractAlternateIdentifiers(JSONObject document){ |
|
301 |
List<String> issns = JSONLDUtils.extractString(document, "issn"); |
|
302 |
List<String> urls = JSONLDUtils.extractString(document, "url"); |
|
303 |
|
|
304 |
ArrayList<DatasetDocument.AlternateIdentifier> curated = new ArrayList<>(); |
|
305 |
for(String item : issns){ |
|
306 |
if(item == null || item.trim().length() == 0) continue; |
|
307 |
curated.add(new DatasetDocument.AlternateIdentifier(item.trim(), "ISSN")); |
|
308 |
} |
|
309 |
for(String item : urls){ |
|
310 |
if(item == null || item.trim().length() == 0) continue; |
|
311 |
curated.add(new DatasetDocument.AlternateIdentifier(item.trim(), "URL")); |
|
312 |
} |
|
313 |
return curated; |
|
314 |
} |
|
315 |
|
|
316 |
private List<DatasetDocument.ResourceType> extractResourceTypes(JSONObject document){ |
|
317 |
List<DatasetDocument.ResourceType> resourceTypes = new ArrayList<>(); |
|
318 |
resourceTypes.add(new DatasetDocument.ResourceType(DatasetDocument.ResourceType.ResourceTypeGeneralType.Dataset)); |
|
319 |
return resourceTypes; |
|
320 |
} |
|
321 |
|
|
322 |
private List<String> extractLanguages(JSONObject document){ |
|
323 |
List<String> languages = JSONLDUtils.extractLanguage(document, "inLanguage"); |
|
324 |
|
|
325 |
ArrayList<String> curated = new ArrayList<>(); |
|
326 |
for(String item : languages){ |
|
327 |
if(item == null || item.trim().length() == 0) continue; |
|
328 |
curated.add(item); |
|
329 |
} |
|
330 |
return curated; |
|
331 |
} |
|
332 |
|
|
333 |
private List<LocalDate> extractUpdatedDate(JSONObject document){ |
|
334 |
List<LocalDate> updatedDates = new ArrayList<>(); |
|
335 |
if(this.options.getUpdatedDateOptions() == null || this.options.getUpdatedDateOptions().format == null || this.options.getUpdatedDateOptions().format.length() == 0) return updatedDates; |
|
336 |
|
|
337 |
DateTimeFormatter formatter = DateTimeFormatter.ofPattern(this.options.getPublicationDateOptions().format); |
|
338 |
|
|
339 |
List<String> dates = JSONLDUtils.extractString(document, "dateModified"); |
|
340 |
for(String updatedDate : dates){ |
|
341 |
if(updatedDate == null || updatedDate.trim().length() == 0) continue; |
|
342 |
try { |
|
343 |
LocalDate localDate = LocalDate.parse(updatedDate, formatter); |
|
344 |
updatedDates.add(localDate); |
|
345 |
} catch (Exception e) { |
|
346 |
continue; |
|
347 |
} |
|
348 |
} |
|
349 |
return updatedDates; |
|
350 |
} |
|
351 |
|
|
352 |
private List<LocalDate> extractCreatedDate(JSONObject document){ |
|
353 |
List<LocalDate> createdDates = new ArrayList<>(); |
|
354 |
if(this.options.getCreatedDateOptions() == null || this.options.getCreatedDateOptions().format == null || this.options.getCreatedDateOptions().format.length() == 0) return createdDates; |
|
355 |
|
|
356 |
DateTimeFormatter formatter = DateTimeFormatter.ofPattern(this.options.getCreatedDateOptions().format); |
|
357 |
|
|
358 |
List<String> dates = JSONLDUtils.extractString(document, "dateCreated"); |
|
359 |
for(String createdDate : dates){ |
|
360 |
if(createdDate == null || createdDate.trim().length() == 0) continue; |
|
361 |
try { |
|
362 |
LocalDate localDate = LocalDate.parse(createdDate, formatter); |
|
363 |
createdDates.add(localDate); |
|
364 |
} catch (Exception e) { |
|
365 |
continue; |
|
366 |
} |
|
367 |
} |
|
368 |
return createdDates; |
|
369 |
} |
|
370 |
|
|
371 |
private List<DatasetDocument.Contributor> extractContributors(JSONObject document){ |
|
372 |
List<JSONLDUtils.PrincipalInfo> editors = JSONLDUtils.extractPrincipal(document, "editor"); |
|
373 |
List<JSONLDUtils.PrincipalInfo> funders = JSONLDUtils.extractPrincipal(document, "funder"); |
|
374 |
List<JSONLDUtils.PrincipalInfo> producers = JSONLDUtils.extractPrincipal(document, "producer"); |
|
375 |
List<JSONLDUtils.PrincipalInfo> sponsors = JSONLDUtils.extractPrincipal(document, "sponsor"); |
|
376 |
List<JSONLDUtils.PrincipalInfo> constributors = JSONLDUtils.extractPrincipal(document, "contributor"); |
|
377 |
|
|
378 |
ArrayList<DatasetDocument.Contributor> curated = new ArrayList<>(); |
|
379 |
for(JSONLDUtils.PrincipalInfo item : editors){ |
|
380 |
if(item.name() == null || item.name().trim().length() == 0) continue; |
|
381 |
curated.add(new DatasetDocument.Contributor(item.name(), item.affiliationNames(), DatasetDocument.Contributor.ContributorType.Editor)); |
|
382 |
} |
|
383 |
for(JSONLDUtils.PrincipalInfo item : funders){ |
|
384 |
if(item.name() == null || item.name().trim().length() == 0) continue; |
|
385 |
curated.add(new DatasetDocument.Contributor(item.name(), item.affiliationNames(), DatasetDocument.Contributor.ContributorType.Funder)); |
|
386 |
} |
|
387 |
for(JSONLDUtils.PrincipalInfo item : producers){ |
|
388 |
if(item.name() == null || item.name().trim().length() == 0) continue; |
|
389 |
curated.add(new DatasetDocument.Contributor(item.name(), item.affiliationNames(), DatasetDocument.Contributor.ContributorType.Producer)); |
|
390 |
} |
|
391 |
for(JSONLDUtils.PrincipalInfo item : sponsors){ |
|
392 |
if(item.name() == null || item.name().trim().length() == 0) continue; |
|
393 |
curated.add(new DatasetDocument.Contributor(item.name(), item.affiliationNames(), DatasetDocument.Contributor.ContributorType.Sponsor)); |
|
394 |
} |
|
395 |
for(JSONLDUtils.PrincipalInfo item : constributors){ |
|
396 |
if(item.name() == null || item.name().trim().length() == 0) continue; |
|
397 |
DatasetDocument.Contributor.ContributorType type = DatasetDocument.Contributor.ContributorType.Other; |
|
398 |
if(this.options.getContributorOptions()!=null && this.options.getContributorOptions().fallbackType != null) type = this.options.getContributorOptions().fallbackType; |
|
399 |
curated.add(new DatasetDocument.Contributor(item.name(), item.affiliationNames(), type)); |
|
400 |
} |
|
401 |
return curated; |
|
402 |
} |
|
403 |
|
|
404 |
private List<String> extractSubjects(JSONObject document){ |
|
405 |
List<String> subjects = JSONLDUtils.extractString(document, "keywords"); |
|
406 |
|
|
407 |
ArrayList<String> curated = new ArrayList<>(); |
|
408 |
for(String item : subjects){ |
|
409 |
if(item == null || item.trim().length() == 0) continue; |
|
410 |
curated.add(item); |
|
411 |
} |
|
412 |
return curated; |
|
413 |
} |
|
414 |
|
|
415 |
private List<LocalDate> extractPublicationDate(JSONObject document){ |
|
416 |
List<LocalDate> publicationDates = new ArrayList<>(); |
|
417 |
if(this.options.getPublicationDateOptions() == null || this.options.getPublicationDateOptions().format == null || this.options.getPublicationDateOptions().format.length() == 0) return publicationDates; |
|
418 |
|
|
419 |
DateTimeFormatter formatter = DateTimeFormatter.ofPattern(this.options.getPublicationDateOptions().format); |
|
420 |
|
|
421 |
List<String> dates = JSONLDUtils.extractString(document, "datePublished"); |
|
422 |
for(String publicationDate : dates){ |
|
423 |
if(publicationDate == null || publicationDate.trim().length() == 0) continue; |
|
424 |
try { |
|
425 |
LocalDate localDate = LocalDate.parse(publicationDate, formatter); |
|
426 |
publicationDates.add(localDate); |
|
427 |
} catch (Exception e) { |
|
428 |
continue; |
|
429 |
} |
|
430 |
} |
|
431 |
return publicationDates; |
|
432 |
} |
|
433 |
|
|
434 |
private List<String> extractPublisher(JSONObject document){ |
|
435 |
List<JSONLDUtils.PrincipalInfo> publishers = JSONLDUtils.extractPrincipal(document, "publisher"); |
|
436 |
|
|
437 |
ArrayList<String> curated = new ArrayList<>(); |
|
438 |
for(JSONLDUtils.PrincipalInfo item : publishers){ |
|
439 |
if(item.name() == null || item.name().trim().length() == 0) continue; |
|
440 |
curated.add(item.name()); |
|
441 |
} |
|
442 |
return curated; |
|
443 |
} |
|
444 |
|
|
445 |
private List<String> extractTitles(JSONObject document){ |
|
446 |
List<String> names = JSONLDUtils.extractString(document, "name"); |
|
447 |
List<String> headlines = JSONLDUtils.extractString(document, "headline"); |
|
448 |
|
|
449 |
HashSet<String> titles = new HashSet<>(); |
|
450 |
titles.addAll(names); |
|
451 |
titles.addAll(headlines); |
|
452 |
return new ArrayList<>(titles); |
|
453 |
} |
|
454 |
|
|
455 |
private List<String> extractAlternateTitles(JSONObject document){ |
|
456 |
List<String> names = JSONLDUtils.extractString(document, "alternateName"); |
|
457 |
List<String> headlines = JSONLDUtils.extractString(document, "alternativeHeadline"); |
|
458 |
|
|
459 |
HashSet<String> titles = new HashSet<>(); |
|
460 |
titles.addAll(names); |
|
461 |
titles.addAll(headlines); |
|
462 |
return new ArrayList<>(titles); |
|
463 |
} |
|
464 |
|
|
465 |
private List<DatasetDocument.Identifier> extractIdentifier(JSONObject document){ |
|
466 |
List<DatasetDocument.Identifier> curated = new ArrayList<>(); |
|
467 |
|
|
468 |
List<JSONLDUtils.IdentifierInfo> identifiers = JSONLDUtils.extractIdentifier(document, "identifier"); |
|
469 |
|
|
470 |
for(JSONLDUtils.IdentifierInfo item : identifiers){ |
|
471 |
if(item.value == null || item.value.trim().length() == 0) continue; |
|
472 |
if(item.type == null || item.type.trim().length() == 0) { |
|
473 |
if (this.options.getIdentifierOptions().fallbackType == null) continue; |
|
474 |
curated.add(new DatasetDocument.Identifier(this.options.getIdentifierOptions().fallbackType, item.value.trim())); |
|
475 |
} |
|
476 |
else { |
|
477 |
DatasetDocument.Identifier.IdentifierType type = null; |
|
478 |
if(this.options.getIdentifierOptions().mappingARK != null && this.options.getIdentifierOptions().mappingARK.contains(item.type.trim())) type = DatasetDocument.Identifier.IdentifierType.ARK; |
|
479 |
else if(this.options.getIdentifierOptions().mappingDOI != null && this.options.getIdentifierOptions().mappingDOI.contains(item.type.trim())) type = DatasetDocument.Identifier.IdentifierType.DOI; |
|
480 |
else if(this.options.getIdentifierOptions().mappingHandle != null && this.options.getIdentifierOptions().mappingHandle.contains(item.type.trim())) type = DatasetDocument.Identifier.IdentifierType.Handle; |
|
481 |
else if(this.options.getIdentifierOptions().mappingPURL != null && this.options.getIdentifierOptions().mappingPURL.contains(item.type.trim())) type = DatasetDocument.Identifier.IdentifierType.PURL; |
|
482 |
else if(this.options.getIdentifierOptions().mappingURL != null && this.options.getIdentifierOptions().mappingURL.contains(item.type.trim())) type = DatasetDocument.Identifier.IdentifierType.URL; |
|
483 |
else if(this.options.getIdentifierOptions().mappingURN != null && this.options.getIdentifierOptions().mappingURN.contains(item.type.trim())) type = DatasetDocument.Identifier.IdentifierType.URN; |
|
484 |
|
|
485 |
if(type == null) continue; |
|
486 |
curated.add(new DatasetDocument.Identifier(type, item.value.trim())); |
|
487 |
} |
|
488 |
} |
|
489 |
return curated; |
|
490 |
} |
|
491 |
|
|
492 |
private List<DatasetDocument.Creator> extractCreator(JSONObject document){ |
|
493 |
List<JSONLDUtils.PrincipalInfo> creators = JSONLDUtils.extractPrincipal(document, "creator"); |
|
494 |
List<JSONLDUtils.PrincipalInfo> authors = JSONLDUtils.extractPrincipal(document, "author"); |
|
495 |
|
|
496 |
HashSet<String> foundNames = new HashSet<>(); |
|
497 |
List<DatasetDocument.Creator> curated = new ArrayList<>(); |
|
498 |
for(JSONLDUtils.PrincipalInfo item : creators){ |
|
499 |
if(item.name() == null || item.name().trim().length() == 0) continue; |
|
500 |
if(foundNames.contains(item.name())) continue; |
|
501 |
foundNames.add(item.name()); |
|
502 |
curated.add(new DatasetDocument.Creator(item.name(), item.affiliationNames())); |
|
503 |
} |
|
504 |
for(JSONLDUtils.PrincipalInfo item : authors){ |
|
505 |
if(item.name() == null || item.name().trim().length() == 0) continue; |
|
506 |
if(foundNames.contains(item.name())) continue; |
|
507 |
foundNames.add(item.name()); |
|
508 |
|
|
509 |
curated.add(new DatasetDocument.Creator(item.name(), item.affiliationNames())); |
|
510 |
} |
|
511 |
return curated; |
|
512 |
} |
|
513 |
|
|
514 |
} |
modules/dnet-collector-plugins/tags/dnet-collector-plugins-1.7.0/src/main/java/eu/dnetlib/data/collector/plugins/schemaorg/SchemaOrgMainKaggle.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.plugins.schemaorg; |
|
2 |
|
|
3 |
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor; |
|
4 |
import org.apache.commons.io.FileUtils; |
|
5 |
import org.apache.commons.logging.Log; |
|
6 |
import org.apache.commons.logging.LogFactory; |
|
7 |
import org.apache.log4j.ConsoleAppender; |
|
8 |
import org.apache.log4j.Level; |
|
9 |
import org.apache.log4j.Logger; |
|
10 |
import org.apache.log4j.PatternLayout; |
|
11 |
|
|
12 |
import java.io.File; |
|
13 |
import java.nio.charset.StandardCharsets; |
|
14 |
import java.util.HashMap; |
|
15 |
import java.util.concurrent.TimeUnit; |
|
16 |
|
|
17 |
public class SchemaOrgMainKaggle { |
|
18 |
|
|
19 |
private static final Log log = LogFactory.getLog(SchemaOrgMainKaggle.class); |
|
20 |
|
|
21 |
public static void main(String[] args) throws Exception { |
|
22 |
|
|
23 |
ConsoleAppender console = new ConsoleAppender(); |
|
24 |
console.setLayout(new PatternLayout("%d [%p|%c|%C{1}] %m%n")); |
|
25 |
console.setThreshold(Level.DEBUG); |
|
26 |
console.activateOptions(); |
|
27 |
Logger.getLogger("eu.dnetlib.data.collector.plugins").addAppender(console); |
|
28 |
|
|
29 |
HashMap<String,String> params = new HashMap<>(); |
|
30 |
params.put("consumerBlockPolling", Boolean.toString(true)); |
|
31 |
params.put("consumerBlockPollingTimeout", "2"); |
|
32 |
params.put("consumerBlockPollingTimeoutUnit", TimeUnit.MINUTES.toString()); |
|
33 |
params.put("endpointCharset", StandardCharsets.UTF_8.name()); |
|
34 |
params.put("updatedDateFormat", "YYYY-MM-DD"); |
|
35 |
params.put("createdDateFormat", "YYYY-MM-DD"); |
|
36 |
params.put("publicationDateFormat", "YYYY-MM-DD"); |
|
37 |
params.put("contributorFallbackType", DatasetDocument.Contributor.ContributorType.Other.toString()); |
|
38 |
params.put("identifierFallbackType", DatasetDocument.Identifier.IdentifierType.Handle.toString()); |
|
39 |
params.put("identifierFallbackURL", Boolean.toString(true)); |
|
40 |
params.put("identifierMappingARK", "ark, ARK"); |
|
41 |
params.put("identifierMappingDOI", "doi, DOI"); |
|
42 |
params.put("identifierMappingHandle", "Handle, HANDLE"); |
|
43 |
params.put("identifierMappingPURL", "purl, PURL"); |
|
44 |
params.put("identifierMappingURN", "urn, URN"); |
|
45 |
params.put("identifierMappingURL", "url, URL"); |
|
46 |
|
|
47 |
params.put("repositoryAccessType", "httpapi-kaggle"); |
|
48 |
|
|
49 |
params.put("httpapi-kaggle_queueSize", "100"); |
|
50 |
params.put("httpapi-kaggle_APICharset", StandardCharsets.UTF_8.name()); |
|
51 |
params.put("httpapi-kaggle_queryUrl", "https://www.kaggle.com/datasets_v2.json?sortBy=updated&group=public&page={PAGE}&pageSize=20&size=sizeAll&filetype=fileTypeAll&license=licenseAll"); |
|
52 |
params.put("httpapi-kaggle_queryPagePlaceholder", "{PAGE}"); |
|
53 |
params.put("httpapi-kaggle_responsePropertyTotalDataset", "totalDatasetListItems"); |
|
54 |
params.put("httpapi-kaggle_responsePropertyDatasetList", "datasetListItems"); |
|
55 |
params.put("httpapi-kaggle_responsePropertyDatasetUrl", "datasetUrl"); |
|
56 |
params.put("httpapi-kaggle_responseBaseDatasetUrl", "https://www.kaggle.com"); |
|
57 |
params.put("httpapi-kaggle_producerBlockPollingTimeout", "2"); |
|
58 |
params.put("httpapi-kaggle_producerBlockPollingTimeoutUnit", TimeUnit.MINUTES.toString()); |
|
59 |
|
|
60 |
InterfaceDescriptor descriptor = new InterfaceDescriptor(); |
|
61 |
descriptor.setId("schema.org - kaggle"); |
|
62 |
descriptor.setBaseUrl("https://www.kaggle.com"); |
|
63 |
|
|
64 |
descriptor.setParams(params); |
|
65 |
|
|
66 |
SchemaOrgPlugin schemaOrgPlugin = new SchemaOrgPlugin(); |
|
67 |
|
|
68 |
Iterable<String> iterable = schemaOrgPlugin.collect(descriptor, null, null); |
|
69 |
|
|
70 |
String outDir = params.get("repositoryAccessType"); |
|
71 |
|
|
72 |
log.info("saving content in " + outDir); |
|
73 |
|
|
74 |
File directory = new File(outDir); |
|
75 |
if (directory.exists()) { |
|
76 |
log.info(directory.getAbsolutePath() + " exists, cleaning up"); |
|
77 |
FileUtils.deleteDirectory(directory); |
|
78 |
} |
|
79 |
FileUtils.forceMkdir(directory); |
|
80 |
Utils.writeFiles(iterable, outDir); |
|
81 |
|
|
82 |
} |
|
83 |
|
|
84 |
} |
modules/dnet-collector-plugins/tags/dnet-collector-plugins-1.7.0/src/main/java/eu/dnetlib/data/collector/plugins/schemaorg/SchemaOrgPlugin.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.plugins.schemaorg; |
|
2 |
|
|
3 |
import eu.dnetlib.data.collector.plugin.AbstractCollectorPlugin; |
|
4 |
import eu.dnetlib.data.collector.plugins.schemaorg.httpapi.kaggle.KaggleRepositoryIterable; |
|
5 |
import eu.dnetlib.data.collector.plugins.schemaorg.sitemapindex.SitemapFileIterator; |
|
6 |
import eu.dnetlib.data.collector.plugins.schemaorg.sitemapindex.SitemapIndexIterator; |
|
7 |
import eu.dnetlib.data.collector.plugins.schemaorg.sitemapindex.SitemapIndexRepositoryIterable; |
|
8 |
import eu.dnetlib.data.collector.rmi.CollectorServiceException; |
|
9 |
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor; |
|
10 |
import org.apache.commons.logging.Log; |
|
11 |
import org.apache.commons.logging.LogFactory; |
|
12 |
|
|
13 |
import java.net.MalformedURLException; |
|
14 |
import java.net.URL; |
|
15 |
import java.nio.charset.StandardCharsets; |
|
16 |
import java.util.concurrent.TimeUnit; |
|
17 |
|
|
18 |
public class SchemaOrgPlugin extends AbstractCollectorPlugin { |
|
19 |
|
|
20 |
private static final Log log = LogFactory.getLog(SchemaOrgPlugin.class); |
|
21 |
|
|
22 |
public String hello(){ |
|
23 |
return "hello"; |
|
24 |
} |
|
25 |
|
|
26 |
@Override |
|
27 |
public Iterable<String> collect(final InterfaceDescriptor interfaceDescriptor, final String fromDate, final String untilDate) throws CollectorServiceException { |
|
28 |
try { |
|
29 |
RepositoryIterable repository = null; |
|
30 |
String repositoryAccessType = Utils.getAsString(interfaceDescriptor.getParams(), "repositoryAccessType", null); |
|
31 |
switch(repositoryAccessType) { |
|
32 |
case "sitemapindex": { |
|
33 |
SitemapIndexRepositoryIterable.Options repositoryOptions = this.compileSitemapIndexRepositoryOptions(interfaceDescriptor); |
|
34 |
SitemapIndexRepositoryIterable repositoryIterable = new SitemapIndexRepositoryIterable(repositoryOptions); |
|
35 |
repositoryIterable.bootstrap(); |
|
36 |
repository = repositoryIterable; |
|
37 |
break; |
|
38 |
} |
|
39 |
case "httpapi-kaggle": { |
|
40 |
KaggleRepositoryIterable.Options repositoryOptions = this.compileKaggleRepositoryOptions(interfaceDescriptor); |
|
41 |
KaggleRepositoryIterable repositoryIterable = new KaggleRepositoryIterable(repositoryOptions); |
|
42 |
repositoryIterable.bootstrap(); |
|
43 |
repository = repositoryIterable; |
|
44 |
break; |
|
45 |
} |
|
46 |
default: |
|
47 |
throw new CollectorServiceException(String.format("unrecognized repository access type ", repositoryAccessType)); |
|
48 |
} |
|
49 |
SchemaOrgIterable.Options schemaOrgOptions = this.compileSchemaOrgOptions(interfaceDescriptor); |
|
50 |
SchemaOrgIterable iterable = new SchemaOrgIterable(schemaOrgOptions, repository); |
|
51 |
return iterable; |
|
52 |
} catch (Exception e) { |
|
53 |
throw new CollectorServiceException("Could not create iterator", e); |
|
54 |
} |
|
55 |
} |
|
56 |
|
|
57 |
private KaggleRepositoryIterable.Options compileKaggleRepositoryOptions(InterfaceDescriptor interfaceDescriptor) throws MalformedURLException { |
|
58 |
KaggleRepositoryIterable.Options kaggleRepositoryOptions = new KaggleRepositoryIterable.Options(); |
|
59 |
kaggleRepositoryOptions.setQueueSize(Utils.getAsInt(interfaceDescriptor.getParams(), "httpapi-kaggle_queueSize", 100)); |
|
60 |
kaggleRepositoryOptions.setPutTimeout(Utils.getAsLong(interfaceDescriptor.getParams(), "httpapi-kaggle_producerBlockPollingTimeout", 20)); |
|
61 |
kaggleRepositoryOptions.setPutTimeoutUnit(Utils.getAsEnum(interfaceDescriptor.getParams(), "httpapi-kaggle_producerBlockPollingTimeoutUnit", TimeUnit.MINUTES, TimeUnit.class)); |
|
62 |
kaggleRepositoryOptions.setCharset(Utils.getAsCharset(interfaceDescriptor.getParams(), "httpapi-kaggle_APICharset", StandardCharsets.UTF_8)); |
|
63 |
kaggleRepositoryOptions.setQueryUrl(Utils.getAsString(interfaceDescriptor.getParams(), "httpapi-kaggle_queryUrl", null)); |
|
64 |
kaggleRepositoryOptions.setQueryPagePlaceholder(Utils.getAsString(interfaceDescriptor.getParams(), "httpapi-kaggle_queryPagePlaceholder", "{PAGE}")); |
|
65 |
kaggleRepositoryOptions.setResponsePropertyTotalDataset(Utils.getAsString(interfaceDescriptor.getParams(), "httpapi-kaggle_responsePropertyTotalDataset", "totalDatasetListItems")); |
|
66 |
kaggleRepositoryOptions.setResponsePropertyDatasetList(Utils.getAsString(interfaceDescriptor.getParams(), "httpapi-kaggle_responsePropertyDatasetList", "datasetListItems")); |
|
67 |
kaggleRepositoryOptions.setResponsePropertyDatasetUrl(Utils.getAsString(interfaceDescriptor.getParams(), "httpapi-kaggle_responsePropertyDatasetUrl", "datasetUrl")); |
|
68 |
kaggleRepositoryOptions.setResponseBaseDatasetUrl(Utils.getAsString(interfaceDescriptor.getParams(), "httpapi-kaggle_responseBaseDatasetUrl", interfaceDescriptor.getBaseUrl())); |
|
69 |
kaggleRepositoryOptions.setRepositoryQueueIteratorOptions(this.compileRepositoryQueueOptions(interfaceDescriptor)); |
|
70 |
return kaggleRepositoryOptions; |
|
71 |
|
|
72 |
} |
|
73 |
|
|
74 |
private SitemapIndexIterator.Options compileSitemapIndexOptions(InterfaceDescriptor interfaceDescriptor) throws MalformedURLException { |
|
75 |
SitemapIndexIterator.Options sitemapIndexIteratorOptions = new SitemapIndexIterator.Options(); |
|
76 |
sitemapIndexIteratorOptions.setCharset(Utils.getAsCharset(interfaceDescriptor.getParams(), "sitemap_IndexCharset", StandardCharsets.UTF_8)); |
|
77 |
sitemapIndexIteratorOptions.setIndexUrl(new URL(interfaceDescriptor.getBaseUrl())); |
|
78 |
return sitemapIndexIteratorOptions; |
|
79 |
|
|
80 |
} |
|
81 |
|
|
82 |
private SitemapFileIterator.Options compileSitemapFileOptions(InterfaceDescriptor interfaceDescriptor) throws MalformedURLException { |
|
83 |
SitemapFileIterator.Options sitemapFileIteratorOptions = new SitemapFileIterator.Options(); |
|
84 |
sitemapFileIteratorOptions.setCharset(Utils.getAsCharset(interfaceDescriptor.getParams(), "sitemap_FileCharset", StandardCharsets.UTF_8)); |
|
85 |
sitemapFileIteratorOptions.setSchemaType(Utils.getAsEnum(interfaceDescriptor.getParams(), "sitemap_FileSchema", SitemapFileIterator.Options.SitemapSchemaType.Xml, SitemapFileIterator.Options.SitemapSchemaType.class)); |
|
86 |
sitemapFileIteratorOptions.setFileType(Utils.getAsEnum(interfaceDescriptor.getParams(), "sitemap_FileType", SitemapFileIterator.Options.SitemapFileType.Text, SitemapFileIterator.Options.SitemapFileType.class)); |
|
87 |
return sitemapFileIteratorOptions; |
|
88 |
} |
|
89 |
|
|
90 |
private RepositoryQueueIterator.Options compileRepositoryQueueOptions(InterfaceDescriptor interfaceDescriptor) throws MalformedURLException { |
|
91 |
RepositoryQueueIterator.Options repositoryQueueIteratorOptions = new RepositoryQueueIterator.Options(); |
|
92 |
repositoryQueueIteratorOptions.setBlockPolling(Utils.getAsBoolean(interfaceDescriptor.getParams(), "consumerBlockPolling", true)); |
|
93 |
repositoryQueueIteratorOptions.setPollTimeout(Utils.getAsLong(interfaceDescriptor.getParams(), "consumerBlockPollingTimeout", 2)); |
|
94 |
repositoryQueueIteratorOptions.setPollTimeoutUnit(Utils.getAsEnum(interfaceDescriptor.getParams(), "consumerBlockPollingTimeoutUnit", TimeUnit.MINUTES, TimeUnit.class)); |
|
95 |
return repositoryQueueIteratorOptions; |
|
96 |
} |
|
97 |
|
|
98 |
private SitemapIndexRepositoryIterable.Options compileSitemapIndexRepositoryOptions(InterfaceDescriptor interfaceDescriptor) throws MalformedURLException { |
|
99 |
SitemapIndexRepositoryIterable.Options sitemapIndexRepositoryIterableOptions = new SitemapIndexRepositoryIterable.Options(); |
|
100 |
sitemapIndexRepositoryIterableOptions.setQueueSize(Utils.getAsInt(interfaceDescriptor.getParams(), "sitemap_queueSize", 100)); |
|
101 |
sitemapIndexRepositoryIterableOptions.setPutTimeout(Utils.getAsLong(interfaceDescriptor.getParams(), "sitemap_producerBlockPollingTimeout", 20)); |
|
102 |
sitemapIndexRepositoryIterableOptions.setPutTimeoutUnit(Utils.getAsEnum(interfaceDescriptor.getParams(), "sitemap_producerBlockPollingTimeoutUnit", TimeUnit.MINUTES, TimeUnit.class)); |
|
103 |
sitemapIndexRepositoryIterableOptions.setRepositoryQueueIteratorOptions(this.compileRepositoryQueueOptions(interfaceDescriptor)); |
|
104 |
sitemapIndexRepositoryIterableOptions.setSitemapFileIteratorOptions(this.compileSitemapFileOptions(interfaceDescriptor)); |
|
105 |
sitemapIndexRepositoryIterableOptions.setSitemapIndexIteratorOptions(this.compileSitemapIndexOptions(interfaceDescriptor)); |
|
106 |
return sitemapIndexRepositoryIterableOptions; |
|
107 |
} |
|
108 |
|
|
109 |
private EndpointAccessIterator.Options compileEndpointAccessOptions(InterfaceDescriptor interfaceDescriptor) throws MalformedURLException { |
|
110 |
EndpointAccessIterator.Options endpointAccessIteratorOptions = new EndpointAccessIterator.Options(); |
|
111 |
endpointAccessIteratorOptions.setCharset(Utils.getAsCharset(interfaceDescriptor.getParams(), "endpointCharset", StandardCharsets.UTF_8)); |
|
112 |
return endpointAccessIteratorOptions; |
|
113 |
} |
|
114 |
|
|
115 |
private DatasetMappingIterator.Options compileDatasetMappingOptions(InterfaceDescriptor interfaceDescriptor) throws MalformedURLException { |
|
116 |
DatasetMappingIterator.Options datasetMappingIteratorOptions = new DatasetMappingIterator.Options(); |
|
117 |
|
|
118 |
DatasetMappingIterator.Options.UpdatedDateOptions datasetMappingIteratorUpdatedDateOptions = new DatasetMappingIterator.Options.UpdatedDateOptions(); |
|
119 |
datasetMappingIteratorUpdatedDateOptions.format =Utils.getAsString(interfaceDescriptor.getParams(), "updatedDateFormat", "YYYY-MM-DD"); |
|
120 |
datasetMappingIteratorOptions.setUpdatedDateOptions(datasetMappingIteratorUpdatedDateOptions); |
|
121 |
|
|
122 |
DatasetMappingIterator.Options.CreatedDateOptions datasetMappingIteratorCreatedDateOptions = new DatasetMappingIterator.Options.CreatedDateOptions(); |
|
123 |
datasetMappingIteratorCreatedDateOptions.format =Utils.getAsString(interfaceDescriptor.getParams(), "createdDateFormat", "YYYY-MM-DD"); |
|
124 |
datasetMappingIteratorOptions.setCreatedDateOptions(datasetMappingIteratorCreatedDateOptions); |
|
125 |
|
|
126 |
DatasetMappingIterator.Options.PublicationDateOptions datasetMappingIteratorPublicationDateOptions = new DatasetMappingIterator.Options.PublicationDateOptions(); |
|
127 |
datasetMappingIteratorPublicationDateOptions.format =Utils.getAsString(interfaceDescriptor.getParams(), "publicationDateFormat", "YYYY-MM-DD"); |
|
128 |
datasetMappingIteratorOptions.setPublicationDateOptions(datasetMappingIteratorPublicationDateOptions); |
|
129 |
|
|
130 |
DatasetMappingIterator.Options.ContributorOptions datasetMappingIteratorContributorOptions = new DatasetMappingIterator.Options.ContributorOptions(); |
|
131 |
datasetMappingIteratorContributorOptions.fallbackType =Utils.getAsEnum(interfaceDescriptor.getParams(), "contributorFallbackType",DatasetDocument.Contributor.ContributorType.Other, DatasetDocument.Contributor.ContributorType.class); |
|
132 |
datasetMappingIteratorOptions.setContributorOptions(datasetMappingIteratorContributorOptions); |
|
133 |
|
|
134 |
DatasetMappingIterator.Options.IdentifierOptions datasetMappingIteratorIdentifierOptions = new DatasetMappingIterator.Options.IdentifierOptions(); |
|
135 |
datasetMappingIteratorIdentifierOptions.fallbackType = Utils.getAsEnum(interfaceDescriptor.getParams(), "identifierFallbackType", null, DatasetDocument.Identifier.IdentifierType.class); |
|
136 |
datasetMappingIteratorIdentifierOptions.fallbackURL = Utils.getAsBoolean(interfaceDescriptor.getParams(), "identifierFallbackURL", true); |
|
137 |
datasetMappingIteratorIdentifierOptions.mappingARK = Utils.getAsStringCsv(interfaceDescriptor.getParams(), "identifierMappingARK", null); |
|
138 |
datasetMappingIteratorIdentifierOptions.mappingDOI = Utils.getAsStringCsv(interfaceDescriptor.getParams(), "identifierMappingDOI", null); |
|
139 |
datasetMappingIteratorIdentifierOptions.mappingHandle = Utils.getAsStringCsv(interfaceDescriptor.getParams(), "identifierMappingHandle", null); |
|
140 |
datasetMappingIteratorIdentifierOptions.mappingPURL = Utils.getAsStringCsv(interfaceDescriptor.getParams(), "identifierMappingPURL", null); |
|
141 |
datasetMappingIteratorIdentifierOptions.mappingURL = Utils.getAsStringCsv(interfaceDescriptor.getParams(), "identifierMappingURL", null); |
|
142 |
datasetMappingIteratorIdentifierOptions.mappingURN = Utils.getAsStringCsv(interfaceDescriptor.getParams(), "identifierMappingURN", null); |
|
143 |
datasetMappingIteratorOptions.setIdentifierOptions(datasetMappingIteratorIdentifierOptions); |
|
144 |
return datasetMappingIteratorOptions; |
|
145 |
} |
|
146 |
|
|
147 |
private SchemaOrgIterable.Options compileSchemaOrgOptions(InterfaceDescriptor interfaceDescriptor) throws MalformedURLException { |
|
148 |
SchemaOrgIterable.Options schemaOrgIterableOptions = new SchemaOrgIterable.Options(); |
|
149 |
schemaOrgIterableOptions.setDatasetMappingOptions(this.compileDatasetMappingOptions(interfaceDescriptor)); |
|
150 |
schemaOrgIterableOptions.setEndpointAccessOptions(this.compileEndpointAccessOptions(interfaceDescriptor)); |
|
151 |
return schemaOrgIterableOptions; |
|
152 |
} |
|
153 |
} |
modules/dnet-collector-plugins/tags/dnet-collector-plugins-1.7.0/src/main/java/eu/dnetlib/data/collector/plugins/schemaorg/DatasetDocument.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.plugins.schemaorg; |
|
2 |
|
|
3 |
import org.w3c.dom.Attr; |
|
4 |
import org.w3c.dom.Document; |
|
5 |
import org.w3c.dom.Element; |
|
6 |
|
|
7 |
import javax.xml.parsers.DocumentBuilder; |
|
8 |
import javax.xml.parsers.DocumentBuilderFactory; |
|
9 |
import javax.xml.parsers.ParserConfigurationException; |
|
10 |
import javax.xml.transform.Transformer; |
|
11 |
import javax.xml.transform.TransformerFactory; |
|
12 |
import javax.xml.transform.dom.DOMSource; |
|
13 |
import javax.xml.transform.stream.StreamResult; |
|
14 |
import java.io.StringWriter; |
|
15 |
import java.time.LocalDate; |
|
16 |
import java.time.format.DateTimeFormatter; |
|
17 |
import java.util.Calendar; |
|
18 |
import java.util.Date; |
|
19 |
import java.util.List; |
|
20 |
|
|
21 |
public class DatasetDocument { |
|
22 |
private List<Identifier> identifiers; |
|
23 |
private List<Creator> creators; |
|
24 |
private List<String> titles; |
|
25 |
private List<String> alternativeTitles; |
|
26 |
private List<String> publishers; |
|
27 |
private List<LocalDate> publicationDates; |
|
28 |
private List<String> subjects; |
|
29 |
private List<Contributor> contributors; |
|
30 |
private List<LocalDate> createdDates; |
|
31 |
private List<LocalDate> updatedDates; |
|
32 |
private List<String> languages; |
|
33 |
private List<ResourceType> resourceTypes; |
|
34 |
private List<AlternateIdentifier> alternateIdentifier; |
|
35 |
private List<Citation> citations; |
|
36 |
private List<String> sizes; |
|
37 |
private List<String> format; |
|
38 |
private List<String> version; |
|
39 |
private List<License> licenses; |
|
40 |
private List<String> descriptions; |
|
41 |
private List<String> disambiguatingDescriptions; |
|
42 |
private List<SpatialCoverage> geoLocations; |
|
43 |
|
|
44 |
public List<Identifier> getIdentifiers() { |
|
45 |
return identifiers; |
|
46 |
} |
|
47 |
|
|
48 |
public void setIdentifiers(List<Identifier> identifiers) { |
|
49 |
this.identifiers = identifiers; |
|
50 |
} |
|
51 |
|
|
52 |
public List<Creator> getCreators() { |
|
53 |
return creators; |
|
54 |
} |
|
55 |
|
|
56 |
public void setCreators(List<Creator> creators) { |
|
57 |
this.creators = creators; |
|
58 |
} |
|
59 |
|
|
60 |
public List<String> getTitles() { |
|
61 |
return titles; |
|
62 |
} |
|
63 |
|
|
64 |
public void setTitles(List<String> titles) { |
|
65 |
this.titles = titles; |
|
66 |
} |
|
67 |
|
|
68 |
public List<String> getAlternativeTitles() { |
|
69 |
return alternativeTitles; |
|
70 |
} |
|
71 |
|
|
72 |
public void setAlternativeTitles(List<String> alternativeTitles) { |
|
73 |
this.alternativeTitles = alternativeTitles; |
|
74 |
} |
|
75 |
|
|
76 |
public List<String> getPublishers() { |
|
77 |
return publishers; |
|
78 |
} |
|
79 |
|
|
80 |
public void setPublishers(List<String> publishers) { |
|
81 |
this.publishers = publishers; |
|
82 |
} |
|
83 |
|
|
84 |
public List<LocalDate> getPublicationDates() { |
|
85 |
return publicationDates; |
|
86 |
} |
|
87 |
|
|
88 |
public void setPublicationDates(List<LocalDate> publicationDates) { |
|
89 |
this.publicationDates = publicationDates; |
|
90 |
} |
|
91 |
|
|
92 |
public List<String> getSubjects() { |
|
93 |
return subjects; |
|
94 |
} |
|
95 |
|
|
96 |
public void setSubjects(List<String> subjects) { |
|
97 |
this.subjects = subjects; |
|
98 |
} |
|
99 |
|
|
100 |
public List<Contributor> getContributors() { |
|
101 |
return contributors; |
|
102 |
} |
|
103 |
|
|
104 |
public void setContributors(List<Contributor> contributors) { |
|
105 |
this.contributors = contributors; |
|
106 |
} |
|
107 |
|
|
108 |
public List<LocalDate> getCreatedDates() { |
|
109 |
return createdDates; |
|
110 |
} |
|
111 |
|
|
112 |
public void setCreatedDates(List<LocalDate> createdDates) { |
|
113 |
this.createdDates = createdDates; |
|
114 |
} |
|
115 |
|
|
116 |
public List<LocalDate> getUpdatedDates() { |
|
117 |
return updatedDates; |
|
118 |
} |
|
119 |
|
|
120 |
public void setUpdatedDates(List<LocalDate> updatedDates) { |
|
121 |
this.updatedDates = updatedDates; |
|
122 |
} |
|
123 |
|
|
124 |
public List<String> getLanguages() { |
|
125 |
return languages; |
|
126 |
} |
|
127 |
|
|
128 |
public void setLanguages(List<String> languages) { |
|
129 |
this.languages = languages; |
|
130 |
} |
|
131 |
|
|
132 |
public List<ResourceType> getResourceTypes() { |
|
133 |
return resourceTypes; |
|
134 |
} |
|
135 |
|
|
136 |
public void setResourceTypes(List<ResourceType> resourceTypes) { |
|
137 |
this.resourceTypes = resourceTypes; |
|
138 |
} |
|
139 |
|
|
140 |
public List<AlternateIdentifier> getAlternateIdentifier() { |
|
141 |
return alternateIdentifier; |
|
142 |
} |
|
143 |
|
|
144 |
public void setAlternateIdentifier(List<AlternateIdentifier> alternateIdentifier) { |
|
145 |
this.alternateIdentifier = alternateIdentifier; |
|
146 |
} |
|
147 |
|
|
148 |
public List<Citation> getCitations() { |
|
149 |
return citations; |
|
150 |
} |
|
151 |
|
|
152 |
public void setCitations(List<Citation> citations) { |
|
153 |
this.citations = citations; |
|
154 |
} |
|
155 |
|
|
156 |
public List<String> getSizes() { |
|
157 |
return sizes; |
|
158 |
} |
|
159 |
|
|
160 |
public void setSizes(List<String> sizes) { |
|
161 |
this.sizes = sizes; |
|
162 |
} |
|
163 |
|
|
164 |
public List<String> getFormat() { |
|
165 |
return format; |
|
166 |
} |
|
167 |
|
|
168 |
public void setFormat(List<String> format) { |
|
169 |
this.format = format; |
|
170 |
} |
|
171 |
|
|
172 |
public List<String> getVersion() { |
|
173 |
return version; |
|
174 |
} |
|
175 |
|
|
176 |
public void setVersion(List<String> version) { |
|
177 |
this.version = version; |
|
178 |
} |
|
179 |
|
|
180 |
public List<License> getLicenses() { |
|
181 |
return licenses; |
|
182 |
} |
|
183 |
|
|
184 |
public void setLicenses(List<License> licenses) { |
|
185 |
this.licenses = licenses; |
|
186 |
} |
|
187 |
|
|
188 |
public List<String> getDescriptions() { |
|
189 |
return descriptions; |
|
190 |
} |
|
191 |
|
|
192 |
public void setDescriptions(List<String> descriptions) { |
|
193 |
this.descriptions = descriptions; |
|
194 |
} |
|
195 |
|
|
196 |
public List<String> getDisambiguatingDescriptions() { |
|
197 |
return disambiguatingDescriptions; |
|
198 |
} |
|
199 |
|
|
200 |
public void setDisambiguatingDescriptions(List<String> disambiguatingDescriptions) { |
|
201 |
this.disambiguatingDescriptions = disambiguatingDescriptions; |
|
202 |
} |
|
203 |
|
|
204 |
public List<SpatialCoverage> getGeoLocations() { |
|
205 |
return geoLocations; |
|
206 |
} |
|
207 |
|
|
208 |
public void setGeoLocations(List<SpatialCoverage> geoLocations) { |
|
209 |
this.geoLocations = geoLocations; |
|
210 |
} |
|
211 |
|
|
212 |
private static String emptyXml; |
|
213 |
private static Object lockEmptyXml = new Object(); |
|
214 |
public static String emptyXml() { |
|
215 |
if(DatasetDocument.emptyXml!=null) return DatasetDocument.emptyXml; |
|
216 |
|
|
217 |
String xml = null; |
|
218 |
try { |
|
219 |
DocumentBuilderFactory docFactory = DocumentBuilderFactory.newInstance(); |
|
220 |
DocumentBuilder docBuilder = docFactory.newDocumentBuilder(); |
|
221 |
Document doc = docBuilder.newDocument(); |
|
222 |
|
|
223 |
Element root = doc.createElement("dataset"); |
|
224 |
doc.appendChild(root); |
|
225 |
|
|
226 |
TransformerFactory tf = TransformerFactory.newInstance(); |
|
227 |
Transformer transformer = tf.newTransformer(); |
|
228 |
StringWriter writer = new StringWriter(); |
|
229 |
transformer.transform(new DOMSource(doc), new StreamResult(writer)); |
|
230 |
xml = writer.getBuffer().toString(); |
|
231 |
}catch(Exception ex){ |
|
232 |
xml = "<dataset/>"; |
|
233 |
} |
|
234 |
|
|
235 |
synchronized (DatasetDocument.lockEmptyXml) { |
|
236 |
if (DatasetDocument.emptyXml == null) DatasetDocument.emptyXml = xml; |
|
237 |
} |
|
238 |
|
|
239 |
return DatasetDocument.emptyXml; |
|
240 |
} |
|
241 |
|
|
242 |
public String toXml() throws Exception { |
|
243 |
DocumentBuilderFactory docFactory = DocumentBuilderFactory.newInstance(); |
|
244 |
DocumentBuilder docBuilder = docFactory.newDocumentBuilder(); |
|
245 |
Document doc = docBuilder.newDocument(); |
|
246 |
|
|
247 |
Element root = doc.createElement("dataset"); |
|
248 |
doc.appendChild(root); |
|
249 |
|
|
250 |
if(this.identifiers!=null){ |
|
251 |
for(Identifier item : this.identifiers){ |
|
252 |
item.toXml(root); |
|
253 |
} |
|
254 |
} |
|
255 |
if(this.creators!=null){ |
|
256 |
Element creators = doc.createElement("creators"); |
|
257 |
root.appendChild(creators); |
|
258 |
for(Creator item : this.creators){ |
|
259 |
item.toXml(creators); |
|
260 |
} |
|
261 |
} |
|
262 |
if(this.titles!=null || this.alternativeTitles!=null){ |
|
263 |
Element titles = doc.createElement("titles"); |
|
264 |
root.appendChild(titles); |
|
265 |
if(this.titles!=null) { |
|
266 |
for (String item : this.titles) { |
|
267 |
Element title = doc.createElement("title"); |
|
268 |
titles.appendChild(title); |
|
269 |
title.appendChild(doc.createTextNode(item)); |
|
270 |
} |
|
271 |
} |
|
272 |
if(this.alternativeTitles!=null) { |
|
273 |
for (String item : this.alternativeTitles) { |
|
274 |
Element title = doc.createElement("title"); |
|
275 |
titles.appendChild(title); |
|
276 |
title.setAttribute("titleType", "AlternativeTitle"); |
|
277 |
title.appendChild(doc.createTextNode(item)); |
|
278 |
} |
|
279 |
} |
|
280 |
} |
|
281 |
if(this.publishers!=null){ |
|
282 |
for(String item : this.publishers){ |
|
283 |
Element publisher = doc.createElement("publisher"); |
|
284 |
root.appendChild(publisher); |
|
285 |
publisher.appendChild(doc.createTextNode(item)); |
|
286 |
} |
|
287 |
} |
|
288 |
if(this.publicationDates!=null){ |
|
289 |
for(LocalDate item : this.publicationDates){ |
|
290 |
Element publicationYear = doc.createElement("publicationYear"); |
|
291 |
root.appendChild(publicationYear); |
|
292 |
publicationYear.appendChild(doc.createTextNode(Integer.toString(item.getYear()))); |
|
293 |
} |
|
294 |
} |
|
295 |
if(this.subjects!=null){ |
|
296 |
Element subjects = doc.createElement("subjects"); |
|
297 |
root.appendChild(subjects); |
|
298 |
for(String item : this.subjects){ |
|
299 |
Element subject = doc.createElement("subject"); |
|
300 |
subjects.appendChild(subject); |
|
301 |
subject.appendChild(doc.createTextNode(item)); |
|
302 |
} |
|
303 |
} |
|
304 |
if(this.contributors!=null){ |
|
305 |
for(Contributor item : this.contributors){ |
|
306 |
item.toXml(root); |
|
307 |
} |
|
308 |
} |
|
309 |
if(this.createdDates!=null || this.updatedDates!=null){ |
|
310 |
Element dates = doc.createElement("dates"); |
|
311 |
root.appendChild(dates); |
|
312 |
|
|
313 |
DateTimeFormatter formatter = DateTimeFormatter.ofPattern("YYYY-MM-DD"); |
|
314 |
|
|
315 |
if(createdDates!=null) { |
|
316 |
for (LocalDate item : this.createdDates) { |
|
317 |
Element date = doc.createElement("date"); |
|
318 |
root.appendChild(date); |
|
319 |
date.setAttribute("dateType", "Created"); |
|
320 |
date.appendChild(doc.createTextNode(item.format(formatter))); |
|
321 |
} |
|
322 |
} |
|
323 |
if(updatedDates!=null) { |
|
324 |
for (LocalDate item : this.updatedDates) { |
|
325 |
Element date = doc.createElement("date"); |
|
326 |
root.appendChild(date); |
|
327 |
date.setAttribute("dateType", "Updated"); |
|
328 |
date.appendChild(doc.createTextNode(item.format(formatter))); |
|
329 |
} |
|
330 |
} |
|
331 |
} |
|
332 |
if(this.languages!=null){ |
|
333 |
for(String item : this.languages){ |
|
334 |
Element language = doc.createElement("language"); |
|
335 |
root.appendChild(language); |
|
336 |
language.appendChild(doc.createTextNode(item)); |
|
337 |
} |
|
338 |
} |
|
339 |
if(this.resourceTypes!=null){ |
|
340 |
for(ResourceType item : this.resourceTypes){ |
|
341 |
item.toXml(root); |
|
342 |
} |
|
343 |
} |
|
344 |
if(this.alternateIdentifier!=null){ |
|
345 |
Element alternateIdentifiers = doc.createElement("alternateIdentifiers"); |
|
346 |
root.appendChild(alternateIdentifiers); |
|
347 |
for(AlternateIdentifier item : this.alternateIdentifier){ |
|
348 |
item.toXml(alternateIdentifiers); |
|
349 |
} |
|
350 |
} |
|
351 |
if(this.citations!=null){ |
|
352 |
for(Citation item : this.citations){ |
|
353 |
item.toXml(root); |
|
354 |
} |
|
355 |
} |
|
356 |
if(this.sizes!=null){ |
|
357 |
Element sizes = doc.createElement("sizes"); |
|
358 |
root.appendChild(sizes); |
|
359 |
for(String item : this.sizes){ |
|
360 |
Element size = doc.createElement("size"); |
|
361 |
sizes.appendChild(size); |
|
362 |
size.appendChild(doc.createTextNode(item)); |
|
363 |
} |
|
364 |
} |
|
365 |
if(this.format!=null){ |
|
366 |
Element formats = doc.createElement("formats"); |
|
367 |
root.appendChild(formats); |
|
368 |
for(String item : this.format){ |
|
369 |
Element format = doc.createElement("format"); |
|
370 |
formats.appendChild(format); |
|
371 |
format.appendChild(doc.createTextNode(item)); |
|
372 |
} |
|
373 |
} |
|
374 |
if(this.version!=null){ |
|
375 |
for(String item : this.version){ |
|
376 |
Element version = doc.createElement("version"); |
|
377 |
root.appendChild(version); |
|
378 |
version.appendChild(doc.createTextNode(item)); |
|
379 |
} |
|
380 |
} |
|
381 |
if(this.licenses!=null){ |
|
382 |
Element rightsList = doc.createElement("rightsList"); |
|
383 |
root.appendChild(rightsList); |
|
384 |
for(License item : this.licenses){ |
|
385 |
item.toXml(rightsList); |
|
386 |
} |
|
387 |
} |
|
388 |
if(this.descriptions!=null || this.disambiguatingDescriptions!=null){ |
|
389 |
Element descriptions = doc.createElement("descriptions"); |
|
390 |
root.appendChild(descriptions); |
|
391 |
if(this.descriptions!=null) { |
|
392 |
for (String item : this.descriptions) { |
|
393 |
Element description = doc.createElement("description"); |
|
394 |
descriptions.appendChild(description); |
|
395 |
description.setAttribute("descriptionType", "Abstract"); |
|
396 |
description.appendChild(doc.createTextNode(item)); |
|
397 |
} |
|
398 |
} |
|
399 |
if(this.disambiguatingDescriptions!=null) { |
|
400 |
for (String item : this.disambiguatingDescriptions) { |
|
401 |
Element description = doc.createElement("description"); |
|
402 |
descriptions.appendChild(description); |
|
403 |
description.setAttribute("descriptionType", "Other"); |
|
404 |
description.appendChild(doc.createTextNode(item)); |
|
405 |
} |
|
406 |
} |
|
407 |
} |
|
408 |
if(this.geoLocations!=null){ |
|
409 |
Element geoLocations = doc.createElement("geoLocations"); |
|
410 |
root.appendChild(geoLocations); |
|
411 |
for(SpatialCoverage item : this.geoLocations){ |
|
412 |
item.toXml(geoLocations); |
|
413 |
} |
|
414 |
} |
|
415 |
|
|
416 |
TransformerFactory tf = TransformerFactory.newInstance(); |
|
417 |
Transformer transformer = tf.newTransformer(); |
|
418 |
StringWriter writer = new StringWriter(); |
|
419 |
transformer.transform(new DOMSource(doc), new StreamResult(writer)); |
|
420 |
String xml = writer.getBuffer().toString(); |
|
421 |
return xml; |
|
422 |
} |
|
423 |
|
|
424 |
public static class SpatialCoverage{ |
|
425 |
public static class Point{ |
|
426 |
public String latitude; |
|
427 |
public String longitude; |
|
428 |
|
|
429 |
public Point() {} |
|
430 |
|
|
431 |
public Point(String latitude, String longitude){ |
|
432 |
this.latitude = latitude; |
|
433 |
this.longitude = longitude; |
|
434 |
} |
|
435 |
} |
|
436 |
public String name; |
|
437 |
public List<Point> points; |
|
438 |
public List<String> boxes; |
|
439 |
|
|
440 |
public SpatialCoverage() {} |
|
441 |
|
|
442 |
public SpatialCoverage(String name, List<Point> points, List<String> boxes ) { |
|
443 |
this.name = name; |
|
444 |
this.points = points; |
|
445 |
this.boxes = boxes; |
|
446 |
} |
|
447 |
|
|
448 |
public void toXml(Element parent){ |
|
449 |
Element node = parent.getOwnerDocument().createElement("geoLocation"); |
|
450 |
parent.appendChild(node); |
|
451 |
|
|
452 |
if(this.points!=null) { |
|
453 |
for(Point point : this.points) { |
|
454 |
if(point.latitude == null || point.longitude == null) continue; |
|
455 |
Element geoLocationPoint = parent.getOwnerDocument().createElement("geoLocationPoint"); |
|
456 |
geoLocationPoint.appendChild(parent.getOwnerDocument().createTextNode(String.format("%s %s", point.latitude, point.longitude))); |
|
457 |
node.appendChild(geoLocationPoint); |
|
458 |
} |
|
459 |
} |
|
460 |
if(this.boxes!=null) { |
|
461 |
for(String box : this.boxes) { |
|
462 |
if(box == null) continue; |
|
463 |
Element geoLocationBox = parent.getOwnerDocument().createElement("geoLocationBox"); |
|
464 |
geoLocationBox.appendChild(parent.getOwnerDocument().createTextNode(box)); |
|
465 |
node.appendChild(geoLocationBox); |
|
466 |
} |
|
467 |
} |
|
468 |
if(this.name!=null) { |
|
469 |
Element geoLocationPlace = parent.getOwnerDocument().createElement("geoLocationPlace"); |
|
470 |
geoLocationPlace.appendChild(parent.getOwnerDocument().createTextNode(this.name)); |
|
471 |
node.appendChild(geoLocationPlace); |
|
472 |
} |
|
473 |
} |
|
474 |
} |
|
475 |
|
|
476 |
public static class License{ |
|
477 |
public String name; |
|
478 |
public String url; |
|
479 |
|
|
480 |
public License() {} |
|
481 |
|
|
482 |
public License(String name, String url) { |
|
483 |
this.name = name; |
|
484 |
this.url = url; |
|
485 |
} |
|
486 |
|
|
487 |
public void toXml(Element parent){ |
|
488 |
Element node = parent.getOwnerDocument().createElement("rights"); |
|
489 |
parent.appendChild(node); |
|
490 |
|
|
491 |
if(this.url!=null) { |
|
492 |
node.setAttribute("rightsURI", this.url); |
|
493 |
} |
|
494 |
if(this.name!=null) { |
|
495 |
node.appendChild(parent.getOwnerDocument().createTextNode(this.name)); |
|
496 |
} |
|
497 |
} |
|
498 |
} |
|
499 |
|
|
500 |
public static class Citation{ |
|
501 |
public enum CitationIdentifierType{ |
|
502 |
ARK, arXiv, bibcode, DOI, EAN13, EISSN, Handle, ISBN, ISSN, ISTC, LISSN, LSID, PMID, |
|
503 |
PURL, UPC, URL, URN |
|
504 |
} |
|
505 |
|
|
506 |
public CitationIdentifierType type; |
|
507 |
public String value; |
|
508 |
|
|
509 |
public Citation() {} |
|
510 |
|
|
511 |
public Citation(String value, CitationIdentifierType type) { |
|
512 |
this.value = value; |
|
513 |
this.type = type; |
|
514 |
} |
|
515 |
|
|
516 |
public void toXml(Element parent){ |
|
517 |
Element node = parent.getOwnerDocument().createElement("relatedIdentifier"); |
|
518 |
parent.appendChild(node); |
|
519 |
|
|
520 |
node.setAttribute("relatedIdentifierType", this.type.toString()); |
|
521 |
node.setAttribute("relationType", "Cites"); |
|
522 |
node.appendChild(parent.getOwnerDocument().createTextNode(this.value)); |
|
523 |
} |
|
524 |
} |
|
525 |
|
|
526 |
public static class Contributor{ |
|
527 |
public enum ContributorType{ |
|
528 |
ContactPerson, DataCollector, DataCurator, DataManager, Distributor, Editor, Funder, HostingInstitution, |
|
529 |
Producer, ProjectLeader, ProjectManager, ProjectMember, RegistrationAgency, RegistrationAuthority, |
|
530 |
RelatedPerson, Researcher, ResearchGroup, RightsHolder, Sponsor, Supervisor, WorkPackageLeader, Other |
|
531 |
} |
|
532 |
|
|
533 |
public String name; |
|
534 |
public List<String> affiliations; |
|
535 |
public ContributorType type; |
|
536 |
|
|
537 |
public Contributor() { |
|
538 |
} |
|
539 |
|
|
540 |
public Contributor(String name) { |
|
541 |
this.name = name; |
|
542 |
} |
|
543 |
|
|
544 |
public Contributor(String name, List<String> affiliations) { |
|
545 |
this.name = name; |
|
546 |
this.affiliations = affiliations; |
|
547 |
} |
|
548 |
|
|
549 |
public Contributor(String name, List<String> affiliations, ContributorType type) { |
|
550 |
this.name = name; |
|
551 |
this.affiliations = affiliations; |
|
552 |
this.type = type; |
|
553 |
} |
|
554 |
|
|
555 |
public void toXml(Element parent){ |
|
556 |
Element node = parent.getOwnerDocument().createElement("contributor"); |
|
557 |
parent.appendChild(node); |
|
558 |
|
|
559 |
node.setAttribute("contributorType", this.type.toString()); |
|
560 |
|
|
561 |
if(this.name!=null) { |
|
562 |
Element contributorName = parent.getOwnerDocument().createElement("contributorName"); |
|
563 |
node.appendChild(contributorName); |
|
564 |
contributorName.appendChild(parent.getOwnerDocument().createTextNode(this.name)); |
|
565 |
} |
|
566 |
if(this.affiliations!=null) { |
|
567 |
for(String item : this.affiliations) { |
|
568 |
Element affiliation = parent.getOwnerDocument().createElement("affiliation"); |
|
569 |
node.appendChild(affiliation); |
|
570 |
affiliation.appendChild(parent.getOwnerDocument().createTextNode(item)); |
|
571 |
} |
|
572 |
} |
|
573 |
} |
|
574 |
} |
|
575 |
|
|
576 |
public static class AlternateIdentifier{ |
|
577 |
public String identifier; |
|
578 |
public String type; |
|
579 |
|
|
580 |
public AlternateIdentifier() {} |
|
581 |
|
|
582 |
public AlternateIdentifier(String identifier, String type) { |
|
583 |
this.identifier = identifier; |
|
584 |
this.type = type; |
|
585 |
} |
|
586 |
|
|
587 |
public void toXml(Element parent){ |
|
588 |
Element node = parent.getOwnerDocument().createElement("alternateIdentifier"); |
|
589 |
parent.appendChild(node); |
|
590 |
|
|
591 |
if(this.type!=null) { |
|
592 |
node.setAttribute("alternateIdentifierType", this.type); |
|
593 |
} |
|
594 |
if(this.identifier!=null) { |
|
595 |
node.appendChild(parent.getOwnerDocument().createTextNode(this.identifier)); |
|
596 |
} |
|
597 |
} |
|
598 |
} |
|
599 |
|
|
600 |
public static class ResourceType{ |
|
601 |
public enum ResourceTypeGeneralType { |
|
602 |
Audiovisual, Collection, Dataset, Event, Image, InteractiveResource, Model, PhysicalObject, Service, |
|
603 |
Software, Sound, Text, Workflow, Other |
|
604 |
} |
|
605 |
|
|
606 |
public ResourceTypeGeneralType type; |
|
607 |
|
|
608 |
public ResourceType() {} |
|
609 |
|
|
610 |
public ResourceType(ResourceTypeGeneralType type) { |
|
611 |
this.type = type; |
|
612 |
} |
|
613 |
|
|
614 |
public void toXml(Element parent){ |
|
615 |
Element node = parent.getOwnerDocument().createElement("resourceType"); |
|
616 |
parent.appendChild(node); |
|
617 |
|
|
618 |
if(this.type!=null) { |
|
619 |
node.setAttribute("resourceTypeGeneral", this.type.toString()); |
|
620 |
} |
|
621 |
} |
|
622 |
} |
|
623 |
|
|
624 |
public static class Creator { |
|
625 |
public String name; |
|
626 |
public List<String> affiliations; |
|
627 |
|
|
628 |
public Creator() { |
|
629 |
} |
|
630 |
|
|
631 |
public Creator(String name) { |
|
632 |
this.name = name; |
|
633 |
} |
|
634 |
|
|
635 |
public Creator(String name, List<String> affiliations) { |
|
636 |
this.name = name; |
|
637 |
this.affiliations = affiliations; |
|
638 |
} |
|
639 |
|
|
640 |
public void toXml(Element parent){ |
|
641 |
Element node = parent.getOwnerDocument().createElement("creator"); |
|
642 |
parent.appendChild(node); |
|
643 |
|
|
644 |
if(this.name!=null) { |
|
645 |
Element creatorName = parent.getOwnerDocument().createElement("creatorName"); |
|
646 |
node.appendChild(creatorName); |
|
647 |
creatorName.appendChild(parent.getOwnerDocument().createTextNode(this.name)); |
|
648 |
} |
|
649 |
if(this.affiliations!=null) { |
|
650 |
for(String item : this.affiliations) { |
|
651 |
Element affiliation = parent.getOwnerDocument().createElement("affiliation"); |
|
652 |
node.appendChild(affiliation); |
|
653 |
affiliation.appendChild(parent.getOwnerDocument().createTextNode(item)); |
|
654 |
} |
|
655 |
} |
|
656 |
} |
|
657 |
} |
|
658 |
|
|
659 |
public static class Identifier { |
|
660 |
public enum IdentifierType { |
|
661 |
ARK, DOI, Handle, PURL, URN, URL |
|
662 |
} |
|
663 |
|
|
664 |
public String value; |
|
665 |
public IdentifierType type; |
|
666 |
|
|
667 |
public Identifier() { |
|
668 |
} |
|
669 |
|
|
670 |
public Identifier(IdentifierType type, String value) { |
|
671 |
this.type = type; |
|
672 |
this.value = value; |
|
673 |
} |
|
674 |
|
|
675 |
public void toXml(Element parent){ |
|
676 |
Element node = parent.getOwnerDocument().createElement("identifier"); |
|
677 |
parent.appendChild(node); |
|
678 |
|
|
679 |
node.setAttribute("identifierType", this.type.toString()); |
|
680 |
if(this.value!=null) { |
|
681 |
node.appendChild(parent.getOwnerDocument().createTextNode(this.value)); |
|
682 |
} |
|
683 |
} |
|
684 |
} |
|
685 |
} |
modules/dnet-collector-plugins/tags/dnet-collector-plugins-1.7.0/src/main/java/eu/dnetlib/data/collector/plugins/schemaorg/httpapi/HttpApiRepositoryIterable.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.plugins.schemaorg.httpapi; |
|
2 |
|
|
3 |
import eu.dnetlib.data.collector.plugins.schemaorg.RepositoryIterable; |
|
4 |
|
|
5 |
public interface HttpApiRepositoryIterable extends RepositoryIterable { |
|
6 |
} |
modules/dnet-collector-plugins/tags/dnet-collector-plugins-1.7.0/src/main/java/eu/dnetlib/data/collector/plugins/schemaorg/sitemapindex/SitemapIndexIterator.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.plugins.schemaorg.sitemapindex; |
|
2 |
|
|
3 |
import eu.dnetlib.data.collector.plugins.schemaorg.Utils; |
|
4 |
import org.apache.commons.io.IOUtils; |
|
5 |
import org.apache.commons.logging.Log; |
|
6 |
import org.apache.commons.logging.LogFactory; |
|
7 |
|
|
8 |
import java.net.URL; |
|
9 |
import java.nio.charset.Charset; |
|
10 |
import java.util.*; |
|
11 |
|
|
12 |
public class SitemapIndexIterator implements Iterator<String> { |
|
13 |
private static final Log log = LogFactory.getLog(SitemapIndexIterator.class); |
|
14 |
|
|
15 |
public static class Options { |
|
16 |
private URL indexUrl; |
|
17 |
private Charset charset; |
|
18 |
|
|
19 |
public Options(){} |
|
20 |
|
|
21 |
public Options(URL indexUrl, Charset charset){ |
|
22 |
this.indexUrl = indexUrl; |
|
23 |
this.charset = charset; |
|
24 |
} |
|
25 |
|
|
26 |
public URL getIndexUrl() { |
|
27 |
return indexUrl; |
|
28 |
} |
|
29 |
|
Also available in: Unified diff
[maven-release-plugin] copy for tag dnet-collector-plugins-1.7.0