/modules/dnet-collector-plugins/trunk/src/main/java/eu/dnetlib/data/collector/plugins/schemaorg/DatasetMappingIterator.java - D-Net - D-Net project tracking tool

dnet45/modules/dnet-collector-plugins/trunk/src/main/java/eu/dnetlib/data/collector/plugins/schemaorg/DatasetMappingIterator.java @ 53685

       package eu.dnetlib.data.collector.plugins.schemaorg;
       import org.apache.commons.logging.Log;
       import org.apache.commons.logging.LogFactory;
       import org.json.JSONObject;
       import java.net.URL;
       import java.time.LocalDate;
       import java.time.format.DateTimeFormatter;
       import java.util.*;
       public class DatasetMappingIterator implements Iterator<String> {
       	private static final Log log = LogFactory.getLog(EndpointAccessIterator.class);
       	public static class Options {
       		public static class IdentifierOptions{
       			public List<String> mappingARK;
       			public List<String> mappingDOI;
       			public List<String> mappingHandle;
       			public List<String> mappingPURL;
       			public List<String> mappingURN;
       			public List<String> mappingURL;
       			public DatasetDocument.Identifier.IdentifierType fallbackType;
       			public Boolean fallbackURL;
+      		}
       		public static class ContributorOptions{
       			public DatasetDocument.Contributor.ContributorType fallbackType;
+      		}
       		public static class PublicationDateOptions{
       			public String format;
+      		}
       		public static class CreatedDateOptions{
       			public String format;
+      		}
       		public static class UpdatedDateOptions{
       			public String format;
+      		}
       		private IdentifierOptions identifierOptions;
       		private PublicationDateOptions publicationDateOptions;
       		private ContributorOptions contributorOptions;
       		private CreatedDateOptions createdDateOptions;
       		private UpdatedDateOptions updatedDateOptions;
       		public UpdatedDateOptions getUpdatedDateOptions() {
       			return updatedDateOptions;
+      		}
       		public void setUpdatedDateOptions(UpdatedDateOptions updatedDateOptions) {
       			this.updatedDateOptions = updatedDateOptions;
+      		}
       		public CreatedDateOptions getCreatedDateOptions() {
       			return createdDateOptions;
+      		}
       		public void setCreatedDateOptions(CreatedDateOptions createdDateOptions) {
       			this.createdDateOptions = createdDateOptions;
+      		}
       		public ContributorOptions getContributorOptions() {
       			return contributorOptions;
+      		}
       		public void setContributorOptions(ContributorOptions contributorOptions) {
       			this.contributorOptions = contributorOptions;
+      		}
       		public PublicationDateOptions getPublicationDateOptions() {
       			return publicationDateOptions;
+      		}
       		public void setPublicationDateOptions(PublicationDateOptions publicationDateOptions) {
       			this.publicationDateOptions = publicationDateOptions;
+      		}
       		public IdentifierOptions getIdentifierOptions() {
       			return identifierOptions;
+      		}
       		public void setIdentifierOptions(IdentifierOptions identifierOptions) {
       			this.identifierOptions = identifierOptions;
+      		}
+      	}
       	private Options options;
       	private EndpointAccessIterator endpointAccessIterator;
       	public DatasetMappingIterator(Options options, EndpointAccessIterator endpointAccessIterator) {
       		this.options = options;
       		this.endpointAccessIterator = endpointAccessIterator;
+      	}
       	@Override
       	public boolean hasNext() {
       		return this.endpointAccessIterator.hasNext();
+      	}
       	@Override
       	public String next() {
       		JSONObject document = this.endpointAccessIterator.next();
       		String xml = null;
       		if (document == null) {
       			log.debug("no document provided to process. returning empty");
       			xml = DatasetDocument.emptyXml();
+      		}
       		else {
       			log.debug("building document");
       			xml = this.buildDataset(document);
       			if (!Utils.validateXml(xml)) {
       				log.debug("xml not valid. setting to empty");
       				xml = null;
+      			}
       			if (xml == null) {
       				log.debug("could not build xml. returning empty");
       				xml = DatasetDocument.emptyXml();
+      			}
+      		}
       		//if all else fails
       		if(xml == null){
       			log.debug("could not build xml. returning empty");
       			xml = "<dataset/>";
+      		}
       		log.debug("xml document for dataset is: "+xml);
       		return xml;
+      	}
       	private String buildDataset(JSONObject document){
       		String xml = null;
       		try{
       			DatasetDocument dataset = new DatasetDocument();
       			dataset.setIdentifiers(this.extractIdentifier(document));
       			dataset.setCreators(this.extractCreator(document));
       			dataset.setTitles(this.extractTitles(document));
       			dataset.setAlternativeTitles(this.extractAlternateTitles(document));
       			dataset.setPublishers(this.extractPublisher(document));
       			dataset.setPublicationDates(this.extractPublicationDate(document));
       			dataset.setSubjects(this.extractSubjects(document));
       			dataset.setContributors(this.extractContributors(document));
       			dataset.setCreatedDates(this.extractCreatedDate(document));
       			dataset.setUpdatedDates(this.extractUpdatedDate(document));
       			dataset.setLanguages(this.extractLanguages(document));
       			dataset.setResourceTypes(this.extractResourceTypes(document));
       			dataset.setAlternateIdentifier(this.extractAlternateIdentifiers(document));
       			dataset.setCitations(this.extractCitations(document));
       			dataset.setSizes(this.extractSize(document));
       			dataset.setFormat(this.extractEncodingFormat(document));
       			dataset.setVersion(this.extractVersion(document));
       			dataset.setLicenses(this.extractLicense(document));
       			dataset.setDescriptions(this.extractDescription(document));
       			dataset.setDisambiguatingDescriptions(this.extractDisambiguatingDescription(document));
       			dataset.setGeoLocations(this.extractSpatialCoverage(document));
       			log.debug("document contains native identifier: : "+(dataset.getIdentifiers()!=null && dataset.getIdentifiers().size() > 0));
       			if((dataset.getIdentifiers() == null || dataset.getIdentifiers().size() == 0) &&
       					this.options.getIdentifierOptions().fallbackURL){
       				log.debug("falling back to url identifier");
       				dataset.setIdentifiers(this.extractIdentifierFallbackURL(document));
       				log.debug("document contains overridden identifier: : "+(dataset.getIdentifiers()!=null && dataset.getIdentifiers().size() > 0));
+      			}
       			xml = dataset.toXml();
+      		}
       		catch(Exception ex){
       			log.error("problem constructing dataset xml. returning empty", ex);
       			xml = null;
+      		}
       		return xml;
+      	}
       	private List<DatasetDocument.Identifier> extractIdentifierFallbackURL(JSONObject document){
       		List<String> urls = JSONLDUtils.extractString(document, "url");
       		ArrayList<DatasetDocument.Identifier> curated = new ArrayList<>();
       		for(String item : urls){
       			if(item == null || item.trim().length() == 0) continue;
       			curated.add(new DatasetDocument.Identifier(DatasetDocument.Identifier.IdentifierType.URL,  item.trim()));
+      		}
       		return curated;
+      	}
       	private List<DatasetDocument.SpatialCoverage> extractSpatialCoverage(JSONObject document){
       		List<JSONLDUtils.PlaceInfo> spatials = JSONLDUtils.extractPlaces(document, "spatialCoverage");
       		ArrayList<DatasetDocument.SpatialCoverage> curated = new ArrayList<>();
       		for(JSONLDUtils.PlaceInfo item : spatials){
       			if((item.name == null || item.name.trim().length() == 0) &&
       					(item.geoCoordinates == null || item.geoCoordinates.size() == 0) &&
       					(item.geoShapes == null || item.geoShapes.size() == 0)) continue;
       			List<DatasetDocument.SpatialCoverage.Point> points = new ArrayList<>();
       			List<String> boxes = new ArrayList<>();
       			if(item.geoCoordinates!=null) {
       				for (JSONLDUtils.GeoCoordinatesInfo iter : item.geoCoordinates){
       					points.add(new DatasetDocument.SpatialCoverage.Point(iter.latitude, iter.longitude));
+      				}
+      			}
       			if(item.geoShapes!=null) {
       				for (JSONLDUtils.GeoShapeInfo iter : item.geoShapes){
       					boxes.add(iter.box);
+      				}
+      			}
       			curated.add(new DatasetDocument.SpatialCoverage(item.name, points, boxes));
+      		}
       		return curated;
+      	}
       	private List<String> extractDescription(JSONObject document){
       		List<String> descriptions = JSONLDUtils.extractString(document, "description");
       		ArrayList<String> curated = new ArrayList<>();
       		for(String item : descriptions){
       			if(item == null || item.trim().length() == 0) continue;
       			curated.add(item);
+      		}
       		return curated;
+      	}
       	private List<String> extractDisambiguatingDescription(JSONObject document){
       		List<String> descriptions = JSONLDUtils.extractString(document, "disambiguatingDescription");
       		ArrayList<String> curated = new ArrayList<>();
       		for(String item : descriptions){
       			if(item == null || item.trim().length() == 0) continue;
       			curated.add(item);
+      		}
       		return curated;
+      	}
       	private List<DatasetDocument.License> extractLicense(JSONObject document){
       		List<JSONLDUtils.LicenseInfo> licenses = JSONLDUtils.extractLicenses(document, "license");
       		ArrayList<DatasetDocument.License> curated = new ArrayList<>();
       		for(JSONLDUtils.LicenseInfo item : licenses){
       			if(item.url == null || item.url.trim().length() == 0) continue;
       			curated.add(new DatasetDocument.License(item.name, item.url));
+      		}
       		return curated;
+      	}
       	private List<String> extractVersion(JSONObject document){
       		List<String> versions = JSONLDUtils.extractString(document, "version");
       		ArrayList<String> curated = new ArrayList<>();
       		for(String item : versions){
       			if(item == null || item.trim().length() == 0) continue;
       			curated.add(item);
+      		}
       		return curated;
+      	}
       	private List<String> extractSize(JSONObject document) {
       		List<String> sizes = JSONLDUtils.extractSize(document, "distribution");
       		HashSet<String> curated = new HashSet<>();
       		for (String item : sizes) {
       			if (item == null || item.trim().length() == 0) continue;
       			curated.add(item);
+      		}
       		return new ArrayList<>(curated);
+      	}
       	private List<String> extractEncodingFormat(JSONObject document){
       		List<String> formats = JSONLDUtils.extractEncodingFormat(document, "distribution");
       		HashSet<String> curated = new HashSet<>();
       		for(String item : formats){
       			if(item == null || item.trim().length() == 0) continue;
       			curated.add(item);
+      		}
       		return new ArrayList<>(curated);
+      	}
       	//TODO: Handle different citation types. Currently only urls
       	private List<DatasetDocument.Citation> extractCitations(JSONObject document){
       		List<JSONLDUtils.CitationInfo> citations = JSONLDUtils.extractCitations(document, "citation");
       		ArrayList<DatasetDocument.Citation> curated = new ArrayList<>();
       		for(JSONLDUtils.CitationInfo item : citations){
       			if(item.url == null || item.url.trim().length() == 0) continue;
       			try{
       				new URL(item.url);
       			}catch (Exception ex){
       				continue;
+      			}
       			curated.add(new DatasetDocument.Citation(item.url, DatasetDocument.Citation.CitationIdentifierType.URL));
+      		}
       		return curated;
+      	}
       	private List<DatasetDocument.AlternateIdentifier> extractAlternateIdentifiers(JSONObject document){
       		List<String> issns = JSONLDUtils.extractString(document, "issn");
       		List<String> urls = JSONLDUtils.extractString(document, "url");
       		ArrayList<DatasetDocument.AlternateIdentifier> curated = new ArrayList<>();
       		for(String item : issns){
       			if(item == null || item.trim().length() == 0) continue;
       			curated.add(new DatasetDocument.AlternateIdentifier(item.trim(), "ISSN"));
+      		}
       		for(String item : urls){
       			if(item == null || item.trim().length() == 0) continue;
       			curated.add(new DatasetDocument.AlternateIdentifier(item.trim(), "URL"));
+      		}
       		return curated;
+      	}
       	private List<DatasetDocument.ResourceType> extractResourceTypes(JSONObject document){
       		List<DatasetDocument.ResourceType> resourceTypes = new ArrayList<>();
       		resourceTypes.add(new DatasetDocument.ResourceType(DatasetDocument.ResourceType.ResourceTypeGeneralType.Dataset));
       		return resourceTypes;
+      	}
       	private List<String> extractLanguages(JSONObject document){
       		List<String> languages = JSONLDUtils.extractLanguage(document, "inLanguage");
       		ArrayList<String> curated = new ArrayList<>();
       		for(String item : languages){
       			if(item == null || item.trim().length() == 0) continue;
       			curated.add(item);
+      		}
       		return curated;
+      	}
       	private List<LocalDate> extractUpdatedDate(JSONObject document){
       		List<LocalDate> updatedDates = new ArrayList<>();
       		if(this.options.getUpdatedDateOptions() == null || this.options.getUpdatedDateOptions().format == null || this.options.getUpdatedDateOptions().format.length() == 0) return updatedDates;
       		DateTimeFormatter formatter = DateTimeFormatter.ofPattern(this.options.getPublicationDateOptions().format);
       		List<String> dates = JSONLDUtils.extractString(document, "dateModified");
       		for(String updatedDate : dates){
       			if(updatedDate == null || updatedDate.trim().length() == 0) continue;
       			try {
       				LocalDate localDate = LocalDate.parse(updatedDate, formatter);
       				updatedDates.add(localDate);
       			} catch (Exception e) {
       				continue;
+      			}
+      		}
       		return updatedDates;
+      	}
       	private List<LocalDate> extractCreatedDate(JSONObject document){
       		List<LocalDate> createdDates = new ArrayList<>();
       		if(this.options.getCreatedDateOptions() == null || this.options.getCreatedDateOptions().format == null || this.options.getCreatedDateOptions().format.length() == 0) return createdDates;
       		DateTimeFormatter formatter = DateTimeFormatter.ofPattern(this.options.getCreatedDateOptions().format);
       		List<String> dates = JSONLDUtils.extractString(document, "dateCreated");
       		for(String createdDate : dates){
       			if(createdDate == null || createdDate.trim().length() == 0) continue;
       			try {
       				LocalDate localDate = LocalDate.parse(createdDate, formatter);
       				createdDates.add(localDate);
       			} catch (Exception e) {
       				continue;
+      			}
+      		}
       		return createdDates;
+      	}
       	private List<DatasetDocument.Contributor> extractContributors(JSONObject document){
       		List<JSONLDUtils.PrincipalInfo> editors = JSONLDUtils.extractPrincipal(document, "editor");
       		List<JSONLDUtils.PrincipalInfo> funders = JSONLDUtils.extractPrincipal(document, "funder");
       		List<JSONLDUtils.PrincipalInfo> producers = JSONLDUtils.extractPrincipal(document, "producer");
       		List<JSONLDUtils.PrincipalInfo> sponsors = JSONLDUtils.extractPrincipal(document, "sponsor");
       		List<JSONLDUtils.PrincipalInfo> constributors = JSONLDUtils.extractPrincipal(document, "contributor");
       		ArrayList<DatasetDocument.Contributor> curated = new ArrayList<>();
       		for(JSONLDUtils.PrincipalInfo item : editors){
       			if(item.name() == null || item.name().trim().length() == 0) continue;
       			curated.add(new DatasetDocument.Contributor(item.name(), item.affiliationNames(), DatasetDocument.Contributor.ContributorType.Editor));
+      		}
       		for(JSONLDUtils.PrincipalInfo item : funders){
       			if(item.name() == null || item.name().trim().length() == 0) continue;
       			curated.add(new DatasetDocument.Contributor(item.name(), item.affiliationNames(), DatasetDocument.Contributor.ContributorType.Funder));
+      		}
       		for(JSONLDUtils.PrincipalInfo item : producers){
       			if(item.name() == null || item.name().trim().length() == 0) continue;
       			curated.add(new DatasetDocument.Contributor(item.name(), item.affiliationNames(), DatasetDocument.Contributor.ContributorType.Producer));
+      		}
       		for(JSONLDUtils.PrincipalInfo item : sponsors){
       			if(item.name() == null || item.name().trim().length() == 0) continue;
       			curated.add(new DatasetDocument.Contributor(item.name(), item.affiliationNames(), DatasetDocument.Contributor.ContributorType.Sponsor));
+      		}
       		for(JSONLDUtils.PrincipalInfo item : constributors){
       			if(item.name() == null || item.name().trim().length() == 0) continue;
       			DatasetDocument.Contributor.ContributorType type = DatasetDocument.Contributor.ContributorType.Other;
       			if(this.options.getContributorOptions()!=null && this.options.getContributorOptions().fallbackType != null) type = this.options.getContributorOptions().fallbackType;
       			curated.add(new DatasetDocument.Contributor(item.name(), item.affiliationNames(), type));
+      		}
       		return curated;
+      	}
       	private List<String> extractSubjects(JSONObject document){
       		List<String> subjects = JSONLDUtils.extractString(document, "keywords");
       		ArrayList<String> curated = new ArrayList<>();
       		for(String item : subjects){
       			if(item == null || item.trim().length() == 0) continue;
       			curated.add(item);
+      		}
       		return curated;
+      	}
       	private List<LocalDate> extractPublicationDate(JSONObject document){
       		List<LocalDate> publicationDates = new ArrayList<>();
       		if(this.options.getPublicationDateOptions() == null || this.options.getPublicationDateOptions().format == null || this.options.getPublicationDateOptions().format.length() == 0) return publicationDates;
       		DateTimeFormatter formatter = DateTimeFormatter.ofPattern(this.options.getPublicationDateOptions().format);
       		List<String> dates = JSONLDUtils.extractString(document, "datePublished");
       		for(String publicationDate : dates){
       			if(publicationDate == null || publicationDate.trim().length() == 0) continue;
       			try {
       				LocalDate localDate = LocalDate.parse(publicationDate, formatter);
       				publicationDates.add(localDate);
       			} catch (Exception e) {
       				continue;
+      			}
+      		}
       		return publicationDates;
+      	}
       	private List<String> extractPublisher(JSONObject document){
       		List<JSONLDUtils.PrincipalInfo> publishers = JSONLDUtils.extractPrincipal(document, "publisher");
       		ArrayList<String> curated = new ArrayList<>();
       		for(JSONLDUtils.PrincipalInfo item : publishers){
       			if(item.name() == null || item.name().trim().length() == 0) continue;
       			curated.add(item.name());
+      		}
       		return curated;
+      	}
       	private List<String> extractTitles(JSONObject document){
       		List<String> names = JSONLDUtils.extractString(document, "name");
       		List<String> headlines = JSONLDUtils.extractString(document, "headline");
       		HashSet<String> titles = new HashSet<>();
       		titles.addAll(names);
       		titles.addAll(headlines);
       		return new ArrayList<>(titles);
+      	}
       	private List<String> extractAlternateTitles(JSONObject document){
       		List<String> names = JSONLDUtils.extractString(document, "alternateName");
       		List<String> headlines = JSONLDUtils.extractString(document, "alternativeHeadline");
       		HashSet<String> titles = new HashSet<>();
       		titles.addAll(names);
       		titles.addAll(headlines);
       		return new ArrayList<>(titles);
+      	}
       	private List<DatasetDocument.Identifier> extractIdentifier(JSONObject document){
       		List<DatasetDocument.Identifier> curated = new ArrayList<>();
       		List<JSONLDUtils.IdentifierInfo> identifiers = JSONLDUtils.extractIdentifier(document, "identifier");
       		for(JSONLDUtils.IdentifierInfo item : identifiers){
       			if(item.value == null || item.value.trim().length() == 0) continue;
       			if(item.type == null || item.type.trim().length() == 0) {
       				if (this.options.getIdentifierOptions().fallbackType == null) continue;
       				curated.add(new DatasetDocument.Identifier(this.options.getIdentifierOptions().fallbackType, item.value.trim()));
+      			}
       			else {
       				DatasetDocument.Identifier.IdentifierType type = null;
       				if(this.options.getIdentifierOptions().mappingARK != null && this.options.getIdentifierOptions().mappingARK.contains(item.type.trim())) type = DatasetDocument.Identifier.IdentifierType.ARK;
       				else if(this.options.getIdentifierOptions().mappingDOI != null && this.options.getIdentifierOptions().mappingDOI.contains(item.type.trim())) type = DatasetDocument.Identifier.IdentifierType.DOI;
       				else if(this.options.getIdentifierOptions().mappingHandle != null && this.options.getIdentifierOptions().mappingHandle.contains(item.type.trim())) type = DatasetDocument.Identifier.IdentifierType.Handle;
       				else if(this.options.getIdentifierOptions().mappingPURL != null && this.options.getIdentifierOptions().mappingPURL.contains(item.type.trim())) type = DatasetDocument.Identifier.IdentifierType.PURL;
       				else if(this.options.getIdentifierOptions().mappingURL != null && this.options.getIdentifierOptions().mappingURL.contains(item.type.trim())) type = DatasetDocument.Identifier.IdentifierType.URL;
       				else if(this.options.getIdentifierOptions().mappingURN != null && this.options.getIdentifierOptions().mappingURN.contains(item.type.trim())) type = DatasetDocument.Identifier.IdentifierType.URN;
       				if(type == null) continue;
       				curated.add(new DatasetDocument.Identifier(type, item.value.trim()));
+      			}
+      		}
       		return curated;
+      	}
       	private List<DatasetDocument.Creator> extractCreator(JSONObject document){
       		List<JSONLDUtils.PrincipalInfo> creators = JSONLDUtils.extractPrincipal(document, "creator");
       		List<JSONLDUtils.PrincipalInfo> authors = JSONLDUtils.extractPrincipal(document, "author");
       		HashSet<String> foundNames = new HashSet<>();
       		List<DatasetDocument.Creator> curated = new ArrayList<>();
       		for(JSONLDUtils.PrincipalInfo item : creators){
       			if(item.name() == null || item.name().trim().length() == 0) continue;
       			if(foundNames.contains(item.name())) continue;
       			foundNames.add(item.name());
       			curated.add(new DatasetDocument.Creator(item.name(), item.affiliationNames()));
+      		}
       		for(JSONLDUtils.PrincipalInfo item : authors){
       			if(item.name() == null || item.name().trim().length() == 0) continue;
       			if(foundNames.contains(item.name())) continue;
       			foundNames.add(item.name());
       			curated.add(new DatasetDocument.Creator(item.name(), item.affiliationNames()));
+      		}
       		return curated;
+      	}
+      }

(2-2/11)

Project

General

Profile

D-Net