Revision 53662
Added by Claudio Atzori almost 6 years ago
modules/dnet-collector-plugins/trunk/src/test/java/eu/dnetlib/data/collector/plugins/schemaorg/SchemaOrgMainKaggle.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.plugins.schemaorg; |
|
2 |
|
|
3 |
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor; |
|
4 |
import org.apache.commons.io.FileUtils; |
|
5 |
import org.apache.commons.lang3.StringUtils; |
|
6 |
import org.apache.commons.logging.Log; |
|
7 |
import org.apache.commons.logging.LogFactory; |
|
8 |
import org.apache.log4j.ConsoleAppender; |
|
9 |
import org.apache.log4j.Level; |
|
10 |
import org.apache.log4j.Logger; |
|
11 |
import org.apache.log4j.PatternLayout; |
|
12 |
import org.dom4j.Document; |
|
13 |
import org.dom4j.io.SAXReader; |
|
14 |
|
|
15 |
import java.io.File; |
|
16 |
import java.io.FileWriter; |
|
17 |
import java.io.StringReader; |
|
18 |
import java.nio.charset.StandardCharsets; |
|
19 |
import java.util.HashMap; |
|
20 |
import java.util.concurrent.TimeUnit; |
|
21 |
|
|
22 |
public class SchemaOrgMainKaggle { |
|
23 |
|
|
24 |
private static final Log log = LogFactory.getLog(SchemaOrgMainKaggle.class); |
|
25 |
|
|
26 |
public static void main(String[] args) throws Exception { |
|
27 |
|
|
28 |
ConsoleAppender console = new ConsoleAppender(); |
|
29 |
console.setLayout(new PatternLayout("%d [%p|%c|%C{1}] %m%n")); |
|
30 |
console.setThreshold(Level.DEBUG); |
|
31 |
console.activateOptions(); |
|
32 |
Logger.getLogger("eu.dnetlib.data.collector.plugins").addAppender(console); |
|
33 |
|
|
34 |
HashMap<String,String> params = new HashMap<>(); |
|
35 |
params.put("consumerBlockPolling", Boolean.toString(true)); |
|
36 |
params.put("consumerBlockPollingTimeout", "2"); |
|
37 |
params.put("consumerBlockPollingTimeoutUnit", TimeUnit.MINUTES.toString()); |
|
38 |
params.put("endpointCharset", StandardCharsets.UTF_8.name()); |
|
39 |
params.put("updatedDateFormat", "YYYY-MM-DD"); |
|
40 |
params.put("createdDateFormat", "YYYY-MM-DD"); |
|
41 |
params.put("publicationDateFormat", "YYYY-MM-DD"); |
|
42 |
params.put("contributorFallbackType", DatasetDocument.Contributor.ContributorType.Other.toString()); |
|
43 |
params.put("identifierFallbackType", "Handle"); |
|
44 |
params.put("identifierFallbackURL", Boolean.toString(true)); |
|
45 |
params.put("identifierMappingARK", "ark, ARK"); |
|
46 |
params.put("identifierMappingDOI", "doi, DOI"); |
|
47 |
params.put("identifierMappingHandle", "Handle, HANDLE"); |
|
48 |
params.put("identifierMappingPURL", "purl, PURL"); |
|
49 |
params.put("identifierMappingURN", "urn, URN"); |
|
50 |
params.put("identifierMappingURL", "url, URL"); |
|
51 |
|
|
52 |
params.put("repositoryAccessType", "httpapi-kaggle"); |
|
53 |
|
|
54 |
params.put("httpapi-kaggle_queueSize", "100"); |
|
55 |
params.put("httpapi-kaggle_APICharset", StandardCharsets.UTF_8.name()); |
|
56 |
params.put("httpapi-kaggle_queryUrl", "https://www.kaggle.com/datasets_v2.json?sortBy=updated&group=public&page={PAGE}&pageSize=20&size=sizeAll&filetype=fileTypeAll&license=licenseAll"); |
|
57 |
params.put("httpapi-kaggle_queryPagePlaceholder", "{PAGE}"); |
|
58 |
params.put("httpapi-kaggle_responsePropertyTotalDataset", "totalDatasetListItems"); |
|
59 |
params.put("httpapi-kaggle_responsePropertyDatasetList", "datasetListItems"); |
|
60 |
params.put("httpapi-kaggle_responsePropertyDatasetUrl", "datasetUrl"); |
|
61 |
params.put("httpapi-kaggle_responseBaseDatasetUrl", "https://www.kaggle.com"); |
|
62 |
|
|
63 |
InterfaceDescriptor descriptor = new InterfaceDescriptor(); |
|
64 |
descriptor.setId("schema.org - kaggle"); |
|
65 |
descriptor.setBaseUrl("https://www.kaggle.com"); |
|
66 |
|
|
67 |
descriptor.setParams(params); |
|
68 |
|
|
69 |
SchemaOrgPlugin schemaOrgPlugin = new SchemaOrgPlugin(); |
|
70 |
|
|
71 |
Iterable<String> iterable = schemaOrgPlugin.collect(descriptor, null, null); |
|
72 |
|
|
73 |
String outDir = params.get("repositoryAccessType"); |
|
74 |
|
|
75 |
log.info("saving content in " + outDir); |
|
76 |
|
|
77 |
File directory = new File(outDir); |
|
78 |
if (directory.exists()) { |
|
79 |
log.info(directory.getAbsolutePath() + " exists, cleaning up"); |
|
80 |
FileUtils.deleteDirectory(directory); |
|
81 |
} |
|
82 |
FileUtils.forceMkdir(directory); |
|
83 |
|
|
84 |
int skipped = 0; |
|
85 |
for(String item : iterable) { |
|
86 |
|
|
87 |
final Document doc = new SAXReader().read(new StringReader(item)); |
|
88 |
|
|
89 |
String id = doc.valueOf("/*[local-name() = 'dataset']/*[local-name() = 'identifier']/text()"); |
|
90 |
if (StringUtils.isNotBlank(id)) { |
|
91 |
log.info(item); |
|
92 |
String fileName = outDir + "/" + id; |
|
93 |
FileWriter w = new FileWriter(fileName); |
|
94 |
w.write(item); |
|
95 |
w.close(); |
|
96 |
log.info("wrote " + fileName); |
|
97 |
} else { |
|
98 |
skipped++; |
|
99 |
} |
|
100 |
if (skipped % 100 == 0) { |
|
101 |
log.info("skipped so far " + skipped); |
|
102 |
} |
|
103 |
} |
|
104 |
|
|
105 |
log.info("Done! skipped " + skipped); |
|
106 |
} |
|
107 |
|
|
108 |
} |
modules/dnet-collector-plugins/trunk/src/test/java/eu/dnetlib/data/collector/plugins/schemaorg/SchemaOrgMainReactome.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.plugins.schemaorg; |
|
2 |
|
|
3 |
import eu.dnetlib.data.collector.plugins.schemaorg.sitemapindex.SitemapFileIterator; |
|
4 |
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor; |
|
5 |
import org.apache.commons.io.FileUtils; |
|
6 |
import org.apache.commons.lang3.StringUtils; |
|
7 |
import org.apache.commons.logging.Log; |
|
8 |
import org.apache.commons.logging.LogFactory; |
|
9 |
import org.apache.log4j.ConsoleAppender; |
|
10 |
import org.apache.log4j.Level; |
|
11 |
import org.apache.log4j.Logger; |
|
12 |
import org.apache.log4j.PatternLayout; |
|
13 |
import org.dom4j.Document; |
|
14 |
import org.dom4j.io.SAXReader; |
|
15 |
|
|
16 |
import java.io.File; |
|
17 |
import java.io.FileWriter; |
|
18 |
import java.io.StringReader; |
|
19 |
import java.nio.charset.StandardCharsets; |
|
20 |
import java.util.HashMap; |
|
21 |
import java.util.concurrent.TimeUnit; |
|
22 |
|
|
23 |
public class SchemaOrgMainReactome { |
|
24 |
|
|
25 |
private static final Log log = LogFactory.getLog(SchemaOrgMainReactome.class); |
|
26 |
|
|
27 |
public static void main(String[] args) throws Exception { |
|
28 |
|
|
29 |
ConsoleAppender console = new ConsoleAppender(); |
|
30 |
console.setLayout(new PatternLayout("%d [%p|%c|%C{1}] %m%n")); |
|
31 |
console.setThreshold(Level.DEBUG); |
|
32 |
console.activateOptions(); |
|
33 |
Logger.getLogger("eu.dnetlib.data.collector.plugins").addAppender(console); |
|
34 |
|
|
35 |
HashMap<String,String> params = new HashMap<>(); |
|
36 |
params.put("consumerBlockPolling", Boolean.toString(true)); |
|
37 |
params.put("consumerBlockPollingTimeout", "2"); |
|
38 |
params.put("consumerBlockPollingTimeoutUnit", TimeUnit.MINUTES.toString()); |
|
39 |
params.put("endpointCharset", StandardCharsets.UTF_8.name()); |
|
40 |
params.put("updatedDateFormat", "YYYY-MM-DD"); |
|
41 |
params.put("createdDateFormat", "YYYY-MM-DD"); |
|
42 |
params.put("publicationDateFormat", "YYYY-MM-DD"); |
|
43 |
params.put("contributorFallbackType", DatasetDocument.Contributor.ContributorType.Other.toString()); |
|
44 |
params.put("identifierFallbackType", "Handle"); |
|
45 |
params.put("identifierFallbackURL", Boolean.toString(true)); |
|
46 |
params.put("identifierMappingARK", "ark, ARK"); |
|
47 |
params.put("identifierMappingDOI", "doi, DOI"); |
|
48 |
params.put("identifierMappingHandle", "Handle, HANDLE"); |
|
49 |
params.put("identifierMappingPURL", "purl, PURL"); |
|
50 |
params.put("identifierMappingURN", "urn, URN"); |
|
51 |
params.put("identifierMappingURL", "url, URL"); |
|
52 |
|
|
53 |
params.put("repositoryAccessType", "sitemapindex"); |
|
54 |
params.put("sitemap_queueSize", "100"); |
|
55 |
params.put("sitemap_IndexCharset", StandardCharsets.UTF_8.name()); |
|
56 |
params.put("sitemap_FileCharset", StandardCharsets.UTF_8.name()); |
|
57 |
params.put("sitemap_FileSchema", SitemapFileIterator.Options.SitemapSchemaType.Text.toString()); |
|
58 |
params.put("sitemap_FileType", SitemapFileIterator.Options.SitemapFileType.GZ.toString()); |
|
59 |
|
|
60 |
|
|
61 |
|
|
62 |
InterfaceDescriptor descriptor = new InterfaceDescriptor(); |
|
63 |
descriptor.setId("schema.org - reactome"); |
|
64 |
descriptor.setBaseUrl("https://reactome.org/sitemapindex.xml"); |
|
65 |
|
|
66 |
descriptor.setParams(params); |
|
67 |
|
|
68 |
SchemaOrgPlugin schemaOrgPlugin = new SchemaOrgPlugin(); |
|
69 |
|
|
70 |
Iterable<String> iterable = schemaOrgPlugin.collect(descriptor, null, null); |
|
71 |
|
|
72 |
String outDir = params.get("repositoryAccessType"); |
|
73 |
|
|
74 |
log.info("saving content in " + outDir); |
|
75 |
|
|
76 |
File directory = new File(outDir); |
|
77 |
if (directory.exists()) { |
|
78 |
log.info(directory.getAbsolutePath() + " exists, cleaning up"); |
|
79 |
FileUtils.deleteDirectory(directory); |
|
80 |
} |
|
81 |
FileUtils.forceMkdir(directory); |
|
82 |
|
|
83 |
int skipped = 0; |
|
84 |
for(String item : iterable) { |
|
85 |
|
|
86 |
final Document doc = new SAXReader().read(new StringReader(item)); |
|
87 |
|
|
88 |
String id = doc.valueOf("/*[local-name() = 'dataset']/*[local-name() = 'identifier']/text()"); |
|
89 |
if (StringUtils.isNotBlank(id)) { |
|
90 |
log.info(item); |
|
91 |
String fileName = outDir + "/" + id; |
|
92 |
FileWriter w = new FileWriter(fileName); |
|
93 |
w.write(item); |
|
94 |
w.close(); |
|
95 |
log.info("wrote " + fileName); |
|
96 |
} else { |
|
97 |
skipped++; |
|
98 |
} |
|
99 |
if (skipped % 100 == 0) { |
|
100 |
log.info("skipped so far " + skipped); |
|
101 |
} |
|
102 |
} |
|
103 |
|
|
104 |
log.info("Done! skipped " + skipped); |
|
105 |
} |
|
106 |
|
|
107 |
} |
modules/dnet-collector-plugins/trunk/pom.xml | ||
---|---|---|
11 | 11 |
<scm> |
12 | 12 |
<developerConnection>scm:svn:https://svn.driver.research-infrastructures.eu/driver/dnet45/modules/dnet-collector-plugins/trunk</developerConnection> |
13 | 13 |
</scm> |
14 |
|
|
15 |
<build> |
|
16 |
<plugins> |
|
17 |
<plugin> |
|
18 |
<artifactId>maven-assembly-plugin</artifactId> |
|
19 |
<configuration> |
|
20 |
<archive> |
|
21 |
<manifest> |
|
22 |
<mainClass>eu.dnetlib.data.collector.plugins.schemaorg.SchemaOrgMainReactome</mainClass> |
|
23 |
</manifest> |
|
24 |
</archive> |
|
25 |
<descriptorRefs> |
|
26 |
<descriptorRef>jar-with-dependencies</descriptorRef> |
|
27 |
</descriptorRefs> |
|
28 |
</configuration> |
|
29 |
</plugin> |
|
30 |
</plugins> |
|
31 |
</build> |
|
32 |
|
|
14 | 33 |
<dependencies> |
15 | 34 |
<dependency> |
16 | 35 |
<groupId>eu.dnetlib</groupId> |
Also available in: Unified diff
added main classes to verify the content collected from Kaggle and Reactome