Project

General

Profile

1 53662 claudio.at
package eu.dnetlib.data.collector.plugins.schemaorg;
2
3
import eu.dnetlib.data.collector.plugins.schemaorg.sitemapindex.SitemapFileIterator;
4
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
5
import org.apache.commons.io.FileUtils;
6
import org.apache.commons.lang3.StringUtils;
7
import org.apache.commons.logging.Log;
8
import org.apache.commons.logging.LogFactory;
9
import org.apache.log4j.ConsoleAppender;
10
import org.apache.log4j.Level;
11
import org.apache.log4j.Logger;
12
import org.apache.log4j.PatternLayout;
13
import org.dom4j.Document;
14
import org.dom4j.io.SAXReader;
15
16
import java.io.File;
17
import java.io.FileWriter;
18
import java.io.StringReader;
19
import java.nio.charset.StandardCharsets;
20
import java.util.HashMap;
21
import java.util.concurrent.TimeUnit;
22
23
public class SchemaOrgMainReactome {
24
25
    private static final Log log = LogFactory.getLog(SchemaOrgMainReactome.class);
26
27
    public static void main(String[] args) throws Exception {
28
29
        ConsoleAppender console = new ConsoleAppender();
30
        console.setLayout(new PatternLayout("%d [%p|%c|%C{1}] %m%n"));
31
        console.setThreshold(Level.DEBUG);
32
        console.activateOptions();
33
        Logger.getLogger("eu.dnetlib.data.collector.plugins").addAppender(console);
34
35
        HashMap<String,String> params = new HashMap<>();
36
        params.put("consumerBlockPolling", Boolean.toString(true));
37
        params.put("consumerBlockPollingTimeout", "2");
38
        params.put("consumerBlockPollingTimeoutUnit", TimeUnit.MINUTES.toString());
39
        params.put("endpointCharset", StandardCharsets.UTF_8.name());
40
        params.put("updatedDateFormat", "YYYY-MM-DD");
41
        params.put("createdDateFormat", "YYYY-MM-DD");
42
        params.put("publicationDateFormat", "YYYY-MM-DD");
43
        params.put("contributorFallbackType", DatasetDocument.Contributor.ContributorType.Other.toString());
44
        params.put("identifierFallbackType", "Handle");
45
        params.put("identifierFallbackURL", Boolean.toString(true));
46
        params.put("identifierMappingARK", "ark, ARK");
47
        params.put("identifierMappingDOI", "doi, DOI");
48
        params.put("identifierMappingHandle", "Handle, HANDLE");
49
        params.put("identifierMappingPURL", "purl, PURL");
50
        params.put("identifierMappingURN", "urn, URN");
51
        params.put("identifierMappingURL", "url, URL");
52
53
        params.put("repositoryAccessType", "sitemapindex");
54
        params.put("sitemap_queueSize", "100");
55
        params.put("sitemap_IndexCharset", StandardCharsets.UTF_8.name());
56
        params.put("sitemap_FileCharset", StandardCharsets.UTF_8.name());
57
        params.put("sitemap_FileSchema", SitemapFileIterator.Options.SitemapSchemaType.Text.toString());
58
        params.put("sitemap_FileType", SitemapFileIterator.Options.SitemapFileType.GZ.toString());
59
60
61
62
        InterfaceDescriptor descriptor = new InterfaceDescriptor();
63
        descriptor.setId("schema.org - reactome");
64
        descriptor.setBaseUrl("https://reactome.org/sitemapindex.xml");
65
66
        descriptor.setParams(params);
67
68
        SchemaOrgPlugin schemaOrgPlugin = new SchemaOrgPlugin();
69
70
        Iterable<String> iterable = schemaOrgPlugin.collect(descriptor, null, null);
71
72
        String outDir = params.get("repositoryAccessType");
73
74
        log.info("saving content in " + outDir);
75
76
        File directory = new File(outDir);
77
        if (directory.exists()) {
78
            log.info(directory.getAbsolutePath() + " exists, cleaning up");
79
            FileUtils.deleteDirectory(directory);
80
        }
81
        FileUtils.forceMkdir(directory);
82
83
        int skipped = 0;
84
        for(String item : iterable) {
85
86
            final Document doc = new SAXReader().read(new StringReader(item));
87
88
            String id = doc.valueOf("/*[local-name() = 'dataset']/*[local-name() = 'identifier']/text()");
89
            if (StringUtils.isNotBlank(id)) {
90
                log.info(item);
91
                String fileName = outDir + "/" + id;
92
                FileWriter w = new FileWriter(fileName);
93
                w.write(item);
94
                w.close();
95
                log.info("wrote " + fileName);
96
            } else {
97
                skipped++;
98
            }
99
            if (skipped % 100 == 0) {
100
                log.info("skipped so far " + skipped);
101
            }
102
        }
103
104
        log.info("Done! skipped " + skipped);
105
    }
106
107
}