Project

General

Profile

1
package eu.dnetlib.data.collector.plugins.schemaorg;
2

    
3
import eu.dnetlib.data.collector.plugins.schemaorg.sitemapindex.SitemapFileIterator;
4
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
5
import org.apache.commons.io.FileUtils;
6
import org.apache.commons.logging.Log;
7
import org.apache.commons.logging.LogFactory;
8
import org.apache.log4j.ConsoleAppender;
9
import org.apache.log4j.Level;
10
import org.apache.log4j.Logger;
11
import org.apache.log4j.PatternLayout;
12

    
13
import java.io.File;
14
import java.nio.charset.StandardCharsets;
15
import java.util.HashMap;
16
import java.util.concurrent.TimeUnit;
17

    
18
public class SchemaOrgMainReactome {
19

    
20
    private static final Log log = LogFactory.getLog(SchemaOrgMainReactome.class);
21

    
22
    public static void main(String[] args) throws Exception {
23

    
24
        ConsoleAppender console = new ConsoleAppender();
25
        console.setLayout(new PatternLayout("%d [%p|%c|%C{1}] %m%n"));
26
        console.setThreshold(Level.DEBUG);
27
        console.activateOptions();
28
        Logger.getLogger("eu.dnetlib.data.collector.plugins").addAppender(console);
29

    
30
        HashMap<String,String> params = new HashMap<>();
31
        params.put("consumerBlockPolling", Boolean.toString(true));
32
        params.put("consumerBlockPollingTimeout", "2");
33
        params.put("consumerBlockPollingTimeoutUnit", TimeUnit.MINUTES.toString());
34
        params.put("endpointCharset", StandardCharsets.UTF_8.name());
35
        params.put("updatedDateFormat", "YYYY-MM-DD");
36
        params.put("createdDateFormat", "YYYY-MM-DD");
37
        params.put("publicationDateFormat", "YYYY-MM-DD");
38
        params.put("contributorFallbackType", DatasetDocument.Contributor.ContributorType.Other.toString());
39
        params.put("identifierFallbackType", DatasetDocument.Identifier.IdentifierType.Handle.toString());
40
        params.put("identifierFallbackURL", Boolean.toString(true));
41
        params.put("identifierMappingARK", "ark, ARK");
42
        params.put("identifierMappingDOI", "doi, DOI");
43
        params.put("identifierMappingHandle", "Handle, HANDLE");
44
        params.put("identifierMappingPURL", "purl, PURL");
45
        params.put("identifierMappingURN", "urn, URN");
46
        params.put("identifierMappingURL", "url, URL");
47

    
48
        params.put("repositoryAccessType", "sitemapindex");
49
        params.put("sitemap_queueSize", "100");
50
        params.put("sitemap_IndexCharset", StandardCharsets.UTF_8.name());
51
        params.put("sitemap_FileCharset", StandardCharsets.UTF_8.name());
52
        params.put("sitemap_FileSchema", SitemapFileIterator.Options.SitemapSchemaType.Text.toString());
53
        params.put("sitemap_FileType", SitemapFileIterator.Options.SitemapFileType.GZ.toString());
54
        params.put("sitemap_producerBlockPollingTimeout", "2");
55
        params.put("sitemap_producerBlockPollingTimeoutUnit", TimeUnit.MINUTES.toString());
56

    
57
        InterfaceDescriptor descriptor = new InterfaceDescriptor();
58
        descriptor.setId("schema.org - reactome");
59
        descriptor.setBaseUrl("https://reactome.org/sitemapindex.xml");
60

    
61
        descriptor.setParams(params);
62

    
63
        SchemaOrgPlugin schemaOrgPlugin = new SchemaOrgPlugin();
64

    
65
        Iterable<String> iterable = schemaOrgPlugin.collect(descriptor, null, null);
66

    
67
        String outDir = params.get("repositoryAccessType");
68

    
69
        log.info("saving content in " + outDir);
70

    
71
        File directory = new File(outDir);
72
        if (directory.exists()) {
73
            log.info(directory.getAbsolutePath() + " exists, cleaning up");
74
            FileUtils.deleteDirectory(directory);
75
        }
76
        FileUtils.forceMkdir(directory);
77
        Utils.writeFiles(iterable, outDir);
78
    }
79

    
80
}
(9-9/11)