Project

General

Profile

1
package eu.dnetlib.data.collector.plugins.schemaorg;
2

    
3
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
4
import org.apache.commons.io.FileUtils;
5
import org.apache.commons.lang3.StringUtils;
6
import org.apache.commons.logging.Log;
7
import org.apache.commons.logging.LogFactory;
8
import org.apache.log4j.ConsoleAppender;
9
import org.apache.log4j.Level;
10
import org.apache.log4j.Logger;
11
import org.apache.log4j.PatternLayout;
12
import org.dom4j.Document;
13
import org.dom4j.io.SAXReader;
14

    
15
import java.io.File;
16
import java.io.FileWriter;
17
import java.io.StringReader;
18
import java.nio.charset.StandardCharsets;
19
import java.util.HashMap;
20
import java.util.concurrent.TimeUnit;
21

    
22
public class SchemaOrgMainKaggle {
23

    
24
    private static final Log log = LogFactory.getLog(SchemaOrgMainKaggle.class);
25

    
26
    public static void main(String[] args) throws Exception {
27

    
28
        ConsoleAppender console = new ConsoleAppender();
29
        console.setLayout(new PatternLayout("%d [%p|%c|%C{1}] %m%n"));
30
        console.setThreshold(Level.DEBUG);
31
        console.activateOptions();
32
        Logger.getLogger("eu.dnetlib.data.collector.plugins").addAppender(console);
33

    
34
        HashMap<String,String> params = new HashMap<>();
35
        params.put("consumerBlockPolling", Boolean.toString(true));
36
        params.put("consumerBlockPollingTimeout", "2");
37
        params.put("consumerBlockPollingTimeoutUnit", TimeUnit.MINUTES.toString());
38
        params.put("endpointCharset", StandardCharsets.UTF_8.name());
39
        params.put("updatedDateFormat", "YYYY-MM-DD");
40
        params.put("createdDateFormat", "YYYY-MM-DD");
41
        params.put("publicationDateFormat", "YYYY-MM-DD");
42
        params.put("contributorFallbackType", DatasetDocument.Contributor.ContributorType.Other.toString());
43
        params.put("identifierFallbackType", "Handle");
44
        params.put("identifierFallbackURL", Boolean.toString(true));
45
        params.put("identifierMappingARK", "ark, ARK");
46
        params.put("identifierMappingDOI", "doi, DOI");
47
        params.put("identifierMappingHandle", "Handle, HANDLE");
48
        params.put("identifierMappingPURL", "purl, PURL");
49
        params.put("identifierMappingURN", "urn, URN");
50
        params.put("identifierMappingURL", "url, URL");
51

    
52
        params.put("repositoryAccessType", "httpapi-kaggle");
53

    
54
        params.put("httpapi-kaggle_queueSize", "100");
55
        params.put("httpapi-kaggle_APICharset", StandardCharsets.UTF_8.name());
56
        params.put("httpapi-kaggle_queryUrl", "https://www.kaggle.com/datasets_v2.json?sortBy=updated&group=public&page={PAGE}&pageSize=20&size=sizeAll&filetype=fileTypeAll&license=licenseAll");
57
        params.put("httpapi-kaggle_queryPagePlaceholder", "{PAGE}");
58
        params.put("httpapi-kaggle_responsePropertyTotalDataset", "totalDatasetListItems");
59
        params.put("httpapi-kaggle_responsePropertyDatasetList", "datasetListItems");
60
        params.put("httpapi-kaggle_responsePropertyDatasetUrl", "datasetUrl");
61
        params.put("httpapi-kaggle_responseBaseDatasetUrl", "https://www.kaggle.com");
62

    
63
        InterfaceDescriptor descriptor = new InterfaceDescriptor();
64
        descriptor.setId("schema.org - kaggle");
65
        descriptor.setBaseUrl("https://www.kaggle.com");
66

    
67
        descriptor.setParams(params);
68

    
69
        SchemaOrgPlugin schemaOrgPlugin = new SchemaOrgPlugin();
70

    
71
        Iterable<String> iterable = schemaOrgPlugin.collect(descriptor, null, null);
72

    
73
        String outDir = params.get("repositoryAccessType");
74

    
75
        log.info("saving content in " + outDir);
76

    
77
        File directory = new File(outDir);
78
        if (directory.exists()) {
79
            log.info(directory.getAbsolutePath() + " exists, cleaning up");
80
            FileUtils.deleteDirectory(directory);
81
        }
82
        FileUtils.forceMkdir(directory);
83

    
84
        int skipped = 0;
85
        for(String item : iterable) {
86

    
87
            final Document doc = new SAXReader().read(new StringReader(item));
88

    
89
            String id = doc.valueOf("/*[local-name() = 'dataset']/*[local-name() = 'identifier']/text()");
90
            if (StringUtils.isNotBlank(id)) {
91
                log.info(item);
92
                String fileName = outDir + "/" + id;
93
                FileWriter w = new FileWriter(fileName);
94
                w.write(item);
95
                w.close();
96
                log.info("wrote " + fileName);
97
            } else {
98
                skipped++;
99
            }
100
            if (skipped % 100 == 0) {
101
                log.info("skipped so far " + skipped);
102
            }
103
        }
104

    
105
        log.info("Done! skipped " + skipped);
106
    }
107

    
108
}
(8-8/11)