Project

General

Profile

1
package eu.dnetlib.data.collector.plugins.schemaorg;
2

    
3
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
4
import org.apache.commons.io.FileUtils;
5
import org.apache.commons.logging.Log;
6
import org.apache.commons.logging.LogFactory;
7
import org.apache.log4j.ConsoleAppender;
8
import org.apache.log4j.Level;
9
import org.apache.log4j.Logger;
10
import org.apache.log4j.PatternLayout;
11

    
12
import java.io.File;
13
import java.nio.charset.StandardCharsets;
14
import java.util.HashMap;
15
import java.util.concurrent.TimeUnit;
16

    
17
public class SchemaOrgMainKaggle {
18

    
19
    private static final Log log = LogFactory.getLog(SchemaOrgMainKaggle.class);
20

    
21
    public static void main(String[] args) throws Exception {
22

    
23
        ConsoleAppender console = new ConsoleAppender();
24
        console.setLayout(new PatternLayout("%d [%p|%c|%C{1}] %m%n"));
25
        console.setThreshold(Level.DEBUG);
26
        console.activateOptions();
27
        Logger.getLogger("eu.dnetlib.data.collector.plugins").addAppender(console);
28

    
29
        HashMap<String,String> params = new HashMap<>();
30
        params.put("consumerBlockPolling", Boolean.toString(true));
31
        params.put("consumerBlockPollingTimeout", "2");
32
        params.put("consumerBlockPollingTimeoutUnit", TimeUnit.MINUTES.toString());
33
        params.put("endpointCharset", StandardCharsets.UTF_8.name());
34
        params.put("updatedDateFormat", "YYYY-MM-DD");
35
        params.put("createdDateFormat", "YYYY-MM-DD");
36
        params.put("publicationDateFormat", "YYYY-MM-DD");
37
        params.put("contributorFallbackType", DatasetDocument.Contributor.ContributorType.Other.toString());
38
        params.put("identifierFallbackType", DatasetDocument.Identifier.IdentifierType.Handle.toString());
39
        params.put("identifierFallbackURL", Boolean.toString(true));
40
        params.put("identifierMappingARK", "ark, ARK");
41
        params.put("identifierMappingDOI", "doi, DOI");
42
        params.put("identifierMappingHandle", "Handle, HANDLE");
43
        params.put("identifierMappingPURL", "purl, PURL");
44
        params.put("identifierMappingURN", "urn, URN");
45
        params.put("identifierMappingURL", "url, URL");
46

    
47
        params.put("repositoryAccessType", "httpapi-kaggle");
48

    
49
        params.put("httpapi-kaggle_queueSize", "100");
50
        params.put("httpapi-kaggle_APICharset", StandardCharsets.UTF_8.name());
51
        params.put("httpapi-kaggle_queryUrl", "https://www.kaggle.com/datasets_v2.json?sortBy=updated&group=public&page={PAGE}&pageSize=20&size=sizeAll&filetype=fileTypeAll&license=licenseAll");
52
        params.put("httpapi-kaggle_queryPagePlaceholder", "{PAGE}");
53
        params.put("httpapi-kaggle_responsePropertyTotalDataset", "totalDatasetListItems");
54
        params.put("httpapi-kaggle_responsePropertyDatasetList", "datasetListItems");
55
        params.put("httpapi-kaggle_responsePropertyDatasetUrl", "datasetUrl");
56
        params.put("httpapi-kaggle_responseBaseDatasetUrl", "https://www.kaggle.com");
57
        params.put("httpapi-kaggle_producerBlockPollingTimeout", "2");
58
        params.put("httpapi-kaggle_producerBlockPollingTimeoutUnit", TimeUnit.MINUTES.toString());
59

    
60
        InterfaceDescriptor descriptor = new InterfaceDescriptor();
61
        descriptor.setId("schema.org - kaggle");
62
        descriptor.setBaseUrl("https://www.kaggle.com");
63

    
64
        descriptor.setParams(params);
65

    
66
        SchemaOrgPlugin schemaOrgPlugin = new SchemaOrgPlugin();
67

    
68
        Iterable<String> iterable = schemaOrgPlugin.collect(descriptor, null, null);
69

    
70
        String outDir = params.get("repositoryAccessType");
71

    
72
        log.info("saving content in " + outDir);
73

    
74
        File directory = new File(outDir);
75
        if (directory.exists()) {
76
            log.info(directory.getAbsolutePath() + " exists, cleaning up");
77
            FileUtils.deleteDirectory(directory);
78
        }
79
        FileUtils.forceMkdir(directory);
80
        Utils.writeFiles(iterable, outDir);
81

    
82
    }
83

    
84
}
(8-8/11)