Project

General

Profile

« Previous | Next » 

Revision 53663

added main classes to verify the content collected from Kaggle and Reactome

View differences:

modules/dnet-collector-plugins/trunk/src/test/java/eu/dnetlib/data/collector/plugins/schemaorg/SchemaOrgMainKaggle.java
1
package eu.dnetlib.data.collector.plugins.schemaorg;
2

  
3
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
4
import org.apache.commons.io.FileUtils;
5
import org.apache.commons.lang3.StringUtils;
6
import org.apache.commons.logging.Log;
7
import org.apache.commons.logging.LogFactory;
8
import org.apache.log4j.ConsoleAppender;
9
import org.apache.log4j.Level;
10
import org.apache.log4j.Logger;
11
import org.apache.log4j.PatternLayout;
12
import org.dom4j.Document;
13
import org.dom4j.io.SAXReader;
14

  
15
import java.io.File;
16
import java.io.FileWriter;
17
import java.io.StringReader;
18
import java.nio.charset.StandardCharsets;
19
import java.util.HashMap;
20
import java.util.concurrent.TimeUnit;
21

  
22
public class SchemaOrgMainKaggle {
23

  
24
    private static final Log log = LogFactory.getLog(SchemaOrgMainKaggle.class);
25

  
26
    public static void main(String[] args) throws Exception {
27

  
28
        ConsoleAppender console = new ConsoleAppender();
29
        console.setLayout(new PatternLayout("%d [%p|%c|%C{1}] %m%n"));
30
        console.setThreshold(Level.DEBUG);
31
        console.activateOptions();
32
        Logger.getLogger("eu.dnetlib.data.collector.plugins").addAppender(console);
33

  
34
        HashMap<String,String> params = new HashMap<>();
35
        params.put("consumerBlockPolling", Boolean.toString(true));
36
        params.put("consumerBlockPollingTimeout", "2");
37
        params.put("consumerBlockPollingTimeoutUnit", TimeUnit.MINUTES.toString());
38
        params.put("endpointCharset", StandardCharsets.UTF_8.name());
39
        params.put("updatedDateFormat", "YYYY-MM-DD");
40
        params.put("createdDateFormat", "YYYY-MM-DD");
41
        params.put("publicationDateFormat", "YYYY-MM-DD");
42
        params.put("contributorFallbackType", DatasetDocument.Contributor.ContributorType.Other.toString());
43
        params.put("identifierFallbackType", "Handle");
44
        params.put("identifierFallbackURL", Boolean.toString(true));
45
        params.put("identifierMappingARK", "ark, ARK");
46
        params.put("identifierMappingDOI", "doi, DOI");
47
        params.put("identifierMappingHandle", "Handle, HANDLE");
48
        params.put("identifierMappingPURL", "purl, PURL");
49
        params.put("identifierMappingURN", "urn, URN");
50
        params.put("identifierMappingURL", "url, URL");
51

  
52
        params.put("repositoryAccessType", "httpapi-kaggle");
53

  
54
        params.put("httpapi-kaggle_queueSize", "100");
55
        params.put("httpapi-kaggle_APICharset", StandardCharsets.UTF_8.name());
56
        params.put("httpapi-kaggle_queryUrl", "https://www.kaggle.com/datasets_v2.json?sortBy=updated&group=public&page={PAGE}&pageSize=20&size=sizeAll&filetype=fileTypeAll&license=licenseAll");
57
        params.put("httpapi-kaggle_queryPagePlaceholder", "{PAGE}");
58
        params.put("httpapi-kaggle_responsePropertyTotalDataset", "totalDatasetListItems");
59
        params.put("httpapi-kaggle_responsePropertyDatasetList", "datasetListItems");
60
        params.put("httpapi-kaggle_responsePropertyDatasetUrl", "datasetUrl");
61
        params.put("httpapi-kaggle_responseBaseDatasetUrl", "https://www.kaggle.com");
62

  
63
        InterfaceDescriptor descriptor = new InterfaceDescriptor();
64
        descriptor.setId("schema.org - kaggle");
65
        descriptor.setBaseUrl("https://www.kaggle.com");
66

  
67
        descriptor.setParams(params);
68

  
69
        SchemaOrgPlugin schemaOrgPlugin = new SchemaOrgPlugin();
70

  
71
        Iterable<String> iterable = schemaOrgPlugin.collect(descriptor, null, null);
72

  
73
        String outDir = params.get("repositoryAccessType");
74

  
75
        log.info("saving content in " + outDir);
76

  
77
        File directory = new File(outDir);
78
        if (directory.exists()) {
79
            log.info(directory.getAbsolutePath() + " exists, cleaning up");
80
            FileUtils.deleteDirectory(directory);
81
        }
82
        FileUtils.forceMkdir(directory);
83

  
84
        int skipped = 0;
85
        for(String item : iterable) {
86

  
87
            final Document doc = new SAXReader().read(new StringReader(item));
88

  
89
            String id = doc.valueOf("/*[local-name() = 'dataset']/*[local-name() = 'identifier']/text()");
90
            if (StringUtils.isNotBlank(id)) {
91
                log.info(item);
92
                String fileName = outDir + "/" + id;
93
                FileWriter w = new FileWriter(fileName);
94
                w.write(item);
95
                w.close();
96
                log.info("wrote " + fileName);
97
            } else {
98
                skipped++;
99
            }
100
            if (skipped % 100 == 0) {
101
                log.info("skipped so far " + skipped);
102
            }
103
        }
104

  
105
        log.info("Done! skipped " + skipped);
106
    }
107

  
108
}
modules/dnet-collector-plugins/trunk/src/test/java/eu/dnetlib/data/collector/plugins/schemaorg/SchemaOrgMainReactome.java
1
package eu.dnetlib.data.collector.plugins.schemaorg;
2

  
3
import eu.dnetlib.data.collector.plugins.schemaorg.sitemapindex.SitemapFileIterator;
4
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
5
import org.apache.commons.io.FileUtils;
6
import org.apache.commons.lang3.StringUtils;
7
import org.apache.commons.logging.Log;
8
import org.apache.commons.logging.LogFactory;
9
import org.apache.log4j.ConsoleAppender;
10
import org.apache.log4j.Level;
11
import org.apache.log4j.Logger;
12
import org.apache.log4j.PatternLayout;
13
import org.dom4j.Document;
14
import org.dom4j.io.SAXReader;
15

  
16
import java.io.File;
17
import java.io.FileWriter;
18
import java.io.StringReader;
19
import java.nio.charset.StandardCharsets;
20
import java.util.HashMap;
21
import java.util.concurrent.TimeUnit;
22

  
23
public class SchemaOrgMainReactome {
24

  
25
    private static final Log log = LogFactory.getLog(SchemaOrgMainReactome.class);
26

  
27
    public static void main(String[] args) throws Exception {
28

  
29
        ConsoleAppender console = new ConsoleAppender();
30
        console.setLayout(new PatternLayout("%d [%p|%c|%C{1}] %m%n"));
31
        console.setThreshold(Level.DEBUG);
32
        console.activateOptions();
33
        Logger.getLogger("eu.dnetlib.data.collector.plugins").addAppender(console);
34

  
35
        HashMap<String,String> params = new HashMap<>();
36
        params.put("consumerBlockPolling", Boolean.toString(true));
37
        params.put("consumerBlockPollingTimeout", "2");
38
        params.put("consumerBlockPollingTimeoutUnit", TimeUnit.MINUTES.toString());
39
        params.put("endpointCharset", StandardCharsets.UTF_8.name());
40
        params.put("updatedDateFormat", "YYYY-MM-DD");
41
        params.put("createdDateFormat", "YYYY-MM-DD");
42
        params.put("publicationDateFormat", "YYYY-MM-DD");
43
        params.put("contributorFallbackType", DatasetDocument.Contributor.ContributorType.Other.toString());
44
        params.put("identifierFallbackType", "Handle");
45
        params.put("identifierFallbackURL", Boolean.toString(true));
46
        params.put("identifierMappingARK", "ark, ARK");
47
        params.put("identifierMappingDOI", "doi, DOI");
48
        params.put("identifierMappingHandle", "Handle, HANDLE");
49
        params.put("identifierMappingPURL", "purl, PURL");
50
        params.put("identifierMappingURN", "urn, URN");
51
        params.put("identifierMappingURL", "url, URL");
52

  
53
        params.put("repositoryAccessType", "sitemapindex");
54
        params.put("sitemap_queueSize", "100");
55
        params.put("sitemap_IndexCharset", StandardCharsets.UTF_8.name());
56
        params.put("sitemap_FileCharset", StandardCharsets.UTF_8.name());
57
        params.put("sitemap_FileSchema", SitemapFileIterator.Options.SitemapSchemaType.Text.toString());
58
        params.put("sitemap_FileType", SitemapFileIterator.Options.SitemapFileType.GZ.toString());
59

  
60

  
61

  
62
        InterfaceDescriptor descriptor = new InterfaceDescriptor();
63
        descriptor.setId("schema.org - reactome");
64
        descriptor.setBaseUrl("https://reactome.org/sitemapindex.xml");
65

  
66
        descriptor.setParams(params);
67

  
68
        SchemaOrgPlugin schemaOrgPlugin = new SchemaOrgPlugin();
69

  
70
        Iterable<String> iterable = schemaOrgPlugin.collect(descriptor, null, null);
71

  
72
        String outDir = params.get("repositoryAccessType");
73

  
74
        log.info("saving content in " + outDir);
75

  
76
        File directory = new File(outDir);
77
        if (directory.exists()) {
78
            log.info(directory.getAbsolutePath() + " exists, cleaning up");
79
            FileUtils.deleteDirectory(directory);
80
        }
81
        FileUtils.forceMkdir(directory);
82

  
83
        int skipped = 0;
84
        for(String item : iterable) {
85

  
86
            final Document doc = new SAXReader().read(new StringReader(item));
87

  
88
            String id = doc.valueOf("/*[local-name() = 'dataset']/*[local-name() = 'identifier']/text()");
89
            if (StringUtils.isNotBlank(id)) {
90
                log.info(item);
91
                String fileName = outDir + "/" + id;
92
                FileWriter w = new FileWriter(fileName);
93
                w.write(item);
94
                w.close();
95
                log.info("wrote " + fileName);
96
            } else {
97
                skipped++;
98
            }
99
            if (skipped % 100 == 0) {
100
                log.info("skipped so far " + skipped);
101
            }
102
        }
103

  
104
        log.info("Done! skipped " + skipped);
105
    }
106

  
107
}
modules/dnet-collector-plugins/trunk/src/main/java/eu/dnetlib/data/collector/plugins/schemaorg/SchemaOrgMainKaggle.java
1
package eu.dnetlib.data.collector.plugins.schemaorg;
2

  
3
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
4
import org.apache.commons.io.FileUtils;
5
import org.apache.commons.lang3.StringUtils;
6
import org.apache.commons.logging.Log;
7
import org.apache.commons.logging.LogFactory;
8
import org.apache.log4j.ConsoleAppender;
9
import org.apache.log4j.Level;
10
import org.apache.log4j.Logger;
11
import org.apache.log4j.PatternLayout;
12
import org.dom4j.Document;
13
import org.dom4j.io.SAXReader;
14

  
15
import java.io.File;
16
import java.io.FileWriter;
17
import java.io.StringReader;
18
import java.nio.charset.StandardCharsets;
19
import java.util.HashMap;
20
import java.util.concurrent.TimeUnit;
21

  
22
public class SchemaOrgMainKaggle {
23

  
24
    private static final Log log = LogFactory.getLog(SchemaOrgMainKaggle.class);
25

  
26
    public static void main(String[] args) throws Exception {
27

  
28
        ConsoleAppender console = new ConsoleAppender();
29
        console.setLayout(new PatternLayout("%d [%p|%c|%C{1}] %m%n"));
30
        console.setThreshold(Level.DEBUG);
31
        console.activateOptions();
32
        Logger.getLogger("eu.dnetlib.data.collector.plugins").addAppender(console);
33

  
34
        HashMap<String,String> params = new HashMap<>();
35
        params.put("consumerBlockPolling", Boolean.toString(true));
36
        params.put("consumerBlockPollingTimeout", "2");
37
        params.put("consumerBlockPollingTimeoutUnit", TimeUnit.MINUTES.toString());
38
        params.put("endpointCharset", StandardCharsets.UTF_8.name());
39
        params.put("updatedDateFormat", "YYYY-MM-DD");
40
        params.put("createdDateFormat", "YYYY-MM-DD");
41
        params.put("publicationDateFormat", "YYYY-MM-DD");
42
        params.put("contributorFallbackType", DatasetDocument.Contributor.ContributorType.Other.toString());
43
        params.put("identifierFallbackType", "Handle");
44
        params.put("identifierFallbackURL", Boolean.toString(true));
45
        params.put("identifierMappingARK", "ark, ARK");
46
        params.put("identifierMappingDOI", "doi, DOI");
47
        params.put("identifierMappingHandle", "Handle, HANDLE");
48
        params.put("identifierMappingPURL", "purl, PURL");
49
        params.put("identifierMappingURN", "urn, URN");
50
        params.put("identifierMappingURL", "url, URL");
51

  
52
        params.put("repositoryAccessType", "httpapi-kaggle");
53

  
54
        params.put("httpapi-kaggle_queueSize", "100");
55
        params.put("httpapi-kaggle_APICharset", StandardCharsets.UTF_8.name());
56
        params.put("httpapi-kaggle_queryUrl", "https://www.kaggle.com/datasets_v2.json?sortBy=updated&group=public&page={PAGE}&pageSize=20&size=sizeAll&filetype=fileTypeAll&license=licenseAll");
57
        params.put("httpapi-kaggle_queryPagePlaceholder", "{PAGE}");
58
        params.put("httpapi-kaggle_responsePropertyTotalDataset", "totalDatasetListItems");
59
        params.put("httpapi-kaggle_responsePropertyDatasetList", "datasetListItems");
60
        params.put("httpapi-kaggle_responsePropertyDatasetUrl", "datasetUrl");
61
        params.put("httpapi-kaggle_responseBaseDatasetUrl", "https://www.kaggle.com");
62

  
63
        InterfaceDescriptor descriptor = new InterfaceDescriptor();
64
        descriptor.setId("schema.org - kaggle");
65
        descriptor.setBaseUrl("https://www.kaggle.com");
66

  
67
        descriptor.setParams(params);
68

  
69
        SchemaOrgPlugin schemaOrgPlugin = new SchemaOrgPlugin();
70

  
71
        Iterable<String> iterable = schemaOrgPlugin.collect(descriptor, null, null);
72

  
73
        String outDir = params.get("repositoryAccessType");
74

  
75
        log.info("saving content in " + outDir);
76

  
77
        File directory = new File(outDir);
78
        if (directory.exists()) {
79
            log.info(directory.getAbsolutePath() + " exists, cleaning up");
80
            FileUtils.deleteDirectory(directory);
81
        }
82
        FileUtils.forceMkdir(directory);
83

  
84
        int skipped = 0;
85
        for(String item : iterable) {
86

  
87
            final Document doc = new SAXReader().read(new StringReader(item));
88

  
89
            String id = doc.valueOf("/*[local-name() = 'dataset']/*[local-name() = 'identifier']/text()");
90
            if (StringUtils.isNotBlank(id)) {
91
                log.info(item);
92
                String fileName = outDir + "/" + id;
93
                FileWriter w = new FileWriter(fileName);
94
                w.write(item);
95
                w.close();
96
                log.info("wrote " + fileName);
97
            } else {
98
                skipped++;
99
            }
100
            if (skipped % 100 == 0) {
101
                log.info("skipped so far " + skipped);
102
            }
103
        }
104

  
105
        log.info("Done! skipped " + skipped);
106
    }
107

  
108
}
modules/dnet-collector-plugins/trunk/src/main/java/eu/dnetlib/data/collector/plugins/schemaorg/SchemaOrgMainReactome.java
1
package eu.dnetlib.data.collector.plugins.schemaorg;
2

  
3
import eu.dnetlib.data.collector.plugins.schemaorg.sitemapindex.SitemapFileIterator;
4
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
5
import org.apache.commons.io.FileUtils;
6
import org.apache.commons.lang3.StringUtils;
7
import org.apache.commons.logging.Log;
8
import org.apache.commons.logging.LogFactory;
9
import org.apache.log4j.ConsoleAppender;
10
import org.apache.log4j.Level;
11
import org.apache.log4j.Logger;
12
import org.apache.log4j.PatternLayout;
13
import org.dom4j.Document;
14
import org.dom4j.io.SAXReader;
15

  
16
import java.io.File;
17
import java.io.FileWriter;
18
import java.io.StringReader;
19
import java.nio.charset.StandardCharsets;
20
import java.util.HashMap;
21
import java.util.concurrent.TimeUnit;
22

  
23
public class SchemaOrgMainReactome {
24

  
25
    private static final Log log = LogFactory.getLog(SchemaOrgMainReactome.class);
26

  
27
    public static void main(String[] args) throws Exception {
28

  
29
        ConsoleAppender console = new ConsoleAppender();
30
        console.setLayout(new PatternLayout("%d [%p|%c|%C{1}] %m%n"));
31
        console.setThreshold(Level.DEBUG);
32
        console.activateOptions();
33
        Logger.getLogger("eu.dnetlib.data.collector.plugins").addAppender(console);
34

  
35
        HashMap<String,String> params = new HashMap<>();
36
        params.put("consumerBlockPolling", Boolean.toString(true));
37
        params.put("consumerBlockPollingTimeout", "2");
38
        params.put("consumerBlockPollingTimeoutUnit", TimeUnit.MINUTES.toString());
39
        params.put("endpointCharset", StandardCharsets.UTF_8.name());
40
        params.put("updatedDateFormat", "YYYY-MM-DD");
41
        params.put("createdDateFormat", "YYYY-MM-DD");
42
        params.put("publicationDateFormat", "YYYY-MM-DD");
43
        params.put("contributorFallbackType", DatasetDocument.Contributor.ContributorType.Other.toString());
44
        params.put("identifierFallbackType", "Handle");
45
        params.put("identifierFallbackURL", Boolean.toString(true));
46
        params.put("identifierMappingARK", "ark, ARK");
47
        params.put("identifierMappingDOI", "doi, DOI");
48
        params.put("identifierMappingHandle", "Handle, HANDLE");
49
        params.put("identifierMappingPURL", "purl, PURL");
50
        params.put("identifierMappingURN", "urn, URN");
51
        params.put("identifierMappingURL", "url, URL");
52

  
53
        params.put("repositoryAccessType", "sitemapindex");
54
        params.put("sitemap_queueSize", "100");
55
        params.put("sitemap_IndexCharset", StandardCharsets.UTF_8.name());
56
        params.put("sitemap_FileCharset", StandardCharsets.UTF_8.name());
57
        params.put("sitemap_FileSchema", SitemapFileIterator.Options.SitemapSchemaType.Text.toString());
58
        params.put("sitemap_FileType", SitemapFileIterator.Options.SitemapFileType.GZ.toString());
59

  
60

  
61

  
62
        InterfaceDescriptor descriptor = new InterfaceDescriptor();
63
        descriptor.setId("schema.org - reactome");
64
        descriptor.setBaseUrl("https://reactome.org/sitemapindex.xml");
65

  
66
        descriptor.setParams(params);
67

  
68
        SchemaOrgPlugin schemaOrgPlugin = new SchemaOrgPlugin();
69

  
70
        Iterable<String> iterable = schemaOrgPlugin.collect(descriptor, null, null);
71

  
72
        String outDir = params.get("repositoryAccessType");
73

  
74
        log.info("saving content in " + outDir);
75

  
76
        File directory = new File(outDir);
77
        if (directory.exists()) {
78
            log.info(directory.getAbsolutePath() + " exists, cleaning up");
79
            FileUtils.deleteDirectory(directory);
80
        }
81
        FileUtils.forceMkdir(directory);
82

  
83
        int skipped = 0;
84
        for(String item : iterable) {
85

  
86
            final Document doc = new SAXReader().read(new StringReader(item));
87

  
88
            String id = doc.valueOf("/*[local-name() = 'dataset']/*[local-name() = 'identifier']/text()");
89
            if (StringUtils.isNotBlank(id)) {
90
                log.info(item);
91
                String fileName = outDir + "/" + id;
92
                FileWriter w = new FileWriter(fileName);
93
                w.write(item);
94
                w.close();
95
                log.info("wrote " + fileName);
96
            } else {
97
                skipped++;
98
            }
99
            if (skipped % 100 == 0) {
100
                log.info("skipped so far " + skipped);
101
            }
102
        }
103

  
104
        log.info("Done! skipped " + skipped);
105
    }
106

  
107
}

Also available in: Unified diff