Project

General

Profile

« Previous | Next » 

Revision 52062

added integration test for VTD parser, reads the datacite collection and provide statistics about the parser performances

View differences:

modules/dnet-openaireplus-mapping-utils/trunk/src/test/java/eu/dnetlib/data/transform/xml/PublicationToProtoTest.java
1
package eu.dnetlib.data.transform.xml;
2

  
3
import java.io.InputStream;
4

  
5
import com.ximpleware.AutoPilot;
6
import com.ximpleware.VTDGen;
7
import com.ximpleware.VTDNav;
8
import org.apache.commons.io.IOUtils;
9
import org.junit.Test;
10

  
11
import static eu.dnetlib.data.transform.xml.vtd.VtdUtilityParser.parseXml;
12

  
13
public class PublicationToProtoTest {
14

  
15

  
16
    @Test
17
    public void testPartser() throws Exception {
18
        final InputStream resource = this.getClass().getResourceAsStream("/eu/dnetlib/data/transform/record.xml");
19
        final String record =IOUtils.toString(resource);
20
        final VTDGen vg = parseXml(record);
21
        final VTDNav vn = vg.getNav();
22
        final AutoPilot ap = new AutoPilot(vn);
23

  
24

  
25

  
26

  
27
    }
28
}
modules/dnet-openaireplus-mapping-utils/trunk/src/test/java/eu/dnetlib/pace/distance/DetectorTest.java
209 209
		double d = sr.getScore();
210 210
		System.out.println(String.format(" d ---> %s", d));
211 211

  
212
		assertTrue("different DOIs will drop the score to 0, regardless of the other fields", d == 0.0);
212
		assertTrue("different DOIs will NOT drop the score to 0, then evaluate other fields", d == 1.0);
213 213
	}
214 214

  
215 215
	@Test
modules/dnet-openaireplus-mapping-utils/trunk/src/test/java/eu/dnetlib/data/transform/xml/vtd/VtdParserToProtoTest.java
25 25
		doTest("/eu/dnetlib/data/transform/dataset.xml", new DatasetToProto());
26 26
	}
27 27

  
28
	@Test
29
	public void testParseDataset2() throws IOException {
30
		doTest("/eu/dnetlib/data/transform/dataset2.xml", new DatasetToProto());
31
	}
32

  
28 33
	private void doTest(final String filePath, Function<String, Oaf> f) throws IOException {
29 34
		final String xml = IOUtils.toString(getClass().getResourceAsStream(filePath));
30 35

  
modules/dnet-openaireplus-mapping-utils/trunk/src/test/java/eu/dnetlib/data/transform/xml/vtd/ConfigurationTestConfig.java
1
package eu.dnetlib.data.transform.xml.vtd;
2

  
3
import java.io.IOException;
4
import java.io.InputStream;
5
import java.util.Properties;
6

  
7
import com.mongodb.MongoClient;
8
import com.mongodb.client.MongoDatabase;
9
import org.springframework.context.annotation.Bean;
10
import org.springframework.context.annotation.Configuration;
11
import org.springframework.core.io.ClassPathResource;
12

  
13
@Configuration
14
public class ConfigurationTestConfig {
15

  
16
	@Bean
17
	public MongoDatabase db() throws IOException {
18

  
19
		final Properties p = new Properties();
20
		final ClassPathResource cp = new ClassPathResource("/eu/dnetlib/data/transform/xml/vtd/test.properties");
21
		try (final InputStream stream = cp.getInputStream()) {
22
			p.load(stream);
23
		}
24

  
25
		final MongoClient mongo = new MongoClient(
26
				p.getProperty("mongodb.host"),
27
				Integer.valueOf(p.getProperty("mongodb.port")));
28
		return mongo.getDatabase(p.getProperty("mongodb.dbname"));
29
	}
30

  
31
}
modules/dnet-openaireplus-mapping-utils/trunk/src/test/java/eu/dnetlib/data/transform/xml/vtd/VtdUtilityParserTest.java
1 1
package eu.dnetlib.data.transform.xml.vtd;
2 2

  
3
import java.io.InputStream;
4
import java.util.List;
5

  
3 6
import com.ximpleware.AutoPilot;
4 7
import com.ximpleware.VTDGen;
5 8
import com.ximpleware.VTDNav;
6
import eu.dnetlib.data.transform.xml.vtd.VtdUtilityParser;
7 9
import org.apache.commons.io.IOUtils;
8 10
import org.junit.Assert;
9 11
import org.junit.Test;
10 12

  
11
import java.io.InputStream;
12
import java.util.List;
13

  
14 13
import static eu.dnetlib.data.transform.xml.vtd.VtdUtilityParser.parseXml;
15 14

  
16 15
public class VtdUtilityParserTest {
......
23 22

  
24 23
    @Test
25 24
    public void testPartser() throws Exception {
26
        final InputStream resource = this.getClass().getResourceAsStream("/eu/dnetlib/data/transform/record.xml");
25
        final InputStream resource = this.getClass().getResourceAsStream("/eu/dnetlib/data/transform/publication.xml");
27 26
        final String record =IOUtils.toString(resource);
28 27
        final VTDGen vg = parseXml(record);
29 28
        final VTDNav vn = vg.getNav();
modules/dnet-openaireplus-mapping-utils/trunk/src/test/java/eu/dnetlib/data/transform/xml/vtd/VtdParserToProtoIT.java
1
package eu.dnetlib.data.transform.xml.vtd;
2

  
3
import java.io.IOException;
4
import java.util.Objects;
5
import java.util.Timer;
6
import java.util.concurrent.atomic.AtomicInteger;
7
import java.util.stream.StreamSupport;
8

  
9
import com.google.common.collect.Streams;
10
import com.mongodb.client.MongoCollection;
11
import com.mongodb.client.MongoDatabase;
12
import eu.dnetlib.miscutils.collections.Pair;
13
import org.apache.commons.lang3.time.StopWatch;
14
import org.apache.commons.logging.Log;
15
import org.apache.commons.logging.LogFactory;
16
import org.apache.commons.math.stat.descriptive.DescriptiveStatistics;
17
import org.bson.Document;
18
import org.junit.Test;
19
import org.junit.runner.RunWith;
20
import org.springframework.beans.factory.annotation.Autowired;
21
import org.springframework.test.context.ContextConfiguration;
22
import org.springframework.test.context.junit4.SpringJUnit4ClassRunner;
23

  
24
import static org.junit.Assert.assertNotNull;
25
import static org.junit.Assert.assertTrue;
26

  
27
@RunWith(SpringJUnit4ClassRunner.class)
28
@ContextConfiguration(classes = { ConfigurationTestConfig.class })
29
public class VtdParserToProtoIT {
30

  
31
	private static final Log log = LogFactory.getLog(VtdParserToProtoIT.class);
32
	public static final String COLLECTION_NAME = "datacite";
33
	private static final int BATCH_SIZE = 10000;
34
	public static final int LOG_FREQ = 5000;
35

  
36
	@Autowired
37
	private MongoDatabase db;
38

  
39
	@Test
40
	public void testParseDatacite() throws IOException {
41

  
42
		final MongoCollection<Document> collection = db.getCollection(COLLECTION_NAME);
43

  
44
		final long collectionSize = collection.count();
45
		log.info(String.format("found %s records in collection '%s'", collectionSize, COLLECTION_NAME));
46

  
47
		final AtomicInteger read = new AtomicInteger(0);
48
		final DescriptiveStatistics stats = new DescriptiveStatistics();
49

  
50
		final StopWatch timer = new StopWatch();
51

  
52
		final DatasetToProto mapper = new DatasetToProto();
53
		StreamSupport.stream(collection.find().batchSize(BATCH_SIZE).spliterator(), false)
54
				.peek(d -> {
55
					if (read.addAndGet(1) % LOG_FREQ == 0) {
56
						log.info(String.format("records read so far %s", read.get()));
57
						log.info(String.format("stats so far %s", stats.toString()));
58
					}
59
				})
60
				.map(d -> (String) d.get("body"))
61
				.filter(Objects::nonNull)
62
				.peek(s -> timer.start())
63
				.map(mapper)
64
				.forEach(oaf -> {
65
					assertNotNull(oaf);
66
					assertTrue(oaf.hasEntity());
67

  
68
					timer.stop();
69
					stats.addValue(timer.getTime());
70
					timer.reset();
71
				});
72

  
73
		log.info(String.format("processed %s/%s records", read.get(), collectionSize));
74
	}
75

  
76

  
77
}
modules/dnet-openaireplus-mapping-utils/trunk/src/test/resources/eu/dnetlib/data/transform/xml/vtd/test.properties
1
mongodb.host    =   node5.t.openaire.research-infrastructures.eu
2
mongodb.port    =   27017
3
mongodb.dbname  =   test_ci
modules/dnet-openaireplus-mapping-utils/trunk/src/test/resources/eu/dnetlib/data/transform/dataset2.xml
1
<?xml version="1.0" encoding="UTF-8"?>
2
<record xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:oaf="http://namespace.openaire.eu/oaf" xmlns:dr="http://www.driver-repository.eu/namespace/dr">
3
	<oai:header xmlns:oai="http://www.openarchives.org/OAI/2.0/" xmlns:dri="http://www.driver-repository.eu/namespace/dri" xmlns="http://namespace.openaire.eu/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:prov="http://www.openarchives.org/OAI/2.0/provenance">
4
		<dri:objIdentifier>datacite____::002bdf21f586e0ea61e992e4724cb8f5</dri:objIdentifier>
5
		<dri:recordIdentifier>oai:oai.datacite.org:6750738</dri:recordIdentifier>
6
		<dri:dateOfCollection>2018-03-24T20:16:13.537Z</dri:dateOfCollection>
7
		<oaf:datasourceprefix>datacite____</oaf:datasourceprefix>
8
		<identifier xmlns="http://www.openarchives.org/OAI/2.0/">oai:oai.datacite.org:6750738</identifier>
9
		<datestamp xmlns="http://www.openarchives.org/OAI/2.0/">2018-03-09T23:51:56Z</datestamp>
10
		<setSpec xmlns="http://www.openarchives.org/OAI/2.0/">CDL</setSpec>
11
		<setSpec xmlns="http://www.openarchives.org/OAI/2.0/">CDL.CULIS</setSpec>
12
		<dr:dateOfTransformation>2018-05-12T21:23:38.227Z</dr:dateOfTransformation>
13
	</oai:header>
14
	<metadata>
15
		<resource xmlns="http://datacite.org/schema/kernel-3" xmlns:dri="http://www.driver-repository.eu/namespace/dri" xmlns:oai="http://www.openarchives.org/OAI/2.0/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:prov="http://www.openarchives.org/OAI/2.0/provenance" xsi:schemaLocation="http://datacite.org/schema/kernel-3 http://schema.datacite.org/meta/kernel-3/metadata.xsd">
16
			<identifier identifierType="DOI">10.7916/D8BV7FXW</identifier>
17
			<alternateIdentifiers xmlns="http://www.openarchives.org/OAI/2.0/">
18
				<alternateIdentifier alternateIdentifierType="URL">http://dx.doi.org/10.7916/D8BV7FXW</alternateIdentifier>
19
			</alternateIdentifiers>
20
			<titles>
21
				<title> Is H+3 cooling ever important in primordial gas?</title>
22
			</titles>
23
			<publisher>Columbia University</publisher>
24
			<publicationYear>2009</publicationYear>
25
			<dates>
26
				<date dateType="Created">2017-06-27</date>
27
				<date dateType="Updated">2018-02-17</date>
28
			</dates>
29
			<creators>
30
				<creator>
31
					<creatorName>Glover, S. C. O.</creatorName>
32
				</creator>
33
				<creator>
34
					<creatorName>Savin, Daniel Wolf</creatorName>
35
				</creator>
36
			</creators>
37
			<subjects>
38
				<subject>Astrophysics</subject>
39
				<subject>Plasma (Ionized gases)</subject>
40
				<subject>Physics</subject>
41
				<subject>Microphysics</subject>
42
			</subjects>
43
			<resourceType resourceTypeGeneral="Dataset">Data</resourceType>
44
			<descriptions>
45
				<description descriptionType="Abstract">Full tables of " Is H+3 cooling ever important in primordial gas?".</description>
46
			</descriptions>
47
			<relatedIdentifiers>
48
				<relatedIdentifier relatedIdentifierType="ISSN" relationType="IsPartOf">1365-2966</relatedIdentifier>
49
				<relatedIdentifier relatedIdentifierType="ISBN" relationType="IsPartOf"/>
50
				<relatedIdentifier relatedIdentifierType="DOI" relationType="IsVariantFormOf">10.1111/j.1365-2966.2008.14156.x</relatedIdentifier>
51
			</relatedIdentifiers>
52
		</resource>
53
		<dr:CobjCategory>0021</dr:CobjCategory>
54
		<oaf:dateAccepted>2009-01-01</oaf:dateAccepted>
55
		<oaf:accessrights>UNKNOWN</oaf:accessrights>
56
		<oaf:language>und</oaf:language>
57
		<oaf:hostedBy name="Unknown Repository" id="openaire____::1256f046-bf1f-4afc-8b47-d0b147148b18"/>
58
		<oaf:collectedFrom name="Datacite" id="openaire____::datacite"/>
59
	</metadata>
60
	<about xmlns:dri="http://www.driver-repository.eu/namespace/dri" xmlns:oai="http://www.openarchives.org/OAI/2.0/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:prov="http://www.openarchives.org/OAI/2.0/provenance">
61
		<provenance xmlns="http://www.openarchives.org/OAI/2.0/provenance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/provenance http://www.openarchives.org/OAI/2.0/provenance.xsd">
62
			<originDescription harvestDate="2018-03-24T20:16:13.537Z" altered="true">
63
				<baseURL>https://oai.datacite.org/oai</baseURL>
64
				<identifier>oai:oai.datacite.org:6750738</identifier>
65
				<datestamp>2018-03-09T23:51:56Z</datestamp>
66
				<metadataNamespace/>
67
			</originDescription>
68
		</provenance>
69
		<oaf:datainfo>
70
			<oaf:inferred>false</oaf:inferred>
71
			<oaf:deletedbyinference>false</oaf:deletedbyinference>
72
			<oaf:trust>0.9</oaf:trust>
73
			<oaf:inferenceprovenance/>
74
			<oaf:provenanceaction schemename="dnet:provenanceActions" schemeid="dnet:provenanceActions" classname="sysimport:crosswalk:datasetarchive" classid="sysimport:crosswalk:datasetarchive"/>
75
		</oaf:datainfo>
76
	</about>
77
</record>
modules/dnet-openaireplus-mapping-utils/trunk/src/main/java/eu/dnetlib/data/transform/xml/vtd/DatasetToProto.java
8 8

  
9 9
public class DatasetToProto extends AbstractResultVtdParser {
10 10

  
11
	public static DatasetToProto newInstance() {
12
		return new DatasetToProto();
13
	}
14

  
11 15
	public DatasetToProto() {}
12 16

  
13 17
	public DatasetToProto(final boolean invisible, final String provenance, final String trust) {
modules/dnet-openaireplus-mapping-utils/trunk/src/main/java/eu/dnetlib/data/transform/xml/vtd/VtdException.java
12 12
	public VtdException(final Throwable e) {
13 13
		super(e);
14 14
	}
15

  
16
	public VtdException(final String msg, final Exception e) {
17
		super(msg, e);
18
	}
19

  
20
	public VtdException(final String msg, final Throwable e) {
21
		super(msg, e);
22
	}
15 23
}
modules/dnet-openaireplus-mapping-utils/trunk/src/main/java/eu/dnetlib/data/transform/xml/vtd/AbstractResultVtdParser.java
84 84

  
85 85
	// publication
86 86
	public static final String PROJECTID = "projectid";
87
	public static final String RELATED_DATASET = "relateddataset";
88
	public static final String RELATED_PUBLICATION = "relatedpublication";
89
	public static final String RELATED_IDENTIFIER = "relatedidentifier";
87
	public static final String RELATED_DATASET = "relatedDataSet";
88
	public static final String RELATED_PUBLICATION = "relatedPublication";
89
	public static final String RELATED_IDENTIFIER = "relatedIdentifier";
90 90

  
91 91
	protected static Map<String, String> mappingAccess = Maps.newHashMap();
92 92

  
......
136 136

  
137 137
			return transform(ap, vn, objIdentifier, getFields());
138 138
		} catch (Throwable e) {
139
			log.error(e.getMessage());
139
			log.error(xml);
140 140
			log.error(ExceptionUtils.getStackTrace(e));
141 141
			return null;
142 142
		}
......
370 370
				final Pair<String, Function<List<Node>, Object>> pair = entry.getValue();
371 371
				final String xpath = pair.getKey();
372 372
				final Function<List<Node>, Object> function = pair.getValue();
373
				addField(md.getBuilder(), d.findFieldByName(fieldName), function.apply(getNodes(ap, vn, xpath)));
373
				try {
374
					addField(md.getBuilder(), d.findFieldByName(fieldName), function.apply(getNodes(ap, vn, xpath)));
375
				} catch (Throwable e) {
376
					throw new VtdException(String.format("Error mapping field '%s' from xpath '%s' for record '%s'", fieldName, xpath, objIdentifier), e);
377
				}
374 378
			}
375 379
		}
376 380

  
......
393 397
	private OafRel.Builder getOafRel(final String objIdentifier, final Node node, final OafRel.Builder oafRel) {
394 398
		final Map<String, String> a = node.getAttributes();
395 399

  
396
		switch (node.getName().toLowerCase()) {
400
		switch (node.getName()) {
397 401
		case PROJECTID:
402
			if (StringUtils.isBlank(node.getTextValue())) {
403
				return null;
404
			}
398 405
			return oafRel
399 406
					.setTarget(oafSplitId(Type.project.name(), StringUtils.trim(node.getTextValue())))
400 407
					.setRelType(RelType.resultProject)
......
403 410

  
404 411
		case RELATED_PUBLICATION:
405 412
		case RELATED_DATASET:
413
			if (StringUtils.isBlank(a.get("id"))) {
414
				return null;
415
			}
406 416
			return oafRel
407 417
					.setTarget(oafSimpleId(Type.result.name(), StringUtils.trim(a.get("id"))))
408 418
					.setRelType(RelType.resultResult)
......
410 420
					.setRelClass("isRelatedTo");
411 421

  
412 422
		case RELATED_IDENTIFIER:
423
			if (StringUtils.isBlank(node.getTextValue())) {
424
				return null;
425
			}
413 426
			return oafRel
414 427
					.setTarget(node.getTextValue())
415 428
					.setRelType(RelType.resultResult)
modules/dnet-openaireplus-mapping-utils/trunk/src/main/java/eu/dnetlib/data/transform/xml/vtd/VtdUtilityParser.java
77 77

  
78 78
	private static Node asNode(final VTDNav vn) throws NavException {
79 79
		final Node currentNode = new Node();
80
		final String name = vn.toRawString(vn.getCurrentIndex());
81
		currentNode.setName(name.contains(NS_SEPARATOR) ? StringUtils.substringAfter(name, NS_SEPARATOR) : name);
82

  
80 83
		int t = vn.getText();
81 84
		if (t >= 0) {
82
			final String name = vn.toRawString(vn.getCurrentIndex());
83
			currentNode.setName(name.contains(NS_SEPARATOR) ? StringUtils.substringAfter(name, NS_SEPARATOR) : name);
84 85
			currentNode.setTextValue(vn.toNormalizedString(t));
85 86
		}
86 87
		currentNode.setAttributes(getAttributes(vn));
modules/dnet-openaireplus-mapping-utils/trunk/pom.xml
14 14
	<scm>
15 15
		<developerConnection>scm:svn:https://svn.driver.research-infrastructures.eu/driver/dnet45/modules/dnet-openaireplus-mapping-utils/trunk</developerConnection>
16 16
	</scm>
17

  
18
	<build>
19
		<plugins>
20
			<plugin>
21
				<groupId>org.apache.maven.plugins</groupId>
22
				<artifactId>maven-failsafe-plugin</artifactId>
23
				<version>2.19.1</version>
24
				<executions>
25
					<execution>
26
						<id>integration-test</id>
27
						<goals>
28
							<goal>integration-test</goal>
29
						</goals>
30
					</execution>
31
					<execution>
32
						<id>verify</id>
33
						<goals>
34
							<goal>verify</goal>
35
						</goals>
36
					</execution>
37
				</executions>
38
			</plugin>
39
		</plugins>
40
	</build>
41

  
17 42
	<dependencies>
18 43
		<dependency>
19 44
			<groupId>com.google.guava</groupId>
......
86 111
			<version>1.2</version>
87 112
		</dependency>
88 113
		<dependency>
114
			<groupId>org.apache.commons</groupId>
115
			<artifactId>commons-lang3</artifactId>
116
			<version>3.5</version>
117
		</dependency>
118

  
119
		<!-- test deps -->
120
		<dependency>
89 121
			<groupId>eu.dnetlib</groupId>
90 122
			<artifactId>dnet-openaireplus-profiles</artifactId>
91 123
			<version>[1.0.0,2.0.0)</version>
92 124
			<scope>test</scope>
93 125
		</dependency>
94 126
		<dependency>
95
			<groupId>org.apache.commons</groupId>
96
			<artifactId>commons-lang3</artifactId>
97
			<version>3.5</version>
127
			<groupId>org.mongodb</groupId>
128
			<artifactId>mongo-java-driver</artifactId>
129
			<version>${mongodb.driver.version}</version>
130
			<scope>test</scope>
98 131
		</dependency>
132
		<dependency>
133
			<groupId>org.springframework</groupId>
134
			<artifactId>spring-context</artifactId>
135
			<version>${spring.version}</version>
136
			<scope>test</scope>
137
		</dependency>
138
		<dependency>
139
			<groupId>org.springframework</groupId>
140
			<artifactId>spring-core</artifactId>
141
			<version>${spring.version}</version>
142
			<scope>test</scope>
143
		</dependency>
144
		<dependency>
145
			<groupId>org.springframework</groupId>
146
			<artifactId>spring-test</artifactId>
147
			<version>${spring.version}</version>
148
			<scope>test</scope>
149
		</dependency>
99 150

  
100 151
	</dependencies>
101 152
</project>

Also available in: Unified diff