1 |
55817
|
claudio.at
|
package eu.dnetlib.data.transform.xml.vtd;
|
2 |
|
|
|
3 |
|
|
import com.mongodb.client.MongoCollection;
|
4 |
|
|
import com.mongodb.client.MongoDatabase;
|
5 |
|
|
import eu.dnetlib.data.transform.xml2.DatasetToProto;
|
6 |
|
|
import org.apache.commons.lang3.time.StopWatch;
|
7 |
|
|
import org.apache.commons.logging.Log;
|
8 |
|
|
import org.apache.commons.logging.LogFactory;
|
9 |
|
|
import org.apache.commons.math.stat.descriptive.DescriptiveStatistics;
|
10 |
|
|
import org.bson.Document;
|
11 |
|
|
import org.junit.Test;
|
12 |
|
|
import org.junit.runner.RunWith;
|
13 |
|
|
import org.springframework.beans.factory.annotation.Autowired;
|
14 |
|
|
import org.springframework.test.context.ContextConfiguration;
|
15 |
|
|
import org.springframework.test.context.junit4.SpringJUnit4ClassRunner;
|
16 |
|
|
|
17 |
|
|
import java.io.IOException;
|
18 |
|
|
import java.util.Objects;
|
19 |
|
|
import java.util.concurrent.atomic.AtomicInteger;
|
20 |
|
|
import java.util.stream.StreamSupport;
|
21 |
|
|
|
22 |
|
|
import static org.junit.Assert.assertNotNull;
|
23 |
|
|
import static org.junit.Assert.assertTrue;
|
24 |
|
|
|
25 |
|
|
@RunWith(SpringJUnit4ClassRunner.class)
|
26 |
|
|
@ContextConfiguration(classes = { ConfigurationTestConfig.class })
|
27 |
|
|
public class VtdParserToProtoIT {
|
28 |
|
|
|
29 |
|
|
private static final Log log = LogFactory.getLog(VtdParserToProtoIT.class);
|
30 |
|
|
public static final String COLLECTION_NAME = "datacite";
|
31 |
|
|
private static final int BATCH_SIZE = 10000;
|
32 |
|
|
public static final int LOG_FREQ = 5000;
|
33 |
|
|
|
34 |
|
|
@Autowired
|
35 |
|
|
private MongoDatabase db;
|
36 |
|
|
|
37 |
|
|
@Test
|
38 |
|
|
public void testParseDatacite() throws IOException {
|
39 |
|
|
|
40 |
|
|
final MongoCollection<Document> collection = db.getCollection(COLLECTION_NAME);
|
41 |
|
|
|
42 |
|
|
final long collectionSize = collection.count();
|
43 |
|
|
log.info(String.format("found %s records in collection '%s'", collectionSize, COLLECTION_NAME));
|
44 |
|
|
|
45 |
|
|
final AtomicInteger read = new AtomicInteger(0);
|
46 |
|
|
final DescriptiveStatistics stats = new DescriptiveStatistics();
|
47 |
|
|
|
48 |
|
|
final StopWatch timer = new StopWatch();
|
49 |
|
|
|
50 |
|
|
final DatasetToProto mapper = new DatasetToProto();
|
51 |
|
|
StreamSupport.stream(collection.find().batchSize(BATCH_SIZE).spliterator(), false)
|
52 |
|
|
.peek(d -> {
|
53 |
|
|
if (read.addAndGet(1) % LOG_FREQ == 0) {
|
54 |
|
|
log.info(String.format("records read so far %s", read.get()));
|
55 |
|
|
log.info(String.format("stats so far %s", stats.toString()));
|
56 |
|
|
}
|
57 |
|
|
})
|
58 |
|
|
.map(d -> (String) d.get("body"))
|
59 |
|
|
.filter(Objects::nonNull)
|
60 |
|
|
.peek(s -> timer.start())
|
61 |
|
|
.map(mapper)
|
62 |
|
|
.forEach(oaf -> {
|
63 |
|
|
assertNotNull(oaf);
|
64 |
|
|
assertTrue(oaf.hasEntity());
|
65 |
|
|
|
66 |
|
|
timer.stop();
|
67 |
|
|
stats.addValue(timer.getTime());
|
68 |
|
|
timer.reset();
|
69 |
|
|
});
|
70 |
|
|
|
71 |
|
|
log.info(String.format("processed %s/%s records", read.get(), collectionSize));
|
72 |
|
|
}
|
73 |
|
|
|
74 |
|
|
|
75 |
|
|
}
|