1 |
28303
|
alessia.ba
|
package eu.dnetlib.data.mapreduce.hbase.oai;
|
2 |
|
|
|
3 |
|
|
import java.io.IOException;
|
4 |
|
|
import java.text.ParseException;
|
5 |
|
|
import java.util.Date;
|
6 |
|
|
|
7 |
46586
|
alessia.ba
|
import com.google.common.collect.ArrayListMultimap;
|
8 |
|
|
import com.google.common.collect.Lists;
|
9 |
|
|
import com.google.common.collect.Multimap;
|
10 |
|
|
import com.mongodb.DBObject;
|
11 |
|
|
import com.mongodb.client.MongoCollection;
|
12 |
|
|
import eu.dnetlib.data.mapreduce.hbase.oai.config.OAIConfiguration;
|
13 |
|
|
import eu.dnetlib.data.mapreduce.hbase.oai.config.OAIConfigurationReader;
|
14 |
|
|
import eu.dnetlib.data.mapreduce.hbase.oai.config.OAIConfigurationStringReader;
|
15 |
|
|
import eu.dnetlib.data.mapreduce.hbase.oai.utils.MongoSetCollection;
|
16 |
|
|
import eu.dnetlib.data.mapreduce.hbase.oai.utils.RecordFieldsExtractor;
|
17 |
29009
|
claudio.at
|
import org.apache.commons.io.IOUtils;
|
18 |
29014
|
alessia.ba
|
import org.apache.hadoop.mapreduce.Counter;
|
19 |
28303
|
alessia.ba
|
import org.dom4j.DocumentException;
|
20 |
|
|
import org.junit.Before;
|
21 |
|
|
import org.junit.Test;
|
22 |
28309
|
alessia.ba
|
import org.mockito.Matchers;
|
23 |
28303
|
alessia.ba
|
import org.mockito.Mock;
|
24 |
|
|
import org.mockito.Mockito;
|
25 |
|
|
import org.mockito.MockitoAnnotations;
|
26 |
29009
|
claudio.at
|
import org.springframework.core.io.ClassPathResource;
|
27 |
|
|
import org.springframework.core.io.Resource;
|
28 |
28303
|
alessia.ba
|
|
29 |
46586
|
alessia.ba
|
import static org.junit.Assert.*;
|
30 |
28303
|
alessia.ba
|
|
31 |
|
|
public class OAIFeedMapperTest {
|
32 |
|
|
|
33 |
|
|
private OaiFeedMapper oaiFeedMapper;
|
34 |
|
|
|
35 |
29009
|
claudio.at
|
/** File path where to find the XML configuration profile **/
|
36 |
|
|
private String pathToProfile = "eu/dnetlib/data/mapreduce/hbase/oai/config/OAIPublisherConfiguration-1.xml";
|
37 |
|
|
|
38 |
28303
|
alessia.ba
|
private OAIConfigurationStringReader oaiConfigurationReader;
|
39 |
|
|
private OAIConfiguration oaiConfiguration;
|
40 |
|
|
|
41 |
28309
|
alessia.ba
|
@Mock
|
42 |
28303
|
alessia.ba
|
private MongoSetCollection mongoSetCollection;
|
43 |
34374
|
alessia.ba
|
@Mock
|
44 |
46586
|
alessia.ba
|
private MongoCollection<DBObject> discardedCollection;
|
45 |
29014
|
alessia.ba
|
@SuppressWarnings("rawtypes")
|
46 |
|
|
@Mock
|
47 |
|
|
private org.apache.hadoop.mapreduce.Mapper.Context context;
|
48 |
|
|
@Mock
|
49 |
|
|
private Counter counter;
|
50 |
28303
|
alessia.ba
|
|
51 |
|
|
private Date feedDate;
|
52 |
|
|
private String objId1 = "oai:dnet:openaire____::2fa6b215ace86e409dde3ba4b2a6b504";
|
53 |
49891
|
alessia.ba
|
private String goodRecord = "<?xml version=\"1.0\"?>\n<record>\n <result xmlns:dc=\"http://purl.org/dc/elements/1.1/\"\n xmlns:dnet=\"eu.dnetlib.miscutils.functional.xml.DnetXsltFunctions\"\n xmlns:dr=\"http://www.driver-repository.eu/namespace/dr\"\n xmlns:dri=\"http://www.driver-repository.eu/namespace/dri\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\">\n <header>\n <dri:objIdentifier>openaire____::2fa6b215ace86e409dde3ba4b2a6b504</dri:objIdentifier>\n <dri:repositoryId/>\n <dri:dateOfCollection>2013-10-09</dri:dateOfCollection>\n <dri:dateOfTransformation>2013-10-09</dri:dateOfTransformation> </header>\n <metadata>\n <oaf:entity xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" \n\t\t xmlns:oaf=\"http://namespace.openaire.eu/oaf\" \n\t\t xsi:schemaLocation=\"http://namespace.openaire.eu/oaf http://www.openaire.eu/schema/0.1/oaf-0.1.xsd\">\n\t\t<oaf:datasource>\n\t\t\t<officialname>The Internet Journal of Orthopedic Surgery</officialname><englishname>The Internet Journal of Orthopedic Surgery</englishname><websiteurl>http://www.ispub.com/journal/the-internet-journal-of-orthopedic-surgery/</websiteurl><accessinfopackage/><namespaceprefix>issn15312968</namespaceprefix><datasourcetypeui classid=\"pubsrepository::journal\" classname=\"pubsrepository::journal\" schemeid=\"dnet:datasource_typologies\" schemename=\"dnet:datasource_typologies\"/><datasourcetype classid=\"pubsrepository::journal\" classname=\"pubsrepository::journal\" schemeid=\"dnet:datasource_typologies\" schemename=\"dnet:datasource_typologies\"/><openairecompatibility classid=\"notCompatible\" classname=\"notCompatible\" schemeid=\"dnet:compatibilityLevel\" schemename=\"dnet:compatibilityLevel\"/><latitude>0.0</latitude><longitude>0.0</longitude><subjects/><policies name=\"\" id=\"\"/><logourl/><contactemail/><dateofvalidation/><description/><odnumberofitems/><odnumberofitemsdate/><odpolicies/><odlanguages/><odcontenttypes/><releasestartdate/><releaseenddate/><missionstatementurl/><dataprovider>false</dataprovider><serviceprovider>false</serviceprovider><databaseaccesstype/><datauploadtype/><databaseaccessrestriction/><datauploadrestriction/><versioning>false</versioning><citationguidelineurl/><qualitymanagementkind/><pidsystems/><certificates/><originalId>openaire____::issn15312968</originalId><collectedfrom name=\"DOAJ-Articles\" id=\"driver______::bee53aa31dc2cbb538c10c2b65fa5824\"/><pid/><datainfo><inferred>false</inferred><deletedbyinference>false</deletedbyinference><trust>0.9</trust><inferenceprovenance/><provenanceaction classid=\"UNKNOWN\" classname=\"UNKNOWN\" schemeid=\"dnet:provenanceActions\" schemename=\"dnet:provenanceActions\"/></datainfo>\n\t\t <rels>\n\t\t </rels>\n\t\t <children>\n\t\t </children>\n\t\t</oaf:datasource>\n </oaf:entity>\n </metadata>\n </result>\n</record>";
|
54 |
34374
|
alessia.ba
|
private String dedupedRecord = "dedupedRecord.xml";
|
55 |
37821
|
alessia.ba
|
private String representativeRecord = "representativeRecord.xml";
|
56 |
28303
|
alessia.ba
|
|
57 |
|
|
@Before
|
58 |
|
|
public void setUp() throws Exception {
|
59 |
|
|
MockitoAnnotations.initMocks(this);
|
60 |
|
|
oaiFeedMapper = new OaiFeedMapper();
|
61 |
29000
|
alessia.ba
|
|
62 |
29009
|
claudio.at
|
Resource resource = new ClassPathResource(pathToProfile);
|
63 |
|
|
// setting up the parser and the profile as a string
|
64 |
|
|
String configurationProfile = IOUtils.toString(resource.getInputStream());
|
65 |
|
|
|
66 |
|
|
System.out.println("oaiConfiguration:\n" + configurationProfile);
|
67 |
|
|
oaiConfigurationReader = new OAIConfigurationStringReader(configurationProfile);
|
68 |
|
|
|
69 |
28303
|
alessia.ba
|
oaiConfiguration = oaiConfigurationReader.getOaiConfiguration();
|
70 |
|
|
|
71 |
50157
|
alessia.ba
|
//String feedDateString = DateUtils.now_ISO8601();
|
72 |
|
|
String feedDateString = "2017-12-18T12:00:04+00:00";
|
73 |
28303
|
alessia.ba
|
try {
|
74 |
48892
|
claudio.at
|
feedDate = org.apache.commons.lang.time.DateUtils.parseDate(
|
75 |
|
|
feedDateString,
|
76 |
|
|
new String[]{ "yyyy-MM-dd'T'HH:mm:ssXXX", "yyyy-MM-dd'T'HH:mm:ssZ" });
|
77 |
28303
|
alessia.ba
|
} catch (ParseException e) {
|
78 |
|
|
e.printStackTrace(System.err);
|
79 |
|
|
throw new RuntimeException(e);
|
80 |
|
|
}
|
81 |
|
|
|
82 |
|
|
oaiFeedMapper.setFeedDate(feedDate);
|
83 |
|
|
oaiFeedMapper.setMongoSetCollection(mongoSetCollection);
|
84 |
|
|
oaiFeedMapper.setOaiConfiguration(oaiConfiguration);
|
85 |
|
|
oaiFeedMapper.setOaiConfigurationReader(oaiConfigurationReader);
|
86 |
34374
|
alessia.ba
|
oaiFeedMapper.setDiscardedCollection(discardedCollection);
|
87 |
|
|
oaiFeedMapper.setSkipDuplicates(true);
|
88 |
|
|
oaiFeedMapper.setDuplicateXPath("//entity//datainfo/deletedbyinference[./text() = 'true']");
|
89 |
28303
|
alessia.ba
|
|
90 |
28309
|
alessia.ba
|
Mockito.when(mongoSetCollection.normalizeSetSpec(Matchers.anyString())).thenReturn("aNormalisedSetName");
|
91 |
29014
|
alessia.ba
|
Mockito.doNothing().when(counter).increment(Matchers.anyLong());
|
92 |
|
|
Mockito.when(context.getCounter(Matchers.anyString(), Matchers.anyString())).thenReturn(counter);
|
93 |
46586
|
alessia.ba
|
//TODO: fix mock
|
94 |
|
|
//Mockito.when(discardedCollection.insertOne((DBObject) Matchers.any());).thenReturn(null);
|
95 |
29014
|
alessia.ba
|
|
96 |
28303
|
alessia.ba
|
}
|
97 |
|
|
|
98 |
|
|
@Test
|
99 |
28309
|
alessia.ba
|
public void testCreateBasicObject() throws DocumentException, IOException, InterruptedException {
|
100 |
29014
|
alessia.ba
|
RecordFieldsExtractor parser = new RecordFieldsExtractor(Lists.newArrayList(oaiConfiguration.getFieldsFor("oaf", "index", "openaire")));
|
101 |
37821
|
alessia.ba
|
Multimap<String, String> parsedRecord = parser.extractFields(goodRecord, oaiConfiguration.getEnrichmentXPathsFor("oaf", "index", "openaire"));
|
102 |
29014
|
alessia.ba
|
DBObject obj = oaiFeedMapper.createBasicObject(objId1, goodRecord, parsedRecord, context);
|
103 |
28309
|
alessia.ba
|
// NOTE that LAST_COLLECTION_DATE_FIELD, DATESTAMP_FIELD,UPDATED_FIELD are not set by the method we are testing, but by the caller
|
104 |
|
|
// method (handleRecord) because they values to set depend on the record status (NEW|UPDATED|UNCHANGED)
|
105 |
|
|
System.out.println(obj);
|
106 |
|
|
assertNotNull(obj);
|
107 |
28303
|
alessia.ba
|
}
|
108 |
34374
|
alessia.ba
|
|
109 |
|
|
@Test
|
110 |
37821
|
alessia.ba
|
public void testCreateBasicObjectRep() throws DocumentException, IOException, InterruptedException {
|
111 |
|
|
RecordFieldsExtractor parser = new RecordFieldsExtractor(Lists.newArrayList(oaiConfiguration.getFieldsFor("oaf", "index", "openaire")));
|
112 |
|
|
String repRecordString = IOUtils.toString(this.getClass().getResourceAsStream(representativeRecord));
|
113 |
|
|
Multimap<String, String> parsedRecord = parser.extractFields(repRecordString, oaiConfiguration.getEnrichmentXPathsFor("oaf", "index", "openaire"));
|
114 |
|
|
DBObject obj = oaiFeedMapper.createBasicObject(objId1, repRecordString, parsedRecord, context);
|
115 |
|
|
// NOTE that LAST_COLLECTION_DATE_FIELD, DATESTAMP_FIELD,UPDATED_FIELD are not set by the method we are testing, but by the caller
|
116 |
|
|
// method (handleRecord) because they values to set depend on the record status (NEW|UPDATED|UNCHANGED)
|
117 |
|
|
System.out.println(obj);
|
118 |
|
|
assertNotNull(obj);
|
119 |
|
|
}
|
120 |
|
|
|
121 |
|
|
@Test
|
122 |
34374
|
alessia.ba
|
public void testParseDeduplicated() throws IOException {
|
123 |
|
|
RecordFieldsExtractor parser = new RecordFieldsExtractor(Lists.newArrayList(oaiConfiguration.getFieldsFor("oaf", "index", "openaire")));
|
124 |
|
|
String dedupedRecordString = IOUtils.toString(this.getClass().getResourceAsStream(dedupedRecord));
|
125 |
|
|
parser.setSkipDuplicates(true);
|
126 |
|
|
parser.setDuplicateXPath("//*[local-name()='entity']//*[local-name()='datainfo']/*[local-name()='deletedbyinference'][./text() = 'true']");
|
127 |
37821
|
alessia.ba
|
Multimap<String, String> parsedRecord = parser.extractFields(dedupedRecordString, oaiConfiguration.getEnrichmentXPathsFor("oaf", "index", "openaire"));
|
128 |
34374
|
alessia.ba
|
assertFalse(oaiFeedMapper.checkRecordFields(parsedRecord, context, "x", dedupedRecordString));
|
129 |
|
|
}
|
130 |
|
|
|
131 |
|
|
@Test
|
132 |
|
|
public void testParseDeduplicated2() throws IOException {
|
133 |
|
|
RecordFieldsExtractor parser = new RecordFieldsExtractor(Lists.newArrayList(oaiConfiguration.getFieldsFor("oaf", "index", "openaire")));
|
134 |
|
|
String dedupedRecordString = IOUtils.toString(this.getClass().getResourceAsStream(dedupedRecord));
|
135 |
|
|
parser.setSkipDuplicates(true);
|
136 |
|
|
parser.setDuplicateXPath("//x");
|
137 |
37821
|
alessia.ba
|
Multimap<String, String> parsedRecord = parser.extractFields(dedupedRecordString, oaiConfiguration.getEnrichmentXPathsFor("oaf", "index", "openaire"));
|
138 |
34374
|
alessia.ba
|
assertTrue(oaiFeedMapper.checkRecordFields(parsedRecord, context, "x", dedupedRecordString));
|
139 |
|
|
}
|
140 |
|
|
|
141 |
|
|
@Test
|
142 |
|
|
public void testCheckRecordFieldsDeduplicated() {
|
143 |
|
|
Multimap<String, String> recordProps = ArrayListMultimap.create();
|
144 |
|
|
recordProps.put("duplicate", "true");
|
145 |
|
|
assertFalse(oaiFeedMapper.checkRecordFields(recordProps, context, objId1, goodRecord));
|
146 |
|
|
}
|
147 |
|
|
|
148 |
|
|
@Test
|
149 |
|
|
public void testCheckRecordFieldsNotDeduplicated() {
|
150 |
|
|
Multimap<String, String> recordProps = ArrayListMultimap.create();
|
151 |
|
|
recordProps.put("duplicate", "false");
|
152 |
|
|
recordProps.put(OAIConfigurationReader.ID_FIELD, "10|xxxx");
|
153 |
|
|
assertTrue(oaiFeedMapper.checkRecordFields(recordProps, context, objId1, goodRecord));
|
154 |
|
|
}
|
155 |
|
|
|
156 |
|
|
@Test
|
157 |
|
|
public void testCheckRecordFieldsEmpty() {
|
158 |
|
|
Multimap<String, String> recordProps = ArrayListMultimap.create();
|
159 |
|
|
assertFalse(oaiFeedMapper.checkRecordFields(recordProps, context, objId1, goodRecord));
|
160 |
|
|
}
|
161 |
|
|
|
162 |
|
|
@Test
|
163 |
|
|
public void testCheckRecordFieldsNull() {
|
164 |
|
|
Multimap<String, String> recordProps = null;
|
165 |
|
|
assertFalse(oaiFeedMapper.checkRecordFields(recordProps, context, objId1, goodRecord));
|
166 |
|
|
}
|
167 |
49891
|
alessia.ba
|
|
168 |
|
|
@Test
|
169 |
|
|
public void testTransformationDate(){
|
170 |
|
|
RecordFieldsExtractor parser = new RecordFieldsExtractor(Lists.newArrayList(oaiConfiguration.getFieldsFor("oaf", "index", "openaire")));
|
171 |
|
|
parser.setSkipDuplicates(true);
|
172 |
|
|
parser.setDuplicateXPath("//*[local-name()='entity']//*[local-name()='datainfo']/*[local-name()='deletedbyinference'][./text() = 'true']");
|
173 |
|
|
Multimap<String, String> parsedRecord = parser.extractFields(goodRecord, oaiConfiguration.getEnrichmentXPathsFor("oaf", "index", "openaire"));
|
174 |
|
|
System.out.println(parsedRecord);
|
175 |
|
|
assertTrue(parsedRecord.containsKey("dateOfTransformation"));
|
176 |
|
|
}
|
177 |
28303
|
alessia.ba
|
}
|