Revision 37821
Added by Alessia Bardi almost 9 years ago
modules/dnet-mapreduce-jobs/trunk/src/test/java/eu/dnetlib/data/mapreduce/hbase/oai/OAIFeedMapperTest.java | ||
---|---|---|
58 | 58 |
private String objId1 = "oai:dnet:openaire____::2fa6b215ace86e409dde3ba4b2a6b504"; |
59 | 59 |
private String goodRecord = "<?xml version=\"1.0\"?>\n<record>\n <result xmlns:dc=\"http://purl.org/dc/elements/1.1/\"\n xmlns:dnet=\"eu.dnetlib.miscutils.functional.xml.DnetXsltFunctions\"\n xmlns:dr=\"http://www.driver-repository.eu/namespace/dr\"\n xmlns:dri=\"http://www.driver-repository.eu/namespace/dri\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\">\n <header>\n <dri:objIdentifier>openaire____::2fa6b215ace86e409dde3ba4b2a6b504</dri:objIdentifier>\n <dri:repositoryId/>\n <dri:dateOfCollection>2013-10-09</dri:dateOfCollection>\n </header>\n <metadata>\n <oaf:entity xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" \n\t\t xmlns:oaf=\"http://namespace.openaire.eu/oaf\" \n\t\t xsi:schemaLocation=\"http://namespace.openaire.eu/oaf http://www.openaire.eu/schema/0.1/oaf-0.1.xsd\">\n\t\t<oaf:datasource>\n\t\t\t<officialname>The Internet Journal of Orthopedic Surgery</officialname><englishname>The Internet Journal of Orthopedic Surgery</englishname><websiteurl>http://www.ispub.com/journal/the-internet-journal-of-orthopedic-surgery/</websiteurl><accessinfopackage/><namespaceprefix>issn15312968</namespaceprefix><datasourcetypeui classid=\"pubsrepository::journal\" classname=\"pubsrepository::journal\" schemeid=\"dnet:datasource_typologies\" schemename=\"dnet:datasource_typologies\"/><datasourcetype classid=\"pubsrepository::journal\" classname=\"pubsrepository::journal\" schemeid=\"dnet:datasource_typologies\" schemename=\"dnet:datasource_typologies\"/><openairecompatibility classid=\"notCompatible\" classname=\"notCompatible\" schemeid=\"dnet:compatibilityLevel\" schemename=\"dnet:compatibilityLevel\"/><latitude>0.0</latitude><longitude>0.0</longitude><subjects/><policies name=\"\" id=\"\"/><logourl/><contactemail/><dateofvalidation/><description/><odnumberofitems/><odnumberofitemsdate/><odpolicies/><odlanguages/><odcontenttypes/><releasestartdate/><releaseenddate/><missionstatementurl/><dataprovider>false</dataprovider><serviceprovider>false</serviceprovider><databaseaccesstype/><datauploadtype/><databaseaccessrestriction/><datauploadrestriction/><versioning>false</versioning><citationguidelineurl/><qualitymanagementkind/><pidsystems/><certificates/><originalId>openaire____::issn15312968</originalId><collectedfrom name=\"DOAJ-Articles\" id=\"driver______::bee53aa31dc2cbb538c10c2b65fa5824\"/><pid/><datainfo><inferred>false</inferred><deletedbyinference>false</deletedbyinference><trust>0.9</trust><inferenceprovenance/><provenanceaction classid=\"UNKNOWN\" classname=\"UNKNOWN\" schemeid=\"dnet:provenanceActions\" schemename=\"dnet:provenanceActions\"/></datainfo>\n\t\t <rels>\n\t\t </rels>\n\t\t <children>\n\t\t </children>\n\t\t</oaf:datasource>\n </oaf:entity>\n </metadata>\n </result>\n</record>"; |
60 | 60 |
private String dedupedRecord = "dedupedRecord.xml"; |
61 |
private String representativeRecord = "representativeRecord.xml"; |
|
61 | 62 |
|
62 | 63 |
@Before |
63 | 64 |
public void setUp() throws Exception { |
... | ... | |
99 | 100 |
@Test |
100 | 101 |
public void testCreateBasicObject() throws DocumentException, IOException, InterruptedException { |
101 | 102 |
RecordFieldsExtractor parser = new RecordFieldsExtractor(Lists.newArrayList(oaiConfiguration.getFieldsFor("oaf", "index", "openaire"))); |
102 |
Multimap<String, String> parsedRecord = parser.extractFields(goodRecord); |
|
103 |
Multimap<String, String> parsedRecord = parser.extractFields(goodRecord, oaiConfiguration.getEnrichmentXPathsFor("oaf", "index", "openaire"));
|
|
103 | 104 |
DBObject obj = oaiFeedMapper.createBasicObject(objId1, goodRecord, parsedRecord, context); |
104 | 105 |
// NOTE that LAST_COLLECTION_DATE_FIELD, DATESTAMP_FIELD,UPDATED_FIELD are not set by the method we are testing, but by the caller |
105 | 106 |
// method (handleRecord) because they values to set depend on the record status (NEW|UPDATED|UNCHANGED) |
... | ... | |
108 | 109 |
} |
109 | 110 |
|
110 | 111 |
@Test |
112 |
public void testCreateBasicObjectRep() throws DocumentException, IOException, InterruptedException { |
|
113 |
RecordFieldsExtractor parser = new RecordFieldsExtractor(Lists.newArrayList(oaiConfiguration.getFieldsFor("oaf", "index", "openaire"))); |
|
114 |
String repRecordString = IOUtils.toString(this.getClass().getResourceAsStream(representativeRecord)); |
|
115 |
Multimap<String, String> parsedRecord = parser.extractFields(repRecordString, oaiConfiguration.getEnrichmentXPathsFor("oaf", "index", "openaire")); |
|
116 |
DBObject obj = oaiFeedMapper.createBasicObject(objId1, repRecordString, parsedRecord, context); |
|
117 |
// NOTE that LAST_COLLECTION_DATE_FIELD, DATESTAMP_FIELD,UPDATED_FIELD are not set by the method we are testing, but by the caller |
|
118 |
// method (handleRecord) because they values to set depend on the record status (NEW|UPDATED|UNCHANGED) |
|
119 |
System.out.println(obj); |
|
120 |
assertNotNull(obj); |
|
121 |
} |
|
122 |
|
|
123 |
@Test |
|
111 | 124 |
public void testParseDeduplicated() throws IOException { |
112 | 125 |
RecordFieldsExtractor parser = new RecordFieldsExtractor(Lists.newArrayList(oaiConfiguration.getFieldsFor("oaf", "index", "openaire"))); |
113 | 126 |
String dedupedRecordString = IOUtils.toString(this.getClass().getResourceAsStream(dedupedRecord)); |
114 | 127 |
parser.setSkipDuplicates(true); |
115 | 128 |
parser.setDuplicateXPath("//*[local-name()='entity']//*[local-name()='datainfo']/*[local-name()='deletedbyinference'][./text() = 'true']"); |
116 |
Multimap<String, String> parsedRecord = parser.extractFields(dedupedRecordString); |
|
129 |
Multimap<String, String> parsedRecord = parser.extractFields(dedupedRecordString, oaiConfiguration.getEnrichmentXPathsFor("oaf", "index", "openaire"));
|
|
117 | 130 |
assertFalse(oaiFeedMapper.checkRecordFields(parsedRecord, context, "x", dedupedRecordString)); |
118 | 131 |
} |
119 | 132 |
|
... | ... | |
123 | 136 |
String dedupedRecordString = IOUtils.toString(this.getClass().getResourceAsStream(dedupedRecord)); |
124 | 137 |
parser.setSkipDuplicates(true); |
125 | 138 |
parser.setDuplicateXPath("//x"); |
126 |
Multimap<String, String> parsedRecord = parser.extractFields(dedupedRecordString); |
|
139 |
Multimap<String, String> parsedRecord = parser.extractFields(dedupedRecordString, oaiConfiguration.getEnrichmentXPathsFor("oaf", "index", "openaire"));
|
|
127 | 140 |
assertTrue(oaiFeedMapper.checkRecordFields(parsedRecord, context, "x", dedupedRecordString)); |
128 | 141 |
} |
129 | 142 |
|
modules/dnet-mapreduce-jobs/trunk/src/test/java/eu/dnetlib/data/mapreduce/hbase/oai/RecordFieldsExtractorTest.java | ||
---|---|---|
1 |
package eu.dnetlib.data.mapreduce.hbase.oai; |
|
2 |
|
|
3 |
import static org.junit.Assert.assertTrue; |
|
4 |
|
|
5 |
import java.io.IOException; |
|
6 |
import java.io.StringReader; |
|
7 |
import java.util.Collection; |
|
8 |
import java.util.List; |
|
9 |
|
|
10 |
import org.apache.commons.io.IOUtils; |
|
11 |
import org.dom4j.Document; |
|
12 |
import org.dom4j.DocumentException; |
|
13 |
import org.dom4j.io.SAXReader; |
|
14 |
import org.junit.Before; |
|
15 |
import org.junit.Test; |
|
16 |
|
|
17 |
import com.google.common.collect.Lists; |
|
18 |
|
|
19 |
import eu.dnetlib.data.mapreduce.hbase.oai.utils.RecordFieldsExtractor; |
|
20 |
|
|
21 |
public class RecordFieldsExtractorTest { |
|
22 |
|
|
23 |
private String record = "representativeRecord.xml"; |
|
24 |
private String record2 = "dedupedRecord.xml"; |
|
25 |
private String record3 = "originalRecord.xml"; |
|
26 |
private RecordFieldsExtractor fieldExtractor; |
|
27 |
private List<String> enrichmentXPaths = Lists.newArrayList("//subject[./@inferred='true']", "//result/datainfo[./inferenceprovenance='dedup']", |
|
28 |
"//rel[./@inferred='true' and ./@inferenceprovenance != 'dedup']"); |
|
29 |
|
|
30 |
@Before |
|
31 |
public void setUp() throws Exception { |
|
32 |
fieldExtractor = new RecordFieldsExtractor(); |
|
33 |
} |
|
34 |
|
|
35 |
@Test |
|
36 |
public void testEnhanced() throws IOException, DocumentException { |
|
37 |
|
|
38 |
String recordString = IOUtils.toString(this.getClass().getResourceAsStream(record)); |
|
39 |
Document doc = new SAXReader().read(new StringReader(recordString)); |
|
40 |
Collection<String> sets = fieldExtractor.getEnrichedSets(doc, enrichmentXPaths, Lists.newArrayList("set1", "set2")); |
|
41 |
System.out.println(sets); |
|
42 |
assertTrue(sets.contains("set1_enriched") && sets.contains("set2_enriched")); |
|
43 |
} |
|
44 |
|
|
45 |
@Test |
|
46 |
public void testEnhancedDeduped() throws IOException, DocumentException { |
|
47 |
|
|
48 |
String recordString = IOUtils.toString(this.getClass().getResourceAsStream(record2)); |
|
49 |
Document doc = new SAXReader().read(new StringReader(recordString)); |
|
50 |
Collection<String> sets = fieldExtractor.getEnrichedSets(doc, enrichmentXPaths, Lists.newArrayList("set1", "set2")); |
|
51 |
System.out.println(sets); |
|
52 |
assertTrue(sets.contains("set1_enriched") && sets.contains("set2_enriched")); |
|
53 |
} |
|
54 |
|
|
55 |
@Test |
|
56 |
public void testNotEnhanced() throws IOException, DocumentException { |
|
57 |
|
|
58 |
String recordString = IOUtils.toString(this.getClass().getResourceAsStream(record3)); |
|
59 |
Document doc = new SAXReader().read(new StringReader(recordString)); |
|
60 |
Collection<String> sets = fieldExtractor.getEnrichedSets(doc, enrichmentXPaths, Lists.newArrayList("set1", "set2")); |
|
61 |
System.out.println(sets); |
|
62 |
assertTrue(sets.isEmpty()); |
|
63 |
} |
|
64 |
} |
modules/dnet-mapreduce-jobs/trunk/src/test/resources/eu/dnetlib/data/mapreduce/hbase/oai/originalRecord.xml | ||
---|---|---|
1 |
<record> |
|
2 |
<result> |
|
3 |
<header> |
|
4 |
<objIdentifier>od______1690::90424108bff748150b567528b93894ea |
|
5 |
</objIdentifier> |
|
6 |
<dateOfCollection>2013-05-24T06:59:35Z</dateOfCollection> |
|
7 |
<counters></counters> |
|
8 |
</header> |
|
9 |
<metadata> |
|
10 |
<entity |
|
11 |
schemaLocation="http://namespace.openaire.eu/oaf http://www.openaire.eu/schema/0.2/oaf-0.2.xsd"> |
|
12 |
<result> |
|
13 |
<title classid="main title" classname="main title" schemeid="dnet:dataCite_title" |
|
14 |
schemename="dnet:dataCite_title">9633684277</title> |
|
15 |
<dateofacceptance>2008-12-10</dateofacceptance> |
|
16 |
<resulttype classid="publication" classname="publication" |
|
17 |
schemeid="dnet:result_typologies" schemename="dnet:result_typologies" /> |
|
18 |
<language classid="und" classname="Undetermined" schemeid="dnet:languages" |
|
19 |
schemename="dnet:languages" /> |
|
20 |
<description>Amerika régóta nem váltja be a New Yorkban lakó |
|
21 |
Michael |
|
22 |
Rutkowsky reményeit. Az alkalmi munkákból élő fiatal, lengyel |
|
23 |
bevándorló elhatározza, hogy elébe megy a szerencséjének. Floridába |
|
24 |
utazik, mert úgy tudja, hogy ott unatkozó milliomos hölgyek várják |
|
25 |
a hozzá hasonló férfiakat. Florida, az álmok félszigete azonban |
|
26 |
milliomos hölgyek helyett életveszélyes kalandokkal fogadja |
|
27 |
Michaelt, akit nem csak a maffia, hanem az FBI emberei is üldözőbe |
|
28 |
vesznek. Csak úgy mentheti a bőrét, ha vállalja egy kábítószerrel |
|
29 |
üzletelő, a maffiát is kijátszó bűnöző szerepét. |
|
30 |
</description> |
|
31 |
<subject classid="" classname="" schemeid="" schemename="" /> |
|
32 |
<relevantdate classid="" classname="" schemeid="" |
|
33 |
schemename="" /> |
|
34 |
<publisher /> |
|
35 |
<embargoenddate /> |
|
36 |
<storagedate /> |
|
37 |
<source /> |
|
38 |
<fulltext /> |
|
39 |
<format /> |
|
40 |
<resourcetype classid="" classname="" schemeid="" |
|
41 |
schemename="" /> |
|
42 |
<device /> |
|
43 |
<size /> |
|
44 |
<version /> |
|
45 |
<lastmetadataupdate /> |
|
46 |
<metadataversionnumber /> |
|
47 |
<originalId>oai:ganymedes.lib.unideb.hu:2437/52836</originalId> |
|
48 |
<collectedfrom name="University of Debrecen Electronic Archive" |
|
49 |
id="opendoar____::d43ab110ab2489d6b9b2caa394bf920f" /> |
|
50 |
<pid classid="oai" classname="oai" schemeid="dnet:pid_types" |
|
51 |
schemename="dnet:pid_types">oai:ganymedes.lib.unideb.hu:2437/52836</pid> |
|
52 |
<bestlicense classid="OPEN" classname="Open Access" |
|
53 |
schemeid="dnet:access_modes" schemename="dnet:access_modes" /> |
|
54 |
<datainfo> |
|
55 |
<inferred>false</inferred> |
|
56 |
<deletedbyinference>false</deletedbyinference> |
|
57 |
<trust>0.9</trust> |
|
58 |
<inferenceprovenance /> |
|
59 |
<provenanceaction classid="sysimport:crosswalk:repository" |
|
60 |
classname="sysimport:crosswalk:repository" schemeid="dnet:provenanceActions" |
|
61 |
schemename="dnet:provenanceActions" /> |
|
62 |
</datainfo> |
|
63 |
<rels></rels> |
|
64 |
<children> |
|
65 |
<instance id="opendoar____::d43ab110ab2489d6b9b2caa394bf920f"> |
|
66 |
<licence classid="OPEN" classname="Open Access" schemeid="dnet:access_modes" |
|
67 |
schemename="dnet:access_modes" /> |
|
68 |
<hostedby name="University of Debrecen Electronic Archive" |
|
69 |
id="opendoar____::d43ab110ab2489d6b9b2caa394bf920f" /> |
|
70 |
<webresource> |
|
71 |
<url>http://hdl.handle.net/2437/52836</url> |
|
72 |
</webresource> |
|
73 |
<webresource> |
|
74 |
<url>http://webpac.lib.unideb.hu:8082/WebPac/CorvinaWeb?action=cclfind&resultview=long&ccltext=idno+bibFSZ874984 |
|
75 |
</url> |
|
76 |
</webresource> |
|
77 |
</instance> |
|
78 |
</children> |
|
79 |
</result> |
|
80 |
</entity> |
|
81 |
</metadata> |
|
82 |
</result> |
|
83 |
</record> |
modules/dnet-mapreduce-jobs/trunk/src/test/resources/eu/dnetlib/data/mapreduce/hbase/oai/config/OAIPublisherConfiguration-1.xml | ||
---|---|---|
46 | 46 |
<SOURCE interpretation="driver" layout="store" name="oai_dc" path="//*[local-name() ='objIdentifier']" /> |
47 | 47 |
</INDEX> |
48 | 48 |
<INDEX name="set"> |
49 |
<SOURCE interpretation="openaire" layout="index" name="oaf" path="//*[local-name() ='repositoryId']" />
|
|
49 |
<SOURCE interpretation="openaire" layout="index" name="oaf" path="//*[local-name() ='collectedfrom']/@name" />
|
|
50 | 50 |
<SOURCE interpretation="driver" layout="store" name="oai_dc" path="//*[local-name() ='repositoryId']" /> |
51 | 51 |
</INDEX> |
52 | 52 |
</INDICES> |
53 |
<ENRICHMENT> |
|
54 |
<XPATH interpretation="openaire" layout="index" name="oaf" path="//subject[./@inferred='true']" /> |
|
55 |
<XPATH interpretation="openaire" layout="index" name="oaf" path="//result/datainfo[./inferenceprovenance='dedup']" /> |
|
56 |
<XPATH interpretation="openaire" layout="index" name="oaf" path="//rel[./@inferred='true' and ./@inferenceprovenance != 'dedup']" /> |
|
57 |
</ENRICHMENT> |
|
53 | 58 |
</CONFIGURATION> |
54 | 59 |
<STATUS> |
55 | 60 |
<LAST_UPDATE value="2013-12-20T10:50:00" /> |
modules/dnet-mapreduce-jobs/trunk/src/test/resources/eu/dnetlib/data/mapreduce/hbase/oai/representativeRecord.xml | ||
---|---|---|
1 |
<record> |
|
2 |
<result xmlns:dri="http://www.driver-repository.eu/namespace/dri" |
|
3 |
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"> |
|
4 |
<header> |
|
5 |
<objIdentifier>dedup_wf_001::0472bc7a1f3c9afeab815dfc50137cdc |
|
6 |
</objIdentifier> |
|
7 |
<dateOfCollection>2015-02-06T16:52:05Z</dateOfCollection> |
|
8 |
<counters> |
|
9 |
<counter_authorship value="6" /> |
|
10 |
<counter_dedup value="2" /> |
|
11 |
</counters> |
|
12 |
</header> |
|
13 |
<metadata> |
|
14 |
<oaf:entity xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" |
|
15 |
xmlns:oaf="http://namespace.openaire.eu/oaf" |
|
16 |
xsi:schemaLocation="http://namespace.openaire.eu/oaf http://www.openaire.eu/schema/0.2/oaf-0.2.xsd"> |
|
17 |
<result> |
|
18 |
<subject schemename="dnet:result_subject" classname="keyword" |
|
19 |
schemeid="dnet:result_subject" classid="keyword">DOAJ:Health Sciences |
|
20 |
</subject> |
|
21 |
<subject schemename="dnet:result_subject" classname="keyword" |
|
22 |
schemeid="dnet:result_subject" classid="keyword">Pathology</subject> |
|
23 |
<subject schemename="dnet:result_subject" classname="keyword" |
|
24 |
schemeid="dnet:result_subject" classid="keyword">DOAJ:Pathology</subject> |
|
25 |
<subject schemename="dnet:result_subject" classname="keyword" |
|
26 |
schemeid="dnet:result_subject" classid="keyword">DOAJ:Medicine (General) |
|
27 |
</subject> |
|
28 |
<subject schemename="dnet:result_subject" classname="keyword" |
|
29 |
schemeid="dnet:result_subject" classid="keyword">R</subject> |
|
30 |
<subject schemename="dnet:result_subject" classname="keyword" |
|
31 |
schemeid="dnet:result_subject" classid="keyword">Medicine</subject> |
|
32 |
<subject schemename="dnet:result_subject" classname="keyword" |
|
33 |
schemeid="dnet:result_subject" classid="keyword">Research Article</subject> |
|
34 |
<subject schemename="dnet:result_subject" classname="keyword" |
|
35 |
schemeid="dnet:result_subject" classid="keyword">RB1-214</subject> |
|
36 |
<title schemename="dnet:dataCite_title" classname="main title" |
|
37 |
schemeid="dnet:dataCite_title" classid="main title">Effects of EDTA and Sodium |
|
38 |
Citrate on hormone measurements by fluorometric (FIA) and |
|
39 |
immunofluorometric (IFMA) methods |
|
40 |
</title> |
|
41 |
<dateofacceptance>2002-05-23</dateofacceptance> |
|
42 |
<publisher>BioMed Central</publisher> |
|
43 |
<resulttype schemename="dnet:result_typologies" |
|
44 |
classname="publication" schemeid="dnet:result_typologies" classid="publication" /> |
|
45 |
<language schemename="dnet:languages" classname="English" |
|
46 |
schemeid="dnet:languages" classid="eng" /> |
|
47 |
<journal eissn="1472-6890" issn="1472-6890" lissn="">BMC |
|
48 |
Clinical Pathology |
|
49 |
</journal> |
|
50 |
<description> |
|
51 |
<p>Abstract</p> |
|
52 |
<p>Background</p> |
|
53 |
<p>Measurements of hormonal concentrations by immunoassays using |
|
54 |
fluorescent tracer substance (Eu3+) are susceptible to the action |
|
55 |
of chemical agents that may cause alterations in its original |
|
56 |
structure. Our goal was to verify the effect of two types of |
|
57 |
anticoagulants in the hormone assays performed by fluorometric |
|
58 |
(FIA) or immunofluorometric (IFMA) methods. |
|
59 |
</p> |
|
60 |
<p>Methods</p> |
|
61 |
<p> |
|
62 |
Blood samples were obtained from 30 outpatients and were drawn in |
|
63 |
EDTA, sodium citrate, and serum separation Vacutainer |
|
64 |
<sup>®</sup> |
|
65 |
Blood Collection Tubes. Samples were analyzed in automatized |
|
66 |
equipment AutoDelfia™ (Perkin Elmer Brazil, Wallac, Finland) for |
|
67 |
the following hormones: Luteinizing hormone (LH), Follicle |
|
68 |
stimulating homone (FSH), prolactin (PRL), growth hormone (GH), |
|
69 |
Sex hormone binding globulin (SHBG), thyroid stimulating hormone |
|
70 |
(TSH), insulin, C peptide, total T3, total T4, free T4, |
|
71 |
estradiol, |
|
72 |
progesterone, testosterone, and cortisol. Statistical analysis was |
|
73 |
carried out by Kruskal-Wallis method and Dunn's test. |
|
74 |
</p> |
|
75 |
<p>Results</p> |
|
76 |
<p>No significant differences were seen between samples for LH, |
|
77 |
FSH, PRL and free T4. Results from GH, TSH, insulin, C peptide, |
|
78 |
SHBG, total T3, total T4, estradiol, testosterone, cortisol, and |
|
79 |
progesterone were significant different between serum and |
|
80 |
EDTA-treated samples groups. Differences were also identified |
|
81 |
between serum and sodium citrate-treated samples in the analysis |
|
82 |
for TSH, insulin, total T3, estradiol, testosterone and |
|
83 |
progesterone. |
|
84 |
</p> |
|
85 |
<p>Conclusions</p> |
|
86 |
<p>We conclude that the hormonal analysis carried through by FIA |
|
87 |
or |
|
88 |
IFMA are susceptible to the effects of anticoagulants in the |
|
89 |
biological material collected that vary depending on the type of |
|
90 |
assay. |
|
91 |
</p> |
|
92 |
</description> |
|
93 |
<source>BMC Clinical Pathology, Vol 2, Iss 1, p 2 (2002)</source> |
|
94 |
<relevantdate schemename="" classname="" schemeid="" |
|
95 |
classid="" /> |
|
96 |
<embargoenddate /> |
|
97 |
<storagedate /> |
|
98 |
<fulltext /> |
|
99 |
<format /> |
|
100 |
<resourcetype schemename="" classname="" schemeid="" |
|
101 |
classid="" /> |
|
102 |
<device /> |
|
103 |
<size /> |
|
104 |
<version /> |
|
105 |
<lastmetadataupdate /> |
|
106 |
<metadataversionnumber /> |
|
107 |
<originalId>oai:europepmc.org:730516</originalId> |
|
108 |
<originalId>oai:doaj.org/article:4b30bd17bb054d0591ee019e0bc50058 |
|
109 |
</originalId> |
|
110 |
<collectedfrom id="driver______::bee53aa31dc2cbb538c10c2b65fa5824" |
|
111 |
name="DOAJ-Articles" /> |
|
112 |
<collectedfrom id="opendoar____::8b6dd7db9af49e67306feb59a8bdc52c" |
|
113 |
name="Europe PubMed Central" /> |
|
114 |
<pid schemename="dnet:pid_types" classname="doi" schemeid="dnet:pid_types" |
|
115 |
classid="doi">10.1186/1472-6890-2-2</pid> |
|
116 |
<pid schemename="dnet:pid_types" classname="pmid" schemeid="dnet:pid_types" |
|
117 |
classid="pmid">12033989</pid> |
|
118 |
<pid schemename="dnet:pid_types" classname="oai" schemeid="dnet:pid_types" |
|
119 |
classid="oai">oai:europepmc.org:730516</pid> |
|
120 |
<pid schemename="dnet:pid_types" classname="pmc" schemeid="dnet:pid_types" |
|
121 |
classid="pmc">PMC115861</pid> |
|
122 |
<pid schemename="dnet:pid_types" classname="oai" schemeid="dnet:pid_types" |
|
123 |
classid="oai">oai:doaj.org/article:4b30bd17bb054d0591ee019e0bc50058 |
|
124 |
</pid> |
|
125 |
<bestlicense schemename="dnet:access_modes" classname="Open Access" |
|
126 |
schemeid="dnet:access_modes" classid="OPEN" /> |
|
127 |
<datainfo> |
|
128 |
<inferred>true</inferred> |
|
129 |
<deletedbyinference>false</deletedbyinference> |
|
130 |
<trust>0.9</trust> |
|
131 |
<inferenceprovenance>dedup</inferenceprovenance> |
|
132 |
<provenanceaction schemename="dnet:provenanceActions" |
|
133 |
classname="sysimport:dedup" schemeid="dnet:provenanceActions" |
|
134 |
classid="sysimport:dedup" /> |
|
135 |
</datainfo> |
|
136 |
<rels> |
|
137 |
<rel provenanceaction="sysimport:crosswalk:repository" trust="0.9" |
|
138 |
inferenceprovenance="" inferred="true"> |
|
139 |
<to scheme="dnet:personroles" class="hasAuthor" type="person">doajarticles::5d963eca520d3beaa45badc6b2c6e55d |
|
140 |
</to> |
|
141 |
<ranking>4</ranking> |
|
142 |
<fullname>Leme Cassia</fullname> |
|
143 |
</rel> |
|
144 |
<rel provenanceaction="sysimport:crosswalk:repository" trust="0.9" |
|
145 |
inferenceprovenance="" inferred="true"> |
|
146 |
<to scheme="dnet:personroles" class="hasAuthor" type="person">doajarticles::0de98ac3f0f8f9958315f976f7798feb |
|
147 |
</to> |
|
148 |
<ranking>5</ranking> |
|
149 |
<fullname>Kohek Maria</fullname> |
|
150 |
</rel> |
|
151 |
<rel provenanceaction="sysimport:crosswalk:repository" trust="0.9" |
|
152 |
inferenceprovenance="" inferred="true"> |
|
153 |
<to scheme="dnet:personroles" class="hasAuthor" type="person">doajarticles::945c13a65c53d30ddc6a51fb5abfa796 |
|
154 |
</to> |
|
155 |
<ranking>1</ranking> |
|
156 |
<fullname>Lando Valeria</fullname> |
|
157 |
</rel> |
|
158 |
<rel provenanceaction="sysimport:crosswalk:repository" trust="0.9" |
|
159 |
inferenceprovenance="" inferred="true"> |
|
160 |
<to scheme="dnet:personroles" class="hasAuthor" type="person">doajarticles::e0eaaa1c4d527be05831cb302f3c6a34 |
|
161 |
</to> |
|
162 |
<ranking>3</ranking> |
|
163 |
<fullname>Nakamura Izabel T</fullname> |
|
164 |
</rel> |
|
165 |
<rel provenanceaction="sysimport:crosswalk:repository" trust="0.9" |
|
166 |
inferenceprovenance="" inferred="true"> |
|
167 |
<to scheme="dnet:personroles" class="hasAuthor" type="person">doajarticles::9eba47b17cc5ce235ceb7e8df838bbd0 |
|
168 |
</to> |
|
169 |
<ranking>6</ranking> |
|
170 |
<fullname>Mendonca Berenice B</fullname> |
|
171 |
</rel> |
|
172 |
<rel provenanceaction="sysimport:crosswalk:repository" trust="0.9" |
|
173 |
inferenceprovenance="" inferred="true"> |
|
174 |
<to scheme="dnet:personroles" class="hasAuthor" type="person">doajarticles::b744016d2b09f0be822a53ca6ee3551b |
|
175 |
</to> |
|
176 |
<ranking>2</ranking> |
|
177 |
<fullname>de Oliveira Suzimara A</fullname> |
|
178 |
</rel> |
|
179 |
</rels> |
|
180 |
<children> |
|
181 |
<result objidentifier="doajarticles::0472bc7a1f3c9afeab815dfc50137cdc"> |
|
182 |
<title schemename="dnet:dataCite_title" classname="main title" |
|
183 |
schemeid="dnet:dataCite_title" classid="main title">Effects of EDTA and |
|
184 |
Sodium |
|
185 |
Citrate on hormone measurements by fluorometric (FIA) and |
|
186 |
immunofluorometric (IFMA) methods |
|
187 |
</title> |
|
188 |
<dateofacceptance>2002-05-01</dateofacceptance> |
|
189 |
<publisher>BioMed Central</publisher> |
|
190 |
<resulttype schemename="dnet:result_typologies" |
|
191 |
classname="publication" schemeid="dnet:result_typologies" |
|
192 |
classid="publication" /> |
|
193 |
</result> |
|
194 |
<result objidentifier="od_______908::3562d33283efddc61ee2b289eae686ef"> |
|
195 |
<title schemename="dnet:dataCite_title" classname="main title" |
|
196 |
schemeid="dnet:dataCite_title" classid="main title">Effects of EDTA and |
|
197 |
Sodium |
|
198 |
Citrate on hormone measurements by fluorometric (FIA) and |
|
199 |
immunofluorometric (IFMA) methods |
|
200 |
</title> |
|
201 |
<dateofacceptance>2002-05-23</dateofacceptance> |
|
202 |
<publisher>BioMed Central</publisher> |
|
203 |
<resulttype schemename="dnet:result_typologies" |
|
204 |
classname="publication" schemeid="dnet:result_typologies" |
|
205 |
classid="publication" /> |
|
206 |
</result> |
|
207 |
<instance id="doajarticles::4c534a294bd126266d1fb2292349b84a"> |
|
208 |
<licence schemename="dnet:access_modes" classname="Open Access" |
|
209 |
schemeid="dnet:access_modes" classid="OPEN" /> |
|
210 |
<instancetype schemename="dnet:publication_resource" |
|
211 |
classname="Article" schemeid="dnet:publication_resource" |
|
212 |
classid="0001" /> |
|
213 |
<hostedby id="doajarticles::4c534a294bd126266d1fb2292349b84a" |
|
214 |
name="BMC Clinical Pathology" /> |
|
215 |
<webresource> |
|
216 |
<url>http://www.biomedcentral.com/1472-6890/2/2</url> |
|
217 |
</webresource> |
|
218 |
<webresource> |
|
219 |
<url>https://doaj.org/toc/1472-6890</url> |
|
220 |
</webresource> |
|
221 |
</instance> |
|
222 |
<instance id="opendoar____::8b6dd7db9af49e67306feb59a8bdc52c"> |
|
223 |
<licence schemename="dnet:access_modes" classname="Open Access" |
|
224 |
schemeid="dnet:access_modes" classid="OPEN" /> |
|
225 |
<instancetype schemename="dnet:publication_resource" |
|
226 |
classname="Article" schemeid="dnet:publication_resource" |
|
227 |
classid="0001" /> |
|
228 |
<hostedby id="opendoar____::8b6dd7db9af49e67306feb59a8bdc52c" |
|
229 |
name="Europe PubMed Central" /> |
|
230 |
<webresource> |
|
231 |
<url>http://europepmc.org/articles/PMC115861</url> |
|
232 |
</webresource> |
|
233 |
</instance> |
|
234 |
</children> |
|
235 |
</result> |
|
236 |
</oaf:entity> |
|
237 |
</metadata> |
|
238 |
</result> |
|
239 |
</record> |
modules/dnet-mapreduce-jobs/trunk/src/main/java/eu/dnetlib/data/mapreduce/hbase/oai/utils/RecordFieldsExtractor.java | ||
---|---|---|
1 | 1 |
package eu.dnetlib.data.mapreduce.hbase.oai.utils; |
2 | 2 |
|
3 | 3 |
import java.io.StringReader; |
4 |
import java.util.Collection; |
|
4 | 5 |
import java.util.List; |
5 | 6 |
import java.util.Map.Entry; |
6 | 7 |
|
... | ... | |
11 | 12 |
|
12 | 13 |
import com.google.common.base.Function; |
13 | 14 |
import com.google.common.collect.ArrayListMultimap; |
15 |
import com.google.common.collect.Collections2; |
|
14 | 16 |
import com.google.common.collect.Iterables; |
17 |
import com.google.common.collect.Lists; |
|
15 | 18 |
import com.google.common.collect.Multimap; |
16 | 19 |
|
20 |
import eu.dnetlib.data.mapreduce.hbase.oai.config.OAIConfigurationReader; |
|
21 |
|
|
17 | 22 |
/** |
18 | 23 |
* An instance of this class can parse an XML record and extract the information needed to store the record in a publisher store. |
19 | 24 |
* |
... | ... | |
27 | 32 |
* List of the indices of the target store. |
28 | 33 |
*/ |
29 | 34 |
private List<PublisherField> storeIndices; |
35 |
/** |
|
36 |
* XPaths to execute to understand if a record has been enhanced: inferred subjects, deduplicated records, inferred relationships (only |
|
37 |
* those generated from IIS, not those redirected by the dedup) |
|
38 |
*/ |
|
39 |
// private List<String> enrichmentXPaths = Lists.newArrayList("//subject[./@inferred='true']", |
|
40 |
// "//result/datainfo[./inferenceprovenance='dedup']", |
|
41 |
// "//rel[./@inferred='true' and ./@inferenceprovenance != 'dedup']"); |
|
30 | 42 |
|
31 | 43 |
private String duplicateXPath; |
32 | 44 |
private boolean skipDuplicates = false; |
33 | 45 |
|
34 | 46 |
/** |
35 | 47 |
* Parses the record and returns a map where a key is the name of an index, the value is the value in the record at the xpath specificed |
36 |
* in this.storeIndices. |
|
48 |
* in this.storeIndices. The enrichment xpaths are used to generate additional OAI sets that can be used by providers to get the subset |
|
49 |
* of records enriched by OpenAIRE. |
|
37 | 50 |
* |
38 | 51 |
* @param record |
39 | 52 |
* the XML string to parse. |
53 |
* @param enrichmentXPaths |
|
54 |
* collection of xpaths that must be satisfied to consider the current record as "enriched by OpenAIRE" |
|
40 | 55 |
* @return a Multimap describing the values to be indexed for this record. |
41 | 56 |
*/ |
42 | 57 |
@SuppressWarnings({ "unchecked", "rawtypes" }) |
43 |
public Multimap<String, String> extractFields(final String record) { |
|
58 |
public Multimap<String, String> extractFields(final String record, final Collection<String> enrichmentXPaths) {
|
|
44 | 59 |
Multimap<String, String> recordProps = ArrayListMultimap.create(); |
45 | 60 |
try { |
46 | 61 |
Document doc = new SAXReader().read(new StringReader(record)); |
... | ... | |
72 | 87 |
} |
73 | 88 |
} |
74 | 89 |
} |
90 |
recordProps.putAll(OAIConfigurationReader.SET_FIELD, getEnrichedSets(doc, enrichmentXPaths, recordProps.get(OAIConfigurationReader.SET_FIELD))); |
|
75 | 91 |
|
76 | 92 |
} catch (DocumentException e) { |
77 | 93 |
recordProps = null; |
... | ... | |
79 | 95 |
return recordProps; |
80 | 96 |
} |
81 | 97 |
|
98 |
public Collection<String> getEnrichedSets(final Document docBody, final Collection<String> enrichmentXPaths, final Collection<String> originalSets) { |
|
99 |
Collection<String> enhancedSets = Lists.newArrayList(); |
|
100 |
if (isEnhanced(docBody, enrichmentXPaths)) { |
|
101 |
enhancedSets.addAll(Collections2.transform(originalSets, new Function<String, String>() { |
|
102 |
|
|
103 |
@Override |
|
104 |
public String apply(final String originalSet) { |
|
105 |
return originalSet + "_enriched"; |
|
106 |
} |
|
107 |
|
|
108 |
})); |
|
109 |
} |
|
110 |
return enhancedSets; |
|
111 |
} |
|
112 |
|
|
113 |
private boolean isEnhanced(final Document docBody, final Collection<String> enrichmentXPaths) { |
|
114 |
for (String xpath : enrichmentXPaths) { |
|
115 |
List xPathResult = docBody.selectNodes(xpath); |
|
116 |
if ((xPathResult != null) && !xPathResult.isEmpty()) return true; |
|
117 |
} |
|
118 |
return false; |
|
119 |
} |
|
120 |
|
|
82 | 121 |
public boolean isDuplicate(final Document doc) { |
83 | 122 |
return doc.selectSingleNode(duplicateXPath) != null; |
84 | 123 |
} |
modules/dnet-mapreduce-jobs/trunk/src/main/java/eu/dnetlib/data/mapreduce/hbase/oai/OaiFeedMapper.java | ||
---|---|---|
63 | 63 |
|
64 | 64 |
private MongoClient mongo; |
65 | 65 |
|
66 |
private Collection<String> enrichmentXPaths; |
|
67 |
|
|
66 | 68 |
@Override |
67 | 69 |
protected void setup(final Context context) throws UnknownHostException { |
68 | 70 |
|
... | ... | |
98 | 100 |
duplicateXPath = context.getConfiguration().get("services.publisher.oai.duplicateXPath"); |
99 | 101 |
skipDuplicates = Boolean.parseBoolean(context.getConfiguration().get("services.publisher.oai.skipDuplicates")); |
100 | 102 |
|
103 |
enrichmentXPaths = oaiConfiguration.getEnrichmentXPathsFor(format, layout, interpretation); |
|
101 | 104 |
Collection<PublisherField> indexFields = oaiConfiguration.getFieldsFor(format, layout, interpretation); |
102 | 105 |
extractor = new RecordFieldsExtractor(Lists.newArrayList(indexFields)); |
103 | 106 |
extractor.setDuplicateXPath(duplicateXPath); |
... | ... | |
131 | 134 |
if (StringUtils.isBlank(recordBody)) { |
132 | 135 |
discard(context, recordKey, recordBody, "blank body"); |
133 | 136 |
} else { |
134 |
Multimap<String, String> recordFields = extractor.extractFields(recordBody); |
|
137 |
Multimap<String, String> recordFields = extractor.extractFields(recordBody, enrichmentXPaths);
|
|
135 | 138 |
String id = ""; |
136 | 139 |
String oaiID = ""; |
137 | 140 |
if (checkRecordFields(recordFields, context, recordKey, recordBody)) { |
modules/dnet-mapreduce-jobs/trunk/src/main/java/eu/dnetlib/data/mapreduce/hbase/oai/config/OAIConfiguration.java | ||
---|---|---|
8 | 8 |
import com.google.common.base.Joiner; |
9 | 9 |
import com.google.common.base.Predicate; |
10 | 10 |
import com.google.common.collect.Collections2; |
11 |
import com.google.common.collect.Multimap; |
|
11 | 12 |
import com.google.common.collect.Sets; |
12 | 13 |
import com.google.gson.Gson; |
13 | 14 |
import com.google.gson.GsonBuilder; |
... | ... | |
51 | 52 |
|
52 | 53 |
private Set<MDFInfo> sourcesMDF = Sets.newHashSet(); |
53 | 54 |
|
55 |
private Multimap<String, String> enrichmentXPaths; |
|
56 |
|
|
54 | 57 |
public Collection<PublisherField> getFieldsFor(final String format, final String layout, final String interpretation) { |
55 | 58 |
final String mdRef = Joiner.on("-").join(format, layout, interpretation); |
56 | 59 |
return Collections2.filter(this.getFields(), new Predicate<PublisherField>() { |
... | ... | |
64 | 67 |
}); |
65 | 68 |
} |
66 | 69 |
|
70 |
public Collection<String> getEnrichmentXPathsFor(final String format, final String layout, final String interpretation) { |
|
71 |
final String mdRef = Joiner.on("-").join(format, layout, interpretation); |
|
72 |
return enrichmentXPaths.get(mdRef); |
|
73 |
} |
|
74 |
|
|
67 | 75 |
public Map<String, SetInfo> getSetsMap() { |
68 | 76 |
return setsMap; |
69 | 77 |
} |
... | ... | |
120 | 128 |
this.idNamespace = idNamespace; |
121 | 129 |
} |
122 | 130 |
|
131 |
public Multimap<String, String> getEnrichmentXPaths() { |
|
132 |
return enrichmentXPaths; |
|
133 |
} |
|
134 |
|
|
135 |
public void setEnrichmentXPaths(final Multimap<String, String> enrichmentXPaths) { |
|
136 |
this.enrichmentXPaths = enrichmentXPaths; |
|
137 |
} |
|
138 |
|
|
123 | 139 |
@Override |
124 | 140 |
public String toString() { |
125 | 141 |
Gson gson = new GsonBuilder().setPrettyPrinting().create(); |
modules/dnet-mapreduce-jobs/trunk/src/main/java/eu/dnetlib/data/mapreduce/hbase/oai/config/OAIConfigurationParser.java | ||
---|---|---|
14 | 14 |
import org.apache.commons.logging.Log; |
15 | 15 |
import org.apache.commons.logging.LogFactory; |
16 | 16 |
|
17 |
import com.google.common.base.Joiner; |
|
17 | 18 |
import com.google.common.collect.ArrayListMultimap; |
18 | 19 |
import com.google.common.collect.Lists; |
19 | 20 |
import com.google.common.collect.Maps; |
... | ... | |
50 | 51 |
Map<String, MDFInfo> mdFormatsMap = Maps.newHashMap(); |
51 | 52 |
List<String> indexNames = Lists.newArrayList(); |
52 | 53 |
List<PublisherField> fields = Lists.newArrayList(); |
54 |
Multimap<String, String> enrichmentPaths = ArrayListMultimap.create(); |
|
53 | 55 |
try { |
54 | 56 |
final XMLStreamReader parser = factory.get().createXMLStreamReader(new StreamSource(new StringReader(configurationProfile))); |
55 | 57 |
log.debug("Configuration profile read by " + parser.getClass().getCanonicalName()); |
... | ... | |
130 | 132 |
publisherField.setSources(fieldSources); |
131 | 133 |
fields.add(publisherField); |
132 | 134 |
|
135 |
} else { |
|
136 |
if (localName.equals("ENRICHMENT")) { |
|
137 |
boolean inEnrichment = true; |
|
138 |
while (parser.hasNext() && inEnrichment) { |
|
139 |
event = parser.next(); |
|
140 |
if (event == XMLStreamConstants.START_ELEMENT) { |
|
141 |
MDFInfo mdfInfo = new MDFInfo(); |
|
142 |
this.handleSourceMDF(mdfInfo, parser); |
|
143 |
String mdf = Joiner.on("-").join(mdfInfo.getSourceFormatName(), mdfInfo.getSourceFormatLayout(), |
|
144 |
mdfInfo.getSourceFormatInterpretation()); |
|
145 |
String xpath = parser.getAttributeValue(null, "path"); |
|
146 |
enrichmentPaths.put(mdf, xpath); |
|
147 |
} |
|
148 |
if ((event == XMLStreamConstants.END_ELEMENT) && parser.getLocalName().equals("ENRICHMENT")) { |
|
149 |
inEnrichment = false; |
|
150 |
} |
|
151 |
} |
|
152 |
} |
|
133 | 153 |
} |
134 | 154 |
} |
135 | 155 |
} |
... | ... | |
139 | 159 |
config.setFieldNames(indexNames); |
140 | 160 |
config.setMdFormatsMap(mdFormatsMap); |
141 | 161 |
config.setSetsMap(setsMap); |
162 |
config.setEnrichmentXPaths(enrichmentPaths); |
|
142 | 163 |
return config; |
143 | 164 |
} catch (final XMLStreamException e) { |
144 | 165 |
throw new OaiPublisherRuntimeException(e); |
Also available in: Unified diff
The OAI feed generates "enriched sets" for each content providers by applying a set of xpaths to records to understand if they have been enriched. The xpaths are defined in the OAI configuration profile.