Project

General

Profile

« Previous | Next » 

Revision 37821

The OAI feed generates "enriched sets" for each content providers by applying a set of xpaths to records to understand if they have been enriched. The xpaths are defined in the OAI configuration profile.

View differences:

modules/dnet-mapreduce-jobs/trunk/src/test/java/eu/dnetlib/data/mapreduce/hbase/oai/OAIFeedMapperTest.java
58 58
	private String objId1 = "oai:dnet:openaire____::2fa6b215ace86e409dde3ba4b2a6b504";
59 59
	private String goodRecord = "<?xml version=\"1.0\"?>\n<record>\n  <result xmlns:dc=\"http://purl.org/dc/elements/1.1/\"\n    xmlns:dnet=\"eu.dnetlib.miscutils.functional.xml.DnetXsltFunctions\"\n    xmlns:dr=\"http://www.driver-repository.eu/namespace/dr\"\n    xmlns:dri=\"http://www.driver-repository.eu/namespace/dri\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\">\n    <header>\n      <dri:objIdentifier>openaire____::2fa6b215ace86e409dde3ba4b2a6b504</dri:objIdentifier>\n      <dri:repositoryId/>\n      <dri:dateOfCollection>2013-10-09</dri:dateOfCollection>\n    </header>\n    <metadata>\n      <oaf:entity xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" \n\t\t    xmlns:oaf=\"http://namespace.openaire.eu/oaf\" \n\t\t    xsi:schemaLocation=\"http://namespace.openaire.eu/oaf http://www.openaire.eu/schema/0.1/oaf-0.1.xsd\">\n\t\t<oaf:datasource>\n\t\t\t<officialname>The Internet Journal of Orthopedic Surgery</officialname><englishname>The Internet Journal of Orthopedic Surgery</englishname><websiteurl>http://www.ispub.com/journal/the-internet-journal-of-orthopedic-surgery/</websiteurl><accessinfopackage/><namespaceprefix>issn15312968</namespaceprefix><datasourcetypeui classid=\"pubsrepository::journal\" classname=\"pubsrepository::journal\" schemeid=\"dnet:datasource_typologies\" schemename=\"dnet:datasource_typologies\"/><datasourcetype classid=\"pubsrepository::journal\" classname=\"pubsrepository::journal\" schemeid=\"dnet:datasource_typologies\" schemename=\"dnet:datasource_typologies\"/><openairecompatibility classid=\"notCompatible\" classname=\"notCompatible\" schemeid=\"dnet:compatibilityLevel\" schemename=\"dnet:compatibilityLevel\"/><latitude>0.0</latitude><longitude>0.0</longitude><subjects/><policies name=\"\" id=\"\"/><logourl/><contactemail/><dateofvalidation/><description/><odnumberofitems/><odnumberofitemsdate/><odpolicies/><odlanguages/><odcontenttypes/><releasestartdate/><releaseenddate/><missionstatementurl/><dataprovider>false</dataprovider><serviceprovider>false</serviceprovider><databaseaccesstype/><datauploadtype/><databaseaccessrestriction/><datauploadrestriction/><versioning>false</versioning><citationguidelineurl/><qualitymanagementkind/><pidsystems/><certificates/><originalId>openaire____::issn15312968</originalId><collectedfrom name=\"DOAJ-Articles\" id=\"driver______::bee53aa31dc2cbb538c10c2b65fa5824\"/><pid/><datainfo><inferred>false</inferred><deletedbyinference>false</deletedbyinference><trust>0.9</trust><inferenceprovenance/><provenanceaction classid=\"UNKNOWN\" classname=\"UNKNOWN\" schemeid=\"dnet:provenanceActions\" schemename=\"dnet:provenanceActions\"/></datainfo>\n\t\t  <rels>\n\t\t  </rels>\n\t\t  <children>\n\t\t  </children>\n\t\t</oaf:datasource>\n      </oaf:entity>\n    </metadata>\n  </result>\n</record>";
60 60
	private String dedupedRecord = "dedupedRecord.xml";
61
	private String representativeRecord = "representativeRecord.xml";
61 62

  
62 63
	@Before
63 64
	public void setUp() throws Exception {
......
99 100
	@Test
100 101
	public void testCreateBasicObject() throws DocumentException, IOException, InterruptedException {
101 102
		RecordFieldsExtractor parser = new RecordFieldsExtractor(Lists.newArrayList(oaiConfiguration.getFieldsFor("oaf", "index", "openaire")));
102
		Multimap<String, String> parsedRecord = parser.extractFields(goodRecord);
103
		Multimap<String, String> parsedRecord = parser.extractFields(goodRecord, oaiConfiguration.getEnrichmentXPathsFor("oaf", "index", "openaire"));
103 104
		DBObject obj = oaiFeedMapper.createBasicObject(objId1, goodRecord, parsedRecord, context);
104 105
		// NOTE that LAST_COLLECTION_DATE_FIELD, DATESTAMP_FIELD,UPDATED_FIELD are not set by the method we are testing, but by the caller
105 106
		// method (handleRecord) because they values to set depend on the record status (NEW|UPDATED|UNCHANGED)
......
108 109
	}
109 110

  
110 111
	@Test
112
	public void testCreateBasicObjectRep() throws DocumentException, IOException, InterruptedException {
113
		RecordFieldsExtractor parser = new RecordFieldsExtractor(Lists.newArrayList(oaiConfiguration.getFieldsFor("oaf", "index", "openaire")));
114
		String repRecordString = IOUtils.toString(this.getClass().getResourceAsStream(representativeRecord));
115
		Multimap<String, String> parsedRecord = parser.extractFields(repRecordString, oaiConfiguration.getEnrichmentXPathsFor("oaf", "index", "openaire"));
116
		DBObject obj = oaiFeedMapper.createBasicObject(objId1, repRecordString, parsedRecord, context);
117
		// NOTE that LAST_COLLECTION_DATE_FIELD, DATESTAMP_FIELD,UPDATED_FIELD are not set by the method we are testing, but by the caller
118
		// method (handleRecord) because they values to set depend on the record status (NEW|UPDATED|UNCHANGED)
119
		System.out.println(obj);
120
		assertNotNull(obj);
121
	}
122

  
123
	@Test
111 124
	public void testParseDeduplicated() throws IOException {
112 125
		RecordFieldsExtractor parser = new RecordFieldsExtractor(Lists.newArrayList(oaiConfiguration.getFieldsFor("oaf", "index", "openaire")));
113 126
		String dedupedRecordString = IOUtils.toString(this.getClass().getResourceAsStream(dedupedRecord));
114 127
		parser.setSkipDuplicates(true);
115 128
		parser.setDuplicateXPath("//*[local-name()='entity']//*[local-name()='datainfo']/*[local-name()='deletedbyinference'][./text() = 'true']");
116
		Multimap<String, String> parsedRecord = parser.extractFields(dedupedRecordString);
129
		Multimap<String, String> parsedRecord = parser.extractFields(dedupedRecordString, oaiConfiguration.getEnrichmentXPathsFor("oaf", "index", "openaire"));
117 130
		assertFalse(oaiFeedMapper.checkRecordFields(parsedRecord, context, "x", dedupedRecordString));
118 131
	}
119 132

  
......
123 136
		String dedupedRecordString = IOUtils.toString(this.getClass().getResourceAsStream(dedupedRecord));
124 137
		parser.setSkipDuplicates(true);
125 138
		parser.setDuplicateXPath("//x");
126
		Multimap<String, String> parsedRecord = parser.extractFields(dedupedRecordString);
139
		Multimap<String, String> parsedRecord = parser.extractFields(dedupedRecordString, oaiConfiguration.getEnrichmentXPathsFor("oaf", "index", "openaire"));
127 140
		assertTrue(oaiFeedMapper.checkRecordFields(parsedRecord, context, "x", dedupedRecordString));
128 141
	}
129 142

  
modules/dnet-mapreduce-jobs/trunk/src/test/java/eu/dnetlib/data/mapreduce/hbase/oai/RecordFieldsExtractorTest.java
1
package eu.dnetlib.data.mapreduce.hbase.oai;
2

  
3
import static org.junit.Assert.assertTrue;
4

  
5
import java.io.IOException;
6
import java.io.StringReader;
7
import java.util.Collection;
8
import java.util.List;
9

  
10
import org.apache.commons.io.IOUtils;
11
import org.dom4j.Document;
12
import org.dom4j.DocumentException;
13
import org.dom4j.io.SAXReader;
14
import org.junit.Before;
15
import org.junit.Test;
16

  
17
import com.google.common.collect.Lists;
18

  
19
import eu.dnetlib.data.mapreduce.hbase.oai.utils.RecordFieldsExtractor;
20

  
21
public class RecordFieldsExtractorTest {
22

  
23
	private String record = "representativeRecord.xml";
24
	private String record2 = "dedupedRecord.xml";
25
	private String record3 = "originalRecord.xml";
26
	private RecordFieldsExtractor fieldExtractor;
27
	private List<String> enrichmentXPaths = Lists.newArrayList("//subject[./@inferred='true']", "//result/datainfo[./inferenceprovenance='dedup']",
28
			"//rel[./@inferred='true' and ./@inferenceprovenance != 'dedup']");
29

  
30
	@Before
31
	public void setUp() throws Exception {
32
		fieldExtractor = new RecordFieldsExtractor();
33
	}
34

  
35
	@Test
36
	public void testEnhanced() throws IOException, DocumentException {
37

  
38
		String recordString = IOUtils.toString(this.getClass().getResourceAsStream(record));
39
		Document doc = new SAXReader().read(new StringReader(recordString));
40
		Collection<String> sets = fieldExtractor.getEnrichedSets(doc, enrichmentXPaths, Lists.newArrayList("set1", "set2"));
41
		System.out.println(sets);
42
		assertTrue(sets.contains("set1_enriched") && sets.contains("set2_enriched"));
43
	}
44

  
45
	@Test
46
	public void testEnhancedDeduped() throws IOException, DocumentException {
47

  
48
		String recordString = IOUtils.toString(this.getClass().getResourceAsStream(record2));
49
		Document doc = new SAXReader().read(new StringReader(recordString));
50
		Collection<String> sets = fieldExtractor.getEnrichedSets(doc, enrichmentXPaths, Lists.newArrayList("set1", "set2"));
51
		System.out.println(sets);
52
		assertTrue(sets.contains("set1_enriched") && sets.contains("set2_enriched"));
53
	}
54

  
55
	@Test
56
	public void testNotEnhanced() throws IOException, DocumentException {
57

  
58
		String recordString = IOUtils.toString(this.getClass().getResourceAsStream(record3));
59
		Document doc = new SAXReader().read(new StringReader(recordString));
60
		Collection<String> sets = fieldExtractor.getEnrichedSets(doc, enrichmentXPaths, Lists.newArrayList("set1", "set2"));
61
		System.out.println(sets);
62
		assertTrue(sets.isEmpty());
63
	}
64
}
modules/dnet-mapreduce-jobs/trunk/src/test/resources/eu/dnetlib/data/mapreduce/hbase/oai/originalRecord.xml
1
<record>
2
	<result>
3
		<header>
4
			<objIdentifier>od______1690::90424108bff748150b567528b93894ea
5
			</objIdentifier>
6
			<dateOfCollection>2013-05-24T06:59:35Z</dateOfCollection>
7
			<counters></counters>
8
		</header>
9
		<metadata>
10
			<entity
11
				schemaLocation="http://namespace.openaire.eu/oaf http://www.openaire.eu/schema/0.2/oaf-0.2.xsd">
12
				<result>
13
					<title classid="main title" classname="main title" schemeid="dnet:dataCite_title"
14
						schemename="dnet:dataCite_title">9633684277</title>
15
					<dateofacceptance>2008-12-10</dateofacceptance>
16
					<resulttype classid="publication" classname="publication"
17
						schemeid="dnet:result_typologies" schemename="dnet:result_typologies" />
18
					<language classid="und" classname="Undetermined" schemeid="dnet:languages"
19
						schemename="dnet:languages" />
20
					<description>Amerika régóta nem váltja be a New Yorkban lakó
21
						Michael
22
						Rutkowsky reményeit. Az alkalmi munkákból élő fiatal, lengyel
23
						bevándorló elhatározza, hogy elébe megy a szerencséjének. Floridába
24
						utazik, mert úgy tudja, hogy ott unatkozó milliomos hölgyek várják
25
						a hozzá hasonló férfiakat. Florida, az álmok félszigete azonban
26
						milliomos hölgyek helyett életveszélyes kalandokkal fogadja
27
						Michaelt, akit nem csak a maffia, hanem az FBI emberei is üldözőbe
28
						vesznek. Csak úgy mentheti a bőrét, ha vállalja egy kábítószerrel
29
						üzletelő, a maffiát is kijátszó bűnöző szerepét.
30
					</description>
31
					<subject classid="" classname="" schemeid="" schemename="" />
32
					<relevantdate classid="" classname="" schemeid=""
33
						schemename="" />
34
					<publisher />
35
					<embargoenddate />
36
					<storagedate />
37
					<source />
38
					<fulltext />
39
					<format />
40
					<resourcetype classid="" classname="" schemeid=""
41
						schemename="" />
42
					<device />
43
					<size />
44
					<version />
45
					<lastmetadataupdate />
46
					<metadataversionnumber />
47
					<originalId>oai:ganymedes.lib.unideb.hu:2437/52836</originalId>
48
					<collectedfrom name="University of Debrecen Electronic Archive"
49
						id="opendoar____::d43ab110ab2489d6b9b2caa394bf920f" />
50
					<pid classid="oai" classname="oai" schemeid="dnet:pid_types"
51
						schemename="dnet:pid_types">oai:ganymedes.lib.unideb.hu:2437/52836</pid>
52
					<bestlicense classid="OPEN" classname="Open Access"
53
						schemeid="dnet:access_modes" schemename="dnet:access_modes" />
54
					<datainfo>
55
						<inferred>false</inferred>
56
						<deletedbyinference>false</deletedbyinference>
57
						<trust>0.9</trust>
58
						<inferenceprovenance />
59
						<provenanceaction classid="sysimport:crosswalk:repository"
60
							classname="sysimport:crosswalk:repository" schemeid="dnet:provenanceActions"
61
							schemename="dnet:provenanceActions" />
62
					</datainfo>
63
					<rels></rels>
64
					<children>
65
						<instance id="opendoar____::d43ab110ab2489d6b9b2caa394bf920f">
66
							<licence classid="OPEN" classname="Open Access" schemeid="dnet:access_modes"
67
								schemename="dnet:access_modes" />
68
							<hostedby name="University of Debrecen Electronic Archive"
69
								id="opendoar____::d43ab110ab2489d6b9b2caa394bf920f" />
70
							<webresource>
71
								<url>http://hdl.handle.net/2437/52836</url>
72
							</webresource>
73
							<webresource>
74
								<url>http://webpac.lib.unideb.hu:8082/WebPac/CorvinaWeb?action=cclfind&amp;resultview=long&amp;ccltext=idno+bibFSZ874984
75
								</url>
76
							</webresource>
77
						</instance>
78
					</children>
79
				</result>
80
			</entity>
81
		</metadata>
82
	</result>
83
</record>
modules/dnet-mapreduce-jobs/trunk/src/test/resources/eu/dnetlib/data/mapreduce/hbase/oai/config/OAIPublisherConfiguration-1.xml
46 46
					<SOURCE interpretation="driver" layout="store" name="oai_dc" path="//*[local-name() ='objIdentifier']" />
47 47
				</INDEX>
48 48
				<INDEX name="set">
49
					<SOURCE interpretation="openaire" layout="index" name="oaf" path="//*[local-name() ='repositoryId']" />
49
					<SOURCE interpretation="openaire" layout="index" name="oaf" path="//*[local-name() ='collectedfrom']/@name" />
50 50
					<SOURCE interpretation="driver" layout="store" name="oai_dc" path="//*[local-name() ='repositoryId']" />
51 51
				</INDEX>
52 52
			</INDICES>
53
			<ENRICHMENT>
54
				<XPATH interpretation="openaire" layout="index" name="oaf" path="//subject[./@inferred='true']" />
55
				<XPATH interpretation="openaire" layout="index" name="oaf" path="//result/datainfo[./inferenceprovenance='dedup']" />
56
				<XPATH interpretation="openaire" layout="index" name="oaf" path="//rel[./@inferred='true' and ./@inferenceprovenance != 'dedup']" />
57
			</ENRICHMENT>
53 58
		</CONFIGURATION>
54 59
		<STATUS>
55 60
			<LAST_UPDATE value="2013-12-20T10:50:00" />
modules/dnet-mapreduce-jobs/trunk/src/test/resources/eu/dnetlib/data/mapreduce/hbase/oai/representativeRecord.xml
1
<record>
2
	<result xmlns:dri="http://www.driver-repository.eu/namespace/dri"
3
		xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
4
		<header>
5
			<objIdentifier>dedup_wf_001::0472bc7a1f3c9afeab815dfc50137cdc
6
			</objIdentifier>
7
			<dateOfCollection>2015-02-06T16:52:05Z</dateOfCollection>
8
			<counters>
9
				<counter_authorship value="6" />
10
				<counter_dedup value="2" />
11
			</counters>
12
		</header>
13
		<metadata>
14
			<oaf:entity xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
15
				xmlns:oaf="http://namespace.openaire.eu/oaf"
16
				xsi:schemaLocation="http://namespace.openaire.eu/oaf http://www.openaire.eu/schema/0.2/oaf-0.2.xsd">
17
				<result>
18
					<subject schemename="dnet:result_subject" classname="keyword"
19
						schemeid="dnet:result_subject" classid="keyword">DOAJ:Health Sciences
20
					</subject>
21
					<subject schemename="dnet:result_subject" classname="keyword"
22
						schemeid="dnet:result_subject" classid="keyword">Pathology</subject>
23
					<subject schemename="dnet:result_subject" classname="keyword"
24
						schemeid="dnet:result_subject" classid="keyword">DOAJ:Pathology</subject>
25
					<subject schemename="dnet:result_subject" classname="keyword"
26
						schemeid="dnet:result_subject" classid="keyword">DOAJ:Medicine (General)
27
					</subject>
28
					<subject schemename="dnet:result_subject" classname="keyword"
29
						schemeid="dnet:result_subject" classid="keyword">R</subject>
30
					<subject schemename="dnet:result_subject" classname="keyword"
31
						schemeid="dnet:result_subject" classid="keyword">Medicine</subject>
32
					<subject schemename="dnet:result_subject" classname="keyword"
33
						schemeid="dnet:result_subject" classid="keyword">Research Article</subject>
34
					<subject schemename="dnet:result_subject" classname="keyword"
35
						schemeid="dnet:result_subject" classid="keyword">RB1-214</subject>
36
					<title schemename="dnet:dataCite_title" classname="main title"
37
						schemeid="dnet:dataCite_title" classid="main title">Effects of EDTA and Sodium
38
						Citrate on hormone measurements by fluorometric (FIA) and
39
						immunofluorometric (IFMA) methods
40
					</title>
41
					<dateofacceptance>2002-05-23</dateofacceptance>
42
					<publisher>BioMed Central</publisher>
43
					<resulttype schemename="dnet:result_typologies"
44
						classname="publication" schemeid="dnet:result_typologies" classid="publication" />
45
					<language schemename="dnet:languages" classname="English"
46
						schemeid="dnet:languages" classid="eng" />
47
					<journal eissn="1472-6890" issn="1472-6890" lissn="">BMC
48
						Clinical Pathology
49
					</journal>
50
					<description>
51
						<p>Abstract</p>
52
						<p>Background</p>
53
						<p>Measurements of hormonal concentrations by immunoassays using
54
							fluorescent tracer substance (Eu3+) are susceptible to the action
55
							of chemical agents that may cause alterations in its original
56
							structure. Our goal was to verify the effect of two types of
57
							anticoagulants in the hormone assays performed by fluorometric
58
							(FIA) or immunofluorometric (IFMA) methods.
59
						</p>
60
						<p>Methods</p>
61
						<p>
62
							Blood samples were obtained from 30 outpatients and were drawn in
63
							EDTA, sodium citrate, and serum separation Vacutainer
64
							<sup>®</sup>
65
							Blood Collection Tubes. Samples were analyzed in automatized
66
							equipment AutoDelfia™ (Perkin Elmer Brazil, Wallac, Finland) for
67
							the following hormones: Luteinizing hormone (LH), Follicle
68
							stimulating homone (FSH), prolactin (PRL), growth hormone (GH),
69
							Sex hormone binding globulin (SHBG), thyroid stimulating hormone
70
							(TSH), insulin, C peptide, total T3, total T4, free T4,
71
							estradiol,
72
							progesterone, testosterone, and cortisol. Statistical analysis was
73
							carried out by Kruskal-Wallis method and Dunn's test.
74
						</p>
75
						<p>Results</p>
76
						<p>No significant differences were seen between samples for LH,
77
							FSH, PRL and free T4. Results from GH, TSH, insulin, C peptide,
78
							SHBG, total T3, total T4, estradiol, testosterone, cortisol, and
79
							progesterone were significant different between serum and
80
							EDTA-treated samples groups. Differences were also identified
81
							between serum and sodium citrate-treated samples in the analysis
82
							for TSH, insulin, total T3, estradiol, testosterone and
83
							progesterone.
84
						</p>
85
						<p>Conclusions</p>
86
						<p>We conclude that the hormonal analysis carried through by FIA
87
							or
88
							IFMA are susceptible to the effects of anticoagulants in the
89
							biological material collected that vary depending on the type of
90
							assay.
91
						</p>
92
					</description>
93
					<source>BMC Clinical Pathology, Vol 2, Iss 1, p 2 (2002)</source>
94
					<relevantdate schemename="" classname="" schemeid=""
95
						classid="" />
96
					<embargoenddate />
97
					<storagedate />
98
					<fulltext />
99
					<format />
100
					<resourcetype schemename="" classname="" schemeid=""
101
						classid="" />
102
					<device />
103
					<size />
104
					<version />
105
					<lastmetadataupdate />
106
					<metadataversionnumber />
107
					<originalId>oai:europepmc.org:730516</originalId>
108
					<originalId>oai:doaj.org/article:4b30bd17bb054d0591ee019e0bc50058
109
					</originalId>
110
					<collectedfrom id="driver______::bee53aa31dc2cbb538c10c2b65fa5824"
111
						name="DOAJ-Articles" />
112
					<collectedfrom id="opendoar____::8b6dd7db9af49e67306feb59a8bdc52c"
113
						name="Europe PubMed Central" />
114
					<pid schemename="dnet:pid_types" classname="doi" schemeid="dnet:pid_types"
115
						classid="doi">10.1186/1472-6890-2-2</pid>
116
					<pid schemename="dnet:pid_types" classname="pmid" schemeid="dnet:pid_types"
117
						classid="pmid">12033989</pid>
118
					<pid schemename="dnet:pid_types" classname="oai" schemeid="dnet:pid_types"
119
						classid="oai">oai:europepmc.org:730516</pid>
120
					<pid schemename="dnet:pid_types" classname="pmc" schemeid="dnet:pid_types"
121
						classid="pmc">PMC115861</pid>
122
					<pid schemename="dnet:pid_types" classname="oai" schemeid="dnet:pid_types"
123
						classid="oai">oai:doaj.org/article:4b30bd17bb054d0591ee019e0bc50058
124
					</pid>
125
					<bestlicense schemename="dnet:access_modes" classname="Open Access"
126
						schemeid="dnet:access_modes" classid="OPEN" />
127
					<datainfo>
128
						<inferred>true</inferred>
129
						<deletedbyinference>false</deletedbyinference>
130
						<trust>0.9</trust>
131
						<inferenceprovenance>dedup</inferenceprovenance>
132
						<provenanceaction schemename="dnet:provenanceActions"
133
							classname="sysimport:dedup" schemeid="dnet:provenanceActions"
134
							classid="sysimport:dedup" />
135
					</datainfo>
136
					<rels>
137
						<rel provenanceaction="sysimport:crosswalk:repository" trust="0.9"
138
							inferenceprovenance="" inferred="true">
139
							<to scheme="dnet:personroles" class="hasAuthor" type="person">doajarticles::5d963eca520d3beaa45badc6b2c6e55d
140
							</to>
141
							<ranking>4</ranking>
142
							<fullname>Leme Cassia</fullname>
143
						</rel>
144
						<rel provenanceaction="sysimport:crosswalk:repository" trust="0.9"
145
							inferenceprovenance="" inferred="true">
146
							<to scheme="dnet:personroles" class="hasAuthor" type="person">doajarticles::0de98ac3f0f8f9958315f976f7798feb
147
							</to>
148
							<ranking>5</ranking>
149
							<fullname>Kohek Maria</fullname>
150
						</rel>
151
						<rel provenanceaction="sysimport:crosswalk:repository" trust="0.9"
152
							inferenceprovenance="" inferred="true">
153
							<to scheme="dnet:personroles" class="hasAuthor" type="person">doajarticles::945c13a65c53d30ddc6a51fb5abfa796
154
							</to>
155
							<ranking>1</ranking>
156
							<fullname>Lando Valeria</fullname>
157
						</rel>
158
						<rel provenanceaction="sysimport:crosswalk:repository" trust="0.9"
159
							inferenceprovenance="" inferred="true">
160
							<to scheme="dnet:personroles" class="hasAuthor" type="person">doajarticles::e0eaaa1c4d527be05831cb302f3c6a34
161
							</to>
162
							<ranking>3</ranking>
163
							<fullname>Nakamura Izabel T</fullname>
164
						</rel>
165
						<rel provenanceaction="sysimport:crosswalk:repository" trust="0.9"
166
							inferenceprovenance="" inferred="true">
167
							<to scheme="dnet:personroles" class="hasAuthor" type="person">doajarticles::9eba47b17cc5ce235ceb7e8df838bbd0
168
							</to>
169
							<ranking>6</ranking>
170
							<fullname>Mendonca Berenice B</fullname>
171
						</rel>
172
						<rel provenanceaction="sysimport:crosswalk:repository" trust="0.9"
173
							inferenceprovenance="" inferred="true">
174
							<to scheme="dnet:personroles" class="hasAuthor" type="person">doajarticles::b744016d2b09f0be822a53ca6ee3551b
175
							</to>
176
							<ranking>2</ranking>
177
							<fullname>de Oliveira Suzimara A</fullname>
178
						</rel>
179
					</rels>
180
					<children>
181
						<result objidentifier="doajarticles::0472bc7a1f3c9afeab815dfc50137cdc">
182
							<title schemename="dnet:dataCite_title" classname="main title"
183
								schemeid="dnet:dataCite_title" classid="main title">Effects of EDTA and
184
								Sodium
185
								Citrate on hormone measurements by fluorometric (FIA) and
186
								immunofluorometric (IFMA) methods
187
							</title>
188
							<dateofacceptance>2002-05-01</dateofacceptance>
189
							<publisher>BioMed Central</publisher>
190
							<resulttype schemename="dnet:result_typologies"
191
								classname="publication" schemeid="dnet:result_typologies"
192
								classid="publication" />
193
						</result>
194
						<result objidentifier="od_______908::3562d33283efddc61ee2b289eae686ef">
195
							<title schemename="dnet:dataCite_title" classname="main title"
196
								schemeid="dnet:dataCite_title" classid="main title">Effects of EDTA and
197
								Sodium
198
								Citrate on hormone measurements by fluorometric (FIA) and
199
								immunofluorometric (IFMA) methods
200
							</title>
201
							<dateofacceptance>2002-05-23</dateofacceptance>
202
							<publisher>BioMed Central</publisher>
203
							<resulttype schemename="dnet:result_typologies"
204
								classname="publication" schemeid="dnet:result_typologies"
205
								classid="publication" />
206
						</result>
207
						<instance id="doajarticles::4c534a294bd126266d1fb2292349b84a">
208
							<licence schemename="dnet:access_modes" classname="Open Access"
209
								schemeid="dnet:access_modes" classid="OPEN" />
210
							<instancetype schemename="dnet:publication_resource"
211
								classname="Article" schemeid="dnet:publication_resource"
212
								classid="0001" />
213
							<hostedby id="doajarticles::4c534a294bd126266d1fb2292349b84a"
214
								name="BMC Clinical Pathology" />
215
							<webresource>
216
								<url>http://www.biomedcentral.com/1472-6890/2/2</url>
217
							</webresource>
218
							<webresource>
219
								<url>https://doaj.org/toc/1472-6890</url>
220
							</webresource>
221
						</instance>
222
						<instance id="opendoar____::8b6dd7db9af49e67306feb59a8bdc52c">
223
							<licence schemename="dnet:access_modes" classname="Open Access"
224
								schemeid="dnet:access_modes" classid="OPEN" />
225
							<instancetype schemename="dnet:publication_resource"
226
								classname="Article" schemeid="dnet:publication_resource"
227
								classid="0001" />
228
							<hostedby id="opendoar____::8b6dd7db9af49e67306feb59a8bdc52c"
229
								name="Europe PubMed Central" />
230
							<webresource>
231
								<url>http://europepmc.org/articles/PMC115861</url>
232
							</webresource>
233
						</instance>
234
					</children>
235
				</result>
236
			</oaf:entity>
237
		</metadata>
238
	</result>
239
</record>
modules/dnet-mapreduce-jobs/trunk/src/main/java/eu/dnetlib/data/mapreduce/hbase/oai/utils/RecordFieldsExtractor.java
1 1
package eu.dnetlib.data.mapreduce.hbase.oai.utils;
2 2

  
3 3
import java.io.StringReader;
4
import java.util.Collection;
4 5
import java.util.List;
5 6
import java.util.Map.Entry;
6 7

  
......
11 12

  
12 13
import com.google.common.base.Function;
13 14
import com.google.common.collect.ArrayListMultimap;
15
import com.google.common.collect.Collections2;
14 16
import com.google.common.collect.Iterables;
17
import com.google.common.collect.Lists;
15 18
import com.google.common.collect.Multimap;
16 19

  
20
import eu.dnetlib.data.mapreduce.hbase.oai.config.OAIConfigurationReader;
21

  
17 22
/**
18 23
 * An instance of this class can parse an XML record and extract the information needed to store the record in a publisher store.
19 24
 * 
......
27 32
	 * List of the indices of the target store.
28 33
	 */
29 34
	private List<PublisherField> storeIndices;
35
	/**
36
	 * XPaths to execute to understand if a record has been enhanced: inferred subjects, deduplicated records, inferred relationships (only
37
	 * those generated from IIS, not those redirected by the dedup)
38
	 */
39
	// private List<String> enrichmentXPaths = Lists.newArrayList("//subject[./@inferred='true']",
40
	// "//result/datainfo[./inferenceprovenance='dedup']",
41
	// "//rel[./@inferred='true' and ./@inferenceprovenance != 'dedup']");
30 42

  
31 43
	private String duplicateXPath;
32 44
	private boolean skipDuplicates = false;
33 45

  
34 46
	/**
35 47
	 * Parses the record and returns a map where a key is the name of an index, the value is the value in the record at the xpath specificed
36
	 * in this.storeIndices.
48
	 * in this.storeIndices. The enrichment xpaths are used to generate additional OAI sets that can be used by providers to get the subset
49
	 * of records enriched by OpenAIRE.
37 50
	 * 
38 51
	 * @param record
39 52
	 *            the XML string to parse.
53
	 * @param enrichmentXPaths
54
	 *            collection of xpaths that must be satisfied to consider the current record as "enriched by OpenAIRE"
40 55
	 * @return a Multimap describing the values to be indexed for this record.
41 56
	 */
42 57
	@SuppressWarnings({ "unchecked", "rawtypes" })
43
	public Multimap<String, String> extractFields(final String record) {
58
	public Multimap<String, String> extractFields(final String record, final Collection<String> enrichmentXPaths) {
44 59
		Multimap<String, String> recordProps = ArrayListMultimap.create();
45 60
		try {
46 61
			Document doc = new SAXReader().read(new StringReader(record));
......
72 87
					}
73 88
				}
74 89
			}
90
			recordProps.putAll(OAIConfigurationReader.SET_FIELD, getEnrichedSets(doc, enrichmentXPaths, recordProps.get(OAIConfigurationReader.SET_FIELD)));
75 91

  
76 92
		} catch (DocumentException e) {
77 93
			recordProps = null;
......
79 95
		return recordProps;
80 96
	}
81 97

  
98
	public Collection<String> getEnrichedSets(final Document docBody, final Collection<String> enrichmentXPaths, final Collection<String> originalSets) {
99
		Collection<String> enhancedSets = Lists.newArrayList();
100
		if (isEnhanced(docBody, enrichmentXPaths)) {
101
			enhancedSets.addAll(Collections2.transform(originalSets, new Function<String, String>() {
102

  
103
				@Override
104
				public String apply(final String originalSet) {
105
					return originalSet + "_enriched";
106
				}
107

  
108
			}));
109
		}
110
		return enhancedSets;
111
	}
112

  
113
	private boolean isEnhanced(final Document docBody, final Collection<String> enrichmentXPaths) {
114
		for (String xpath : enrichmentXPaths) {
115
			List xPathResult = docBody.selectNodes(xpath);
116
			if ((xPathResult != null) && !xPathResult.isEmpty()) return true;
117
		}
118
		return false;
119
	}
120

  
82 121
	public boolean isDuplicate(final Document doc) {
83 122
		return doc.selectSingleNode(duplicateXPath) != null;
84 123
	}
modules/dnet-mapreduce-jobs/trunk/src/main/java/eu/dnetlib/data/mapreduce/hbase/oai/OaiFeedMapper.java
63 63

  
64 64
	private MongoClient mongo;
65 65

  
66
	private Collection<String> enrichmentXPaths;
67

  
66 68
	@Override
67 69
	protected void setup(final Context context) throws UnknownHostException {
68 70

  
......
98 100
		duplicateXPath = context.getConfiguration().get("services.publisher.oai.duplicateXPath");
99 101
		skipDuplicates = Boolean.parseBoolean(context.getConfiguration().get("services.publisher.oai.skipDuplicates"));
100 102

  
103
		enrichmentXPaths = oaiConfiguration.getEnrichmentXPathsFor(format, layout, interpretation);
101 104
		Collection<PublisherField> indexFields = oaiConfiguration.getFieldsFor(format, layout, interpretation);
102 105
		extractor = new RecordFieldsExtractor(Lists.newArrayList(indexFields));
103 106
		extractor.setDuplicateXPath(duplicateXPath);
......
131 134
			if (StringUtils.isBlank(recordBody)) {
132 135
				discard(context, recordKey, recordBody, "blank body");
133 136
			} else {
134
				Multimap<String, String> recordFields = extractor.extractFields(recordBody);
137
				Multimap<String, String> recordFields = extractor.extractFields(recordBody, enrichmentXPaths);
135 138
				String id = "";
136 139
				String oaiID = "";
137 140
				if (checkRecordFields(recordFields, context, recordKey, recordBody)) {
modules/dnet-mapreduce-jobs/trunk/src/main/java/eu/dnetlib/data/mapreduce/hbase/oai/config/OAIConfiguration.java
8 8
import com.google.common.base.Joiner;
9 9
import com.google.common.base.Predicate;
10 10
import com.google.common.collect.Collections2;
11
import com.google.common.collect.Multimap;
11 12
import com.google.common.collect.Sets;
12 13
import com.google.gson.Gson;
13 14
import com.google.gson.GsonBuilder;
......
51 52

  
52 53
	private Set<MDFInfo> sourcesMDF = Sets.newHashSet();
53 54

  
55
	private Multimap<String, String> enrichmentXPaths;
56

  
54 57
	public Collection<PublisherField> getFieldsFor(final String format, final String layout, final String interpretation) {
55 58
		final String mdRef = Joiner.on("-").join(format, layout, interpretation);
56 59
		return Collections2.filter(this.getFields(), new Predicate<PublisherField>() {
......
64 67
		});
65 68
	}
66 69

  
70
	public Collection<String> getEnrichmentXPathsFor(final String format, final String layout, final String interpretation) {
71
		final String mdRef = Joiner.on("-").join(format, layout, interpretation);
72
		return enrichmentXPaths.get(mdRef);
73
	}
74

  
67 75
	public Map<String, SetInfo> getSetsMap() {
68 76
		return setsMap;
69 77
	}
......
120 128
		this.idNamespace = idNamespace;
121 129
	}
122 130

  
131
	public Multimap<String, String> getEnrichmentXPaths() {
132
		return enrichmentXPaths;
133
	}
134

  
135
	public void setEnrichmentXPaths(final Multimap<String, String> enrichmentXPaths) {
136
		this.enrichmentXPaths = enrichmentXPaths;
137
	}
138

  
123 139
	@Override
124 140
	public String toString() {
125 141
		Gson gson = new GsonBuilder().setPrettyPrinting().create();
modules/dnet-mapreduce-jobs/trunk/src/main/java/eu/dnetlib/data/mapreduce/hbase/oai/config/OAIConfigurationParser.java
14 14
import org.apache.commons.logging.Log;
15 15
import org.apache.commons.logging.LogFactory;
16 16

  
17
import com.google.common.base.Joiner;
17 18
import com.google.common.collect.ArrayListMultimap;
18 19
import com.google.common.collect.Lists;
19 20
import com.google.common.collect.Maps;
......
50 51
		Map<String, MDFInfo> mdFormatsMap = Maps.newHashMap();
51 52
		List<String> indexNames = Lists.newArrayList();
52 53
		List<PublisherField> fields = Lists.newArrayList();
54
		Multimap<String, String> enrichmentPaths = ArrayListMultimap.create();
53 55
		try {
54 56
			final XMLStreamReader parser = factory.get().createXMLStreamReader(new StreamSource(new StringReader(configurationProfile)));
55 57
			log.debug("Configuration profile read by " + parser.getClass().getCanonicalName());
......
130 132
								publisherField.setSources(fieldSources);
131 133
								fields.add(publisherField);
132 134

  
135
							} else {
136
								if (localName.equals("ENRICHMENT")) {
137
									boolean inEnrichment = true;
138
									while (parser.hasNext() && inEnrichment) {
139
										event = parser.next();
140
										if (event == XMLStreamConstants.START_ELEMENT) {
141
											MDFInfo mdfInfo = new MDFInfo();
142
											this.handleSourceMDF(mdfInfo, parser);
143
											String mdf = Joiner.on("-").join(mdfInfo.getSourceFormatName(), mdfInfo.getSourceFormatLayout(),
144
													mdfInfo.getSourceFormatInterpretation());
145
											String xpath = parser.getAttributeValue(null, "path");
146
											enrichmentPaths.put(mdf, xpath);
147
										}
148
										if ((event == XMLStreamConstants.END_ELEMENT) && parser.getLocalName().equals("ENRICHMENT")) {
149
											inEnrichment = false;
150
										}
151
									}
152
								}
133 153
							}
134 154
						}
135 155
					}
......
139 159
			config.setFieldNames(indexNames);
140 160
			config.setMdFormatsMap(mdFormatsMap);
141 161
			config.setSetsMap(setsMap);
162
			config.setEnrichmentXPaths(enrichmentPaths);
142 163
			return config;
143 164
		} catch (final XMLStreamException e) {
144 165
			throw new OaiPublisherRuntimeException(e);

Also available in: Unified diff