Project

General

Profile

« Previous | Next » 

Revision 57188

removed project reference from src/test/resources/eu/dnetlib/data/transform/odf.xml, the test didn't include any check against it

View differences:

modules/dnet-mapreduce-jobs/trunk/src/test/java/eu/dnetlib/data/mapreduce/hbase/dataexport/RecordFilterTest.java
1
package eu.dnetlib.data.mapreduce.hbase.dataexport;
2

  
3
import com.google.gson.Gson;
4
import eu.dnetlib.data.mapreduce.hbase.bulktag.ProtoMap;
5
import org.dom4j.Document;
6
import org.dom4j.DocumentException;
7
import org.dom4j.io.SAXReader;
8
import org.junit.Assert;
9
import org.junit.Test;
10

  
11
import java.text.ParseException;
12
import java.util.Map;
13

  
14
public class RecordFilterTest {
15

  
16
    private static final String REGEX = ".*\\\\s(interdisciplinar.*|transdisciplinary.*)\\\\s.*";
17

  
18
    private static final String DEFAULT_CRITERIA = "{ " +
19
            "\"/*[local-name() ='record']/*[local-name() ='result']/*[local-name() ='metadata']/*[local-name() ='entity']/*[local-name() ='result']/*[local-name() ='datainfo']/*[local-name() ='deletedbyinference']/text()\" : \"false\", " +
20
            "\"local-name(//*[local-name()='entity']/*)\" : \"result\" " +
21
            "}";
22

  
23
    private static final String CRITERIA = "{ " +
24
            "\"/*[local-name() ='record']/*[local-name() ='result']/*[local-name() ='metadata']/*[local-name() ='entity']/*[local-name() ='result']/*[local-name() ='title']/text()\" : \""+REGEX+"\", " +
25
            "\"/*[local-name() ='record']/*[local-name() ='result']/*[local-name() ='metadata']/*[local-name() ='entity']/*[local-name() ='result']/*[local-name() ='description']/text()\" : \""+REGEX+"\", " +
26
            "\"/*[local-name() ='record']/*[local-name() ='result']/*[local-name() ='metadata']/*[local-name() ='entity']/*[local-name() ='result']/*[local-name() ='subject']/text()\" : \""+REGEX+"\" " +
27
            "}";
28

  
29
    private static final String YEAR_XPATH = "/*[local-name() ='record']/*[local-name() ='result']/*[local-name() ='metadata']/*[local-name() ='entity']/*[local-name() ='result']/*[local-name() ='dateofacceptance']";
30

  
31
    @Test
32
    public void recordNonMatchFilterTest() throws DocumentException, ParseException {
33

  
34
        final Map<String, String> defaultCriteria = new Gson().fromJson(DEFAULT_CRITERIA, ProtoMap.class);
35

  
36
        final RecordFilter defaultFilter = new RecordFilter(defaultCriteria, YEAR_XPATH, 1990, 2019);
37

  
38
        final Document doc = new SAXReader().read(getClass().getResourceAsStream("non_match_record.xml"));
39
        Assert.assertTrue(defaultFilter.matches(doc, true));
40

  
41
        final RecordFilter filter = new RecordFilter(new Gson().fromJson(CRITERIA, ProtoMap.class), YEAR_XPATH, 1990, 2019);
42
        Assert.assertFalse(filter.matches(doc, false));
43
    }
44

  
45
    @Test
46
    public void recordMatchFilterTest() throws DocumentException, ParseException {
47

  
48
        final Map<String, String> defaultCriteria = new Gson().fromJson(DEFAULT_CRITERIA, ProtoMap.class);
49

  
50
        final RecordFilter defaultFilter = new RecordFilter(defaultCriteria, YEAR_XPATH, 1990, 2019);
51

  
52
        final Document doc = new SAXReader().read(getClass().getResourceAsStream("match_record.xml"));
53
        Assert.assertTrue(defaultFilter.matches(doc, true));
54

  
55
        final RecordFilter filter = new RecordFilter(new Gson().fromJson(CRITERIA, ProtoMap.class), YEAR_XPATH, 1990, 2019);
56
        Assert.assertTrue(filter.matches(doc, false));
57
    }
58

  
59
}
modules/dnet-mapreduce-jobs/trunk/src/test/resources/eu/dnetlib/data/mapreduce/hbase/dataexport/non_match_record.xml
1
<record rank="null">
2
    <result xmlns:dri="http://www.driver-repository.eu/namespace/dri">
3
        <header xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
4
            <dri:objIdentifier>dedup_wf_001::3ed91398db93e83c7f2a2f09c1e229ce</dri:objIdentifier>
5
            <dri:dateOfCollection>2019-03-21T11:39:15.746Z</dri:dateOfCollection>
6
            <dri:dateOfTransformation>2019-03-21T11:46:25.499Z</dri:dateOfTransformation>
7
            <counters>
8
                <counter_similarity_inferred value="6"/>
9
                <counter_similarity value="6"/>
10
                <counter_dedup value="2"/>
11
                <counter_doi value="1"/>
12
            </counters>
13
        </header>
14
        <metadata>
15
            <oaf:entity xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:oaf="http://namespace.openaire.eu/oaf" xsi:schemaLocation="http://namespace.openaire.eu/oaf https://www.openaire.eu/schema/1.0/oaf-1.0.xsd">
16
                <oaf:result>
17
                    <subject classid="keyword" classname="keyword" schemeid="dnet:subject_classification_typologies" schemename="dnet:subject_classification_typologies">T</subject>
18
                    <subject classid="keyword" classname="keyword" schemeid="dnet:subject_classification_typologies" schemename="dnet:subject_classification_typologies">G</subject>
19
                    <subject classid="keyword" classname="keyword" schemeid="dnet:subject_classification_typologies" schemename="dnet:subject_classification_typologies">GE1-350</subject>
20
                    <subject classid="keyword" classname="keyword" schemeid="dnet:subject_classification_typologies" schemename="dnet:subject_classification_typologies">Geography. Anthropology. Recreation</subject>
21
                    <subject classid="keyword" classname="keyword" schemeid="dnet:subject_classification_typologies" schemename="dnet:subject_classification_typologies">Environmental technology. Sanitary engineering</subject>
22
                    <subject classid="keyword" classname="keyword" schemeid="dnet:subject_classification_typologies" schemename="dnet:subject_classification_typologies">Environmental sciences</subject>
23
                    <subject classid="keyword" classname="keyword" schemeid="dnet:subject_classification_typologies" schemename="dnet:subject_classification_typologies">Technology</subject>
24
                    <subject classid="keyword" classname="keyword" schemeid="dnet:subject_classification_typologies" schemename="dnet:subject_classification_typologies">TD1-1066</subject>
25
                    <title classid="main title" classname="main title" schemeid="dnet:dataCite_title" schemename="dnet:dataCite_title">
26
                        Impacts of climate change under CMIP5 RCP scenarios on the streamflow in the Dinder River and ecosystem habitats in Dinder National Park, Sudan
27
                    </title>
28
                    <publisher>Copernicus Publications</publisher>
29
                    <journal issn="1607-7938" eissn="1607-7938" lissn="" ep="" iss="" sp="" vol=""/>
30
                    <dateofacceptance>2018-09-27</dateofacceptance>
31
                    <language classid="eng" classname="English" schemeid="dnet:languages" schemename="dnet:languages"/>
32
                    <resulttype classid="publication" classname="publication" schemeid="dnet:result_typologies" schemename="dnet:result_typologies"/>
33
                    <creator rank="1" name="Amir K." surname="Basheer">Basheer, Amir K.</creator>
34
                    <creator rank="2" name="Haishen" surname="Lu">Lu, Haishen</creator>
35
                    <creator rank="3" name="Abubaker" surname="Omer">Omer, Abubaker</creator>
36
                    <creator rank="4" name="Abubaker B." surname="Ali">Ali, Abubaker B.</creator>
37
                    <creator rank="5" name="Abdeldime M. S." surname="Abdelgader">Abdelgader, Abdeldime M. S.</creator>
38
                    <fulltext>
39
                        file:///mnt/uploaded_dumps/copernicus/upload/hess-20-1331-2016.pdf
40
                    </fulltext>
41
                    <description>
42
                        The fate of seasonal river ecosystem habitats under climate change essentially depends on the changes in annual recharge of the river, which are related to alterations in precipitation and evaporation over the river basin. Therefore, the change in climate conditions is expected to significantly affect hydrological and ecological components, particularly in fragmented ecosystems. This study aims to assess the impacts of climate change on the streamflow in the Dinder River basin (DRB) and to infer its relative possible effects on the Dinder National Park (DNP) ecosystem habitats in Sudan. Four global circulation models (GCMs) from Coupled Model Intercomparison Project Phase 5 and two statistical downscaling approaches combined with a hydrological model (SWAT ndash; the Soil and Water Assessment Tool) were used to project the climate change conditions over the study periods 2020s, 2050s, and 2080s. The results indicated that the climate over the DRB will become warmer and wetter under most scenarios. The projected precipitation variability mainly depends on the selected GCM and downscaling approach. Moreover, the projected streamflow is quite sensitive to rainfall and temperature variation, and will likely increase in this century. In contrast to drought periods during the 1960s, 1970s, and 1980s, the predicted climate change is likely to affect ecosystems in DNP positively and promote the ecological restoration for the habitats of flora and fauna.
43
                    </description>
44
                    <format>application/pdf</format>
45
                    <source>
46
                        Hydrology and Earth System Sciences, Vol 20, Iss 4, Pp 1331-1353 (2016)
47
                    </source>
48
                    <source>eISSN: 1607-7938</source>
49
                    <country classid="" classname="" schemeid="" schemename=""/>
50
                    <relevantdate classid="" classname="" schemeid="" schemename=""/>
51
                    <embargoenddate/>
52
                    <contributor/>
53
                    <resourcetype classid="" classname="" schemeid="" schemename=""/>
54
                    <coverage/>
55
                    <refereed/>
56
                    <storagedate/>
57
                    <device/>
58
                    <size/>
59
                    <version/>
60
                    <lastmetadataupdate/>
61
                    <metadataversionnumber/>
62
                    <documentationUrl/>
63
                    <codeRepositoryUrl/>
64
                    <programmingLanguage classid="" classname="" schemeid="" schemename=""/>
65
                    <contactperson/>
66
                    <contactgroup/>
67
                    <tool/>
68
                    <originalId>
69
                        oai:doaj.org/article:8956bc6de28f4705af47f57ae01a0d6e
70
                    </originalId>
71
                    <originalId>oai:publications.copernicus.org:hess32404</originalId>
72
                    <collectedfrom name="DOAJ-Articles" id="driver______::bee53aa31dc2cbb538c10c2b65fa5824"/>
73
                    <collectedfrom name="Copernicus Publications" id="openaire____::5a38cb462ac487bf26bdb86009fe3e74"/>
74
                    <pid classid="doi" classname="doi" schemeid="dnet:pid_types" schemename="dnet:pid_types">10.5194/hess-20-1331-2016</pid>
75
                    <bestaccessright classid="OPEN" classname="Open Access" schemeid="dnet:access_modes" schemename="dnet:access_modes"/>
76
                    <datainfo>
77
                        <inferred>true</inferred>
78
                        <deletedbyinference>false</deletedbyinference>
79
                        <trust>0.9</trust>
80
                        <inferenceprovenance>dedup-similarity-result-levenstein</inferenceprovenance>
81
                        <provenanceaction classid="sysimport:dedup" classname="sysimport:dedup" schemeid="dnet:provenanceActions" schemename="dnet:provenanceActions"/>
82
                    </datainfo>
83
                    <rels>
84
                        <rel inferred="true" trust="0.9" inferenceprovenance="iis::document_similarities_standard" provenanceaction="iis">
85
                            <to class="hasAmongTopNSimilarDocuments" scheme="dnet:result_result_relations" type="result">copernicuspu::86069783ef5f47bf6b4b23a26f9b632c</to>
86
                            <title classid="main title" classname="main title" schemeid="dnet:dataCite_title" schemename="dnet:dataCite_title">
87
                                Investigation of the long-term variations in hydro-climatology of the Dinder and Rahad basins and its implications on ecosystems of the Dinder National Park, Sudan
88
                            </title>
89
                            <resulttype classid="publication" classname="publication" schemeid="dnet:result_typologies" schemename="dnet:result_typologies"/>
90
                            <pid classid="doi" classname="doi" schemeid="dnet:pid_types" schemename="dnet:pid_types">10.5194/hess-2016-407</pid>
91
                            <similarity>0.86955255</similarity>
92
                            <type>STANDARD</type>
93
                            <dateofacceptance>2018-09-27</dateofacceptance>
94
                        </rel>
95
                        <rel inferred="true" trust="0.9" inferenceprovenance="iis::document_similarities_standard" provenanceaction="iis">
96
                            <to class="isAmongTopNSimilarDocuments" scheme="dnet:result_result_relations" type="result">copernicuspu::86069783ef5f47bf6b4b23a26f9b632c</to>
97
                            <title classid="main title" classname="main title" schemeid="dnet:dataCite_title" schemename="dnet:dataCite_title">
98
                                Investigation of the long-term variations in hydro-climatology of the Dinder and Rahad basins and its implications on ecosystems of the Dinder National Park, Sudan
99
                            </title>
100
                            <resulttype classid="publication" classname="publication" schemeid="dnet:result_typologies" schemename="dnet:result_typologies"/>
101
                            <pid classid="doi" classname="doi" schemeid="dnet:pid_types" schemename="dnet:pid_types">10.5194/hess-2016-407</pid>
102
                            <similarity>0.86955255</similarity>
103
                            <type>STANDARD</type>
104
                            <dateofacceptance>2018-09-27</dateofacceptance>
105
                        </rel>
106
                        <rel inferred="true" trust="0.9" inferenceprovenance="iis::document_similarities_standard" provenanceaction="iis">
107
                            <to class="hasAmongTopNSimilarDocuments" scheme="dnet:result_result_relations" type="result">dedup_wf_001::959d58f74f5139ae3e8dd4b02c7314c0</to>
108
                            <similarity>0.7040297</similarity>
109
                            <resulttype classid="publication" classname="publication" schemeid="dnet:result_typologies" schemename="dnet:result_typologies"/>
110
                            <publisher>Copernicus Publications</publisher>
111
                            <title classid="main title" classname="main title" schemeid="dnet:dataCite_title" schemename="dnet:dataCite_title">
112
                                Analysis of streamflow response to land use and land cover changes using satellite data and hydrological modelling: case study of Dinder and Rahad tributaries of the Blue Nile (Ethiopia–Sudan)
113
                            </title>
114
                            <type>STANDARD</type>
115
                            <dateofacceptance>2018-09-27</dateofacceptance>
116
                            <pid classid="doi" classname="doi" schemeid="dnet:pid_types" schemename="dnet:pid_types">10.5194/hess-21-5217-2017</pid>
117
                            <dateofacceptance>2017-10-01</dateofacceptance>
118
                        </rel>
119
                        <rel inferred="true" trust="0.9" inferenceprovenance="iis::document_similarities_standard" provenanceaction="iis">
120
                            <to class="isAmongTopNSimilarDocuments" scheme="dnet:result_result_relations" type="result">dedup_wf_001::959d58f74f5139ae3e8dd4b02c7314c0</to>
121
                            <similarity>0.7040297</similarity>
122
                            <resulttype classid="publication" classname="publication" schemeid="dnet:result_typologies" schemename="dnet:result_typologies"/>
123
                            <publisher>Copernicus Publications</publisher>
124
                            <title classid="main title" classname="main title" schemeid="dnet:dataCite_title" schemename="dnet:dataCite_title">
125
                                Analysis of streamflow response to land use and land cover changes using satellite data and hydrological modelling: case study of Dinder and Rahad tributaries of the Blue Nile (Ethiopia–Sudan)
126
                            </title>
127
                            <type>STANDARD</type>
128
                            <dateofacceptance>2018-09-27</dateofacceptance>
129
                            <pid classid="doi" classname="doi" schemeid="dnet:pid_types" schemename="dnet:pid_types">10.5194/hess-21-5217-2017</pid>
130
                            <dateofacceptance>2017-10-01</dateofacceptance>
131
                        </rel>
132
                        <rel inferred="true" trust="0.9" inferenceprovenance="iis::document_similarities_standard" provenanceaction="iis">
133
                            <to class="hasAmongTopNSimilarDocuments" scheme="dnet:result_result_relations" type="result">dedup_wf_001::df4969fc6413f99334a2451f5f4a3221</to>
134
                            <publisher>
135
                                Freie Universität Berlin Universitätsbibliothek, Garystr. 39, 14195 Berlin
136
                            </publisher>
137
                            <dateofacceptance>2010-02-19</dateofacceptance>
138
                            <similarity>0.7040297</similarity>
139
                            <resulttype classid="publication" classname="publication" schemeid="dnet:result_typologies" schemename="dnet:result_typologies"/>
140
                            <type>STANDARD</type>
141
                            <title classid="main title" classname="main title" schemeid="dnet:dataCite_title" schemename="dnet:dataCite_title">
142
                                integration of socio-economic development and conservation as a challange for the protected area management ; the cases of Wadi Howar National Park and Dinder National Park
143
                            </title>
144
                        </rel>
145
                        <rel inferred="true" trust="0.9" inferenceprovenance="iis::document_similarities_standard" provenanceaction="iis">
146
                            <to class="isAmongTopNSimilarDocuments" scheme="dnet:result_result_relations" type="result">dedup_wf_001::df4969fc6413f99334a2451f5f4a3221</to>
147
                            <publisher>
148
                                Freie Universität Berlin Universitätsbibliothek, Garystr. 39, 14195 Berlin
149
                            </publisher>
150
                            <dateofacceptance>2010-02-19</dateofacceptance>
151
                            <similarity>0.7040297</similarity>
152
                            <resulttype classid="publication" classname="publication" schemeid="dnet:result_typologies" schemename="dnet:result_typologies"/>
153
                            <type>STANDARD</type>
154
                            <title classid="main title" classname="main title" schemeid="dnet:dataCite_title" schemename="dnet:dataCite_title">
155
                                integration of socio-economic development and conservation as a challange for the protected area management ; the cases of Wadi Howar National Park and Dinder National Park
156
                            </title>
157
                        </rel>
158
                    </rels>
159
                    <children>
160
                        <result objidentifier="copernicuspu::3ed91398db93e83c7f2a2f09c1e229ce">
161
                            <title classid="main title" classname="main title" schemeid="dnet:dataCite_title" schemename="dnet:dataCite_title">
162
                                Impacts of climate change under CMIP5 RCP scenarios on the streamflow in the Dinder River and ecosystem habitats in Dinder National Park, Sudan
163
                            </title>
164
                            <dateofacceptance>2018-09-27</dateofacceptance>
165
                            <resulttype classid="publication" classname="publication" schemeid="dnet:result_typologies" schemename="dnet:result_typologies"/>
166
                        </result>
167
                        <result objidentifier="doajarticles::7cca16547fba04de50f609f0f7a2b34f">
168
                            <title classid="main title" classname="main title" schemeid="dnet:dataCite_title" schemename="dnet:dataCite_title">
169
                                Impacts of climate change under CMIP5 RCP scenarios on the streamflow in the Dinder River and ecosystem habitats in Dinder National Park, Sudan
170
                            </title>
171
                            <publisher>Copernicus Publications</publisher>
172
                            <dateofacceptance>2016-04-01</dateofacceptance>
173
                            <resulttype classid="publication" classname="publication" schemeid="dnet:result_typologies" schemename="dnet:result_typologies"/>
174
                        </result>
175
                        <instance id="copernicuspu::7e1a08d9f8c386b160529380308fb039">
176
                            <accessright classid="OPEN" classname="Open Access" schemeid="dnet:access_modes" schemename="dnet:access_modes"/>
177
                            <dateofacceptance>2018-09-27</dateofacceptance>
178
                            <instancetype classid="0038" classname="Other literature type" schemeid="dnet:publication_resource" schemename="dnet:publication_resource"/>
179
                            <hostedby name="Hydrology and Earth System Sciences (HESS)" id="copernicuspu::7e1a08d9f8c386b160529380308fb039"/>
180
                            <collectedfrom name="Copernicus Publications" id="openaire____::5a38cb462ac487bf26bdb86009fe3e74"/>
181
                            <webresource>
182
                                <url>
183
                                    https://www.hydrol-earth-syst-sci.net/20/1331/2016/
184
                                </url>
185
                            </webresource>
186
                        </instance>
187
                        <instance id="doajarticles::e11c3334d13cac1b2a8a89976026f695">
188
                            <accessright classid="OPEN" classname="Open Access" schemeid="dnet:access_modes" schemename="dnet:access_modes"/>
189
                            <dateofacceptance>2016-04-01</dateofacceptance>
190
                            <instancetype classid="0001" classname="Article" schemeid="dnet:publication_resource" schemename="dnet:publication_resource"/>
191
                            <hostedby name="Hydrology and Earth System Sciences" id="doajarticles::e11c3334d13cac1b2a8a89976026f695"/>
192
                            <collectedfrom name="DOAJ-Articles" id="driver______::bee53aa31dc2cbb538c10c2b65fa5824"/>
193
                            <webresource>
194
                                <url>https://doaj.org/toc/1027-5606</url>
195
                            </webresource>
196
                            <webresource>
197
                                <url>https://doaj.org/toc/1607-7938</url>
198
                            </webresource>
199
                            <webresource>
200
                                <url>
201
                                    http://www.hydrol-earth-syst-sci.net/20/1331/2016/hess-20-1331-2016.pdf
202
                                </url>
203
                            </webresource>
204
                        </instance>
205
                    </children>
206
                </oaf:result>
207
            </oaf:entity>
208
        </metadata>
209
    </result>
210
</record>
modules/dnet-mapreduce-jobs/trunk/src/test/resources/eu/dnetlib/data/mapreduce/hbase/dataexport/match_record.xml
1
<record rank="null">
2
    <result>
3
        <header>
4
            <objIdentifier>od________65::3a1124fd7b91a4c18376ae175b59b4ce</objIdentifier>
5
            <dateOfCollection>2019-05-23T20:02:01.021Z</dateOfCollection>
6
            <dateOfTransformation>2019-06-17T16:16:05.741Z</dateOfTransformation>
7
            <counters></counters>
8
        </header>
9
        <metadata>
10
            <entity schemaLocation="http://namespace.openaire.eu/oaf https://www.openaire.eu/schema/1.0/oaf-1.0.xsd">
11
                <result>
12
                    <subject classid="arxiv" classname="arxiv" schemeid="dnet:subject_classification_typologies" schemename="dnet:subject_classification_typologies" inferred="true" inferenceprovenance="iis::document_classes" provenanceaction="iis" trust="0.7245">Physics::Instrumentation and Detectors</subject>
13
                    <subject classid="keyword" classname="keyword" schemeid="dnet:result_subject" schemename="dnet:result_subject">Particle Physics - Experiment</subject>
14
                    <subject classid="arxiv" classname="arxiv" schemeid="dnet:subject_classification_typologies" schemename="dnet:subject_classification_typologies" inferred="true" inferenceprovenance="iis::document_classes" provenanceaction="iis" trust="0.8055">High Energy Physics::Phenomenology</subject>
15
                    <subject classid="arxiv" classname="arxiv" schemeid="dnet:subject_classification_typologies" schemename="dnet:subject_classification_typologies" inferred="true" inferenceprovenance="iis::document_classes" provenanceaction="iis" trust="0.7542">Nuclear Experiment</subject>
16
                    <subject classid="arxiv" classname="arxiv" schemeid="dnet:subject_classification_typologies" schemename="dnet:subject_classification_typologies" inferred="true" inferenceprovenance="iis::document_classes" provenanceaction="iis" trust="0.882">High Energy Physics::Experiment</subject>
17
                    <resulttype classid="publication" classname="publication" schemeid="dnet:result_typologies" schemename="dnet:result_typologies" />
18
                    <journal issn="" eissn="" lissn="" ep="" iss="" sp="" vol="" />
19
                    <language classid="und" classname="Undetermined" schemeid="dnet:languages" schemename="dnet:languages" />
20
                    <creator rank="1" name="Collaboration" surname="Cms">CMS Collaboration</creator>
21
                    <embargoenddate />
22
                    <dateofacceptance>2010-01-01</dateofacceptance>
23
                    <description>Proton--proton collision INTERDISCIPLINA events collected with the CMS experiment at LHC at a center--of--mass energy of $\sqrt{s} = 7$~TeV in 2010 are used to commission the algorithms for reconstruction and identification of tau lepton hadronic decays. Four different types of algorithms are considered: three based on particle--flow event reconstruction and one based on combinations of tracks and calorimeter clusters. Probabilities for quark and gluon jets to pass the tau identification criteria of the different algorithms are measured in data dominated by QCD multi--jet events and compared to predictions of Monte Carlo simulations.</description>
24
                    <title classid="main title" classname="main title" schemeid="dnet:dataCite_title" schemename="dnet:dataCite_title">Study INTERDISCIPLINARY of tau reconstruction algorithms using pp collisions data collected at sqrt(s) = 7 TeV</title>
25
                    <country classid="" classname="" schemeid="" schemename="" />
26
                    <relevantdate classid="" classname="" schemeid="" schemename="" />
27
                    <publisher />
28
                    <source />
29
                    <fulltext />
30
                    <format />
31
                    <contributor />
32
                    <resourcetype classid="" classname="" schemeid="" schemename="" />
33
                    <coverage />
34
                    <refereed />
35
                    <storagedate />
36
                    <device />
37
                    <size />
38
                    <version />
39
                    <lastmetadataupdate />
40
                    <metadataversionnumber />
41
                    <documentationUrl />
42
                    <codeRepositoryUrl />
43
                    <programmingLanguage classid="" classname="" schemeid="" schemename="" />
44
                    <contactperson />
45
                    <contactgroup />
46
                    <tool />
47
                    <collectedfrom name="CERN Document Server" id="opendoar____::fc490ca45c00b1249bbe3554a4fdf6fb" />
48
                    <originalId>oai:cds.cern.ch:1279358</originalId>
49
                    <pid classid="" classname="" schemeid="" schemename="" />
50
                    <bestaccessright classid="OPEN" classname="Open Access" schemeid="dnet:access_modes" schemename="dnet:access_modes" />
51
                    <context id="egi" label="EGI Federation" type="community">
52
                        <category id="egi::virtual" label="EGI virtual organizations">
53
                            <concept id="egi::virtual::4" label="cms" />
54
                        </category>
55
                    </context>
56
                    <datainfo>
57
                        <inferred>false</inferred>
58
                        <deletedbyinference>false</deletedbyinference>
59
                        <trust>0.9</trust>
60
                        <inferenceprovenance />
61
                        <provenanceaction classid="sysimport:crosswalk:repository" classname="sysimport:crosswalk:repository" schemeid="dnet:provenanceActions" schemename="dnet:provenanceActions" />
62
                    </datainfo>
63
                    <rels></rels>
64
                    <children>
65
                        <instance id="opendoar____::fc490ca45c00b1249bbe3554a4fdf6fb">
66
                            <collectedfrom name="CERN Document Server" id="opendoar____::fc490ca45c00b1249bbe3554a4fdf6fb" />
67
                            <accessright classid="OPEN" classname="Open Access" schemeid="dnet:access_modes" schemename="dnet:access_modes" />
68
                            <hostedby name="CERN Document Server" id="opendoar____::fc490ca45c00b1249bbe3554a4fdf6fb" />
69
                            <instancetype classid="0038" classname="Other literature type" schemeid="dnet:publication_resource" schemename="dnet:publication_resource" />
70
                            <dateofacceptance>2010-01-01</dateofacceptance>
71
                            <webresource>
72
                                <url>http://cds.cern.ch/record/1279358</url>
73
                            </webresource>
74
                        </instance>
75
                    </children>
76
                </result>
77
            </entity>
78
        </metadata>
79
    </result>
80
</record>
modules/dnet-mapreduce-jobs/trunk/src/test/resources/eu/dnetlib/data/transform/odf.xml
26 26
        <dc:creator>Corso, Mariano</dc:creator>
27 27
        <dc:title>(Re-)Designing the Business Model of a Digital Ecosystem: An Example in the Socio-Care Context</dc:title>
28 28
        <dc:date>2018</dc:date>
29
        <oaf:projectid>corda__h2020::643588</oaf:projectid>
30 29
    </metadata>
31 30
    <provenance xmlns="http://www.openarchives.org/OAI/2.0/provenance" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/provenance http://www.openarchives.org/OAI/2.0/provenance.xsd">
32 31
        <originDescription harvestDate="2019-04-11T14:51:27.828+02:00" altered="true">
modules/dnet-mapreduce-jobs/trunk/src/main/java/eu/dnetlib/data/mapreduce/hbase/broker/SoftwareEventFactory.java
44 44

  
45 45
	public static List<EventWrapper> process(final Context context, final Oaf current, final Oaf other, final Float trust, final Map<String, String> baseUrlMap)
46 46
			throws IOException, InterruptedException, DocumentException {
47

  
48
		/*
49
		 * if (!current.getRel().hasCachedOafTarget() || (other != null && !other.getRel().hasCachedOafTarget())) {
50
		 * context.getCounter(COUNTER_GROUP, "events skipped: missing project 2nd step").increment(1); return; }
51
		 */
52

  
53 47
		return new SoftwareEventFactory(baseUrlMap).processSoftware(context, current, other, trust);
54 48
	}
55 49

  
......
67 61

  
68 62
				final String provenance = oafRel.getDataInfo().getProvenanceaction().getClassid();
69 63
				if (inferenceProvenance.contains(provenance)) {
70
					final OafEntity result = oafRel.getRel().getCachedOafTarget().getEntity();
71
					events.add(doProcessSoftware(context, current, current, result, provenance, Topic.ENRICH_MISSING_SOFTWARE, trust(trust, oafRel)));
64
					final Software software = mapRelatedSoftware(oafRel.getRel().getCachedOafTarget().getEntity().getResult());
65
					events.add(doProcessSoftware(context, current, current, software, provenance, Topic.ENRICH_MISSING_SOFTWARE, trust(trust, oafRel)));
72 66
				}
73 67
			}
74 68
		} else {
......
82 76

  
83 77
						final String provenance = otherOafRel.getDataInfo().getProvenanceaction().getClassid();
84 78

  
85
						final OafEntity software = otherOafRel.getRel().getCachedOafTarget().getEntity();
79
						final OafEntity swEntity = otherOafRel.getRel().getCachedOafTarget().getEntity();
80
						final Software software = mapRelatedSoftware(swEntity.getResult());
86 81

  
87 82
						final boolean currentHasSw = Iterables.tryFind(current.getEntity().getCachedOafRelList(), oaf -> {
88 83
							final String currentSwId = oaf.getRel().getCachedOafTarget().getEntity().getId();
89
							// System.out.println(String.format("%s = %s ? %s", currentProjectId, project.getId(),
90
							// currentProjectId.equals(project.getId())));
91
							return currentSwId.equals(software.getId());
84
							return currentSwId.equals(swEntity.getId());
92 85
						}).isPresent();
93 86

  
94 87
						if (!currentHasSw) {
95
							// System.out.println(String.format("generating event for other = %s\n\nproject = %s", other, project));
96
							events.add(doProcessSoftware(context, current, other, software, provenance, Topic.ENRICH_MISSING_PROJECT,
88
							events.add(doProcessSoftware(context, current, other, software, provenance, Topic.ENRICH_MISSING_SOFTWARE,
97 89
									trust(trust, currentOafRel)));
98 90
						}
99 91
					}
......
106 98
	private EventWrapper doProcessSoftware(final Context context,
107 99
			final Oaf current,
108 100
			final Oaf other,
109
			final OafEntity software,
101
			final Software software,
110 102
			final String provenance,
111 103
			final Topic topic,
112 104
			final Float trust)
......
117 109

  
118 110
		final Provenance prov = getProvenance(otherEntity, provenance);
119 111

  
120
		final OpenAireEventPayload payload = addSoftware(OpenAireEventPayloadFactory.fromOAF(currentEntity, trust, prov), software);
112
		final OpenAireEventPayload payload = OpenAireEventPayloadFactory.fromOAF(currentEntity, trust, prov);
121 113

  
122
		final EventMessage event = asEvent(currentEntity, topic, payload, otherEntity, trust);
123
		event.setPayload(HighlightFactory.highlightEnrichSoftware(payload, software, provenance).toJSON());
124
		return EventWrapper.newInstance(event,
125
				payload.getHighlight().getSoftwares().stream().filter(Objects::nonNull).map(s -> s.getName()).sorted()
126
						.collect(Collectors.joining(", ")),
127
				topic.getValue());
128
	}
129

  
130
	private OpenAireEventPayload addSoftware(final OpenAireEventPayload payload, final OafEntity software) {
131 114
		final Map<String, Software> swMap = Maps.newHashMap();
132 115
		for (final Software s : payload.getPublication().getSoftwares()) {
133 116
			swMap.put(s.getLandingPage(), s);
134 117
		}
135
		final Software hlSw = mapRelatedSoftware(software.getResult());
136
		swMap.put(hlSw.getLandingPage(), hlSw);
137 118

  
119
		swMap.put(software.getLandingPage(), software);
120

  
138 121
		payload.getPublication().setSoftwares(Lists.newArrayList(swMap.values()));
122
		payload.getHighlight().setSoftwares(Lists.newArrayList(software));
139 123

  
140
		return payload;
124
		final EventMessage event = asEvent(currentEntity, topic, payload, otherEntity, trust);
125

  
126
		event.setPayload(payload.toJSON());
127
		return EventWrapper.newInstance(event,
128
				payload.getHighlight().getSoftwares().stream().filter(Objects::nonNull).map(s -> s.getName()).sorted()
129
						.collect(Collectors.joining(", ")),
130
				topic.getValue());
141 131
	}
142 132

  
143 133
	private Provenance getProvenance(final OafEntity entity, final String provenance) {
modules/dnet-mapreduce-jobs/trunk/src/main/java/eu/dnetlib/data/mapreduce/hbase/broker/enrich/SoftwareEnrichmentReducer.java
26 26

  
27 27
	@Override
28 28
	protected String counterGroup() {
29
		return "Broker Enrichment projects";
29
		return "Broker Enrichment Software";
30 30
	}
31 31

  
32 32
	@Override
......
70 70
				.collect(Collectors.toList());
71 71

  
72 72
		if (valid.isEmpty()) {
73
			context.getCounter(counterGroup(), "Events Skipped - Missing project").increment(1);
73
			context.getCounter(counterGroup(), "Events Skipped - Missing software").increment(1);
74 74
			return;
75 75
		}
76 76

  
......
116 116

  
117 117
	}
118 118

  
119
	private Oaf addSoftware(final Oaf current, final Map<String, Oaf> software) {
119
	private Oaf addSoftware(final Oaf current, final Map<String, Oaf> softwareMap) {
120 120

  
121 121
		final Oaf.Builder oafBuilder = Oaf.newBuilder(current);
122 122
		final List<Oaf> cachedRels = Lists.newArrayList();
123 123

  
124 124
		for (final Oaf.Builder cachedOafRel : oafBuilder.getEntityBuilder().getCachedOafRelBuilderList()) {
125
			final String projectId = cachedOafRel.getRel().getTarget();
125
			final String softwareId = cachedOafRel.getRel().getTarget();
126 126

  
127
			if (software.containsKey(projectId)) {
128
				final Oaf project = software.get(projectId);
127
			if (softwareMap.containsKey(softwareId)) {
128
				final Oaf software = softwareMap.get(softwareId);
129 129

  
130
				cachedOafRel.getRelBuilder().setCachedOafTarget(project);
130
				cachedOafRel.getRelBuilder().setCachedOafTarget(software);
131 131
				cachedRels.add(cachedOafRel.build());
132 132
			}
133 133
		}
modules/dnet-mapreduce-jobs/trunk/src/main/java/eu/dnetlib/data/mapreduce/hbase/broker/enrich/EnrichmentReducer.java
72 72

  
73 73
								final float trust = scale(similarity);
74 74
								if (!DedupUtils.isRoot(currentId) && !DedupUtils.isRoot(otherId)) {
75
									events.addAll(PIDEventFactory.process(current, other, trust));
76
									events.addAll(OAVersionEventFactory.process(current, other, trust, untrustedOaDsList));
77
									events.addAll(AbstractEventFactory.process(current, other, trust));
78
									events.addAll(PublicationDateEventFactory.process(current, other, trust));
75
									//events.addAll(PIDEventFactory.process(current, other, trust));
76
									//events.addAll(OAVersionEventFactory.process(current, other, trust, untrustedOaDsList));
77
									//events.addAll(AbstractEventFactory.process(current, other, trust));
78
									//events.addAll(PublicationDateEventFactory.process(current, other, trust));
79 79
									events.addAll(OrcidEventFactory.process(current, other, trust));
80 80
								}
81 81

  
82
								events.addAll(SubjectEventFactory.process(context, current, other, trust));
82
								//events.addAll(SubjectEventFactory.process(context, current, other, trust));
83 83
							} else {
84 84
								context.getCounter(counterGroup(), "d < " + dedupConf.getWf().getThreshold()).increment(1);
85 85
							}
86 86

  
87 87
						} else if (oafList.size() == 1) {
88
							events.addAll(SubjectEventFactory.process(context, current));
88
							//events.addAll(SubjectEventFactory.process(context, current));
89 89
						}
90 90
					}
91 91
					emit(events, context);
modules/dnet-mapreduce-jobs/trunk/src/main/java/eu/dnetlib/data/mapreduce/hbase/broker/enrich/SoftwareEnrichmentMapper.java
69 69
				emit(context, key.copyBytes(), oafBuilder.build().toByteArray(), PUBLICATION);
70 70
				break;
71 71
			}
72
			break;
72 73
		default:
73 74
			throw new IllegalArgumentException("invalid type: " + type);
74 75
		}
modules/dnet-mapreduce-jobs/trunk/src/main/java/eu/dnetlib/data/mapreduce/hbase/broker/mapping/HighlightFactory.java
68 68
	}
69 69

  
70 70
	public static OpenAireEventPayload highlightEnrichSoftware(final OpenAireEventPayload p, final OafEntity software, final String provenance) {
71
		// TODO: this can wait. Think about generating the openaire string for project links: it will be easier for subscribers to integrate
72
		// it back to their records!
73 71

  
74 72
		p.getHighlight().setSoftwares(Lists.newArrayList(mapRelatedSoftware(software.getResult())));
75 73

  
modules/dnet-mapreduce-jobs/trunk/src/main/java/eu/dnetlib/data/mapreduce/hbase/broker/mapping/ProtoMapping.java
100 100
		return p;
101 101
	}
102 102

  
103
	protected static final List<Software> mapRelatedSoftwares(final OafEntity entity) {
104
		final Map<String, Oaf> softwareMap = Maps.newHashMap();
105
		for(Oaf rel : entity.getCachedOafRelList()) {
106
			final OafEntity p = rel.getRel().getCachedOafTarget().getEntity();
107
			softwareMap.put(p.getId(), Oaf.newBuilder(rel).build());
108
		}
109

  
110
		return softwareMap.values().stream()
111
				.map(o -> mapRelatedSoftware(o.getRel().getCachedOafTarget().getEntity().getResult()))
112
				.collect(Collectors.toList());
113
	}
114

  
103 115
	protected static final Software mapRelatedSoftware(final ResultProtos.Result result) {
104 116
		final Software s = new Software();
105 117
		final ResultProtos.Result.Metadata rp = result.getMetadata();
modules/dnet-mapreduce-jobs/trunk/src/main/java/eu/dnetlib/data/mapreduce/hbase/dataexport/ExportFilteredResultMapper.java
2 2

  
3 3
import com.google.gson.Gson;
4 4
import eu.dnetlib.data.mapreduce.hbase.bulktag.ProtoMap;
5
import org.apache.commons.logging.Log;
6
import org.apache.commons.logging.LogFactory;
7 5
import org.apache.hadoop.io.Text;
8 6
import org.apache.hadoop.mapreduce.Mapper;
9 7
import org.dom4j.Document;
......
14 12
import java.time.Year;
15 13

  
16 14
/**
17
 * Exports the result matching the criteria found in the confguration.
15
 * Exports the result matching the criteria found in the configuration.
18 16
 *
19 17
 * @author claudio
20 18
 */
21 19
public class ExportFilteredResultMapper extends Mapper<Text, Text, Text, Text> {
22 20

  
23
	/**
24
	 * logger.
25
	 */
26
	private static final Log log = LogFactory.getLog(ExportFilteredResultMapper.class); // NOPMD by marko on 11/24/08 5:02 PM
21
	private final static String RESULT_TYPE_XPATH = "/*[local-name() ='record']/*[local-name() ='result']/*[local-name() ='metadata']/*[local-name() ='entity']/*[local-name() ='result']/*[local-name() ='resulttype']/@classid";
27 22

  
28 23
	private Text keyOut;
29 24

  
......
35 30

  
36 31
	@Override
37 32
	protected void setup(final Context context) throws IOException, InterruptedException {
38
		super.setup(context);
39

  
40 33
		keyOut = new Text("");
41 34
		valueOut = new Text();
42 35

  
......
60 53

  
61 54
			final Document doc = new SAXReader().read(new StringReader(record));
62 55

  
63
			if (defaultFilter.matches(doc)) {
56
			if (defaultFilter.matches(doc, true)) {
64 57

  
65
				if (userFilter.matches(doc)) {
58
				if (userFilter.matches(doc, false)) {
66 59
					keyOut.set(keyIn.toString());
67 60
					valueOut.set(value.toString());
68 61

  
69 62
					context.write(keyOut, valueOut);
70
					context.getCounter("filter", "matched criteria").increment(1);
63
					context.getCounter("filter", "matched criteria " +doc.valueOf(RESULT_TYPE_XPATH)).increment(1);
71 64
				} else {
72 65
					context.getCounter("filter", "filtered by criteria").increment(1);
73 66
				}
modules/dnet-mapreduce-jobs/trunk/src/main/java/eu/dnetlib/data/mapreduce/hbase/dataexport/RecordFilter.java
3 3
import eu.dnetlib.miscutils.functional.xml.DnetXsltFunctions;
4 4
import org.apache.commons.lang3.StringUtils;
5 5
import org.dom4j.Document;
6
import org.dom4j.tree.DefaultText;
6 7

  
7 8
import java.text.ParseException;
8
import java.util.List;
9 9
import java.util.Map;
10 10
import java.util.Objects;
11 11

  
......
24 24
        this.toYear = toYear;
25 25
    }
26 26

  
27
    public boolean matches(final Document record) throws ParseException {
27
    public boolean matches(final Document record, final boolean strict) throws ParseException {
28 28

  
29 29
        final String date = record.valueOf(yearXpath);
30 30
        if (StringUtils.isBlank(date)) {
31 31
            return false;
32 32
        }
33 33

  
34
        final Integer year = Integer.valueOf(DnetXsltFunctions.extractYear(date));
34
        final String yyyy = DnetXsltFunctions.extractYear(date);
35
        if (StringUtils.isBlank(yyyy)) {
36
            return false;
37
        }
38
        final Integer year = Integer.valueOf(yyyy);
35 39

  
36 40
        if (year < fromYear | year > toYear) {
37 41
            return false;
38 42
        }
39 43

  
44

  
45
        boolean matched = false;
40 46
        for(final Map.Entry<String, String> c : criteria.entrySet()) {
41 47

  
42
            List<String> nodes = record.selectNodes(c.getKey());
43
            if (nodes != null) {
44
                boolean matches = nodes.stream()
45
                        .filter(Objects::nonNull)
46
                        .map(s -> s.toLowerCase())
47
                        .map(s -> s.trim())
48
                        .anyMatch(s -> s.matches(c.getValue()));
49
                if (matches) {
50
                    return true;
51
                }
48
            boolean matches = matched = record.selectNodes(c.getKey()).stream()
49
                    .filter(Objects::nonNull)
50
                    .map(o -> textOf(o))
51
                    .map(s -> ((String) s).toLowerCase())
52
                    .map(s -> ((String) s).trim())
53
                    .anyMatch(s -> {
54
                        return ((String) s).matches(c.getValue().toLowerCase());
55
                    });
56

  
57
            if (matches && !strict) {
58
                return true;
52 59
            }
53 60
        }
54
        return false;
61
        return matched;
55 62
    }
56 63

  
64
    private String textOf(final Object o) {
65
        if (o instanceof org.dom4j.tree.DefaultText) {
66
            return ((DefaultText) o).getText();
67
        }
68
        return o.toString();
69
    }
70

  
57 71
}
modules/dnet-mapreduce-jobs/trunk/src/main/resources/log4j.properties
27 27
log4j.logger.eu.dnetlib.conf.PropertyFetcher=WARN
28 28
#log4j.logger.eu.dnetlib.data.transform.XsltRowTransformerFactory=DEBUG
29 29

  
30
log4j.logger.org.reflections.Reflections=OFF
31

  
30 32
log4j.logger.eu.dnetlib.enabling.is.sn.ISSNServiceImpl=OFF
31 33
log4j.logger.eu.dnetlib.enabling.datasources.DatasourceManagerClients=FATAL
32 34
log4j.logger.eu.dnetlib.data.mdstore.modular.mongodb.utils.MetadataCheckJob=DEBUG

Also available in: Unified diff