Revision 53588
Added by Sandro La Bruzzo over 5 years ago
modules/dnet-mapreduce-jobs/branches/master/src/test/java/eu/dnetlib/data/mapreduce/actions/DOIBoostToActionsTest.java | ||
---|---|---|
1 | 1 |
package eu.dnetlib.data.mapreduce.actions; |
2 | 2 |
|
3 |
import com.google.common.collect.Lists; |
|
3 | 4 |
import com.google.gson.JsonObject; |
4 | 5 |
import com.google.gson.JsonParser; |
5 | 6 |
import eu.dnetlib.actionmanager.actions.ActionFactory; |
6 | 7 |
import eu.dnetlib.actionmanager.actions.AtomicAction; |
7 | 8 |
import eu.dnetlib.actionmanager.common.Agent; |
8 |
import eu.dnetlib.data.mapreduce.hbase.dataimport.CrossRefToActions; |
|
9 | 9 |
import eu.dnetlib.data.mapreduce.hbase.dataimport.DOIBoostToActions; |
10 |
import eu.dnetlib.data.mapreduce.util.XmlRecordFactory; |
|
11 |
import eu.dnetlib.data.transform.Column; |
|
12 |
import eu.dnetlib.data.transform.Row; |
|
13 |
import eu.dnetlib.data.transform.XsltRowTransformerFactoryTest; |
|
10 | 14 |
import org.apache.commons.lang3.StringUtils; |
15 |
import org.dom4j.Document; |
|
16 |
import org.dom4j.io.DocumentResult; |
|
17 |
import org.dom4j.io.DocumentSource; |
|
18 |
import org.dom4j.io.SAXReader; |
|
11 | 19 |
import org.junit.Before; |
12 | 20 |
import org.junit.Test; |
13 | 21 |
|
14 |
import java.io.BufferedReader; |
|
15 |
import java.io.IOException; |
|
16 |
import java.io.InputStream; |
|
17 |
import java.io.InputStreamReader; |
|
22 |
import javax.xml.transform.Transformer; |
|
23 |
import javax.xml.transform.TransformerFactory; |
|
24 |
import java.io.*; |
|
18 | 25 |
import java.util.List; |
26 |
import java.util.Map; |
|
19 | 27 |
|
20 |
public class DOIBoostToActionsTest { |
|
28 |
public class DOIBoostToActionsTest extends XsltRowTransformerFactoryTest {
|
|
21 | 29 |
private String setName; |
22 | 30 |
private Agent agent; |
23 | 31 |
|
32 |
private final static String xslt = |
|
33 |
"<xsl:stylesheet version=\"1.0\" xmlns:xsl=\"http://www.w3.org/1999/XSL/Transform\">\n" |
|
34 |
+ " <xsl:template match=\"@*|node()\">\n" |
|
35 |
+ " <xsl:copy>\n" |
|
36 |
+ " <xsl:apply-templates select=\"@*|node()\"/>\n" |
|
37 |
+ " </xsl:copy>\n" |
|
38 |
+ " </xsl:template>\n" |
|
39 |
+ "</xsl:stylesheet>"; |
|
40 |
|
|
41 |
private Transformer transformer; |
|
42 |
|
|
24 | 43 |
@Before |
25 | 44 |
public void setup() { |
26 | 45 |
setName = "DLI"; |
... | ... | |
29 | 48 |
|
30 | 49 |
@Test |
31 | 50 |
public void testSingleDOIBoostAction() throws IOException { |
32 |
doTestSingleDOIBoostAction("/eu/dnetlib/data/mapreduce/actions/DOIBoostAction.json");
|
|
51 |
doTestSingleDOIBoostAction("/eu/dnetlib/data/mapreduce/actions/broken");
|
|
33 | 52 |
} |
34 | 53 |
|
35 | 54 |
|
36 | 55 |
@Test |
56 |
public void testDOIBoostActionToXML() throws Exception { |
|
57 |
doTestSingleDOIBoostActionToXML("/eu/dnetlib/data/mapreduce/actions/broken"); |
|
58 |
} |
|
59 |
|
|
60 |
|
|
61 |
|
|
62 |
@Test |
|
37 | 63 |
public void testMultipleDOIBoostAction() throws IOException { |
38 | 64 |
doTestAllDOIBoostAction("/eu/dnetlib/data/mapreduce/actions/part-00070"); |
39 | 65 |
} |
40 | 66 |
|
41 | 67 |
|
68 |
private void doTestSingleDOIBoostActionToXML(final String filePath) throws Exception { |
|
69 |
final List<Row> rows = Lists.newArrayList(); |
|
70 |
final InputStream is = this.getClass().getResourceAsStream(filePath); |
|
71 |
final BufferedReader in = new BufferedReader(new InputStreamReader(is)); |
|
42 | 72 |
|
73 |
String line = in.readLine(); |
|
74 |
|
|
75 |
final JsonParser parser = new JsonParser(); |
|
76 |
JsonObject root = parser.parse(line).getAsJsonObject(); |
|
77 |
List<AtomicAction> actions = DOIBoostToActions.generatePublicationActionsFromDump(root, new ActionFactory(), setName, agent, false, false); |
|
78 |
if (actions!= null) { |
|
79 |
actions.forEach(action-> { |
|
80 |
if (action.getTargetColumn().equals("body") && action.getTargetColumnFamily().equals("result")) |
|
81 |
{ |
|
82 |
Column<String, byte[]> col = new Column<>("body" , action.getTargetValue()); |
|
83 |
rows.add(new Row("result",action.getTargetRowKey() , Lists.newArrayList(col))); |
|
84 |
} |
|
85 |
|
|
86 |
}); |
|
87 |
|
|
88 |
|
|
89 |
|
|
90 |
} |
|
91 |
|
|
92 |
Map<String, XmlRecordFactory> stringXmlRecordFactoryMap = mapAll(buildTable(rows)); |
|
93 |
transformer = TransformerFactory.newInstance().newTransformer(new DocumentSource((new SAXReader()).read(new StringReader(xslt)))); |
|
94 |
|
|
95 |
final SAXReader saxReader = new SAXReader(); |
|
96 |
stringXmlRecordFactoryMap.values().forEach(it -> { |
|
97 |
try { |
|
98 |
final DocumentResult result = new DocumentResult(); |
|
99 |
final Document document = saxReader.read(new StringReader(it.build())); |
|
100 |
transformer.transform(new DocumentSource(document), result); |
|
101 |
System.out.println(result.getDocument().asXML()); |
|
102 |
} |
|
103 |
catch (Exception e) { |
|
104 |
e.printStackTrace(); |
|
105 |
} |
|
106 |
|
|
107 |
}); |
|
108 |
|
|
109 |
} |
|
110 |
|
|
43 | 111 |
private void doTestSingleDOIBoostAction(final String filePath) throws IOException { |
44 | 112 |
final InputStream is = this.getClass().getResourceAsStream(filePath); |
45 | 113 |
final BufferedReader in = new BufferedReader(new InputStreamReader(is)); |
... | ... | |
48 | 116 |
|
49 | 117 |
final JsonParser parser = new JsonParser(); |
50 | 118 |
JsonObject root = parser.parse(line).getAsJsonObject(); |
51 |
List<AtomicAction> actions = DOIBoostToActions.generatePublicationActionsFromDump(root, new ActionFactory(), setName, agent, false, true);
|
|
119 |
List<AtomicAction> actions = DOIBoostToActions.generatePublicationActionsFromDump(root, new ActionFactory(), setName, agent, false, false);
|
|
52 | 120 |
if (actions!= null) { |
53 | 121 |
|
54 | 122 |
actions.forEach(it -> System.out.println(String.format(" RowKey:%s TargetColumnFamily:%s TargetColumn: %s", it.getTargetRowKey(), it.getTargetColumnFamily(), it.getTargetColumn()))); |
modules/dnet-mapreduce-jobs/branches/master/src/test/java/eu/dnetlib/data/transform/XsltRowTransformerFactoryTest.java | ||
---|---|---|
679 | 679 |
|
680 | 680 |
} |
681 | 681 |
|
682 |
private Map<String, Map<String, Map<String, byte[]>>> buildTable(final List<Row> rows) throws UnsupportedEncodingException {
|
|
682 |
protected Map<String, Map<String, Map<String, byte[]>>> buildTable(final List<Row> rows) throws UnsupportedEncodingException {
|
|
683 | 683 |
final Map<String, Map<String, Map<String, byte[]>>> table = Maps.newHashMap(); |
684 | 684 |
|
685 | 685 |
for (final Row row : rows) { |
... | ... | |
705 | 705 |
|
706 | 706 |
} |
707 | 707 |
|
708 |
private Map<String, XmlRecordFactory> mapAll(final Map<String, Map<String, Map<String, byte[]>>> table) throws Exception {
|
|
708 |
protected Map<String, XmlRecordFactory> mapAll(final Map<String, Map<String, Map<String, byte[]>>> table) throws Exception {
|
|
709 | 709 |
|
710 | 710 |
final Map<String, XmlRecordFactory> builders = Maps.newHashMap(); |
711 | 711 |
for (final Entry<String, Map<String, Map<String, byte[]>>> e : table.entrySet()) { |
modules/dnet-mapreduce-jobs/branches/master/src/test/resources/eu/dnetlib/data/mapreduce/actions/broken | ||
---|---|---|
1 |
{"best_oa_location_url": "", "issued": "2012-3-10", "abstract": [], "objIdentifier": "8201506ef2275362065a6b228c560e51", "pissn": "1220-1766", "subject": ["Electrical and Electronic Engineering", "Computer Science(all)"], "eissn": "1841-429X", "author": [{"affiliation": [], "given": "Diego", "family": "Soto"}, {"affiliation": [], "given": "Jaime", "family": "Borquez"}], "dateOfCollection": "2018-08-07 12:23:27Z", "type": {"cobj": "0001", "value": "Article"}, "is_oa": false, "published-online": "2012-3-10", "link": null, "published-print": "2012-3-10", "accepted": null, "publisher": null, "doi": "10.24846/v21i1y201210", "license": null, "url": "http://dx.doi.org/10.24846/v21i1y201210", "issn": [{"type": "print", "value": "1220-1766"}, {"type": "electronic", "value": "1841-429X"}], "collectedFrom": [{"name": "UnpayWall", "id": "openaire____::unpaywall"}, {"name": "CrossRef", "id": "openaire____::crossref"}], "title": ["Control of a Modular Multilevel Matrix Converter for High Power Applications"], "funder": null, "datasourcePrefix": "crossref____"} |
|
1 |
{"publisher": "Elsevier BV", "doi": "10.1016/j.dib.2015.09.011", "license": [{"url": "http://www.elsevier.com/tdm/userlicense/1.0/", "content-version": "tdm", "\"delay-in-days": null, "date-time": "2015-12-01T00:00:00Z"}, {"url": "http://creativecommons.org/licenses/by/4.0/", "content-version": "vor", "\"delay-in-days": null, "date-time": "2015-09-15T00:00:00Z"}], "title": ["Data on individual PCR efficiency values as quality control for circulating miRNAs"], "issued": "2015-12-1", "abstract": [{"provenance": "MAG", "value": "This data article contains data related to the research article entitled âVariability in microRNA recovery from plasma: Comparison of five commercial kits, doi:10.1016/j.ab.2015.07.018â Brunet-Vega (2015) [1]. PCR efficiency, along with RNA and cDNA quality, are the most important factors affecting the quality of qPCR results. Constant amplification efficiency in all compared samples is indispensable when relative quantification is used to measure changes in gene expression. An easy way to measure PCR efficiency, without the need of a standard curve, is LinRegPCR software. Individual PCR efficiency can be determined as a part of qPCR quality control. This is especially important when the initial RNA quantity is so low that cannot be accurately quantified, such as in circulating RNA extractions. This data article reports the Cqs and PCR efficiencies of 5 miRNAs quantified in RNA isolated from 4 patients with colorectal cancer (CRC) and 4 healthy donors using five commercially available kits."}], "issn": [{"type": "print", "value": "2352-3409"}], "doi-url": "http://dx.doi.org/10.1016/j.dib.2015.09.011", "instances": [{"url": "http://api.elsevier.com/content/article/PII:S235234091500205X?httpAccept=text/xml", "provenance": "CrossRef", "access-rights": "UNKNOWN"}, {"url": "http://api.elsevier.com/content/article/PII:S235234091500205X?httpAccept=text/plain", "provenance": "CrossRef", "access-rights": "UNKNOWN"}, {"url": "http://doi.org/10.1016/j.dib.2015.09.011", "provenance": "UnpayWall", "access-rights": "OPEN"}, {"url": "https://academic.microsoft.com/#/detail/2115687396", "provenance": "MAG", "access-rights": "UNKNOWN"}], "published-online": null, "authors": [{"affiliations": [{"official-page": "http://www.uab.edu/", "provenance": "MAG", "value": "University of Alabama at Birmingham", "identifiers": [{"value": "http://en.wikipedia.org/wiki/University_of_Alabama_at_Birmingham", "schema": "wikpedia"}, {"value": "grid.265892.2", "schema": "grid.ac"}, {"value": "https://academic.microsoft.com/#/detail/32389192", "schema": "URL"}]}], "given": "Anna", "identifiers": [{"provenance": "MAG", "value": "https://academic.microsoft.com/#/detail/2335602571", "schema": "URL"}], "fullname": "Anna Brunet-Vega", "family": "Brunet-Vega"}, {"affiliations": [{"official-page": "http://www.uab.edu/", "provenance": "MAG", "value": "University of Alabama at Birmingham", "identifiers": [{"value": "http://en.wikipedia.org/wiki/University_of_Alabama_at_Birmingham", "schema": "wikpedia"}, {"value": "grid.265892.2", "schema": "grid.ac"}, {"value": "https://academic.microsoft.com/#/detail/32389192", "schema": "URL"}]}], "given": "Carles", "identifiers": [{"provenance": "MAG", "value": "https://academic.microsoft.com/#/detail/2692345643", "schema": "URL"}], "fullname": "Carles Pericay", "family": "Pericay"}, {"affiliations": [{"official-page": "http://www.uab.edu/", "provenance": "MAG", "value": "University of Alabama at Birmingham", "identifiers": [{"value": "http://en.wikipedia.org/wiki/University_of_Alabama_at_Birmingham", "schema": "wikpedia"}, {"value": "grid.265892.2", "schema": "grid.ac"}, {"value": "https://academic.microsoft.com/#/detail/32389192", "schema": "URL"}]}], "given": "María Elisa", "identifiers": [{"provenance": "MAG", "value": "https://academic.microsoft.com/#/detail/2160096374", "schema": "URL"}], "fullname": "María Elisa Quílez", "family": "Quílez"}, {"affiliations": [{"official-page": "http://www.uab.edu/", "provenance": "MAG", "value": "University of Alabama at Birmingham", "identifiers": [{"value": "http://en.wikipedia.org/wiki/University_of_Alabama_at_Birmingham", "schema": "wikpedia"}, {"value": "grid.265892.2", "schema": "grid.ac"}, {"value": "https://academic.microsoft.com/#/detail/32389192", "schema": "URL"}]}], "given": "María José", "identifiers": [{"provenance": "MAG", "value": "https://academic.microsoft.com/#/detail/2751674309", "schema": "URL"}], "fullname": "María José Ramírez-Lázaro", "family": "Ramírez-Lázaro"}, {"affiliations": [{"official-page": "http://www.uab.edu/", "provenance": "MAG", "value": "University of Alabama at Birmingham", "identifiers": [{"value": "http://en.wikipedia.org/wiki/University_of_Alabama_at_Birmingham", "schema": "wikpedia"}, {"value": "grid.265892.2", "schema": "grid.ac"}, {"value": "https://academic.microsoft.com/#/detail/32389192", "schema": "URL"}]}], "given": "Xavier", "identifiers": [{"provenance": "MAG", "value": "https://academic.microsoft.com/#/detail/2683017426", "schema": "URL"}], "fullname": "Xavier Calvet", "family": "Calvet"}, {"affiliations": [{"official-page": "http://www.uab.edu/", "provenance": "MAG", "value": "University of Alabama at Birmingham", "identifiers": [{"value": "http://en.wikipedia.org/wiki/University_of_Alabama_at_Birmingham", "schema": "wikpedia"}, {"value": "grid.265892.2", "schema": "grid.ac"}, {"value": "https://academic.microsoft.com/#/detail/32389192", "schema": "URL"}]}], "given": "Sergio", "identifiers": [{"provenance": "CrossRef", "value": "http://orcid.org/0000-0001-9294-1585", "schema": null}, {"provenance": "MAG", "value": "https://academic.microsoft.com/#/detail/2752779186", "schema": "URL"}, {"provenance": "ORCID", "value": "https://orcid.org/0000-0001-9294-1585", "schema": null}], "fullname": "Sergio Lario", "family": "Lario"}], "collectedFrom": ["CrossRef", "MAG", "ORCID", "UnpayWall"], "accepted": null, "type": "journal-article", "published-print": "2015-12-1", "subject": ["Multidisciplinary"]} |
modules/dnet-mapreduce-jobs/branches/master/src/main/java/eu/dnetlib/data/mapreduce/hbase/dataimport/DOIBoostToActions.java | ||
---|---|---|
17 | 17 |
import java.io.InputStream; |
18 | 18 |
import java.util.*; |
19 | 19 |
import java.util.concurrent.atomic.AtomicInteger; |
20 |
import java.util.function.Function; |
|
20 | 21 |
import java.util.stream.Collectors; |
21 | 22 |
|
22 | 23 |
import static eu.dnetlib.data.mapreduce.hbase.dataimport.DumpToActionsUtility.*; |
... | ... | |
263 | 264 |
result.setMetadata(metadata.build()); |
264 | 265 |
entity.setResult(result.build()); |
265 | 266 |
oaf.setEntity(entity.build()); |
267 |
System.out.println(JsonFormat.printToString(oaf.build())); |
|
266 | 268 |
final List<AtomicAction> actionList = new ArrayList<>(); |
267 | 269 |
if (!onlyOrganization) |
268 | 270 |
actionList.add(factory.createAtomicAction(setName, agent, oaf.getEntity().getId(), "result", "body", oaf.build().toByteArray())); |
... | ... | |
506 | 508 |
identifiers.stream().map(id -> { |
507 | 509 |
final String value = id.get("value").getAsString(); |
508 | 510 |
return extractIdentifier(value); |
509 |
}).forEach(abuilder::addPid); |
|
511 |
}).collect( |
|
512 |
Collectors.toMap( |
|
513 |
FieldTypeProtos.KeyValue::getKey, |
|
514 |
Function.identity(), |
|
515 |
(a,b) -> a |
|
516 |
)).values().forEach(abuilder::addPid); |
|
510 | 517 |
abuilder.setRank(counter.getAndIncrement()); |
511 | 518 |
|
512 | 519 |
return abuilder.build(); |
modules/dnet-mapreduce-jobs/branches/master/src/main/java/eu/dnetlib/data/mapreduce/hbase/dataexport/ExportDuplicatesMapper.java | ||
---|---|---|
1 |
package eu.dnetlib.data.mapreduce.hbase.dataexport; |
|
2 |
|
|
3 |
import com.googlecode.protobuf.format.JsonFormat; |
|
4 |
import eu.dnetlib.data.mapreduce.util.DedupUtils; |
|
5 |
import eu.dnetlib.data.proto.OafProtos; |
|
6 |
import eu.dnetlib.data.proto.TypeProtos; |
|
7 |
import org.apache.hadoop.hbase.client.Result; |
|
8 |
import org.apache.hadoop.hbase.io.ImmutableBytesWritable; |
|
9 |
import org.apache.hadoop.hbase.mapreduce.TableMapper; |
|
10 |
import org.apache.hadoop.io.Text; |
|
11 |
|
|
12 |
import java.io.IOException; |
|
13 |
import java.util.Map; |
|
14 |
|
|
15 |
public class ExportDuplicatesMapper extends TableMapper<Text, Text> { |
|
16 |
|
|
17 |
private Text keyOut; |
|
18 |
private Text valueOut; |
|
19 |
|
|
20 |
@Override |
|
21 |
protected void setup(Context context) throws IOException, InterruptedException { |
|
22 |
keyOut = new Text(""); |
|
23 |
valueOut = new Text(); |
|
24 |
} |
|
25 |
|
|
26 |
@Override |
|
27 |
protected void map(ImmutableBytesWritable key, Result value, Context context) throws IOException, InterruptedException { |
|
28 |
|
|
29 |
final Map<byte[], byte[]> mergedInMap = value.getFamilyMap(DedupUtils.getDedupCF_mergedInBytes(TypeProtos.Type.result)); |
|
30 |
|
|
31 |
if (mergedInMap != null && !mergedInMap.isEmpty()) { |
|
32 |
final byte[] body = value.getValue("result".getBytes(), DedupUtils.BODY_B); |
|
33 |
|
|
34 |
if (body != null) { |
|
35 |
OafProtos.Oaf oaf = OafProtos.Oaf.parseFrom(body); |
|
36 |
valueOut.set(JsonFormat.printToString(oaf)); |
|
37 |
context.write(keyOut, valueOut); |
|
38 |
} |
|
39 |
} |
|
40 |
|
|
41 |
} |
|
42 |
} |
Also available in: Unified diff
refactored Action