Revision 57397
Added by Michele Artini over 4 years ago
XsltRowTransformerFactoryTest.java | ||
---|---|---|
1 | 1 |
package eu.dnetlib.data.transform; |
2 | 2 |
|
3 |
import java.io.*; |
|
4 |
import java.util.*; |
|
3 |
import static org.junit.Assert.assertFalse; |
|
4 |
import static org.junit.Assert.assertNotNull; |
|
5 |
import static org.junit.Assert.assertTrue; |
|
6 |
|
|
7 |
import java.io.BufferedReader; |
|
8 |
import java.io.IOException; |
|
9 |
import java.io.InputStream; |
|
10 |
import java.io.InputStreamReader; |
|
11 |
import java.io.StringReader; |
|
12 |
import java.io.UnsupportedEncodingException; |
|
13 |
import java.util.ArrayList; |
|
14 |
import java.util.HashMap; |
|
15 |
import java.util.Iterator; |
|
16 |
import java.util.List; |
|
17 |
import java.util.Map; |
|
5 | 18 |
import java.util.Map.Entry; |
19 |
import java.util.Set; |
|
6 | 20 |
import java.util.zip.GZIPInputStream; |
21 |
|
|
7 | 22 |
import javax.xml.transform.TransformerConfigurationException; |
8 | 23 |
import javax.xml.transform.TransformerFactoryConfigurationError; |
9 | 24 |
|
25 |
import org.apache.commons.io.IOUtils; |
|
26 |
import org.apache.commons.lang.StringUtils; |
|
27 |
import org.apache.commons.logging.Log; |
|
28 |
import org.apache.commons.logging.LogFactory; |
|
29 |
import org.dom4j.Document; |
|
30 |
import org.dom4j.DocumentException; |
|
31 |
import org.dom4j.io.SAXReader; |
|
32 |
import org.json.JSONObject; |
|
33 |
import org.junit.Before; |
|
34 |
import org.junit.Ignore; |
|
35 |
import org.junit.Test; |
|
36 |
import org.springframework.core.io.ByteArrayResource; |
|
37 |
import org.springframework.core.io.Resource; |
|
38 |
|
|
10 | 39 |
import com.google.common.base.Function; |
11 | 40 |
import com.google.common.collect.Iterables; |
12 | 41 |
import com.google.common.collect.Lists; |
... | ... | |
15 | 44 |
import com.google.protobuf.InvalidProtocolBufferException; |
16 | 45 |
import com.googlecode.protobuf.format.JsonFormat; |
17 | 46 |
import com.googlecode.protobuf.format.JsonFormat.ParseException; |
47 |
|
|
18 | 48 |
import eu.dnetlib.actionmanager.actions.ActionFactory; |
19 | 49 |
import eu.dnetlib.actionmanager.actions.XsltInfoPackageAction; |
20 | 50 |
import eu.dnetlib.actionmanager.common.Agent; |
21 | 51 |
import eu.dnetlib.actionmanager.common.Operation; |
22 | 52 |
import eu.dnetlib.actionmanager.common.Provenance; |
23 |
import eu.dnetlib.data.mapreduce.hbase.index.config.*; |
|
24 |
import eu.dnetlib.data.mapreduce.util.*; |
|
53 |
import eu.dnetlib.data.mapreduce.hbase.index.config.Context; |
|
54 |
import eu.dnetlib.data.mapreduce.hbase.index.config.ContextMapper; |
|
55 |
import eu.dnetlib.data.mapreduce.hbase.index.config.EntityConfigTable; |
|
56 |
import eu.dnetlib.data.mapreduce.hbase.index.config.IndexConfig; |
|
57 |
import eu.dnetlib.data.mapreduce.hbase.index.config.IndexConfigTest; |
|
58 |
import eu.dnetlib.data.mapreduce.hbase.index.config.LinkDescriptor; |
|
59 |
import eu.dnetlib.data.mapreduce.hbase.index.config.RelClasses; |
|
60 |
import eu.dnetlib.data.mapreduce.hbase.index.config.RelClassesTest; |
|
61 |
import eu.dnetlib.data.mapreduce.util.OafDecoder; |
|
62 |
import eu.dnetlib.data.mapreduce.util.OafRowKeyDecoder; |
|
63 |
import eu.dnetlib.data.mapreduce.util.RelDescriptor; |
|
64 |
import eu.dnetlib.data.mapreduce.util.UpdateMerger; |
|
65 |
import eu.dnetlib.data.mapreduce.util.XmlRecordFactory; |
|
66 |
import eu.dnetlib.data.mapreduce.util.XmlRecordFactoryTest; |
|
25 | 67 |
import eu.dnetlib.data.proto.KindProtos.Kind; |
26 | 68 |
import eu.dnetlib.data.proto.OafProtos.Oaf; |
27 | 69 |
import eu.dnetlib.data.proto.TypeProtos.Type; |
28 | 70 |
import eu.dnetlib.miscutils.functional.xml.IndentXmlString; |
29 |
import org.apache.commons.io.IOUtils; |
|
30 |
import org.apache.commons.lang.StringUtils; |
|
31 |
import org.apache.commons.logging.Log; |
|
32 |
import org.apache.commons.logging.LogFactory; |
|
33 |
import org.dom4j.Document; |
|
34 |
import org.dom4j.DocumentException; |
|
35 |
import org.dom4j.io.SAXReader; |
|
36 |
import org.json.JSONObject; |
|
37 |
import org.junit.Before; |
|
38 |
import org.junit.Ignore; |
|
39 |
import org.junit.Test; |
|
40 |
import org.springframework.core.io.ByteArrayResource; |
|
41 |
import org.springframework.core.io.Resource; |
|
42 | 71 |
|
43 |
import static org.junit.Assert.*; |
|
44 |
|
|
45 | 72 |
public class XsltRowTransformerFactoryTest { |
46 | 73 |
|
47 | 74 |
private static final Log log = LogFactory.getLog(XsltRowTransformerFactoryTest.class); |
... | ... | |
78 | 105 |
doTest(loadFromTransformationProfile("claimRels_2_hbase.xml"), load("recordClaimRel.xml")); |
79 | 106 |
} |
80 | 107 |
|
81 |
|
|
82 | 108 |
@Test |
83 | 109 |
public void testParseFp7IctPUB() throws Exception { |
84 | 110 |
|
... | ... | |
121 | 147 |
doTest(loadFromTransformationProfile("odf2hbase.xml"), load("doecode.xml")); |
122 | 148 |
} |
123 | 149 |
|
124 |
|
|
125 | 150 |
@Test |
126 | 151 |
public void testParseDatasetLindat() throws Exception { |
127 | 152 |
|
... | ... | |
152 | 177 |
doTest(loadFromTransformationProfile("oaf2hbase.xml"), load("recordClaimedDedup.xml")); |
153 | 178 |
} |
154 | 179 |
|
155 |
|
|
156 | 180 |
@Test |
157 | 181 |
public void testParseClaimDataset() throws Exception { |
158 | 182 |
|
159 | 183 |
doTest(loadFromTransformationProfile("odf2hbase.xml"), load("recordClaimDataset.xml")); |
160 | 184 |
} |
161 | 185 |
|
162 |
|
|
163 | 186 |
@Test |
164 | 187 |
public void testParseACM() throws Exception { |
165 | 188 |
|
... | ... | |
184 | 207 |
doTest(loadFromTransformationProfile("projects_2_hbase.xsl"), load("projectRecordFCT.xml")); |
185 | 208 |
} |
186 | 209 |
|
187 |
|
|
188 | 210 |
@Test |
189 | 211 |
public void testParseOaf() throws Exception { |
190 | 212 |
|
... | ... | |
244 | 266 |
|
245 | 267 |
final List<Row> rows = Lists.newArrayList(); |
246 | 268 |
rows.addAll(asRows(loadFromTransformationProfile("oaf2hbase.xml"), load("odf.xml"))); |
247 |
//printAll(mapAll(buildTable(rows))); |
|
269 |
// printAll(mapAll(buildTable(rows)));
|
|
248 | 270 |
} |
249 | 271 |
|
250 | 272 |
@Test |
... | ... | |
254 | 276 |
rows.addAll(asRows(loadFromTransformationProfile("oaf2hbase.xml"), load("pangaeOAF2.xml"))); |
255 | 277 |
printAll(mapAll(buildTable(rows))); |
256 | 278 |
} |
279 |
|
|
257 | 280 |
@Test |
258 | 281 |
public void testZenodo() throws Exception { |
259 | 282 |
|
... | ... | |
310 | 333 |
printAll(mapAll(buildTable(rows))); |
311 | 334 |
} |
312 | 335 |
|
313 |
|
|
314 |
|
|
315 | 336 |
@Test |
316 | 337 |
public void testLinkOrganization() throws Exception { |
317 | 338 |
|
... | ... | |
341 | 362 |
|
342 | 363 |
final Function<Row, Row> f = rowIn -> { |
343 | 364 |
|
344 |
final List<Column<String,byte[]>> cols = Lists.newArrayList(); |
|
345 |
for(Column<String,byte[]> col : rowIn.getColumns()) {
|
|
365 |
final List<Column<String, byte[]>> cols = Lists.newArrayList();
|
|
366 |
for (final Column<String, byte[]> col : rowIn.getColumns()) {
|
|
346 | 367 |
if (col.getName().equals("body")) { |
347 | 368 |
cols.add(new Column(col.getName(), col.getValue())); |
348 | 369 |
|
... | ... | |
360 | 381 |
rows.addAll(puma1); |
361 | 382 |
rows.addAll(puma2); |
362 | 383 |
|
363 |
List<Oaf> duplicates = Lists.newArrayList(); |
|
384 |
final List<Oaf> duplicates = Lists.newArrayList();
|
|
364 | 385 |
duplicates.add(getOafBody(puma1)); |
365 | 386 |
duplicates.add(getOafBody(puma2)); |
366 | 387 |
final Oaf.Builder oafMerge = OafEntityMerger.merge(mergeId, duplicates); |
... | ... | |
373 | 394 |
} |
374 | 395 |
|
375 | 396 |
private Oaf getOafBody(final List<Row> rows) throws InvalidProtocolBufferException { |
376 |
for(Row row : rows) { |
|
377 |
if(StringUtils.startsWith(row.getKey(), "50")) { |
|
378 |
return Oaf.parseFrom(row.getColumn("body").getValue()); |
|
397 |
for (final Row row : rows) { |
|
398 |
if (StringUtils.startsWith(row.getKey(), "50")) { return Oaf.parseFrom(row.getColumn("body").getValue()); |
|
379 | 399 |
|
380 | 400 |
} |
381 | 401 |
} |
... | ... | |
393 | 413 |
|
394 | 414 |
doTest(loadFromTransformationProfile("datasources_2_hbase.xsl"), load("datasourceNative.xml")); |
395 | 415 |
} |
416 |
|
|
396 | 417 |
@Test |
397 | 418 |
public void testParseDatasourcePiwik() throws Exception { |
398 | 419 |
|
... | ... | |
410 | 431 |
doTestJsonGz(loadFromTransformationProfile("oaf2hbase.xml"), load("mdstore_cleaned.json.gz")); |
411 | 432 |
} |
412 | 433 |
|
413 |
|
|
414 | 434 |
@Test |
415 | 435 |
public void testLoadFromTransformationProfile() throws IOException { |
416 |
InputStream in = loadFromTransformationProfile("oaf2hbase.xml"); |
|
436 |
final InputStream in = loadFromTransformationProfile("oaf2hbase.xml");
|
|
417 | 437 |
log.info(IOUtils.toString(in)); |
418 | 438 |
} |
419 | 439 |
|
... | ... | |
441 | 461 |
|
442 | 462 |
doTest(loadFromTransformationProfile("projects_2_hbase.xsl"), load("projectWithFunderOriginalName.xml")); |
443 | 463 |
} |
464 |
|
|
444 | 465 |
@Test |
445 | 466 |
public void testLinkFunderOriginalName() throws Exception { |
446 | 467 |
|
... | ... | |
513 | 534 |
final List<Row> rows = Lists.newArrayList(); |
514 | 535 |
rows.addAll(asRows(loadFromTransformationProfile("oaf2hbase.xml"), load("oafWithCommunity.xml"))); |
515 | 536 |
|
516 |
ActionFactory actionFactory = new ActionFactory(); |
|
537 |
final ActionFactory actionFactory = new ActionFactory();
|
|
517 | 538 |
|
518 |
Map<String, Resource> xslts = Maps.newHashMap(); |
|
539 |
final Map<String, Resource> xslts = Maps.newHashMap();
|
|
519 | 540 |
|
520 | 541 |
xslts.put("oaf2hbase", new ByteArrayResource(IOUtils.toString(loadFromTransformationProfile("oaf2hbase.xml")).getBytes())); |
521 | 542 |
actionFactory.setXslts(xslts); |
522 | 543 |
|
523 |
XsltInfoPackageAction pa = actionFactory.generateInfoPackageAction( |
|
544 |
final XsltInfoPackageAction pa = actionFactory.generateInfoPackageAction(
|
|
524 | 545 |
"oaf2hbase", |
525 | 546 |
"rawset-id", |
526 | 547 |
new Agent("agent-id", "agent-name", Agent.AGENT_TYPE.algo), |
... | ... | |
533 | 554 |
|
534 | 555 |
IOUtils.readLines(load("country_updates.json")).forEach(line -> { |
535 | 556 |
|
536 |
Oaf.Builder oaf = Oaf.newBuilder(); |
|
557 |
final Oaf.Builder oaf = Oaf.newBuilder();
|
|
537 | 558 |
|
538 | 559 |
try { |
539 | 560 |
JsonFormat.merge(line, oaf); |
540 |
} catch (JsonFormat.ParseException e) { |
|
561 |
} catch (final JsonFormat.ParseException e) {
|
|
541 | 562 |
throw new IllegalArgumentException(e); |
542 | 563 |
} |
543 | 564 |
|
544 |
Column<String, byte[]> col = new Column<>("update_" + System.nanoTime(), oaf.build().toByteArray()); |
|
565 |
final Column<String, byte[]> col = new Column<>("update_" + System.nanoTime(), oaf.build().toByteArray());
|
|
545 | 566 |
rows.add(new Row("result", oaf.getEntity().getId(), Lists.newArrayList(col))); |
546 | 567 |
}); |
547 | 568 |
|
548 | 569 |
pa.asAtomicActions().forEach(a -> { |
549 |
Column<String, byte[]> col = new Column<>("update_" + System.nanoTime(), a.getTargetValue()); |
|
570 |
final Column<String, byte[]> col = new Column<>("update_" + System.nanoTime(), a.getTargetValue());
|
|
550 | 571 |
rows.add(new Row(a.getTargetColumnFamily(), a.getTargetRowKey(), Lists.newArrayList(col))); |
551 | 572 |
}); |
552 | 573 |
|
553 |
|
|
554 | 574 |
/* |
555 |
rows.forEach(r -> { |
|
556 |
log.info(r); |
|
557 |
}); |
|
558 |
*/ |
|
575 |
* rows.forEach(r -> { log.info(r); }); |
|
576 |
*/ |
|
559 | 577 |
|
560 | 578 |
mapAll(buildTable(rows)).entrySet().forEach(b -> { |
561 | 579 |
log.info(b.getKey()); |
... | ... | |
589 | 607 |
doTest(loadFromTransformationProfile("odf2hbase.xml"), load("guidelines4_qeios1.xml")); |
590 | 608 |
} |
591 | 609 |
|
592 |
@Test
|
|
593 |
public void testGuidelines4Aria() throws Exception {
|
|
610 |
@Test
|
|
611 |
public void testGuidelines4Aria() throws Exception {
|
|
594 | 612 |
|
595 |
doTest(loadFromTransformationProfile("odf2hbase.xml"), load("guidelines4_aria.xml"));
|
|
596 |
}
|
|
613 |
doTest(loadFromTransformationProfile("odf2hbase.xml"), load("guidelines4_aria.xml"));
|
|
614 |
}
|
|
597 | 615 |
|
598 | 616 |
@Test |
599 | 617 |
public void testJournalRecord() throws Exception { |
... | ... | |
610 | 628 |
printAll(mapAll(buildTable(rows))); |
611 | 629 |
} |
612 | 630 |
|
631 |
@Test |
|
632 |
public void testOpenOrganizations() throws Exception { |
|
633 |
|
|
634 |
final List<Row> rows = Lists.newArrayList(); |
|
635 |
rows.addAll(asRows(loadFromTransformationProfile("organizations_2_hbase.xsl"), load("openorgs_sample.xml"))); |
|
636 |
|
|
637 |
printAll(mapAll(buildTable(rows))); |
|
638 |
} |
|
639 |
|
|
613 | 640 |
private void doTestJsonRow(final String json) throws Exception { |
614 |
Row row = asRowFromJson(json); |
|
641 |
final Row row = asRowFromJson(json);
|
|
615 | 642 |
log.info(row); |
616 |
List<Row> rows = new ArrayList<>(); |
|
643 |
final List<Row> rows = new ArrayList<>();
|
|
617 | 644 |
rows.add(row); |
618 | 645 |
final Map<String, Map<String, Map<String, byte[]>>> table = buildTable(rows); |
619 | 646 |
final Map<String, XmlRecordFactory> builders = mapAll(table); |
... | ... | |
654 | 681 |
final List<Row> rows = rowsIterator.next(); |
655 | 682 |
i++; |
656 | 683 |
|
657 |
if ((i % 10000) == 0) {
|
|
684 |
if (i % 10000 == 0) {
|
|
658 | 685 |
System.out.println(i); |
659 | 686 |
} |
660 | 687 |
|
... | ... | |
698 | 725 |
return asRows(xsltStream, params, recordStream, null); |
699 | 726 |
} |
700 | 727 |
|
701 |
private List<Row> asRows(final InputStream xsltStream, final Map<String, Object> params, final InputStream recordStream, final Function<Row, Row> p) throws Exception { |
|
728 |
private List<Row> asRows(final InputStream xsltStream, final Map<String, Object> params, final InputStream recordStream, final Function<Row, Row> p) |
|
729 |
throws Exception { |
|
702 | 730 |
final String xslt = IOUtils.toString(xsltStream); |
703 | 731 |
final XsltRowTransformer transformer = factory.getTransformer(xslt, params); |
704 | 732 |
assertNotNull(transformer); |
... | ... | |
804 | 832 |
if (c.getName().equals("body")) { |
805 | 833 |
final String theBody = new String(c.getValue(), "UTF-8"); |
806 | 834 |
assertTrue(StringUtils.isNotBlank(theBody)); |
807 |
//System.out.println(theBody); |
|
835 |
// System.out.println(theBody);
|
|
808 | 836 |
} |
809 | 837 |
} |
810 | 838 |
} |
... | ... | |
839 | 867 |
|
840 | 868 |
final Map<String, byte[]> familyMap = row.get(type.toString()); |
841 | 869 |
|
842 |
if (familyMap == null) return;
|
|
870 |
if (familyMap == null) { return; }
|
|
843 | 871 |
|
844 | 872 |
final byte[] bodyB = familyMap.get("body"); |
845 | 873 |
|
... | ... | |
857 | 885 |
final String it = ld.getRelDescriptor().getIt(); |
858 | 886 |
final Map<String, byte[]> cols = row.get(it); |
859 | 887 |
|
860 |
if ((cols != null) && !cols.isEmpty()) {
|
|
888 |
if (cols != null && !cols.isEmpty()) {
|
|
861 | 889 |
|
862 | 890 |
for (final byte[] oafB : cols.values()) { |
863 | 891 |
|
... | ... | |
901 | 929 |
for (final Entry<String, XmlRecordFactory> e : builders.entrySet()) { |
902 | 930 |
final OafRowKeyDecoder kd = OafRowKeyDecoder.decode(e.getKey()); |
903 | 931 |
|
904 |
if (!e.getValue().isValid()) throw new IllegalArgumentException("invalid builder: " + e.getKey());
|
|
932 |
if (!e.getValue().isValid()) { throw new IllegalArgumentException("invalid builder: " + e.getKey()); }
|
|
905 | 933 |
if (types.contains(kd.getType())) { |
906 | 934 |
final String val = IndentXmlString.apply(e.getValue().build()); |
907 | 935 |
|
908 |
if ((xpaths != null) && !xpaths.isEmpty() && (xpaths.get(kd.getType()) != null)) {
|
|
936 |
if (xpaths != null && !xpaths.isEmpty() && xpaths.get(kd.getType()) != null) {
|
|
909 | 937 |
final Document doc = r.read(new StringReader(val)); |
910 | 938 |
|
911 | 939 |
log.debug("\n" + e.getKey()); |
... | ... | |
937 | 965 |
|
938 | 966 |
private XmlRecordFactory newBuilder() throws TransformerConfigurationException, TransformerFactoryConfigurationError, DocumentException { |
939 | 967 |
return new XmlRecordFactory(entityConfigTable, ContextMapper.fromXml(Context.xml), |
940 |
RelClasses.fromJSon(RelClassesTest.relClassesJson), XmlRecordFactoryTest.SCHEMA_LOCATION, true, false, false, XmlRecordFactoryTest.specialDatasourceTypes); |
|
968 |
RelClasses.fromJSon(RelClassesTest.relClassesJson), XmlRecordFactoryTest.SCHEMA_LOCATION, true, false, false, |
|
969 |
XmlRecordFactoryTest.specialDatasourceTypes); |
|
941 | 970 |
} |
942 | 971 |
|
943 | 972 |
private InputStream load(final String fileName) { |
... | ... | |
946 | 975 |
|
947 | 976 |
private InputStream loadFromTransformationProfile(final String profilePath) { |
948 | 977 |
log.info("Loading xslt from: " + basePathProfiles + profilePath); |
949 |
InputStream profile = getClass().getResourceAsStream(basePathProfiles + profilePath); |
|
950 |
SAXReader saxReader = new SAXReader(); |
|
978 |
final InputStream profile = getClass().getResourceAsStream(basePathProfiles + profilePath);
|
|
979 |
final SAXReader saxReader = new SAXReader();
|
|
951 | 980 |
Document doc = null; |
952 | 981 |
try { |
953 | 982 |
doc = saxReader.read(profile); |
954 |
} catch (DocumentException e) { |
|
983 |
} catch (final DocumentException e) {
|
|
955 | 984 |
e.printStackTrace(); |
956 | 985 |
throw new RuntimeException(e); |
957 | 986 |
} |
958 |
String xslt = doc.selectSingleNode("//SCRIPT/CODE/*[local-name()='stylesheet']").asXML(); |
|
959 |
//log.info(xslt); |
|
987 |
final String xslt = doc.selectSingleNode("//SCRIPT/CODE/*[local-name()='stylesheet']").asXML();
|
|
988 |
// log.info(xslt);
|
|
960 | 989 |
return IOUtils.toInputStream(xslt); |
961 | 990 |
} |
962 | 991 |
|
963 |
|
|
964 |
private Row asRowFromJson(String json) throws ParseException { |
|
965 |
Oaf.Builder oafBuilder = Oaf.newBuilder(); |
|
992 |
private Row asRowFromJson(final String json) throws ParseException { |
|
993 |
final Oaf.Builder oafBuilder = Oaf.newBuilder(); |
|
966 | 994 |
JsonFormat.merge(json, oafBuilder); |
967 | 995 |
final Oaf oaf = oafBuilder.build(); |
968 |
Row row = new Row("result", oaf.getEntity().getId()); |
|
969 |
Column<String, byte[]> c = new Column<>("body", oaf.toByteArray()); |
|
996 |
final Row row = new Row("result", oaf.getEntity().getId());
|
|
997 |
final Column<String, byte[]> c = new Column<>("body", oaf.toByteArray());
|
|
970 | 998 |
row.setColumn("body", c); |
971 | 999 |
return row; |
972 | 1000 |
|
Also available in: Unified diff
(openorgs) added schemeid to pids in sql query