1 |
1 |
package eu.dnetlib.data.transform;
|
2 |
2 |
|
3 |
|
import java.io.*;
|
4 |
|
import java.util.*;
|
|
3 |
import static org.junit.Assert.assertFalse;
|
|
4 |
import static org.junit.Assert.assertNotNull;
|
|
5 |
import static org.junit.Assert.assertTrue;
|
|
6 |
|
|
7 |
import java.io.BufferedReader;
|
|
8 |
import java.io.IOException;
|
|
9 |
import java.io.InputStream;
|
|
10 |
import java.io.InputStreamReader;
|
|
11 |
import java.io.StringReader;
|
|
12 |
import java.io.UnsupportedEncodingException;
|
|
13 |
import java.util.ArrayList;
|
|
14 |
import java.util.HashMap;
|
|
15 |
import java.util.Iterator;
|
|
16 |
import java.util.List;
|
|
17 |
import java.util.Map;
|
5 |
18 |
import java.util.Map.Entry;
|
|
19 |
import java.util.Set;
|
6 |
20 |
import java.util.zip.GZIPInputStream;
|
|
21 |
|
7 |
22 |
import javax.xml.transform.TransformerConfigurationException;
|
8 |
23 |
import javax.xml.transform.TransformerFactoryConfigurationError;
|
9 |
24 |
|
|
25 |
import org.apache.commons.io.IOUtils;
|
|
26 |
import org.apache.commons.lang.StringUtils;
|
|
27 |
import org.apache.commons.logging.Log;
|
|
28 |
import org.apache.commons.logging.LogFactory;
|
|
29 |
import org.dom4j.Document;
|
|
30 |
import org.dom4j.DocumentException;
|
|
31 |
import org.dom4j.io.SAXReader;
|
|
32 |
import org.json.JSONObject;
|
|
33 |
import org.junit.Before;
|
|
34 |
import org.junit.Ignore;
|
|
35 |
import org.junit.Test;
|
|
36 |
import org.springframework.core.io.ByteArrayResource;
|
|
37 |
import org.springframework.core.io.Resource;
|
|
38 |
|
10 |
39 |
import com.google.common.base.Function;
|
11 |
40 |
import com.google.common.collect.Iterables;
|
12 |
41 |
import com.google.common.collect.Lists;
|
... | ... | |
15 |
44 |
import com.google.protobuf.InvalidProtocolBufferException;
|
16 |
45 |
import com.googlecode.protobuf.format.JsonFormat;
|
17 |
46 |
import com.googlecode.protobuf.format.JsonFormat.ParseException;
|
|
47 |
|
18 |
48 |
import eu.dnetlib.actionmanager.actions.ActionFactory;
|
19 |
49 |
import eu.dnetlib.actionmanager.actions.XsltInfoPackageAction;
|
20 |
50 |
import eu.dnetlib.actionmanager.common.Agent;
|
21 |
51 |
import eu.dnetlib.actionmanager.common.Operation;
|
22 |
52 |
import eu.dnetlib.actionmanager.common.Provenance;
|
23 |
|
import eu.dnetlib.data.mapreduce.hbase.index.config.*;
|
24 |
|
import eu.dnetlib.data.mapreduce.util.*;
|
|
53 |
import eu.dnetlib.data.mapreduce.hbase.index.config.Context;
|
|
54 |
import eu.dnetlib.data.mapreduce.hbase.index.config.ContextMapper;
|
|
55 |
import eu.dnetlib.data.mapreduce.hbase.index.config.EntityConfigTable;
|
|
56 |
import eu.dnetlib.data.mapreduce.hbase.index.config.IndexConfig;
|
|
57 |
import eu.dnetlib.data.mapreduce.hbase.index.config.IndexConfigTest;
|
|
58 |
import eu.dnetlib.data.mapreduce.hbase.index.config.LinkDescriptor;
|
|
59 |
import eu.dnetlib.data.mapreduce.hbase.index.config.RelClasses;
|
|
60 |
import eu.dnetlib.data.mapreduce.hbase.index.config.RelClassesTest;
|
|
61 |
import eu.dnetlib.data.mapreduce.util.OafDecoder;
|
|
62 |
import eu.dnetlib.data.mapreduce.util.OafRowKeyDecoder;
|
|
63 |
import eu.dnetlib.data.mapreduce.util.RelDescriptor;
|
|
64 |
import eu.dnetlib.data.mapreduce.util.UpdateMerger;
|
|
65 |
import eu.dnetlib.data.mapreduce.util.XmlRecordFactory;
|
|
66 |
import eu.dnetlib.data.mapreduce.util.XmlRecordFactoryTest;
|
25 |
67 |
import eu.dnetlib.data.proto.KindProtos.Kind;
|
26 |
68 |
import eu.dnetlib.data.proto.OafProtos.Oaf;
|
27 |
69 |
import eu.dnetlib.data.proto.TypeProtos.Type;
|
28 |
70 |
import eu.dnetlib.miscutils.functional.xml.IndentXmlString;
|
29 |
|
import org.apache.commons.io.IOUtils;
|
30 |
|
import org.apache.commons.lang.StringUtils;
|
31 |
|
import org.apache.commons.logging.Log;
|
32 |
|
import org.apache.commons.logging.LogFactory;
|
33 |
|
import org.dom4j.Document;
|
34 |
|
import org.dom4j.DocumentException;
|
35 |
|
import org.dom4j.io.SAXReader;
|
36 |
|
import org.json.JSONObject;
|
37 |
|
import org.junit.Before;
|
38 |
|
import org.junit.Ignore;
|
39 |
|
import org.junit.Test;
|
40 |
|
import org.springframework.core.io.ByteArrayResource;
|
41 |
|
import org.springframework.core.io.Resource;
|
42 |
71 |
|
43 |
|
import static org.junit.Assert.*;
|
44 |
|
|
45 |
72 |
public class XsltRowTransformerFactoryTest {
|
46 |
73 |
|
47 |
74 |
private static final Log log = LogFactory.getLog(XsltRowTransformerFactoryTest.class);
|
... | ... | |
78 |
105 |
doTest(loadFromTransformationProfile("claimRels_2_hbase.xml"), load("recordClaimRel.xml"));
|
79 |
106 |
}
|
80 |
107 |
|
81 |
|
|
82 |
108 |
@Test
|
83 |
109 |
public void testParseFp7IctPUB() throws Exception {
|
84 |
110 |
|
... | ... | |
121 |
147 |
doTest(loadFromTransformationProfile("odf2hbase.xml"), load("doecode.xml"));
|
122 |
148 |
}
|
123 |
149 |
|
124 |
|
|
125 |
150 |
@Test
|
126 |
151 |
public void testParseDatasetLindat() throws Exception {
|
127 |
152 |
|
... | ... | |
152 |
177 |
doTest(loadFromTransformationProfile("oaf2hbase.xml"), load("recordClaimedDedup.xml"));
|
153 |
178 |
}
|
154 |
179 |
|
155 |
|
|
156 |
180 |
@Test
|
157 |
181 |
public void testParseClaimDataset() throws Exception {
|
158 |
182 |
|
159 |
183 |
doTest(loadFromTransformationProfile("odf2hbase.xml"), load("recordClaimDataset.xml"));
|
160 |
184 |
}
|
161 |
185 |
|
162 |
|
|
163 |
186 |
@Test
|
164 |
187 |
public void testParseACM() throws Exception {
|
165 |
188 |
|
... | ... | |
184 |
207 |
doTest(loadFromTransformationProfile("projects_2_hbase.xsl"), load("projectRecordFCT.xml"));
|
185 |
208 |
}
|
186 |
209 |
|
187 |
|
|
188 |
210 |
@Test
|
189 |
211 |
public void testParseOaf() throws Exception {
|
190 |
212 |
|
... | ... | |
244 |
266 |
|
245 |
267 |
final List<Row> rows = Lists.newArrayList();
|
246 |
268 |
rows.addAll(asRows(loadFromTransformationProfile("oaf2hbase.xml"), load("odf.xml")));
|
247 |
|
//printAll(mapAll(buildTable(rows)));
|
|
269 |
// printAll(mapAll(buildTable(rows)));
|
248 |
270 |
}
|
249 |
271 |
|
250 |
272 |
@Test
|
... | ... | |
254 |
276 |
rows.addAll(asRows(loadFromTransformationProfile("oaf2hbase.xml"), load("pangaeOAF2.xml")));
|
255 |
277 |
printAll(mapAll(buildTable(rows)));
|
256 |
278 |
}
|
|
279 |
|
257 |
280 |
@Test
|
258 |
281 |
public void testZenodo() throws Exception {
|
259 |
282 |
|
... | ... | |
310 |
333 |
printAll(mapAll(buildTable(rows)));
|
311 |
334 |
}
|
312 |
335 |
|
313 |
|
|
314 |
|
|
315 |
336 |
@Test
|
316 |
337 |
public void testLinkOrganization() throws Exception {
|
317 |
338 |
|
... | ... | |
341 |
362 |
|
342 |
363 |
final Function<Row, Row> f = rowIn -> {
|
343 |
364 |
|
344 |
|
final List<Column<String,byte[]>> cols = Lists.newArrayList();
|
345 |
|
for(Column<String,byte[]> col : rowIn.getColumns()) {
|
|
365 |
final List<Column<String, byte[]>> cols = Lists.newArrayList();
|
|
366 |
for (final Column<String, byte[]> col : rowIn.getColumns()) {
|
346 |
367 |
if (col.getName().equals("body")) {
|
347 |
368 |
cols.add(new Column(col.getName(), col.getValue()));
|
348 |
369 |
|
... | ... | |
360 |
381 |
rows.addAll(puma1);
|
361 |
382 |
rows.addAll(puma2);
|
362 |
383 |
|
363 |
|
List<Oaf> duplicates = Lists.newArrayList();
|
|
384 |
final List<Oaf> duplicates = Lists.newArrayList();
|
364 |
385 |
duplicates.add(getOafBody(puma1));
|
365 |
386 |
duplicates.add(getOafBody(puma2));
|
366 |
387 |
final Oaf.Builder oafMerge = OafEntityMerger.merge(mergeId, duplicates);
|
... | ... | |
373 |
394 |
}
|
374 |
395 |
|
375 |
396 |
private Oaf getOafBody(final List<Row> rows) throws InvalidProtocolBufferException {
|
376 |
|
for(Row row : rows) {
|
377 |
|
if(StringUtils.startsWith(row.getKey(), "50")) {
|
378 |
|
return Oaf.parseFrom(row.getColumn("body").getValue());
|
|
397 |
for (final Row row : rows) {
|
|
398 |
if (StringUtils.startsWith(row.getKey(), "50")) { return Oaf.parseFrom(row.getColumn("body").getValue());
|
379 |
399 |
|
380 |
400 |
}
|
381 |
401 |
}
|
... | ... | |
393 |
413 |
|
394 |
414 |
doTest(loadFromTransformationProfile("datasources_2_hbase.xsl"), load("datasourceNative.xml"));
|
395 |
415 |
}
|
|
416 |
|
396 |
417 |
@Test
|
397 |
418 |
public void testParseDatasourcePiwik() throws Exception {
|
398 |
419 |
|
... | ... | |
410 |
431 |
doTestJsonGz(loadFromTransformationProfile("oaf2hbase.xml"), load("mdstore_cleaned.json.gz"));
|
411 |
432 |
}
|
412 |
433 |
|
413 |
|
|
414 |
434 |
@Test
|
415 |
435 |
public void testLoadFromTransformationProfile() throws IOException {
|
416 |
|
InputStream in = loadFromTransformationProfile("oaf2hbase.xml");
|
|
436 |
final InputStream in = loadFromTransformationProfile("oaf2hbase.xml");
|
417 |
437 |
log.info(IOUtils.toString(in));
|
418 |
438 |
}
|
419 |
439 |
|
... | ... | |
441 |
461 |
|
442 |
462 |
doTest(loadFromTransformationProfile("projects_2_hbase.xsl"), load("projectWithFunderOriginalName.xml"));
|
443 |
463 |
}
|
|
464 |
|
444 |
465 |
@Test
|
445 |
466 |
public void testLinkFunderOriginalName() throws Exception {
|
446 |
467 |
|
... | ... | |
513 |
534 |
final List<Row> rows = Lists.newArrayList();
|
514 |
535 |
rows.addAll(asRows(loadFromTransformationProfile("oaf2hbase.xml"), load("oafWithCommunity.xml")));
|
515 |
536 |
|
516 |
|
ActionFactory actionFactory = new ActionFactory();
|
|
537 |
final ActionFactory actionFactory = new ActionFactory();
|
517 |
538 |
|
518 |
|
Map<String, Resource> xslts = Maps.newHashMap();
|
|
539 |
final Map<String, Resource> xslts = Maps.newHashMap();
|
519 |
540 |
|
520 |
541 |
xslts.put("oaf2hbase", new ByteArrayResource(IOUtils.toString(loadFromTransformationProfile("oaf2hbase.xml")).getBytes()));
|
521 |
542 |
actionFactory.setXslts(xslts);
|
522 |
543 |
|
523 |
|
XsltInfoPackageAction pa = actionFactory.generateInfoPackageAction(
|
|
544 |
final XsltInfoPackageAction pa = actionFactory.generateInfoPackageAction(
|
524 |
545 |
"oaf2hbase",
|
525 |
546 |
"rawset-id",
|
526 |
547 |
new Agent("agent-id", "agent-name", Agent.AGENT_TYPE.algo),
|
... | ... | |
533 |
554 |
|
534 |
555 |
IOUtils.readLines(load("country_updates.json")).forEach(line -> {
|
535 |
556 |
|
536 |
|
Oaf.Builder oaf = Oaf.newBuilder();
|
|
557 |
final Oaf.Builder oaf = Oaf.newBuilder();
|
537 |
558 |
|
538 |
559 |
try {
|
539 |
560 |
JsonFormat.merge(line, oaf);
|
540 |
|
} catch (JsonFormat.ParseException e) {
|
|
561 |
} catch (final JsonFormat.ParseException e) {
|
541 |
562 |
throw new IllegalArgumentException(e);
|
542 |
563 |
}
|
543 |
564 |
|
544 |
|
Column<String, byte[]> col = new Column<>("update_" + System.nanoTime(), oaf.build().toByteArray());
|
|
565 |
final Column<String, byte[]> col = new Column<>("update_" + System.nanoTime(), oaf.build().toByteArray());
|
545 |
566 |
rows.add(new Row("result", oaf.getEntity().getId(), Lists.newArrayList(col)));
|
546 |
567 |
});
|
547 |
568 |
|
548 |
569 |
pa.asAtomicActions().forEach(a -> {
|
549 |
|
Column<String, byte[]> col = new Column<>("update_" + System.nanoTime(), a.getTargetValue());
|
|
570 |
final Column<String, byte[]> col = new Column<>("update_" + System.nanoTime(), a.getTargetValue());
|
550 |
571 |
rows.add(new Row(a.getTargetColumnFamily(), a.getTargetRowKey(), Lists.newArrayList(col)));
|
551 |
572 |
});
|
552 |
573 |
|
553 |
|
|
554 |
574 |
/*
|
555 |
|
rows.forEach(r -> {
|
556 |
|
log.info(r);
|
557 |
|
});
|
558 |
|
*/
|
|
575 |
* rows.forEach(r -> { log.info(r); });
|
|
576 |
*/
|
559 |
577 |
|
560 |
578 |
mapAll(buildTable(rows)).entrySet().forEach(b -> {
|
561 |
579 |
log.info(b.getKey());
|
... | ... | |
589 |
607 |
doTest(loadFromTransformationProfile("odf2hbase.xml"), load("guidelines4_qeios1.xml"));
|
590 |
608 |
}
|
591 |
609 |
|
592 |
|
@Test
|
593 |
|
public void testGuidelines4Aria() throws Exception {
|
|
610 |
@Test
|
|
611 |
public void testGuidelines4Aria() throws Exception {
|
594 |
612 |
|
595 |
|
doTest(loadFromTransformationProfile("odf2hbase.xml"), load("guidelines4_aria.xml"));
|
596 |
|
}
|
|
613 |
doTest(loadFromTransformationProfile("odf2hbase.xml"), load("guidelines4_aria.xml"));
|
|
614 |
}
|
597 |
615 |
|
598 |
616 |
@Test
|
599 |
617 |
public void testJournalRecord() throws Exception {
|
... | ... | |
610 |
628 |
printAll(mapAll(buildTable(rows)));
|
611 |
629 |
}
|
612 |
630 |
|
|
631 |
@Test
|
|
632 |
public void testOpenOrganizations() throws Exception {
|
|
633 |
|
|
634 |
final List<Row> rows = Lists.newArrayList();
|
|
635 |
rows.addAll(asRows(loadFromTransformationProfile("organizations_2_hbase.xsl"), load("openorgs_sample.xml")));
|
|
636 |
|
|
637 |
printAll(mapAll(buildTable(rows)));
|
|
638 |
}
|
|
639 |
|
613 |
640 |
private void doTestJsonRow(final String json) throws Exception {
|
614 |
|
Row row = asRowFromJson(json);
|
|
641 |
final Row row = asRowFromJson(json);
|
615 |
642 |
log.info(row);
|
616 |
|
List<Row> rows = new ArrayList<>();
|
|
643 |
final List<Row> rows = new ArrayList<>();
|
617 |
644 |
rows.add(row);
|
618 |
645 |
final Map<String, Map<String, Map<String, byte[]>>> table = buildTable(rows);
|
619 |
646 |
final Map<String, XmlRecordFactory> builders = mapAll(table);
|
... | ... | |
654 |
681 |
final List<Row> rows = rowsIterator.next();
|
655 |
682 |
i++;
|
656 |
683 |
|
657 |
|
if ((i % 10000) == 0) {
|
|
684 |
if (i % 10000 == 0) {
|
658 |
685 |
System.out.println(i);
|
659 |
686 |
}
|
660 |
687 |
|
... | ... | |
698 |
725 |
return asRows(xsltStream, params, recordStream, null);
|
699 |
726 |
}
|
700 |
727 |
|
701 |
|
private List<Row> asRows(final InputStream xsltStream, final Map<String, Object> params, final InputStream recordStream, final Function<Row, Row> p) throws Exception {
|
|
728 |
private List<Row> asRows(final InputStream xsltStream, final Map<String, Object> params, final InputStream recordStream, final Function<Row, Row> p)
|
|
729 |
throws Exception {
|
702 |
730 |
final String xslt = IOUtils.toString(xsltStream);
|
703 |
731 |
final XsltRowTransformer transformer = factory.getTransformer(xslt, params);
|
704 |
732 |
assertNotNull(transformer);
|
... | ... | |
804 |
832 |
if (c.getName().equals("body")) {
|
805 |
833 |
final String theBody = new String(c.getValue(), "UTF-8");
|
806 |
834 |
assertTrue(StringUtils.isNotBlank(theBody));
|
807 |
|
//System.out.println(theBody);
|
|
835 |
// System.out.println(theBody);
|
808 |
836 |
}
|
809 |
837 |
}
|
810 |
838 |
}
|
... | ... | |
839 |
867 |
|
840 |
868 |
final Map<String, byte[]> familyMap = row.get(type.toString());
|
841 |
869 |
|
842 |
|
if (familyMap == null) return;
|
|
870 |
if (familyMap == null) { return; }
|
843 |
871 |
|
844 |
872 |
final byte[] bodyB = familyMap.get("body");
|
845 |
873 |
|
... | ... | |
857 |
885 |
final String it = ld.getRelDescriptor().getIt();
|
858 |
886 |
final Map<String, byte[]> cols = row.get(it);
|
859 |
887 |
|
860 |
|
if ((cols != null) && !cols.isEmpty()) {
|
|
888 |
if (cols != null && !cols.isEmpty()) {
|
861 |
889 |
|
862 |
890 |
for (final byte[] oafB : cols.values()) {
|
863 |
891 |
|
... | ... | |
901 |
929 |
for (final Entry<String, XmlRecordFactory> e : builders.entrySet()) {
|
902 |
930 |
final OafRowKeyDecoder kd = OafRowKeyDecoder.decode(e.getKey());
|
903 |
931 |
|
904 |
|
if (!e.getValue().isValid()) throw new IllegalArgumentException("invalid builder: " + e.getKey());
|
|
932 |
if (!e.getValue().isValid()) { throw new IllegalArgumentException("invalid builder: " + e.getKey()); }
|
905 |
933 |
if (types.contains(kd.getType())) {
|
906 |
934 |
final String val = IndentXmlString.apply(e.getValue().build());
|
907 |
935 |
|
908 |
|
if ((xpaths != null) && !xpaths.isEmpty() && (xpaths.get(kd.getType()) != null)) {
|
|
936 |
if (xpaths != null && !xpaths.isEmpty() && xpaths.get(kd.getType()) != null) {
|
909 |
937 |
final Document doc = r.read(new StringReader(val));
|
910 |
938 |
|
911 |
939 |
log.debug("\n" + e.getKey());
|
... | ... | |
937 |
965 |
|
938 |
966 |
private XmlRecordFactory newBuilder() throws TransformerConfigurationException, TransformerFactoryConfigurationError, DocumentException {
|
939 |
967 |
return new XmlRecordFactory(entityConfigTable, ContextMapper.fromXml(Context.xml),
|
940 |
|
RelClasses.fromJSon(RelClassesTest.relClassesJson), XmlRecordFactoryTest.SCHEMA_LOCATION, true, false, false, XmlRecordFactoryTest.specialDatasourceTypes);
|
|
968 |
RelClasses.fromJSon(RelClassesTest.relClassesJson), XmlRecordFactoryTest.SCHEMA_LOCATION, true, false, false,
|
|
969 |
XmlRecordFactoryTest.specialDatasourceTypes);
|
941 |
970 |
}
|
942 |
971 |
|
943 |
972 |
private InputStream load(final String fileName) {
|
... | ... | |
946 |
975 |
|
947 |
976 |
private InputStream loadFromTransformationProfile(final String profilePath) {
|
948 |
977 |
log.info("Loading xslt from: " + basePathProfiles + profilePath);
|
949 |
|
InputStream profile = getClass().getResourceAsStream(basePathProfiles + profilePath);
|
950 |
|
SAXReader saxReader = new SAXReader();
|
|
978 |
final InputStream profile = getClass().getResourceAsStream(basePathProfiles + profilePath);
|
|
979 |
final SAXReader saxReader = new SAXReader();
|
951 |
980 |
Document doc = null;
|
952 |
981 |
try {
|
953 |
982 |
doc = saxReader.read(profile);
|
954 |
|
} catch (DocumentException e) {
|
|
983 |
} catch (final DocumentException e) {
|
955 |
984 |
e.printStackTrace();
|
956 |
985 |
throw new RuntimeException(e);
|
957 |
986 |
}
|
958 |
|
String xslt = doc.selectSingleNode("//SCRIPT/CODE/*[local-name()='stylesheet']").asXML();
|
959 |
|
//log.info(xslt);
|
|
987 |
final String xslt = doc.selectSingleNode("//SCRIPT/CODE/*[local-name()='stylesheet']").asXML();
|
|
988 |
// log.info(xslt);
|
960 |
989 |
return IOUtils.toInputStream(xslt);
|
961 |
990 |
}
|
962 |
991 |
|
963 |
|
|
964 |
|
private Row asRowFromJson(String json) throws ParseException {
|
965 |
|
Oaf.Builder oafBuilder = Oaf.newBuilder();
|
|
992 |
private Row asRowFromJson(final String json) throws ParseException {
|
|
993 |
final Oaf.Builder oafBuilder = Oaf.newBuilder();
|
966 |
994 |
JsonFormat.merge(json, oafBuilder);
|
967 |
995 |
final Oaf oaf = oafBuilder.build();
|
968 |
|
Row row = new Row("result", oaf.getEntity().getId());
|
969 |
|
Column<String, byte[]> c = new Column<>("body", oaf.toByteArray());
|
|
996 |
final Row row = new Row("result", oaf.getEntity().getId());
|
|
997 |
final Column<String, byte[]> c = new Column<>("body", oaf.toByteArray());
|
970 |
998 |
row.setColumn("body", c);
|
971 |
999 |
return row;
|
972 |
1000 |
|
(openorgs) added schemeid to pids in sql query