Project

General

Profile

« Previous | Next » 

Revision 57397

(openorgs) added schemeid to pids in sql query

View differences:

modules/dnet-mapreduce-jobs/trunk/src/test/java/eu/dnetlib/data/transform/XsltRowTransformerFactoryTest.java
1 1
package eu.dnetlib.data.transform;
2 2

  
3
import java.io.*;
4
import java.util.*;
3
import static org.junit.Assert.assertFalse;
4
import static org.junit.Assert.assertNotNull;
5
import static org.junit.Assert.assertTrue;
6

  
7
import java.io.BufferedReader;
8
import java.io.IOException;
9
import java.io.InputStream;
10
import java.io.InputStreamReader;
11
import java.io.StringReader;
12
import java.io.UnsupportedEncodingException;
13
import java.util.ArrayList;
14
import java.util.HashMap;
15
import java.util.Iterator;
16
import java.util.List;
17
import java.util.Map;
5 18
import java.util.Map.Entry;
19
import java.util.Set;
6 20
import java.util.zip.GZIPInputStream;
21

  
7 22
import javax.xml.transform.TransformerConfigurationException;
8 23
import javax.xml.transform.TransformerFactoryConfigurationError;
9 24

  
25
import org.apache.commons.io.IOUtils;
26
import org.apache.commons.lang.StringUtils;
27
import org.apache.commons.logging.Log;
28
import org.apache.commons.logging.LogFactory;
29
import org.dom4j.Document;
30
import org.dom4j.DocumentException;
31
import org.dom4j.io.SAXReader;
32
import org.json.JSONObject;
33
import org.junit.Before;
34
import org.junit.Ignore;
35
import org.junit.Test;
36
import org.springframework.core.io.ByteArrayResource;
37
import org.springframework.core.io.Resource;
38

  
10 39
import com.google.common.base.Function;
11 40
import com.google.common.collect.Iterables;
12 41
import com.google.common.collect.Lists;
......
15 44
import com.google.protobuf.InvalidProtocolBufferException;
16 45
import com.googlecode.protobuf.format.JsonFormat;
17 46
import com.googlecode.protobuf.format.JsonFormat.ParseException;
47

  
18 48
import eu.dnetlib.actionmanager.actions.ActionFactory;
19 49
import eu.dnetlib.actionmanager.actions.XsltInfoPackageAction;
20 50
import eu.dnetlib.actionmanager.common.Agent;
21 51
import eu.dnetlib.actionmanager.common.Operation;
22 52
import eu.dnetlib.actionmanager.common.Provenance;
23
import eu.dnetlib.data.mapreduce.hbase.index.config.*;
24
import eu.dnetlib.data.mapreduce.util.*;
53
import eu.dnetlib.data.mapreduce.hbase.index.config.Context;
54
import eu.dnetlib.data.mapreduce.hbase.index.config.ContextMapper;
55
import eu.dnetlib.data.mapreduce.hbase.index.config.EntityConfigTable;
56
import eu.dnetlib.data.mapreduce.hbase.index.config.IndexConfig;
57
import eu.dnetlib.data.mapreduce.hbase.index.config.IndexConfigTest;
58
import eu.dnetlib.data.mapreduce.hbase.index.config.LinkDescriptor;
59
import eu.dnetlib.data.mapreduce.hbase.index.config.RelClasses;
60
import eu.dnetlib.data.mapreduce.hbase.index.config.RelClassesTest;
61
import eu.dnetlib.data.mapreduce.util.OafDecoder;
62
import eu.dnetlib.data.mapreduce.util.OafRowKeyDecoder;
63
import eu.dnetlib.data.mapreduce.util.RelDescriptor;
64
import eu.dnetlib.data.mapreduce.util.UpdateMerger;
65
import eu.dnetlib.data.mapreduce.util.XmlRecordFactory;
66
import eu.dnetlib.data.mapreduce.util.XmlRecordFactoryTest;
25 67
import eu.dnetlib.data.proto.KindProtos.Kind;
26 68
import eu.dnetlib.data.proto.OafProtos.Oaf;
27 69
import eu.dnetlib.data.proto.TypeProtos.Type;
28 70
import eu.dnetlib.miscutils.functional.xml.IndentXmlString;
29
import org.apache.commons.io.IOUtils;
30
import org.apache.commons.lang.StringUtils;
31
import org.apache.commons.logging.Log;
32
import org.apache.commons.logging.LogFactory;
33
import org.dom4j.Document;
34
import org.dom4j.DocumentException;
35
import org.dom4j.io.SAXReader;
36
import org.json.JSONObject;
37
import org.junit.Before;
38
import org.junit.Ignore;
39
import org.junit.Test;
40
import org.springframework.core.io.ByteArrayResource;
41
import org.springframework.core.io.Resource;
42 71

  
43
import static org.junit.Assert.*;
44

  
45 72
public class XsltRowTransformerFactoryTest {
46 73

  
47 74
	private static final Log log = LogFactory.getLog(XsltRowTransformerFactoryTest.class);
......
78 105
		doTest(loadFromTransformationProfile("claimRels_2_hbase.xml"), load("recordClaimRel.xml"));
79 106
	}
80 107

  
81

  
82 108
	@Test
83 109
	public void testParseFp7IctPUB() throws Exception {
84 110

  
......
121 147
		doTest(loadFromTransformationProfile("odf2hbase.xml"), load("doecode.xml"));
122 148
	}
123 149

  
124

  
125 150
	@Test
126 151
	public void testParseDatasetLindat() throws Exception {
127 152

  
......
152 177
		doTest(loadFromTransformationProfile("oaf2hbase.xml"), load("recordClaimedDedup.xml"));
153 178
	}
154 179

  
155

  
156 180
	@Test
157 181
	public void testParseClaimDataset() throws Exception {
158 182

  
159 183
		doTest(loadFromTransformationProfile("odf2hbase.xml"), load("recordClaimDataset.xml"));
160 184
	}
161 185

  
162

  
163 186
	@Test
164 187
	public void testParseACM() throws Exception {
165 188

  
......
184 207
		doTest(loadFromTransformationProfile("projects_2_hbase.xsl"), load("projectRecordFCT.xml"));
185 208
	}
186 209

  
187

  
188 210
	@Test
189 211
	public void testParseOaf() throws Exception {
190 212

  
......
244 266

  
245 267
		final List<Row> rows = Lists.newArrayList();
246 268
		rows.addAll(asRows(loadFromTransformationProfile("oaf2hbase.xml"), load("odf.xml")));
247
		//printAll(mapAll(buildTable(rows)));
269
		// printAll(mapAll(buildTable(rows)));
248 270
	}
249 271

  
250 272
	@Test
......
254 276
		rows.addAll(asRows(loadFromTransformationProfile("oaf2hbase.xml"), load("pangaeOAF2.xml")));
255 277
		printAll(mapAll(buildTable(rows)));
256 278
	}
279

  
257 280
	@Test
258 281
	public void testZenodo() throws Exception {
259 282

  
......
310 333
		printAll(mapAll(buildTable(rows)));
311 334
	}
312 335

  
313

  
314

  
315 336
	@Test
316 337
	public void testLinkOrganization() throws Exception {
317 338

  
......
341 362

  
342 363
		final Function<Row, Row> f = rowIn -> {
343 364

  
344
			final List<Column<String,byte[]>> cols = Lists.newArrayList();
345
			for(Column<String,byte[]> col : rowIn.getColumns()) {
365
			final List<Column<String, byte[]>> cols = Lists.newArrayList();
366
			for (final Column<String, byte[]> col : rowIn.getColumns()) {
346 367
				if (col.getName().equals("body")) {
347 368
					cols.add(new Column(col.getName(), col.getValue()));
348 369

  
......
360 381
		rows.addAll(puma1);
361 382
		rows.addAll(puma2);
362 383

  
363
		List<Oaf> duplicates = Lists.newArrayList();
384
		final List<Oaf> duplicates = Lists.newArrayList();
364 385
		duplicates.add(getOafBody(puma1));
365 386
		duplicates.add(getOafBody(puma2));
366 387
		final Oaf.Builder oafMerge = OafEntityMerger.merge(mergeId, duplicates);
......
373 394
	}
374 395

  
375 396
	private Oaf getOafBody(final List<Row> rows) throws InvalidProtocolBufferException {
376
		for(Row row : rows) {
377
			if(StringUtils.startsWith(row.getKey(), "50")) {
378
				return Oaf.parseFrom(row.getColumn("body").getValue());
397
		for (final Row row : rows) {
398
			if (StringUtils.startsWith(row.getKey(), "50")) { return Oaf.parseFrom(row.getColumn("body").getValue());
379 399

  
380 400
			}
381 401
		}
......
393 413

  
394 414
		doTest(loadFromTransformationProfile("datasources_2_hbase.xsl"), load("datasourceNative.xml"));
395 415
	}
416

  
396 417
	@Test
397 418
	public void testParseDatasourcePiwik() throws Exception {
398 419

  
......
410 431
		doTestJsonGz(loadFromTransformationProfile("oaf2hbase.xml"), load("mdstore_cleaned.json.gz"));
411 432
	}
412 433

  
413

  
414 434
	@Test
415 435
	public void testLoadFromTransformationProfile() throws IOException {
416
		InputStream in = loadFromTransformationProfile("oaf2hbase.xml");
436
		final InputStream in = loadFromTransformationProfile("oaf2hbase.xml");
417 437
		log.info(IOUtils.toString(in));
418 438
	}
419 439

  
......
441 461

  
442 462
		doTest(loadFromTransformationProfile("projects_2_hbase.xsl"), load("projectWithFunderOriginalName.xml"));
443 463
	}
464

  
444 465
	@Test
445 466
	public void testLinkFunderOriginalName() throws Exception {
446 467

  
......
513 534
		final List<Row> rows = Lists.newArrayList();
514 535
		rows.addAll(asRows(loadFromTransformationProfile("oaf2hbase.xml"), load("oafWithCommunity.xml")));
515 536

  
516
		ActionFactory actionFactory = new ActionFactory();
537
		final ActionFactory actionFactory = new ActionFactory();
517 538

  
518
		Map<String, Resource> xslts = Maps.newHashMap();
539
		final Map<String, Resource> xslts = Maps.newHashMap();
519 540

  
520 541
		xslts.put("oaf2hbase", new ByteArrayResource(IOUtils.toString(loadFromTransformationProfile("oaf2hbase.xml")).getBytes()));
521 542
		actionFactory.setXslts(xslts);
522 543

  
523
		XsltInfoPackageAction pa = actionFactory.generateInfoPackageAction(
544
		final XsltInfoPackageAction pa = actionFactory.generateInfoPackageAction(
524 545
				"oaf2hbase",
525 546
				"rawset-id",
526 547
				new Agent("agent-id", "agent-name", Agent.AGENT_TYPE.algo),
......
533 554

  
534 555
		IOUtils.readLines(load("country_updates.json")).forEach(line -> {
535 556

  
536
			Oaf.Builder oaf = Oaf.newBuilder();
557
			final Oaf.Builder oaf = Oaf.newBuilder();
537 558

  
538 559
			try {
539 560
				JsonFormat.merge(line, oaf);
540
			} catch (JsonFormat.ParseException e) {
561
			} catch (final JsonFormat.ParseException e) {
541 562
				throw new IllegalArgumentException(e);
542 563
			}
543 564

  
544
			Column<String, byte[]> col = new Column<>("update_" + System.nanoTime(), oaf.build().toByteArray());
565
			final Column<String, byte[]> col = new Column<>("update_" + System.nanoTime(), oaf.build().toByteArray());
545 566
			rows.add(new Row("result", oaf.getEntity().getId(), Lists.newArrayList(col)));
546 567
		});
547 568

  
548 569
		pa.asAtomicActions().forEach(a -> {
549
			Column<String, byte[]> col = new Column<>("update_" + System.nanoTime(), a.getTargetValue());
570
			final Column<String, byte[]> col = new Column<>("update_" + System.nanoTime(), a.getTargetValue());
550 571
			rows.add(new Row(a.getTargetColumnFamily(), a.getTargetRowKey(), Lists.newArrayList(col)));
551 572
		});
552 573

  
553

  
554 574
		/*
555
		rows.forEach(r -> {
556
			log.info(r);
557
		});
558
		*/
575
		 * rows.forEach(r -> { log.info(r); });
576
		 */
559 577

  
560 578
		mapAll(buildTable(rows)).entrySet().forEach(b -> {
561 579
			log.info(b.getKey());
......
589 607
		doTest(loadFromTransformationProfile("odf2hbase.xml"), load("guidelines4_qeios1.xml"));
590 608
	}
591 609

  
592
    @Test
593
    public void testGuidelines4Aria() throws Exception {
610
	@Test
611
	public void testGuidelines4Aria() throws Exception {
594 612

  
595
        doTest(loadFromTransformationProfile("odf2hbase.xml"), load("guidelines4_aria.xml"));
596
    }
613
		doTest(loadFromTransformationProfile("odf2hbase.xml"), load("guidelines4_aria.xml"));
614
	}
597 615

  
598 616
	@Test
599 617
	public void testJournalRecord() throws Exception {
......
610 628
		printAll(mapAll(buildTable(rows)));
611 629
	}
612 630

  
631
	@Test
632
	public void testOpenOrganizations() throws Exception {
633

  
634
		final List<Row> rows = Lists.newArrayList();
635
		rows.addAll(asRows(loadFromTransformationProfile("organizations_2_hbase.xsl"), load("openorgs_sample.xml")));
636

  
637
		printAll(mapAll(buildTable(rows)));
638
	}
639

  
613 640
	private void doTestJsonRow(final String json) throws Exception {
614
		Row row = asRowFromJson(json);
641
		final Row row = asRowFromJson(json);
615 642
		log.info(row);
616
		List<Row> rows = new ArrayList<>();
643
		final List<Row> rows = new ArrayList<>();
617 644
		rows.add(row);
618 645
		final Map<String, Map<String, Map<String, byte[]>>> table = buildTable(rows);
619 646
		final Map<String, XmlRecordFactory> builders = mapAll(table);
......
654 681
			final List<Row> rows = rowsIterator.next();
655 682
			i++;
656 683

  
657
			if ((i % 10000) == 0) {
684
			if (i % 10000 == 0) {
658 685
				System.out.println(i);
659 686
			}
660 687

  
......
698 725
		return asRows(xsltStream, params, recordStream, null);
699 726
	}
700 727

  
701
	private List<Row> asRows(final InputStream xsltStream, final Map<String, Object> params, final InputStream recordStream, final Function<Row, Row> p) throws Exception {
728
	private List<Row> asRows(final InputStream xsltStream, final Map<String, Object> params, final InputStream recordStream, final Function<Row, Row> p)
729
			throws Exception {
702 730
		final String xslt = IOUtils.toString(xsltStream);
703 731
		final XsltRowTransformer transformer = factory.getTransformer(xslt, params);
704 732
		assertNotNull(transformer);
......
804 832
				if (c.getName().equals("body")) {
805 833
					final String theBody = new String(c.getValue(), "UTF-8");
806 834
					assertTrue(StringUtils.isNotBlank(theBody));
807
					//System.out.println(theBody);
835
					// System.out.println(theBody);
808 836
				}
809 837
			}
810 838
		}
......
839 867

  
840 868
		final Map<String, byte[]> familyMap = row.get(type.toString());
841 869

  
842
		if (familyMap == null) return;
870
		if (familyMap == null) { return; }
843 871

  
844 872
		final byte[] bodyB = familyMap.get("body");
845 873

  
......
857 885
				final String it = ld.getRelDescriptor().getIt();
858 886
				final Map<String, byte[]> cols = row.get(it);
859 887

  
860
				if ((cols != null) && !cols.isEmpty()) {
888
				if (cols != null && !cols.isEmpty()) {
861 889

  
862 890
					for (final byte[] oafB : cols.values()) {
863 891

  
......
901 929
		for (final Entry<String, XmlRecordFactory> e : builders.entrySet()) {
902 930
			final OafRowKeyDecoder kd = OafRowKeyDecoder.decode(e.getKey());
903 931

  
904
			if (!e.getValue().isValid()) throw new IllegalArgumentException("invalid builder: " + e.getKey());
932
			if (!e.getValue().isValid()) { throw new IllegalArgumentException("invalid builder: " + e.getKey()); }
905 933
			if (types.contains(kd.getType())) {
906 934
				final String val = IndentXmlString.apply(e.getValue().build());
907 935

  
908
				if ((xpaths != null) && !xpaths.isEmpty() && (xpaths.get(kd.getType()) != null)) {
936
				if (xpaths != null && !xpaths.isEmpty() && xpaths.get(kd.getType()) != null) {
909 937
					final Document doc = r.read(new StringReader(val));
910 938

  
911 939
					log.debug("\n" + e.getKey());
......
937 965

  
938 966
	private XmlRecordFactory newBuilder() throws TransformerConfigurationException, TransformerFactoryConfigurationError, DocumentException {
939 967
		return new XmlRecordFactory(entityConfigTable, ContextMapper.fromXml(Context.xml),
940
				RelClasses.fromJSon(RelClassesTest.relClassesJson), XmlRecordFactoryTest.SCHEMA_LOCATION, true, false, false, XmlRecordFactoryTest.specialDatasourceTypes);
968
				RelClasses.fromJSon(RelClassesTest.relClassesJson), XmlRecordFactoryTest.SCHEMA_LOCATION, true, false, false,
969
				XmlRecordFactoryTest.specialDatasourceTypes);
941 970
	}
942 971

  
943 972
	private InputStream load(final String fileName) {
......
946 975

  
947 976
	private InputStream loadFromTransformationProfile(final String profilePath) {
948 977
		log.info("Loading xslt from: " + basePathProfiles + profilePath);
949
		InputStream profile = getClass().getResourceAsStream(basePathProfiles + profilePath);
950
		SAXReader saxReader = new SAXReader();
978
		final InputStream profile = getClass().getResourceAsStream(basePathProfiles + profilePath);
979
		final SAXReader saxReader = new SAXReader();
951 980
		Document doc = null;
952 981
		try {
953 982
			doc = saxReader.read(profile);
954
		} catch (DocumentException e) {
983
		} catch (final DocumentException e) {
955 984
			e.printStackTrace();
956 985
			throw new RuntimeException(e);
957 986
		}
958
		String xslt = doc.selectSingleNode("//SCRIPT/CODE/*[local-name()='stylesheet']").asXML();
959
		//log.info(xslt);
987
		final String xslt = doc.selectSingleNode("//SCRIPT/CODE/*[local-name()='stylesheet']").asXML();
988
		// log.info(xslt);
960 989
		return IOUtils.toInputStream(xslt);
961 990
	}
962 991

  
963

  
964
	private Row asRowFromJson(String json) throws ParseException {
965
		Oaf.Builder oafBuilder = Oaf.newBuilder();
992
	private Row asRowFromJson(final String json) throws ParseException {
993
		final Oaf.Builder oafBuilder = Oaf.newBuilder();
966 994
		JsonFormat.merge(json, oafBuilder);
967 995
		final Oaf oaf = oafBuilder.build();
968
		Row row = new Row("result", oaf.getEntity().getId());
969
		Column<String, byte[]> c = new Column<>("body", oaf.toByteArray());
996
		final Row row = new Row("result", oaf.getEntity().getId());
997
		final Column<String, byte[]> c = new Column<>("body", oaf.toByteArray());
970 998
		row.setColumn("body", c);
971 999
		return row;
972 1000

  
modules/dnet-mapreduce-jobs/trunk/src/test/resources/eu/dnetlib/data/transform/openorgs_sample.xml
1
<?xml version="1.0" encoding="UTF-8"?>
2
<ROW>
3
	<FIELD name="trust">0.95</FIELD>
4
	<FIELD name="collectedfromid">openaire____::openorgs</FIELD>
5
	<FIELD name="country">AU@@@dnet:countries</FIELD>
6
	<FIELD name="legalname">Australian National University</FIELD>
7
	<FIELD name="legalshortname">ANU</FIELD>
8
	<FIELD name="pid">
9
		<ITEM>0000 0001 2180 7477###ISNI@@@dnet:pid_types</ITEM>
10
		<ITEM>100009020###FundRef@@@dnet:pid_types</ITEM>
11
		<ITEM>285106###OrgRef@@@dnet:pid_types</ITEM>
12
		<ITEM>501100000995###FundRef@@@dnet:pid_types</ITEM>
13
		<ITEM>501100001151###FundRef@@@dnet:pid_types</ITEM>
14
		<ITEM>Q127990###Wikidata@@@dnet:pid_types</ITEM>
15
		<ITEM>grid.1001.0###grid.ac@@@dnet:pid_types</ITEM>
16
	</FIELD>
17
	<FIELD name="organizationid">openorgs____::0000000001</FIELD>
18
	<FIELD name="alternativeNames">
19
		<ITEM>Australian National University</ITEM>
20
	</FIELD>
21
	<FIELD name="provenanceaction">sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions
22
	</FIELD>
23
	<FIELD name="inferenceprovenance"></FIELD>
24
	<FIELD name="websiteurl">http://www.anu.edu.au/</FIELD>
25
	<FIELD name="collectedfromname">OpenOrgs Database</FIELD>
26
	<FIELD name="dateoftransformation">2019-10-14 08:59:35.295767</FIELD>
27
	<FIELD name="inferred">false</FIELD>
28
	<FIELD name="deletedbyinference">false</FIELD>
29
</ROW>
modules/dnet-openaireplus-workflows/trunk/src/main/resources/eu/dnetlib/msro/openaireplus/workflows/hbase/queryOrganizationsFromOpenOrgsDB.sql
13 13
	'OpenOrgs Database'                                                                                                           AS collectedfromname,
14 14
	o.country || '@@@dnet:countries'                                                                                              AS country,
15 15
	'sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions' AS provenanceaction,
16
	--array_agg(DISTINCT i.otherid || '###' || i.type)                                                                              AS pid
17
	ARRAY[]::text[]                                           AS pid
16
	array_agg(DISTINCT i.otherid || '###' || i.type || '@@@dnet:pid_types')                                                       AS pid
18 17
FROM organizations o
19 18
	LEFT OUTER JOIN acronyms a    ON (a.id = o.id)
20 19
	LEFT OUTER JOIN urls u        ON (u.id = o.id)

Also available in: Unified diff