Project

General

Profile

1 26600 sandro.lab
package eu.dnetlib.data.transform;
2
3 39431 claudio.at
import java.io.*;
4
import java.util.*;
5 30967 claudio.at
import java.util.Map.Entry;
6 35746 alessia.ba
import java.util.zip.GZIPInputStream;
7 30967 claudio.at
import javax.xml.transform.TransformerConfigurationException;
8
import javax.xml.transform.TransformerFactoryConfigurationError;
9
10 44483 claudio.at
import com.google.common.base.Function;
11
import com.google.common.collect.Iterables;
12 31997 claudio.at
import com.google.common.collect.Lists;
13 30967 claudio.at
import com.google.common.collect.Maps;
14 38025 claudio.at
import com.google.common.collect.Sets;
15 26600 sandro.lab
import com.google.protobuf.InvalidProtocolBufferException;
16 53408 claudio.at
import com.googlecode.protobuf.format.JsonFormat;
17 55177 alessia.ba
import com.googlecode.protobuf.format.JsonFormat.ParseException;
18 53408 claudio.at
import eu.dnetlib.actionmanager.actions.ActionFactory;
19
import eu.dnetlib.actionmanager.actions.XsltInfoPackageAction;
20
import eu.dnetlib.actionmanager.common.Agent;
21
import eu.dnetlib.actionmanager.common.Operation;
22
import eu.dnetlib.actionmanager.common.Provenance;
23 39431 claudio.at
import eu.dnetlib.data.mapreduce.hbase.index.config.*;
24
import eu.dnetlib.data.mapreduce.util.*;
25 26600 sandro.lab
import eu.dnetlib.data.proto.KindProtos.Kind;
26
import eu.dnetlib.data.proto.OafProtos.Oaf;
27 30967 claudio.at
import eu.dnetlib.data.proto.TypeProtos.Type;
28 26600 sandro.lab
import eu.dnetlib.miscutils.functional.xml.IndentXmlString;
29 39431 claudio.at
import org.apache.commons.io.IOUtils;
30
import org.apache.commons.lang.StringUtils;
31 40063 alessia.ba
import org.apache.commons.logging.Log;
32
import org.apache.commons.logging.LogFactory;
33 39431 claudio.at
import org.dom4j.Document;
34
import org.dom4j.DocumentException;
35
import org.dom4j.io.SAXReader;
36
import org.json.JSONObject;
37
import org.junit.Before;
38 48702 claudio.at
import org.junit.Ignore;
39 39431 claudio.at
import org.junit.Test;
40 53408 claudio.at
import org.springframework.core.io.ByteArrayResource;
41
import org.springframework.core.io.Resource;
42 26600 sandro.lab
43 39431 claudio.at
import static org.junit.Assert.*;
44
45 26600 sandro.lab
public class XsltRowTransformerFactoryTest {
46
47 40063 alessia.ba
	private static final Log log = LogFactory.getLog(XsltRowTransformerFactoryTest.class);
48 52562 alessia.ba
	private static String basePathProfiles = "/eu/dnetlib/test/profiles/TransformationRuleDSResources/TransformationRuleDSResourceType/2hbase/";
49 26600 sandro.lab
	private XsltRowTransformerFactory factory;
50 30967 claudio.at
	private EntityConfigTable entityConfigTable;
51 26600 sandro.lab
52
	@Before
53
	public void setUp() throws Exception {
54
		factory = new XsltRowTransformerFactory();
55 30967 claudio.at
		entityConfigTable = IndexConfig.load(IndexConfigTest.config).getConfigMap();
56 26600 sandro.lab
	}
57
58
	@Test
59 48702 claudio.at
	@Ignore // need to reimplement because claimUpdates_2_hbase.xsl was removed
60 42534 alessia.ba
	public void testParseOafClaimUpdate() throws Exception {
61
		doTest(loadFromTransformationProfile("claimUpdates_2_hbase.xsl"), load("recordClaimUpdate.xml"));
62
	}
63
64
	@Test
65 48702 claudio.at
	@Ignore // need to reimplement because claimUpdates_2_hbase.xsl was removed
66 39616 claudio.at
	public void testParseClaimUpdate() throws Exception {
67
68 41468 claudio.at
		final List<Row> rows = Lists.newArrayList();
69
		rows.addAll(asRows(loadFromTransformationProfile("oaf2hbase.xml"), load("recordClaim.xml")));
70
		rows.addAll(asRows(loadFromTransformationProfile("claimUpdates_2_hbase.xsl"), load("recordClaimUpdate.xml")));
71
72
		printAll(mapAll(buildTable(rows)));
73 39616 claudio.at
	}
74
75
	@Test
76 52543 alessia.ba
	public void testParseClaimRel() throws Exception {
77
78
		doTest(loadFromTransformationProfile("claimRels_2_hbase.xml"), load("recordClaimRel.xml"));
79
	}
80
81
82
	@Test
83 49718 claudio.at
	public void testParseFp7IctPUB() throws Exception {
84
85
		doTest(loadFromTransformationProfile("oaf2hbase.xml"), load("ec_fp7_ict.xml"));
86
	}
87
88
	@Test
89 52524 claudio.at
	public void testParseRecordCrossref() throws Exception {
90
91
		doTest(loadFromTransformationProfile("oaf2hbase.xml"), load("recordCrossref.xml"));
92
	}
93
94
	@Test
95 38586 claudio.at
	public void testParseDatasetPUB() throws Exception {
96
97 40063 alessia.ba
		doTest(loadFromTransformationProfile("odf2hbase.xml"), load("recordDatasetPUB.xml"));
98 38586 claudio.at
	}
99
100
	@Test
101 52422 claudio.at
	public void testParseSoftwareEgiApp() throws Exception {
102
103
		doTest(loadFromTransformationProfile("odf2hbase.xml"), load("softwareEgiApp.xml"));
104
	}
105
106
	@Test
107
	public void testParseSoftwareEgiApp2() throws Exception {
108
109
		doTest(loadFromTransformationProfile("odf2hbase.xml"), load("softwareEgiApp2.xml"));
110
	}
111
112
	@Test
113
	public void testParseOrpEgiApp() throws Exception {
114
115
		doTest(loadFromTransformationProfile("odf2hbase.xml"), load("orpEgiApp.xml"));
116
	}
117
118
	@Test
119 55099 alessia.ba
	public void testParseSoftwareDOECODE() throws Exception {
120
121
		doTest(loadFromTransformationProfile("odf2hbase.xml"), load("doecode.xml"));
122
	}
123
124
125
	@Test
126 48854 claudio.at
	public void testParseDatasetLindat() throws Exception {
127
128
		doTest(loadFromTransformationProfile("odf2hbase.xml"), load("datasetLindat.xml"));
129
	}
130
131
	@Test
132 53362 miriam.bag
	public void testParseDatasetNeuroVault() throws Exception {
133
134
		doTest(loadFromTransformationProfile("odf2hbase.xml"), load("recordNeuroVault.xml"));
135
	}
136
137
	@Test
138 55182 alessia.ba
	public void testParseDatasetNeuroVault2() throws Exception {
139
140
		doTest(loadFromTransformationProfile("odf2hbase.xml"), load("recordNeuroVault2.xml"));
141
	}
142
143
	@Test
144 31997 claudio.at
	public void testParseClaim() throws Exception {
145 26600 sandro.lab
146 40063 alessia.ba
		doTest(loadFromTransformationProfile("oaf2hbase.xml"), load("recordClaim.xml"));
147 30967 claudio.at
	}
148 26600 sandro.lab
149 30967 claudio.at
	@Test
150 55093 alessia.ba
	public void testParseClaimDedup() throws Exception {
151
152
		doTest(loadFromTransformationProfile("oaf2hbase.xml"), load("recordClaimedDedup.xml"));
153
	}
154
155
156
	@Test
157 42825 alessia.ba
	public void testParseClaimDataset() throws Exception {
158
159
		doTest(loadFromTransformationProfile("odf2hbase.xml"), load("recordClaimDataset.xml"));
160
	}
161
162 52543 alessia.ba
163 42825 alessia.ba
	@Test
164 39431 claudio.at
	public void testParseACM() throws Exception {
165
166 40063 alessia.ba
		doTest(loadFromTransformationProfile("oaf2hbase.xml"), load("recordACM.xml"));
167 39431 claudio.at
	}
168
169
	@Test
170 39616 claudio.at
	public void testParseASB() throws Exception {
171
172 40063 alessia.ba
		doTest(loadFromTransformationProfile("oaf2hbase.xml"), load("recordASB.xml"));
173 39616 claudio.at
	}
174
175
	@Test
176 33382 claudio.at
	public void testParseProjectCorda() throws Exception {
177 26600 sandro.lab
178 40205 claudio.at
		doTest(loadFromTransformationProfile("projects_2_hbase.xsl"), load("projectRecordCorda.xml"));
179 26600 sandro.lab
	}
180
181
	@Test
182 33382 claudio.at
	public void testParseProjectFCT() throws Exception {
183
184 40205 claudio.at
		doTest(loadFromTransformationProfile("projects_2_hbase.xsl"), load("projectRecordFCT.xml"));
185 33382 claudio.at
	}
186
187 46587 alessia.ba
188 33382 claudio.at
	@Test
189 40205 claudio.at
	public void testParseOaf() throws Exception {
190 26600 sandro.lab
191 40063 alessia.ba
		doTest(loadFromTransformationProfile("oaf2hbase.xml"), load("record.xml"));
192 30967 claudio.at
	}
193 26600 sandro.lab
194 30967 claudio.at
	@Test
195 40205 claudio.at
	public void testParseOafPublication() throws Exception {
196
197 52980 alessia.ba
		doTest(loadFromTransformationProfile("oaf_entity2hbase.xml"), load("record.xml"));
198 40205 claudio.at
	}
199
200
	@Test
201 43558 claudio.at
	public void testParseLindat() throws Exception {
202
203
		doTest(loadFromTransformationProfile("odf2hbase.xml"), load("recordLindat.xml"));
204
	}
205
206
	@Test
207 31997 claudio.at
	public void testParseDatacite() throws Exception {
208 26600 sandro.lab
209 40063 alessia.ba
		doTest(loadFromTransformationProfile("odf2hbase.xml"), load("recordDatacite.xml"));
210 30967 claudio.at
	}
211 26600 sandro.lab
212 31997 claudio.at
	@Test
213 33382 claudio.at
	public void testParseDatacite2() throws Exception {
214
215 40063 alessia.ba
		doTest(loadFromTransformationProfile("odf2hbase.xml"), load("recordDatacite2.xml"));
216 33382 claudio.at
	}
217
218
	@Test
219 55263 alessia.ba
	public void testParseDataciteNewES() throws Exception {
220
221
		doTest(loadFromTransformationProfile("odf2hbase.xml"), load("dataciteNew.xml"));
222
	}
223
224
	@Test
225 42495 alessia.ba
	public void testParseOpenTrials() throws Exception {
226
227
		doTest(loadFromTransformationProfile("odf2hbase.xml"), load("opentrials_datacite1.xml"));
228
	}
229
230
	@Test
231 32094 claudio.at
	public void testLinkPangaea() throws Exception {
232 26600 sandro.lab
233 34438 claudio.at
		final List<Row> rows = Lists.newArrayList();
234 40205 claudio.at
		rows.addAll(asRows(loadFromTransformationProfile("odf2hbase.xml"), load("pangaeODF1.xml")));
235
		rows.addAll(asRows(loadFromTransformationProfile("odf2hbase.xml"), load("pangaeODF2.xml")));
236 40063 alessia.ba
		rows.addAll(asRows(loadFromTransformationProfile("oaf2hbase.xml"), load("pangaeOAF.xml")));
237 40205 claudio.at
		rows.addAll(asRows(loadFromTransformationProfile("projects_2_hbase.xsl"), load("projectRecordCordaPangaea.xml")));
238 31997 claudio.at
239 38025 claudio.at
		printAll(mapAll(buildTable(rows)));
240 31997 claudio.at
	}
241
242 33382 claudio.at
	@Test
243 57186 sandro.lab
	public void testODF() throws Exception {
244
245
		final List<Row> rows = Lists.newArrayList();
246
		rows.addAll(asRows(loadFromTransformationProfile("oaf2hbase.xml"), load("odf.xml")));
247 57193 sandro.lab
		//printAll(mapAll(buildTable(rows)));
248 57186 sandro.lab
	}
249
250
	@Test
251 43795 alessia.ba
	public void testPangaea() throws Exception {
252
253
		final List<Row> rows = Lists.newArrayList();
254
		rows.addAll(asRows(loadFromTransformationProfile("oaf2hbase.xml"), load("pangaeOAF2.xml")));
255
		printAll(mapAll(buildTable(rows)));
256
	}
257 45034 alessia.ba
	@Test
258
	public void testZenodo() throws Exception {
259 43795 alessia.ba
260 45034 alessia.ba
		final List<Row> rows = Lists.newArrayList();
261
		rows.addAll(asRows(loadFromTransformationProfile("odf2hbase.xml"), load("zenodoData.xml")));
262
		printAll(mapAll(buildTable(rows)));
263
	}
264
265 43795 alessia.ba
	@Test
266 52193 alessia.ba
	public void testZenodoSoftware() throws Exception {
267
268
		final List<Row> rows = Lists.newArrayList();
269
		rows.addAll(asRows(loadFromTransformationProfile("odf2hbase.xml"), load("softwareZenodo_odf.xml")));
270
		printAll(mapAll(buildTable(rows)));
271
	}
272
273
	@Test
274 35179 michele.ar
	public void testLinkCorda() throws Exception {
275
276
		final List<Row> rows = Lists.newArrayList();
277 40205 claudio.at
		rows.addAll(asRows(loadFromTransformationProfile("projects_2_hbase.xsl"), load("projectRecordCorda.xml")));
278 40063 alessia.ba
		rows.addAll(asRows(loadFromTransformationProfile("oaf2hbase.xml"), load("recordCorda.xml")));
279 35179 michele.ar
280 38025 claudio.at
		printAll(mapAll(buildTable(rows)));
281 35179 michele.ar
	}
282
283
	@Test
284 33382 claudio.at
	public void testLinkFCT() throws Exception {
285
286 34438 claudio.at
		final List<Row> rows = Lists.newArrayList();
287 40205 claudio.at
		rows.addAll(asRows(loadFromTransformationProfile("projects_2_hbase.xsl"), load("projectRecordFCT.xml")));
288 40063 alessia.ba
		rows.addAll(asRows(loadFromTransformationProfile("oaf2hbase.xml"), load("recordFCT.xml")));
289 33382 claudio.at
290 38025 claudio.at
		printAll(mapAll(buildTable(rows)));
291 33382 claudio.at
	}
292
293
	@Test
294 41468 claudio.at
	public void testLinkARC() throws Exception {
295
296
		final List<Row> rows = Lists.newArrayList();
297
		rows.addAll(asRows(loadFromTransformationProfile("projects_2_hbase.xsl"), load("projectRecordARC.xml")));
298
		rows.addAll(asRows(loadFromTransformationProfile("oaf2hbase.xml"), load("recordARC.xml")));
299
300
		printAll(mapAll(buildTable(rows)));
301
	}
302
303
	@Test
304 33382 claudio.at
	public void testLinkWT() throws Exception {
305
306 34438 claudio.at
		final List<Row> rows = Lists.newArrayList();
307 40205 claudio.at
		rows.addAll(asRows(loadFromTransformationProfile("projects_2_hbase.xsl"), load("projectRecordWT.xml")));
308 40063 alessia.ba
		rows.addAll(asRows(loadFromTransformationProfile("oaf2hbase.xml"), load("recordWT.xml")));
309 33382 claudio.at
310 38025 claudio.at
		printAll(mapAll(buildTable(rows)));
311 33382 claudio.at
	}
312
313 43169 alessia.ba
314
315 34438 claudio.at
	@Test
316
	public void testLinkOrganization() throws Exception {
317
318
		final List<Row> rows = Lists.newArrayList();
319 40205 claudio.at
		rows.addAll(asRows(loadFromTransformationProfile("organizations_2_hbase.xsl"), load("organization.xml")));
320
		rows.addAll(asRows(loadFromTransformationProfile("projectorganization_2_hbase.xsl"), load("project_organization.xml")));
321
		rows.addAll(asRows(loadFromTransformationProfile("projects_2_hbase.xsl"), load("projectRecordCorda.xml")));
322 34438 claudio.at
323 38025 claudio.at
		printAll(mapAll(buildTable(rows)));
324 34438 claudio.at
	}
325
326 35746 alessia.ba
	@Test
327 46587 alessia.ba
	public void testLinkOrganizationAffiliation() throws Exception {
328
329
		final List<Row> rows = Lists.newArrayList();
330
		rows.addAll(asRows(loadFromTransformationProfile("organizations_2_hbase.xsl"), load("organization.xml")));
331
		rows.addAll(asRows(loadFromTransformationProfile("resultorganization_2_hbase.xsl"), load("result_organization.xml")));
332
		rows.addAll(asRows(loadFromTransformationProfile("oaf2hbase.xml"), load("record.xml")));
333
334
		printAll(mapAll(buildTable(rows)));
335
	}
336
337
	@Test
338 44483 claudio.at
	public void testDuplicates() throws Exception {
339
		final String mergeId = "50|dedup_wf_001::08ed625d07e5738b794ff14d6773fd9f";
340
		final List<Row> rows = Lists.newArrayList();
341
342 49029 claudio.at
		final Function<Row, Row> f = rowIn -> {
343 44483 claudio.at
344 49029 claudio.at
			final List<Column<String,byte[]>> cols = Lists.newArrayList();
345
			for(Column<String,byte[]> col : rowIn.getColumns()) {
346
				if (col.getName().equals("body")) {
347
					cols.add(new Column(col.getName(), col.getValue()));
348 44483 claudio.at
349
				}
350
			}
351 49029 claudio.at
			return new Row("result", rowIn.getKey(), cols);
352 44483 claudio.at
		};
353
354
		final List<Row> puma1 = asRows(loadFromTransformationProfile("oaf2hbase.xml"), load("recordPuma1.xml"), f);
355
		puma1.add(new Row("resultResult_dedup_isMergedIn", mergeId));
356
357
		final List<Row> puma2 = asRows(loadFromTransformationProfile("oaf2hbase.xml"), load("recordPuma2.xml"), f);
358
		puma2.add(new Row("resultResult_dedup_isMergedIn", mergeId));
359
360
		rows.addAll(puma1);
361
		rows.addAll(puma2);
362
363
		List<Oaf> duplicates = Lists.newArrayList();
364
		duplicates.add(getOafBody(puma1));
365
		duplicates.add(getOafBody(puma2));
366
		final Oaf.Builder oafMerge = OafEntityMerger.merge(mergeId, duplicates);
367
368 48697 claudio.at
		final Row mergeRow = new Row("result", mergeId, Lists.newArrayList(new Column("body", oafMerge.build().toByteArray())));
369 44483 claudio.at
370
		rows.add(mergeRow);
371
372
		printAll(mapAll(buildTable(rows)));
373
	}
374
375
	private Oaf getOafBody(final List<Row> rows) throws InvalidProtocolBufferException {
376
		for(Row row : rows) {
377
			if(StringUtils.startsWith(row.getKey(), "50")) {
378
				return Oaf.parseFrom(row.getColumn("body").getValue());
379
380
			}
381
		}
382
		return null;
383
	}
384
385
	@Test
386 35746 alessia.ba
	public void testParseDoajOAF() throws Exception {
387
388 40063 alessia.ba
		doTest(loadFromTransformationProfile("oaf2hbase.xml"), load("doajUniversityRecord.xml"));
389 35746 alessia.ba
	}
390
391 39888 alessia.ba
	@Test
392
	public void testParseDatasource() throws Exception {
393
394 40205 claudio.at
		doTest(loadFromTransformationProfile("datasources_2_hbase.xsl"), load("datasourceNative.xml"));
395 39888 alessia.ba
	}
396 44899 alessia.ba
	@Test
397
	public void testParseDatasourcePiwik() throws Exception {
398 39888 alessia.ba
399 44899 alessia.ba
		doTest(loadFromTransformationProfile("datasources_2_hbase.xsl"), load("datasourcePiwik.xml"));
400
	}
401
402 40205 claudio.at
	@Test
403 40341 alessia.ba
	public void testParseDataDatasource() throws Exception {
404
405
		doTest(loadFromTransformationProfile("datasources_2_hbase.xsl"), load("dataDatasource.xml"));
406
	}
407
408
	@Test
409 36164 claudio.at
	public void testFromMongodbCompressedDump() throws Exception {
410 40063 alessia.ba
		doTestJsonGz(loadFromTransformationProfile("oaf2hbase.xml"), load("mdstore_cleaned.json.gz"));
411 35746 alessia.ba
	}
412
413 55177 alessia.ba
414 40205 claudio.at
	@Test
415
	public void testLoadFromTransformationProfile() throws IOException {
416
		InputStream in = loadFromTransformationProfile("oaf2hbase.xml");
417
		log.info(IOUtils.toString(in));
418
	}
419
420
	@Test
421
	public void test_template() throws Exception {
422
		final String xslt = IOUtils.toString(loadFromTransformationProfile("oaf2hbase.xml"));
423
		final XsltRowTransformer transformer = factory.getTransformer(xslt);
424
		assertNotNull(transformer);
425
426
		final String record = IOUtils.toString(load("record.xml"));
427
		final List<Row> rows = transformer.apply(record);
428
429
		System.out.println(rows);
430
	}
431
432
	@Test
433
	public void testWrongCharsOrganization() throws Exception {
434
		final List<Row> rows = Lists.newArrayList();
435
		rows.addAll(asRows(loadFromTransformationProfile("organizations_2_hbase.xsl"), load("organizationWrongChars.xml")));
436
		printAll(mapAll(buildTable(rows)));
437
	}
438
439 43169 alessia.ba
	@Test
440
	public void testParseProjectWithFunderOriginalName() throws Exception {
441
442
		doTest(loadFromTransformationProfile("projects_2_hbase.xsl"), load("projectWithFunderOriginalName.xml"));
443
	}
444
	@Test
445
	public void testLinkFunderOriginalName() throws Exception {
446
447
		final List<Row> rows = Lists.newArrayList();
448
		rows.addAll(asRows(loadFromTransformationProfile("projects_2_hbase.xsl"), load("projectWithFunderOriginalName.xml")));
449
		rows.addAll(asRows(loadFromTransformationProfile("oaf2hbase.xml"), load("recordFunderOriginalName.xml")));
450
451
		printAll(mapAll(buildTable(rows)));
452
	}
453
454 44070 alessia.ba
	@Test
455
	public void testProjectExtraInfo() throws Exception {
456
		final List<Row> rows = Lists.newArrayList();
457
		rows.addAll(asRows(loadFromTransformationProfile("projects_2_hbase.xsl"), load("projectRecordExtraInfo.xml")));
458
		printAll(mapAll(buildTable(rows)));
459
	}
460
461 48697 claudio.at
	@Test
462
	public void testParseSoftwareFromODF() throws Exception {
463
		final List<Row> rows = Lists.newArrayList();
464
		rows.addAll(asRows(loadFromTransformationProfile("odf2hbase.xml"), load("softwareODF.xml")));
465
		rows.addAll(asRows(loadFromTransformationProfile("projects_2_hbase.xsl"), load("projectRecordCorda.xml")));
466
		printAll(mapAll(buildTable(rows)));
467
	}
468
469 52212 alessia.ba
	@Test
470 48697 claudio.at
	public void testParseSoftwareFromOAF() throws Exception {
471
		final List<Row> rows = Lists.newArrayList();
472 52212 alessia.ba
		rows.addAll(asRows(loadFromTransformationProfile("oaf2hbase.xml"), load("recordOAFsoftwareCLOSED.xml")));
473 48697 claudio.at
		printAll(mapAll(buildTable(rows)));
474
	}
475
476 52212 alessia.ba
	@Test
477 54978 alessia.ba
	public void testParsePubFromODF() throws Exception {
478
		final List<Row> rows = Lists.newArrayList();
479
		rows.addAll(asRows(loadFromTransformationProfile("odf2hbase.xml"), load("recordODFPub.xml")));
480
		printAll(mapAll(buildTable(rows)));
481
	}
482
483
	@Test
484 52212 alessia.ba
	public void testParseSoftwareFromOAFOpen() throws Exception {
485
		final List<Row> rows = Lists.newArrayList();
486
		rows.addAll(asRows(loadFromTransformationProfile("oaf2hbase.xml"), load("recordOAFsoftwareOPEN.xml")));
487
		printAll(mapAll(buildTable(rows)));
488
	}
489
490 52275 alessia.ba
	@Test
491 53756 alessia.ba
	public void testParseSoftwareBiotool() throws Exception {
492
		final List<Row> rows = Lists.newArrayList();
493
		rows.addAll(asRows(loadFromTransformationProfile("odf2hbase.xml"), load("biotoolSw.xml")));
494
		printAll(mapAll(buildTable(rows)));
495
	}
496
497
	@Test
498 52275 alessia.ba
	public void testParseOafWithExternalRef() throws Exception {
499
		final List<Row> rows = Lists.newArrayList();
500
		rows.addAll(asRows(loadFromTransformationProfile("oaf2hbase.xml"), load("oafWithExternalReference.xml")));
501
		printAll(mapAll(buildTable(rows)));
502
	}
503
504 52277 alessia.ba
	@Test
505
	public void testParseOafWithCommunity() throws Exception {
506
		final List<Row> rows = Lists.newArrayList();
507
		rows.addAll(asRows(loadFromTransformationProfile("oaf2hbase.xml"), load("oafWithCommunity.xml")));
508
		printAll(mapAll(buildTable(rows)));
509
	}
510
511 53408 claudio.at
	@Test
512
	public void testParseOafWithUpdates() throws Exception {
513
		final List<Row> rows = Lists.newArrayList();
514
		rows.addAll(asRows(loadFromTransformationProfile("oaf2hbase.xml"), load("oafWithCommunity.xml")));
515
516
		ActionFactory actionFactory = new ActionFactory();
517
518
		Map<String, Resource> xslts = Maps.newHashMap();
519
520
		xslts.put("oaf2hbase", new ByteArrayResource(IOUtils.toString(loadFromTransformationProfile("oaf2hbase.xml")).getBytes()));
521
		actionFactory.setXslts(xslts);
522
523
		XsltInfoPackageAction pa = actionFactory.generateInfoPackageAction(
524
				"oaf2hbase",
525
				"rawset-id",
526
				new Agent("agent-id", "agent-name", Agent.AGENT_TYPE.algo),
527
				Operation.UPDATE,
528
				IOUtils.toString(load("oafUpdateWithCommunity.xml")),
529
				Provenance.sysimport_mining_aggregator,
530
				"0.9");
531
532
		final String qualifier = "update_" + System.nanoTime();
533
534
		IOUtils.readLines(load("country_updates.json")).forEach(line -> {
535
536
			Oaf.Builder oaf = Oaf.newBuilder();
537
538
			try {
539
				JsonFormat.merge(line, oaf);
540
			} catch (JsonFormat.ParseException e) {
541
				throw new IllegalArgumentException(e);
542
			}
543
544
			Column<String, byte[]> col = new Column<>("update_" + System.nanoTime(), oaf.build().toByteArray());
545
			rows.add(new Row("result", oaf.getEntity().getId(), Lists.newArrayList(col)));
546
		});
547
548
		pa.asAtomicActions().forEach(a -> {
549
			Column<String, byte[]> col = new Column<>("update_" + System.nanoTime(), a.getTargetValue());
550
			rows.add(new Row(a.getTargetColumnFamily(), a.getTargetRowKey(), Lists.newArrayList(col)));
551
		});
552
553
554
		/*
555
		rows.forEach(r -> {
556
			log.info(r);
557
		});
558
		*/
559
560
		mapAll(buildTable(rows)).entrySet().forEach(b -> {
561
			log.info(b.getKey());
562
			log.info(b.getValue());
563
		});
564
	}
565
566 54978 alessia.ba
	@Test
567
	public void testParseCrisPub() throws Exception {
568
		doTest(loadFromTransformationProfile("odf2hbase.xml"), load("cris_pub1.xml"));
569
	}
570
571 55177 alessia.ba
	@Test
572
	public void testBioToolSwRowJson() throws Exception {
573
		doTestJsonRow(IOUtils.toString(load("biotoolSwRow.json")));
574
	}
575
576 55887 alessia.ba
	@Test
577
	public void testParseVirta() throws Exception {
578
		doTest(loadFromTransformationProfile("datasources_2_hbase.xsl"), load("virta.xml"));
579
	}
580
581
	@Test
582
	public void testParseJournal() throws Exception {
583
		doTest(loadFromTransformationProfile("datasources_2_hbase.xsl"), load("datasourceWithISSN.xml"));
584
	}
585
586 57091 alessia.ba
	@Test
587 57092 alessia.ba
	public void testGuidelines4Qeios() throws Exception {
588 57091 alessia.ba
589
		doTest(loadFromTransformationProfile("odf2hbase.xml"), load("guidelines4_qeios1.xml"));
590
	}
591
592 57092 alessia.ba
    @Test
593
    public void testGuidelines4Aria() throws Exception {
594
595
        doTest(loadFromTransformationProfile("odf2hbase.xml"), load("guidelines4_aria.xml"));
596
    }
597
598 57162 alessia.ba
	@Test
599
	public void testJournalRecord() throws Exception {
600
		doTest(loadFromTransformationProfile("oaf2hbase.xml"), load("journalRecord.xml"));
601
	}
602
603 55177 alessia.ba
	private void doTestJsonRow(final String json) throws Exception {
604
		Row row = asRowFromJson(json);
605
		log.info(row);
606
		List<Row> rows = new ArrayList<>();
607
		rows.add(row);
608
		final Map<String, Map<String, Map<String, byte[]>>> table = buildTable(rows);
609
		final Map<String, XmlRecordFactory> builders = mapAll(table);
610
		printAll(builders);
611
612
	}
613
614 31997 claudio.at
	private void doTest(final InputStream xsltStream, final InputStream recordStream) throws Exception {
615
		try {
616 34438 claudio.at
			final List<Row> rows = asRows(xsltStream, recordStream);
617 31997 claudio.at
618 40063 alessia.ba
			log.info(rows);
619 26600 sandro.lab
620 34438 claudio.at
			final Map<String, Map<String, Map<String, byte[]>>> table = buildTable(rows);
621 26600 sandro.lab
622 30967 claudio.at
			// System.out.println("\n" + table.toString());
623 26600 sandro.lab
624 34438 claudio.at
			final Map<String, XmlRecordFactory> builders = mapAll(table);
625 28094 claudio.at
626 38025 claudio.at
			printAll(builders);
627 34438 claudio.at
		} catch (final InvalidProtocolBufferException e) {
628 31997 claudio.at
			throw new Exception(e);
629 34438 claudio.at
		} catch (final TransformerConfigurationException e) {
630 31997 claudio.at
			throw new Exception(e);
631 34438 claudio.at
		} catch (final TransformerFactoryConfigurationError e) {
632 31997 claudio.at
			throw new Exception(e);
633 34438 claudio.at
		} catch (final DocumentException e) {
634 31997 claudio.at
			throw new Exception(e);
635 26600 sandro.lab
		}
636
	}
637
638 35746 alessia.ba
	private void doTestJsonGz(final InputStream xsltStream, final InputStream recordStream) throws Exception {
639
640 36164 claudio.at
		final Iterator<List<Row>> rowsIterator = asRowsJsonGzip(xsltStream, recordStream);
641 35746 alessia.ba
642 36164 claudio.at
		int i = 0;
643
		while (rowsIterator.hasNext()) {
644
			final List<Row> rows = rowsIterator.next();
645
			i++;
646 35746 alessia.ba
647 36164 claudio.at
			if ((i % 10000) == 0) {
648
				System.out.println(i);
649
			}
650 35746 alessia.ba
651 36164 claudio.at
			final Map<String, Map<String, Map<String, byte[]>>> table = buildTableDoaj(rows);
652 35746 alessia.ba
653 36164 claudio.at
			for (final Map<String, Map<String, byte[]>> m : table.values()) {
654
				for (final Map<String, byte[]> mv : m.values()) {
655
					for (final byte[] v : mv.values()) {
656
						final OafDecoder d = OafDecoder.decode(v);
657
						assertNotNull(d);
658
						assertNotNull(d.getOaf());
659
660
						switch (d.getKind()) {
661
						case entity:
662
							assertNotNull(d.getMetadata());
663
							if (d.getOaf().getEntity().getType().equals(Type.result)) {
664
								System.out.println(d.getOaf());
665
							}
666
							break;
667
						case relation:
668
							assertNotNull(d.getRel());
669
							break;
670
						default:
671
							break;
672
						}
673
					}
674
				}
675 35746 alessia.ba
			}
676
		}
677
	}
678
679 44483 claudio.at
	private List<Row> asRows(final InputStream xsltStream, final InputStream recordStream, final Function<Row, Row> p) throws Exception {
680 48697 claudio.at
		return asRows(xsltStream, new HashMap<>(), recordStream, p);
681 44483 claudio.at
	}
682
683 31997 claudio.at
	private List<Row> asRows(final InputStream xsltStream, final InputStream recordStream) throws Exception {
684 48697 claudio.at
		return asRows(xsltStream, new HashMap<>(), recordStream);
685 38025 claudio.at
	}
686
687
	private List<Row> asRows(final InputStream xsltStream, final Map<String, Object> params, final InputStream recordStream) throws Exception {
688 44483 claudio.at
		return asRows(xsltStream, params, recordStream, null);
689
	}
690
691
	private List<Row> asRows(final InputStream xsltStream, final Map<String, Object> params, final InputStream recordStream, final Function<Row, Row> p) throws Exception {
692 34438 claudio.at
		final String xslt = IOUtils.toString(xsltStream);
693 38025 claudio.at
		final XsltRowTransformer transformer = factory.getTransformer(xslt, params);
694 31997 claudio.at
		assertNotNull(transformer);
695 26600 sandro.lab
696 34438 claudio.at
		final String record = IOUtils.toString(recordStream);
697
		final List<Row> rows = transformer.apply(record);
698 31997 claudio.at
699
		assertNotNull(rows);
700
		assertFalse(rows.isEmpty());
701 44483 claudio.at
		return p == null ? rows : Lists.newArrayList(Iterables.transform(rows, p));
702 31997 claudio.at
	}
703
704 35746 alessia.ba
	private Iterator<List<Row>> asRowsJsonGzip(final InputStream xsltStream, final InputStream recordStreamJsonGzip) throws Exception {
705
		final String xslt = IOUtils.toString(xsltStream);
706
		final XsltRowTransformer transformer = factory.getTransformer(xslt);
707
		assertNotNull(transformer);
708
		assertNotNull(recordStreamJsonGzip);
709
710 36164 claudio.at
		final GZIPInputStream stream = new GZIPInputStream(recordStreamJsonGzip);
711 35746 alessia.ba
		assertNotNull(stream);
712
		final BufferedReader inStream = new BufferedReader(new InputStreamReader(stream));
713
		assertNotNull(inStream);
714
		return new Iterator<List<Row>>() {
715
716
			String jsonRecord = null;
717
718
			@Override
719
			public boolean hasNext() {
720
				try {
721
					return (jsonRecord = inStream.readLine()) != null;
722 36164 claudio.at
				} catch (final IOException e) {
723 35746 alessia.ba
					throw new RuntimeException(e);
724
				}
725
			}
726
727
			@Override
728
			public List<Row> next() {
729
730 36164 claudio.at
				final JSONObject jsonObj = new JSONObject(jsonRecord);
731
				final String body = jsonObj.getString("body");
732
				try {
733
					assertTrue(StringUtils.isNotBlank(body));
734
					// System.out.println(body);
735
					final List<Row> rows = transformer.apply(body);
736
					assertNotNull(rows);
737
					assertFalse(rows.isEmpty());
738
					return rows;
739
				} catch (final Throwable e) {
740
					System.err.println("error transforming document: " + body);
741
					throw new RuntimeException(e);
742
				}
743 35746 alessia.ba
			}
744
745
			@Override
746
			public void remove() {
747 43394 claudio.at
				throw new UnsupportedOperationException();
748 35746 alessia.ba
			}
749
750
		};
751
752
	}
753
754
	private Map<String, Map<String, Map<String, byte[]>>> buildTableDoaj(final List<Row> rows) throws UnsupportedEncodingException {
755 34438 claudio.at
		final Map<String, Map<String, Map<String, byte[]>>> table = Maps.newHashMap();
756 31997 claudio.at
757 34438 claudio.at
		for (final Row row : rows) {
758
			final String rowKey = row.getKey();
759
			final String cf = row.getColumnFamily();
760 31997 claudio.at
			if (!table.containsKey(rowKey)) {
761 48697 claudio.at
				table.put(rowKey, new HashMap<>());
762 31997 claudio.at
			}
763
			if (!table.get(rowKey).containsKey(cf)) {
764 48697 claudio.at
				table.get(rowKey).put(row.getColumnFamily(), new HashMap<>());
765 31997 claudio.at
			}
766 34438 claudio.at
			for (final Column<String, byte[]> c : row.getColumns()) {
767 35746 alessia.ba
				// System.out.println(String.format("ADDING K:%s CF:%s Q:%s", rowKey, cf, c.getName()));
768
				table.get(rowKey).get(cf).put(c.getName(), c.getValue());
769
				if (cf.equals("result") && c.getName().equals("body")) {
770 36164 claudio.at
					// System.out.println(String.format("ADDING K:%s CF:%s Q:%s", rowKey, cf, c.getName()));
771 35746 alessia.ba
					assertTrue(StringUtils.isNotBlank(new String(c.getValue(), "UTF-8")));
772
				}
773
			}
774
		}
775
		return table;
776
777
	}
778
779 53588 sandro.lab
	protected Map<String, Map<String, Map<String, byte[]>>> buildTable(final List<Row> rows) throws UnsupportedEncodingException {
780 35746 alessia.ba
		final Map<String, Map<String, Map<String, byte[]>>> table = Maps.newHashMap();
781
782
		for (final Row row : rows) {
783
			final String rowKey = row.getKey();
784
			final String cf = row.getColumnFamily();
785
			if (!table.containsKey(rowKey)) {
786 49029 claudio.at
				table.put(rowKey, new HashMap<>());
787 35746 alessia.ba
			}
788
			if (!table.get(rowKey).containsKey(cf)) {
789 49029 claudio.at
				table.get(rowKey).put(row.getColumnFamily(), new HashMap<>());
790 35746 alessia.ba
			}
791
			for (final Column<String, byte[]> c : row.getColumns()) {
792 31997 claudio.at
				System.out.println(String.format("ADDING K:%s CF:%s Q:%s", rowKey, cf, c.getName()));
793
				table.get(rowKey).get(cf).put(c.getName(), c.getValue());
794 38586 claudio.at
				if (c.getName().equals("body")) {
795 36164 claudio.at
					final String theBody = new String(c.getValue(), "UTF-8");
796 35746 alessia.ba
					assertTrue(StringUtils.isNotBlank(theBody));
797 44070 alessia.ba
					//System.out.println(theBody);
798 35746 alessia.ba
				}
799 31997 claudio.at
			}
800
		}
801
		return table;
802 35746 alessia.ba
803 31997 claudio.at
	}
804
805 53588 sandro.lab
	protected Map<String, XmlRecordFactory> mapAll(final Map<String, Map<String, Map<String, byte[]>>> table) throws Exception {
806 31997 claudio.at
807 34438 claudio.at
		final Map<String, XmlRecordFactory> builders = Maps.newHashMap();
808
		for (final Entry<String, Map<String, Map<String, byte[]>>> e : table.entrySet()) {
809 31997 claudio.at
			map(builders, e.getKey(), e.getValue());
810
		}
811
		return builders;
812
	}
813
814 35746 alessia.ba
	// private Map<String, XmlRecordFactory> mapResultsOnly(final Map<String, Map<String, Map<String, byte[]>>> table) throws Exception {
815
	//
816
	// final Map<String, XmlRecordFactory> builders = Maps.newHashMap();
817
	// for (final Entry<String, Map<String, Map<String, byte[]>>> e : table.entrySet()) {
818
	// final Type type = OafRowKeyDecoder.decode(e.getKey()).getType();
819
	// if (type == Type.result) {
820
	// map(builders, e.getKey(), e.getValue());
821
	// }
822
	// }
823
	// return builders;
824
	// }
825
826 31997 claudio.at
	private void map(final Map<String, XmlRecordFactory> builders, final String rowKey, final Map<String, Map<String, byte[]>> row) throws Exception {
827
828 30967 claudio.at
		final Type type = OafRowKeyDecoder.decode(rowKey).getType();
829 26600 sandro.lab
830 41468 claudio.at
		final Map<String, byte[]> familyMap = row.get(type.toString());
831 26600 sandro.lab
832 41468 claudio.at
		if (familyMap == null) return;
833 26600 sandro.lab
834 41468 claudio.at
		final byte[] bodyB = familyMap.get("body");
835 26600 sandro.lab
836 30967 claudio.at
		if (bodyB != null) {
837
			ensureBuilder(builders, rowKey);
838 41468 claudio.at
839
			final Oaf oaf = UpdateMerger.mergeBodyUpdates(familyMap);
840
841
			final OafDecoder mainEntity = OafDecoder.decode(oaf);
842
843 30967 claudio.at
			builders.get(rowKey).setMainEntity(mainEntity);
844 28094 claudio.at
845 34438 claudio.at
			for (final LinkDescriptor ld : entityConfigTable.getDescriptors(type)) {
846 26600 sandro.lab
847 34438 claudio.at
				final String it = ld.getRelDescriptor().getIt();
848
				final Map<String, byte[]> cols = row.get(it);
849 26600 sandro.lab
850 35746 alessia.ba
				if ((cols != null) && !cols.isEmpty()) {
851 26600 sandro.lab
852 34438 claudio.at
					for (final byte[] oafB : cols.values()) {
853 26600 sandro.lab
854 41468 claudio.at
						final Oaf.Builder relBuilder = Oaf.newBuilder(Oaf.parseFrom(oafB));
855 26600 sandro.lab
856 30967 claudio.at
						if (ld.isSymmetric()) {
857 34438 claudio.at
							final RelDescriptor rd = ld.getRelDescriptor();
858 40205 claudio.at
859
							relBuilder.getRelBuilder().setCachedTarget(mainEntity.getEntity()).setRelType(rd.getRelType()).setSubRelType(rd.getSubRelType());
860 30967 claudio.at
						}
861 26600 sandro.lab
862 40205 claudio.at
						relBuilder.getRelBuilder().setChild(ld.isChild());
863 26600 sandro.lab
864 40314 claudio.at
						final Oaf.Builder oafBuilder = Oaf.newBuilder().setKind(Kind.relation).setLastupdatetimestamp(System.currentTimeMillis());
865 40205 claudio.at
						oafBuilder.mergeFrom(relBuilder.build());
866 26600 sandro.lab
867 40205 claudio.at
						final String targetId = ld.isSymmetric() ? oafBuilder.getRel().getTarget() : oafBuilder.getRel().getSource();
868 30967 claudio.at
						ensureBuilder(builders, targetId);
869 34438 claudio.at
						final OafDecoder decoder = OafDecoder.decode(oafBuilder.build());
870 37334 claudio.at
871 30967 claudio.at
						if (ld.isChild()) {
872 37334 claudio.at
							builders.get(targetId).addChild(type, decoder);
873 30967 claudio.at
						} else {
874 37334 claudio.at
							builders.get(targetId).addRelation(type, decoder);
875 30967 claudio.at
						}
876 37334 claudio.at
					}
877 26600 sandro.lab
878 30967 claudio.at
				}
879
			}
880 26600 sandro.lab
		}
881
882
	}
883
884 38025 claudio.at
	private void printAll(final Map<String, XmlRecordFactory> builders) throws DocumentException {
885
		print(Sets.newHashSet(Type.values()), builders, null);
886
	}
887
888
	private void print(final Set<Type> types, final Map<String, XmlRecordFactory> builders, final Map<Type, Set<String>> xpaths) throws DocumentException {
889
		final SAXReader r = new SAXReader();
890
891 34438 claudio.at
		for (final Entry<String, XmlRecordFactory> e : builders.entrySet()) {
892 38025 claudio.at
			final OafRowKeyDecoder kd = OafRowKeyDecoder.decode(e.getKey());
893
894
			if (!e.getValue().isValid()) throw new IllegalArgumentException("invalid builder: " + e.getKey());
895
			if (types.contains(kd.getType())) {
896
				final String val = IndentXmlString.apply(e.getValue().build());
897
898
				if ((xpaths != null) && !xpaths.isEmpty() && (xpaths.get(kd.getType()) != null)) {
899
					final Document doc = r.read(new StringReader(val));
900
901 40063 alessia.ba
					log.debug("\n" + e.getKey());
902 38025 claudio.at
					for (final String xpath : xpaths.get(kd.getType())) {
903 40063 alessia.ba
						log.debug(doc.valueOf(xpath));
904 38025 claudio.at
					}
905
				} else {
906 40063 alessia.ba
					log.info(val);
907 38025 claudio.at
				}
908 31997 claudio.at
			}
909
		}
910
	}
911
912 35746 alessia.ba
	private void printNoIndent(final Map<String, XmlRecordFactory> builders) {
913
		for (final Entry<String, XmlRecordFactory> e : builders.entrySet()) {
914
			if (e.getValue().isValid()) {
915 40063 alessia.ba
				log.debug(e.getValue().build());
916 35746 alessia.ba
			} else {
917 40063 alessia.ba
				log.debug("invalid builder: " + e.getKey());
918 35746 alessia.ba
			}
919
		}
920
	}
921
922 31997 claudio.at
	private void ensureBuilder(final Map<String, XmlRecordFactory> builders, final String rowKey) throws Exception {
923 30967 claudio.at
		if (!builders.containsKey(rowKey)) {
924
			builders.put(rowKey, newBuilder());
925 26600 sandro.lab
		}
926
	}
927
928 30967 claudio.at
	private XmlRecordFactory newBuilder() throws TransformerConfigurationException, TransformerFactoryConfigurationError, DocumentException {
929 46587 alessia.ba
		return new XmlRecordFactory(entityConfigTable, ContextMapper.fromXml(Context.xml),
930
				RelClasses.fromJSon(RelClassesTest.relClassesJson), XmlRecordFactoryTest.SCHEMA_LOCATION, true, false, false, XmlRecordFactoryTest.specialDatasourceTypes);
931 30967 claudio.at
	}
932
933 33382 claudio.at
	private InputStream load(final String fileName) {
934
		return getClass().getResourceAsStream(fileName);
935
	}
936
937 40063 alessia.ba
	private InputStream loadFromTransformationProfile(final String profilePath) {
938
		log.info("Loading xslt from: " + basePathProfiles + profilePath);
939
		InputStream profile = getClass().getResourceAsStream(basePathProfiles + profilePath);
940
		SAXReader saxReader = new SAXReader();
941
		Document doc = null;
942
		try {
943
			doc = saxReader.read(profile);
944
		} catch (DocumentException e) {
945
			e.printStackTrace();
946
			throw new RuntimeException(e);
947
		}
948
		String xslt = doc.selectSingleNode("//SCRIPT/CODE/*[local-name()='stylesheet']").asXML();
949 52422 claudio.at
		//log.info(xslt);
950 40063 alessia.ba
		return IOUtils.toInputStream(xslt);
951
	}
952
953 55177 alessia.ba
954
	private Row asRowFromJson(String json) throws ParseException {
955
		Oaf.Builder oafBuilder = Oaf.newBuilder();
956
		JsonFormat.merge(json, oafBuilder);
957
		final Oaf oaf = oafBuilder.build();
958
		Row row = new Row("result", oaf.getEntity().getId());
959
		Column<String, byte[]> c = new Column<>("body", oaf.toByteArray());
960
		row.setColumn("body", c);
961
		return row;
962
963
	}
964
965 26600 sandro.lab
}