Project

General

Profile

1
package eu.dnetlib.data.transform;
2

    
3
import static org.junit.Assert.assertFalse;
4
import static org.junit.Assert.assertNotNull;
5
import static org.junit.Assert.assertTrue;
6

    
7
import java.io.BufferedReader;
8
import java.io.IOException;
9
import java.io.InputStream;
10
import java.io.InputStreamReader;
11
import java.io.StringReader;
12
import java.io.UnsupportedEncodingException;
13
import java.util.ArrayList;
14
import java.util.HashMap;
15
import java.util.Iterator;
16
import java.util.List;
17
import java.util.Map;
18
import java.util.Map.Entry;
19
import java.util.Set;
20
import java.util.zip.GZIPInputStream;
21

    
22
import javax.xml.transform.TransformerConfigurationException;
23
import javax.xml.transform.TransformerFactoryConfigurationError;
24

    
25
import org.apache.commons.io.IOUtils;
26
import org.apache.commons.lang.StringUtils;
27
import org.apache.commons.logging.Log;
28
import org.apache.commons.logging.LogFactory;
29
import org.dom4j.Document;
30
import org.dom4j.DocumentException;
31
import org.dom4j.io.SAXReader;
32
import org.json.JSONObject;
33
import org.junit.Before;
34
import org.junit.Ignore;
35
import org.junit.Test;
36
import org.springframework.core.io.ByteArrayResource;
37
import org.springframework.core.io.Resource;
38

    
39
import com.google.common.base.Function;
40
import com.google.common.collect.Iterables;
41
import com.google.common.collect.Lists;
42
import com.google.common.collect.Maps;
43
import com.google.common.collect.Sets;
44
import com.google.protobuf.InvalidProtocolBufferException;
45
import com.googlecode.protobuf.format.JsonFormat;
46
import com.googlecode.protobuf.format.JsonFormat.ParseException;
47

    
48
import eu.dnetlib.actionmanager.actions.ActionFactory;
49
import eu.dnetlib.actionmanager.actions.XsltInfoPackageAction;
50
import eu.dnetlib.actionmanager.common.Agent;
51
import eu.dnetlib.actionmanager.common.Operation;
52
import eu.dnetlib.actionmanager.common.Provenance;
53
import eu.dnetlib.data.mapreduce.hbase.index.config.Context;
54
import eu.dnetlib.data.mapreduce.hbase.index.config.ContextMapper;
55
import eu.dnetlib.data.mapreduce.hbase.index.config.EntityConfigTable;
56
import eu.dnetlib.data.mapreduce.hbase.index.config.IndexConfig;
57
import eu.dnetlib.data.mapreduce.hbase.index.config.IndexConfigTest;
58
import eu.dnetlib.data.mapreduce.hbase.index.config.LinkDescriptor;
59
import eu.dnetlib.data.mapreduce.hbase.index.config.RelClasses;
60
import eu.dnetlib.data.mapreduce.hbase.index.config.RelClassesTest;
61
import eu.dnetlib.data.mapreduce.util.OafDecoder;
62
import eu.dnetlib.data.mapreduce.util.OafRowKeyDecoder;
63
import eu.dnetlib.data.mapreduce.util.RelDescriptor;
64
import eu.dnetlib.data.mapreduce.util.UpdateMerger;
65
import eu.dnetlib.data.mapreduce.util.XmlRecordFactory;
66
import eu.dnetlib.data.mapreduce.util.XmlRecordFactoryTest;
67
import eu.dnetlib.data.proto.KindProtos.Kind;
68
import eu.dnetlib.data.proto.OafProtos.Oaf;
69
import eu.dnetlib.data.proto.TypeProtos.Type;
70
import eu.dnetlib.miscutils.functional.xml.IndentXmlString;
71

    
72
public class XsltRowTransformerFactoryTest {
73

    
74
	private static final Log log = LogFactory.getLog(XsltRowTransformerFactoryTest.class);
75
	private static String basePathProfiles = "/eu/dnetlib/test/profiles/TransformationRuleDSResources/TransformationRuleDSResourceType/2hbase/";
76
	private XsltRowTransformerFactory factory;
77
	private EntityConfigTable entityConfigTable;
78

    
79
	@Before
80
	public void setUp() throws Exception {
81
		factory = new XsltRowTransformerFactory();
82
		entityConfigTable = IndexConfig.load(IndexConfigTest.config).getConfigMap();
83
	}
84

    
85
	@Test
86
	@Ignore // need to reimplement because claimUpdates_2_hbase.xsl was removed
87
	public void testParseOafClaimUpdate() throws Exception {
88
		doTest(loadFromTransformationProfile("claimUpdates_2_hbase.xsl"), load("recordClaimUpdate.xml"));
89
	}
90

    
91
	@Test
92
	@Ignore // need to reimplement because claimUpdates_2_hbase.xsl was removed
93
	public void testParseClaimUpdate() throws Exception {
94

    
95
		final List<Row> rows = Lists.newArrayList();
96
		rows.addAll(asRows(loadFromTransformationProfile("oaf2hbase.xml"), load("recordClaim.xml")));
97
		rows.addAll(asRows(loadFromTransformationProfile("claimUpdates_2_hbase.xsl"), load("recordClaimUpdate.xml")));
98

    
99
		printAll(mapAll(buildTable(rows)));
100
	}
101

    
102
	@Test
103
	public void testParseClaimRel() throws Exception {
104

    
105
		doTest(loadFromTransformationProfile("claimRels_2_hbase.xml"), load("recordClaimRel.xml"));
106
	}
107

    
108
	@Test
109
	public void testParseFp7IctPUB() throws Exception {
110

    
111
		doTest(loadFromTransformationProfile("oaf2hbase.xml"), load("ec_fp7_ict.xml"));
112
	}
113

    
114
	@Test
115
	public void testParseRecordCrossref() throws Exception {
116

    
117
		doTest(loadFromTransformationProfile("oaf2hbase.xml"), load("recordCrossref.xml"));
118
	}
119

    
120
	@Test
121
	public void testParseDatasetPUB() throws Exception {
122

    
123
		doTest(loadFromTransformationProfile("odf2hbase.xml"), load("recordDatasetPUB.xml"));
124
	}
125

    
126
	@Test
127
	public void testParseSoftwareEgiApp() throws Exception {
128

    
129
		doTest(loadFromTransformationProfile("odf2hbase.xml"), load("softwareEgiApp.xml"));
130
	}
131

    
132
	@Test
133
	public void testParseSoftwareEgiApp2() throws Exception {
134

    
135
		doTest(loadFromTransformationProfile("odf2hbase.xml"), load("softwareEgiApp2.xml"));
136
	}
137

    
138
	@Test
139
	public void testParseOrpEgiApp() throws Exception {
140

    
141
		doTest(loadFromTransformationProfile("odf2hbase.xml"), load("orpEgiApp.xml"));
142
	}
143

    
144
	@Test
145
	public void testParseSoftwareDOECODE() throws Exception {
146

    
147
		doTest(loadFromTransformationProfile("odf2hbase.xml"), load("doecode.xml"));
148
	}
149

    
150
	@Test
151
	public void testParseDatasetLindat() throws Exception {
152

    
153
		doTest(loadFromTransformationProfile("odf2hbase.xml"), load("datasetLindat.xml"));
154
	}
155

    
156
	@Test
157
	public void testParseDatasetNeuroVault() throws Exception {
158

    
159
		doTest(loadFromTransformationProfile("odf2hbase.xml"), load("recordNeuroVault.xml"));
160
	}
161

    
162
	@Test
163
	public void testParseDatasetNeuroVault2() throws Exception {
164

    
165
		doTest(loadFromTransformationProfile("odf2hbase.xml"), load("recordNeuroVault2.xml"));
166
	}
167

    
168
	@Test
169
	public void testParseClaim() throws Exception {
170

    
171
		doTest(loadFromTransformationProfile("oaf2hbase.xml"), load("recordClaim.xml"));
172
	}
173

    
174
	@Test
175
	public void testParseClaimDedup() throws Exception {
176

    
177
		doTest(loadFromTransformationProfile("oaf2hbase.xml"), load("recordClaimedDedup.xml"));
178
	}
179

    
180
	@Test
181
	public void testParseClaimDataset() throws Exception {
182

    
183
		doTest(loadFromTransformationProfile("odf2hbase.xml"), load("recordClaimDataset.xml"));
184
	}
185

    
186
	@Test
187
	public void testParseACM() throws Exception {
188

    
189
		doTest(loadFromTransformationProfile("oaf2hbase.xml"), load("recordACM.xml"));
190
	}
191

    
192
	@Test
193
	public void testParseASB() throws Exception {
194

    
195
		doTest(loadFromTransformationProfile("oaf2hbase.xml"), load("recordASB.xml"));
196
	}
197

    
198
	@Test
199
	public void testParseProjectCorda() throws Exception {
200

    
201
		doTest(loadFromTransformationProfile("projects_2_hbase.xsl"), load("projectRecordCorda.xml"));
202
	}
203

    
204
	@Test
205
	public void testParseProjectFCT() throws Exception {
206

    
207
		doTest(loadFromTransformationProfile("projects_2_hbase.xsl"), load("projectRecordFCT.xml"));
208
	}
209

    
210
	@Test
211
	public void testParseProjectWithSummaryAndAmount() throws Exception {
212

    
213
		doTest(loadFromTransformationProfile("projects_2_hbase.xsl"), load("projectWithSummaryAndAmount.xml"));
214
	}
215

    
216
	@Test
217
	public void testLinkOrganizationWithBudget() throws Exception {
218

    
219
		final List<Row> rows = Lists.newArrayList();
220
		rows.addAll(asRows(loadFromTransformationProfile("organizations_2_hbase.xsl"), load("organization.xml")));
221
		rows.addAll(asRows(loadFromTransformationProfile("projectorganization_2_hbase.xsl"), load("project_org_budget.xml")));
222
		rows.addAll(asRows(loadFromTransformationProfile("projects_2_hbase.xsl"), load("projectWithSummaryAndAmount.xml")));
223

    
224
		printAll(mapAll(buildTable(rows)));
225
	}
226

    
227

    
228
	@Test
229
	public void testParseOaf() throws Exception {
230

    
231
		doTest(loadFromTransformationProfile("oaf2hbase.xml"), load("record.xml"));
232
	}
233

    
234
	@Test
235
	public void testParseOafPublication() throws Exception {
236

    
237
		doTest(loadFromTransformationProfile("oaf_entity2hbase.xml"), load("record.xml"));
238
	}
239

    
240
	@Test
241
	public void testParseLindat() throws Exception {
242

    
243
		doTest(loadFromTransformationProfile("odf2hbase.xml"), load("recordLindat.xml"));
244
	}
245

    
246
	@Test
247
	public void testParseDatacite() throws Exception {
248

    
249
		doTest(loadFromTransformationProfile("odf2hbase.xml"), load("recordDatacite.xml"));
250
	}
251

    
252
	@Test
253
	public void testParseDatacite2() throws Exception {
254

    
255
		doTest(loadFromTransformationProfile("odf2hbase.xml"), load("recordDatacite2.xml"));
256
	}
257

    
258
	@Test
259
	public void testParseDataciteNewES() throws Exception {
260

    
261
		doTest(loadFromTransformationProfile("odf2hbase.xml"), load("dataciteNew.xml"));
262
	}
263

    
264
	@Test
265
	public void testParseDatacite2hostedby() throws Exception {
266

    
267
		doTest(loadFromTransformationProfile("odf2hbase.xml"), load("datacite2hostedby.xml"));
268
	}
269

    
270
	@Test
271
	public void testParseOpenTrials() throws Exception {
272

    
273
		doTest(loadFromTransformationProfile("odf2hbase.xml"), load("opentrials_datacite1.xml"));
274
	}
275

    
276
	@Test
277
	public void testLinkPangaea() throws Exception {
278

    
279
		final List<Row> rows = Lists.newArrayList();
280
		rows.addAll(asRows(loadFromTransformationProfile("odf2hbase.xml"), load("pangaeODF1.xml")));
281
		rows.addAll(asRows(loadFromTransformationProfile("odf2hbase.xml"), load("pangaeODF2.xml")));
282
		rows.addAll(asRows(loadFromTransformationProfile("oaf2hbase.xml"), load("pangaeOAF.xml")));
283
		rows.addAll(asRows(loadFromTransformationProfile("projects_2_hbase.xsl"), load("projectRecordCordaPangaea.xml")));
284

    
285
		printAll(mapAll(buildTable(rows)));
286
	}
287

    
288
	@Test
289
	public void testODF() throws Exception {
290

    
291
		final List<Row> rows = Lists.newArrayList();
292
		rows.addAll(asRows(loadFromTransformationProfile("oaf2hbase.xml"), load("odf.xml")));
293
		// printAll(mapAll(buildTable(rows)));
294
	}
295

    
296
	@Test
297
	public void testPangaea() throws Exception {
298

    
299
		final List<Row> rows = Lists.newArrayList();
300
		rows.addAll(asRows(loadFromTransformationProfile("oaf2hbase.xml"), load("pangaeOAF2.xml")));
301
		printAll(mapAll(buildTable(rows)));
302
	}
303

    
304
	@Test
305
	public void testZenodo() throws Exception {
306

    
307
		final List<Row> rows = Lists.newArrayList();
308
		rows.addAll(asRows(loadFromTransformationProfile("odf2hbase.xml"), load("zenodoData.xml")));
309
		printAll(mapAll(buildTable(rows)));
310
	}
311

    
312
	@Test
313
	public void testZenodoSoftware() throws Exception {
314

    
315
		final List<Row> rows = Lists.newArrayList();
316
		rows.addAll(asRows(loadFromTransformationProfile("odf2hbase.xml"), load("softwareZenodo_odf.xml")));
317
		printAll(mapAll(buildTable(rows)));
318
	}
319

    
320
	@Test
321
	public void testZenodoSoftware2() throws Exception {
322

    
323
		final List<Row> rows = Lists.newArrayList();
324
		rows.addAll(asRows(loadFromTransformationProfile("odf2hbase.xml"), load("softwareZenodo2_odf.xml")));
325
		printAll(mapAll(buildTable(rows)));
326
	}
327

    
328
	@Test
329
	public void testInfoscienceODF() throws Exception {
330

    
331
		final List<Row> rows = Lists.newArrayList();
332
		rows.addAll(asRows(loadFromTransformationProfile("odf2hbase.xml"), load("record_infoscience_odf.xml")));
333
		printAll(mapAll(buildTable(rows)));
334
	}
335

    
336
	@Test
337
	public void testLinkCorda() throws Exception {
338

    
339
		final List<Row> rows = Lists.newArrayList();
340
		rows.addAll(asRows(loadFromTransformationProfile("projects_2_hbase.xsl"), load("projectRecordCorda.xml")));
341
		rows.addAll(asRows(loadFromTransformationProfile("oaf2hbase.xml"), load("recordCorda.xml")));
342

    
343
		printAll(mapAll(buildTable(rows)));
344
	}
345

    
346
	@Test
347
	public void testLinkFCT() throws Exception {
348

    
349
		final List<Row> rows = Lists.newArrayList();
350
		rows.addAll(asRows(loadFromTransformationProfile("projects_2_hbase.xsl"), load("projectRecordFCT.xml")));
351
		rows.addAll(asRows(loadFromTransformationProfile("oaf2hbase.xml"), load("recordFCT.xml")));
352

    
353
		printAll(mapAll(buildTable(rows)));
354
	}
355

    
356
	@Test
357
	public void testLinkARC() throws Exception {
358

    
359
		final List<Row> rows = Lists.newArrayList();
360
		rows.addAll(asRows(loadFromTransformationProfile("projects_2_hbase.xsl"), load("projectRecordARC.xml")));
361
		rows.addAll(asRows(loadFromTransformationProfile("oaf2hbase.xml"), load("recordARC.xml")));
362

    
363
		printAll(mapAll(buildTable(rows)));
364
	}
365

    
366
	@Test
367
	public void testLinkWT() throws Exception {
368

    
369
		final List<Row> rows = Lists.newArrayList();
370
		rows.addAll(asRows(loadFromTransformationProfile("projects_2_hbase.xsl"), load("projectRecordWT.xml")));
371
		rows.addAll(asRows(loadFromTransformationProfile("oaf2hbase.xml"), load("recordWT.xml")));
372

    
373
		printAll(mapAll(buildTable(rows)));
374
	}
375

    
376
	@Test
377
	public void testLinkOrganization() throws Exception {
378

    
379
		final List<Row> rows = Lists.newArrayList();
380
		rows.addAll(asRows(loadFromTransformationProfile("organizations_2_hbase.xsl"), load("organization.xml")));
381
		rows.addAll(asRows(loadFromTransformationProfile("projectorganization_2_hbase.xsl"), load("project_organization.xml")));
382
		rows.addAll(asRows(loadFromTransformationProfile("projects_2_hbase.xsl"), load("projectRecordCorda.xml")));
383

    
384
		printAll(mapAll(buildTable(rows)));
385
	}
386

    
387
	@Test
388
	public void testLinkOrganizationAffiliation() throws Exception {
389

    
390
		final List<Row> rows = Lists.newArrayList();
391
		rows.addAll(asRows(loadFromTransformationProfile("organizations_2_hbase.xsl"), load("organization.xml")));
392
		rows.addAll(asRows(loadFromTransformationProfile("resultorganization_2_hbase.xsl"), load("result_organization.xml")));
393
		rows.addAll(asRows(loadFromTransformationProfile("oaf2hbase.xml"), load("record.xml")));
394

    
395
		printAll(mapAll(buildTable(rows)));
396
	}
397

    
398
	@Test
399
	public void testDuplicates() throws Exception {
400
		final String mergeId = "50|dedup_wf_001::08ed625d07e5738b794ff14d6773fd9f";
401
		final List<Row> rows = Lists.newArrayList();
402

    
403
		final Function<Row, Row> f = rowIn -> {
404

    
405
			final List<Column<String, byte[]>> cols = Lists.newArrayList();
406
			for (final Column<String, byte[]> col : rowIn.getColumns()) {
407
				if (col.getName().equals("body")) {
408
					cols.add(new Column(col.getName(), col.getValue()));
409

    
410
				}
411
			}
412
			return new Row("result", rowIn.getKey(), cols);
413
		};
414

    
415
		final List<Row> puma1 = asRows(loadFromTransformationProfile("oaf2hbase.xml"), load("recordPuma1.xml"), f);
416
		puma1.add(new Row("resultResult_dedup_isMergedIn", mergeId));
417

    
418
		final List<Row> puma2 = asRows(loadFromTransformationProfile("oaf2hbase.xml"), load("recordPuma2.xml"), f);
419
		puma2.add(new Row("resultResult_dedup_isMergedIn", mergeId));
420

    
421
		rows.addAll(puma1);
422
		rows.addAll(puma2);
423

    
424
		final List<Oaf> duplicates = Lists.newArrayList();
425
		duplicates.add(getOafBody(puma1));
426
		duplicates.add(getOafBody(puma2));
427
		final Oaf.Builder oafMerge = OafEntityMerger.merge(mergeId, duplicates);
428

    
429
		final Row mergeRow = new Row("result", mergeId, Lists.newArrayList(new Column("body", oafMerge.build().toByteArray())));
430

    
431
		rows.add(mergeRow);
432

    
433
		printAll(mapAll(buildTable(rows)));
434
	}
435

    
436
	private Oaf getOafBody(final List<Row> rows) throws InvalidProtocolBufferException {
437
		for (final Row row : rows) {
438
			if (StringUtils.startsWith(row.getKey(), "50")) { return Oaf.parseFrom(row.getColumn("body").getValue());
439

    
440
			}
441
		}
442
		return null;
443
	}
444

    
445
	@Test
446
	public void testParseDoajOAF() throws Exception {
447

    
448
		doTest(loadFromTransformationProfile("oaf2hbase.xml"), load("doajUniversityRecord.xml"));
449
	}
450

    
451
	@Test
452
	public void testParseDatasource() throws Exception {
453

    
454
		doTest(loadFromTransformationProfile("datasources_2_hbase.xsl"), load("datasourceNative.xml"));
455
	}
456

    
457
	@Test
458
	public void testParseDatasourcePiwik() throws Exception {
459

    
460
		doTest(loadFromTransformationProfile("datasources_2_hbase.xsl"), load("datasourcePiwik.xml"));
461
	}
462

    
463
	@Test
464
	public void testParseDataDatasource() throws Exception {
465

    
466
		doTest(loadFromTransformationProfile("datasources_2_hbase.xsl"), load("dataDatasource.xml"));
467
	}
468

    
469
	@Test
470
	public void testFromMongodbCompressedDump() throws Exception {
471
		doTestJsonGz(loadFromTransformationProfile("oaf2hbase.xml"), load("mdstore_cleaned.json.gz"));
472
	}
473

    
474
	@Test
475
	public void testLoadFromTransformationProfile() throws IOException {
476
		final InputStream in = loadFromTransformationProfile("oaf2hbase.xml");
477
		log.info(IOUtils.toString(in));
478
	}
479

    
480
	@Test
481
	public void test_template() throws Exception {
482
		final String xslt = IOUtils.toString(loadFromTransformationProfile("oaf2hbase.xml"));
483
		final XsltRowTransformer transformer = factory.getTransformer(xslt);
484
		assertNotNull(transformer);
485

    
486
		final String record = IOUtils.toString(load("record.xml"));
487
		final List<Row> rows = transformer.apply(record);
488

    
489
		System.out.println(rows);
490
	}
491

    
492
	@Test
493
	public void testWrongCharsOrganization() throws Exception {
494
		final List<Row> rows = Lists.newArrayList();
495
		rows.addAll(asRows(loadFromTransformationProfile("organizations_2_hbase.xsl"), load("organizationWrongChars.xml")));
496
		printAll(mapAll(buildTable(rows)));
497
	}
498

    
499
	@Test
500
	public void testParseProjectWithFunderOriginalName() throws Exception {
501

    
502
		doTest(loadFromTransformationProfile("projects_2_hbase.xsl"), load("projectWithFunderOriginalName.xml"));
503
	}
504

    
505
	@Test
506
	public void testLinkFunderOriginalName() throws Exception {
507

    
508
		final List<Row> rows = Lists.newArrayList();
509
		rows.addAll(asRows(loadFromTransformationProfile("projects_2_hbase.xsl"), load("projectWithFunderOriginalName.xml")));
510
		rows.addAll(asRows(loadFromTransformationProfile("oaf2hbase.xml"), load("recordFunderOriginalName.xml")));
511

    
512
		printAll(mapAll(buildTable(rows)));
513
	}
514

    
515
	@Test
516
	public void testProjectExtraInfo() throws Exception {
517
		final List<Row> rows = Lists.newArrayList();
518
		rows.addAll(asRows(loadFromTransformationProfile("projects_2_hbase.xsl"), load("projectRecordExtraInfo.xml")));
519
		printAll(mapAll(buildTable(rows)));
520
	}
521

    
522
	@Test
523
	public void testParseSoftwareFromODF() throws Exception {
524
		final List<Row> rows = Lists.newArrayList();
525
		rows.addAll(asRows(loadFromTransformationProfile("odf2hbase.xml"), load("softwareODF.xml")));
526
		rows.addAll(asRows(loadFromTransformationProfile("projects_2_hbase.xsl"), load("projectRecordCorda.xml")));
527
		printAll(mapAll(buildTable(rows)));
528
	}
529

    
530
	@Test
531
	public void testParseSoftwareFromOAF() throws Exception {
532
		final List<Row> rows = Lists.newArrayList();
533
		rows.addAll(asRows(loadFromTransformationProfile("oaf2hbase.xml"), load("recordOAFsoftwareCLOSED.xml")));
534
		printAll(mapAll(buildTable(rows)));
535
	}
536

    
537
	@Test
538
	public void testParsePubFromODF() throws Exception {
539
		final List<Row> rows = Lists.newArrayList();
540
		rows.addAll(asRows(loadFromTransformationProfile("odf2hbase.xml"), load("recordODFPub.xml")));
541
		printAll(mapAll(buildTable(rows)));
542
	}
543

    
544
	@Test
545
	public void testParseSoftwareFromOAFOpen() throws Exception {
546
		final List<Row> rows = Lists.newArrayList();
547
		rows.addAll(asRows(loadFromTransformationProfile("oaf2hbase.xml"), load("recordOAFsoftwareOPEN.xml")));
548
		printAll(mapAll(buildTable(rows)));
549
	}
550

    
551
	@Test
552
	public void testParseSoftwareBiotool() throws Exception {
553
		final List<Row> rows = Lists.newArrayList();
554
		rows.addAll(asRows(loadFromTransformationProfile("odf2hbase.xml"), load("biotoolSw.xml")));
555
		printAll(mapAll(buildTable(rows)));
556
	}
557

    
558
	@Test
559
	public void testParseOafWithExternalRef() throws Exception {
560
		final List<Row> rows = Lists.newArrayList();
561
		rows.addAll(asRows(loadFromTransformationProfile("oaf2hbase.xml"), load("oafWithExternalReference.xml")));
562
		printAll(mapAll(buildTable(rows)));
563
	}
564

    
565
	@Test
566
	public void testParseOafWithCommunity() throws Exception {
567
		final List<Row> rows = Lists.newArrayList();
568
		rows.addAll(asRows(loadFromTransformationProfile("oaf2hbase.xml"), load("oafWithCommunity.xml")));
569
		printAll(mapAll(buildTable(rows)));
570
	}
571

    
572
	@Test
573
	public void testParseOafWithUpdates() throws Exception {
574
		final List<Row> rows = Lists.newArrayList();
575
		rows.addAll(asRows(loadFromTransformationProfile("oaf2hbase.xml"), load("oafWithCommunity.xml")));
576

    
577
		final ActionFactory actionFactory = new ActionFactory();
578

    
579
		final Map<String, Resource> xslts = Maps.newHashMap();
580

    
581
		xslts.put("oaf2hbase", new ByteArrayResource(IOUtils.toString(loadFromTransformationProfile("oaf2hbase.xml")).getBytes()));
582
		actionFactory.setXslts(xslts);
583

    
584
		final XsltInfoPackageAction pa = actionFactory.generateInfoPackageAction(
585
				"oaf2hbase",
586
				"rawset-id",
587
				new Agent("agent-id", "agent-name", Agent.AGENT_TYPE.algo),
588
				Operation.UPDATE,
589
				IOUtils.toString(load("oafUpdateWithCommunity.xml")),
590
				Provenance.sysimport_mining_aggregator,
591
				"0.9");
592

    
593
		final String qualifier = "update_" + System.nanoTime();
594

    
595
		IOUtils.readLines(load("country_updates.json")).forEach(line -> {
596

    
597
			final Oaf.Builder oaf = Oaf.newBuilder();
598

    
599
			try {
600
				JsonFormat.merge(line, oaf);
601
			} catch (final JsonFormat.ParseException e) {
602
				throw new IllegalArgumentException(e);
603
			}
604

    
605
			final Column<String, byte[]> col = new Column<>("update_" + System.nanoTime(), oaf.build().toByteArray());
606
			rows.add(new Row("result", oaf.getEntity().getId(), Lists.newArrayList(col)));
607
		});
608

    
609
		pa.asAtomicActions().forEach(a -> {
610
			final Column<String, byte[]> col = new Column<>("update_" + System.nanoTime(), a.getTargetValue());
611
			rows.add(new Row(a.getTargetColumnFamily(), a.getTargetRowKey(), Lists.newArrayList(col)));
612
		});
613

    
614
		/*
615
		 * rows.forEach(r -> { log.info(r); });
616
		 */
617

    
618
		mapAll(buildTable(rows)).entrySet().forEach(b -> {
619
			log.info(b.getKey());
620
			log.info(b.getValue());
621
		});
622
	}
623

    
624
	@Test
625
	public void testParseCrisPub() throws Exception {
626
		doTest(loadFromTransformationProfile("odf2hbase.xml"), load("cris_pub1.xml"));
627
	}
628

    
629
	@Test
630
	public void testBioToolSwRowJson() throws Exception {
631
		doTestJsonRow(IOUtils.toString(load("biotoolSwRow.json")));
632
	}
633

    
634
	@Test
635
	public void testParseVirta() throws Exception {
636
		doTest(loadFromTransformationProfile("datasources_2_hbase.xsl"), load("virta.xml"));
637
	}
638

    
639
	@Test
640
	public void testParseJournal() throws Exception {
641
		doTest(loadFromTransformationProfile("datasources_2_hbase.xsl"), load("datasourceWithISSN.xml"));
642
	}
643

    
644
	@Test
645
	public void testGuidelines4Qeios() throws Exception {
646

    
647
		doTest(loadFromTransformationProfile("odf2hbase.xml"), load("guidelines4_qeios1.xml"));
648
	}
649

    
650
	@Test
651
	public void testGuidelines4Aria() throws Exception {
652

    
653
		doTest(loadFromTransformationProfile("odf2hbase.xml"), load("guidelines4_aria.xml"));
654
	}
655

    
656
	@Test
657
	public void testJournalRecord() throws Exception {
658
		doTest(loadFromTransformationProfile("oaf2hbase.xml"), load("journalRecord.xml"));
659
	}
660

    
661
	@Test
662
	public void testSygmaDatacite() throws Exception {
663

    
664
		final List<Row> rows = Lists.newArrayList();
665
		rows.addAll(asRows(loadFromTransformationProfile("odf2hbase.xml"), load("sygma_datacite.xml")));
666
		rows.addAll(asRows(loadFromTransformationProfile("projects_2_hbase.xsl"), load("projectRecordCorda.xml")));
667

    
668
		printAll(mapAll(buildTable(rows)));
669
	}
670

    
671
	@Test
672
	public void testOpenOrganizations() throws Exception {
673

    
674
		final List<Row> rows = Lists.newArrayList();
675
		rows.addAll(asRows(loadFromTransformationProfile("organizations_2_hbase.xsl"), load("openorgs_sample.xml")));
676

    
677
		printAll(mapAll(buildTable(rows)));
678
	}
679

    
680
	private void doTestJsonRow(final String json) throws Exception {
681
		final Row row = asRowFromJson(json);
682
		log.info(row);
683
		final List<Row> rows = new ArrayList<>();
684
		rows.add(row);
685
		final Map<String, Map<String, Map<String, byte[]>>> table = buildTable(rows);
686
		final Map<String, XmlRecordFactory> builders = mapAll(table);
687
		printAll(builders);
688

    
689
	}
690

    
691
	private void doTest(final InputStream xsltStream, final InputStream recordStream) throws Exception {
692
		try {
693
			final List<Row> rows = asRows(xsltStream, recordStream);
694

    
695
			log.info(rows);
696

    
697
			final Map<String, Map<String, Map<String, byte[]>>> table = buildTable(rows);
698

    
699
			// System.out.println("\n" + table.toString());
700

    
701
			final Map<String, XmlRecordFactory> builders = mapAll(table);
702

    
703
			printAll(builders);
704
		} catch (final InvalidProtocolBufferException e) {
705
			throw new Exception(e);
706
		} catch (final TransformerConfigurationException e) {
707
			throw new Exception(e);
708
		} catch (final TransformerFactoryConfigurationError e) {
709
			throw new Exception(e);
710
		} catch (final DocumentException e) {
711
			throw new Exception(e);
712
		}
713
	}
714

    
715
	private void doTestJsonGz(final InputStream xsltStream, final InputStream recordStream) throws Exception {
716

    
717
		final Iterator<List<Row>> rowsIterator = asRowsJsonGzip(xsltStream, recordStream);
718

    
719
		int i = 0;
720
		while (rowsIterator.hasNext()) {
721
			final List<Row> rows = rowsIterator.next();
722
			i++;
723

    
724
			if (i % 10000 == 0) {
725
				System.out.println(i);
726
			}
727

    
728
			final Map<String, Map<String, Map<String, byte[]>>> table = buildTableDoaj(rows);
729

    
730
			for (final Map<String, Map<String, byte[]>> m : table.values()) {
731
				for (final Map<String, byte[]> mv : m.values()) {
732
					for (final byte[] v : mv.values()) {
733
						final OafDecoder d = OafDecoder.decode(v);
734
						assertNotNull(d);
735
						assertNotNull(d.getOaf());
736

    
737
						switch (d.getKind()) {
738
						case entity:
739
							assertNotNull(d.getMetadata());
740
							if (d.getOaf().getEntity().getType().equals(Type.result)) {
741
								System.out.println(d.getOaf());
742
							}
743
							break;
744
						case relation:
745
							assertNotNull(d.getRel());
746
							break;
747
						default:
748
							break;
749
						}
750
					}
751
				}
752
			}
753
		}
754
	}
755

    
756
	private List<Row> asRows(final InputStream xsltStream, final InputStream recordStream, final Function<Row, Row> p) throws Exception {
757
		return asRows(xsltStream, new HashMap<>(), recordStream, p);
758
	}
759

    
760
	private List<Row> asRows(final InputStream xsltStream, final InputStream recordStream) throws Exception {
761
		return asRows(xsltStream, new HashMap<>(), recordStream);
762
	}
763

    
764
	private List<Row> asRows(final InputStream xsltStream, final Map<String, Object> params, final InputStream recordStream) throws Exception {
765
		return asRows(xsltStream, params, recordStream, null);
766
	}
767

    
768
	private List<Row> asRows(final InputStream xsltStream, final Map<String, Object> params, final InputStream recordStream, final Function<Row, Row> p)
769
			throws Exception {
770
		final String xslt = IOUtils.toString(xsltStream);
771
		final XsltRowTransformer transformer = factory.getTransformer(xslt, params);
772
		assertNotNull(transformer);
773

    
774
		final String record = IOUtils.toString(recordStream);
775
		final List<Row> rows = transformer.apply(record);
776

    
777
		assertNotNull(rows);
778
		assertFalse(rows.isEmpty());
779
		return p == null ? rows : Lists.newArrayList(Iterables.transform(rows, p));
780
	}
781

    
782
	private Iterator<List<Row>> asRowsJsonGzip(final InputStream xsltStream, final InputStream recordStreamJsonGzip) throws Exception {
783
		final String xslt = IOUtils.toString(xsltStream);
784
		final XsltRowTransformer transformer = factory.getTransformer(xslt);
785
		assertNotNull(transformer);
786
		assertNotNull(recordStreamJsonGzip);
787

    
788
		final GZIPInputStream stream = new GZIPInputStream(recordStreamJsonGzip);
789
		assertNotNull(stream);
790
		final BufferedReader inStream = new BufferedReader(new InputStreamReader(stream));
791
		assertNotNull(inStream);
792
		return new Iterator<List<Row>>() {
793

    
794
			String jsonRecord = null;
795

    
796
			@Override
797
			public boolean hasNext() {
798
				try {
799
					return (jsonRecord = inStream.readLine()) != null;
800
				} catch (final IOException e) {
801
					throw new RuntimeException(e);
802
				}
803
			}
804

    
805
			@Override
806
			public List<Row> next() {
807

    
808
				final JSONObject jsonObj = new JSONObject(jsonRecord);
809
				final String body = jsonObj.getString("body");
810
				try {
811
					assertTrue(StringUtils.isNotBlank(body));
812
					// System.out.println(body);
813
					final List<Row> rows = transformer.apply(body);
814
					assertNotNull(rows);
815
					assertFalse(rows.isEmpty());
816
					return rows;
817
				} catch (final Throwable e) {
818
					System.err.println("error transforming document: " + body);
819
					throw new RuntimeException(e);
820
				}
821
			}
822

    
823
			@Override
824
			public void remove() {
825
				throw new UnsupportedOperationException();
826
			}
827

    
828
		};
829

    
830
	}
831

    
832
	private Map<String, Map<String, Map<String, byte[]>>> buildTableDoaj(final List<Row> rows) throws UnsupportedEncodingException {
833
		final Map<String, Map<String, Map<String, byte[]>>> table = Maps.newHashMap();
834

    
835
		for (final Row row : rows) {
836
			final String rowKey = row.getKey();
837
			final String cf = row.getColumnFamily();
838
			if (!table.containsKey(rowKey)) {
839
				table.put(rowKey, new HashMap<>());
840
			}
841
			if (!table.get(rowKey).containsKey(cf)) {
842
				table.get(rowKey).put(row.getColumnFamily(), new HashMap<>());
843
			}
844
			for (final Column<String, byte[]> c : row.getColumns()) {
845
				// System.out.println(String.format("ADDING K:%s CF:%s Q:%s", rowKey, cf, c.getName()));
846
				table.get(rowKey).get(cf).put(c.getName(), c.getValue());
847
				if (cf.equals("result") && c.getName().equals("body")) {
848
					// System.out.println(String.format("ADDING K:%s CF:%s Q:%s", rowKey, cf, c.getName()));
849
					assertTrue(StringUtils.isNotBlank(new String(c.getValue(), "UTF-8")));
850
				}
851
			}
852
		}
853
		return table;
854

    
855
	}
856

    
857
	protected Map<String, Map<String, Map<String, byte[]>>> buildTable(final List<Row> rows) throws UnsupportedEncodingException {
858
		final Map<String, Map<String, Map<String, byte[]>>> table = Maps.newHashMap();
859

    
860
		for (final Row row : rows) {
861
			final String rowKey = row.getKey();
862
			final String cf = row.getColumnFamily();
863
			if (!table.containsKey(rowKey)) {
864
				table.put(rowKey, new HashMap<>());
865
			}
866
			if (!table.get(rowKey).containsKey(cf)) {
867
				table.get(rowKey).put(row.getColumnFamily(), new HashMap<>());
868
			}
869
			for (final Column<String, byte[]> c : row.getColumns()) {
870
				System.out.println(String.format("ADDING K:%s CF:%s Q:%s", rowKey, cf, c.getName()));
871
				table.get(rowKey).get(cf).put(c.getName(), c.getValue());
872
				if (c.getName().equals("body")) {
873
					final String theBody = new String(c.getValue(), "UTF-8");
874
					assertTrue(StringUtils.isNotBlank(theBody));
875
					// System.out.println(theBody);
876
				}
877
			}
878
		}
879
		return table;
880

    
881
	}
882

    
883
	protected Map<String, XmlRecordFactory> mapAll(final Map<String, Map<String, Map<String, byte[]>>> table) throws Exception {
884

    
885
		final Map<String, XmlRecordFactory> builders = Maps.newHashMap();
886
		for (final Entry<String, Map<String, Map<String, byte[]>>> e : table.entrySet()) {
887
			map(builders, e.getKey(), e.getValue());
888
		}
889
		return builders;
890
	}
891

    
892
	// private Map<String, XmlRecordFactory> mapResultsOnly(final Map<String, Map<String, Map<String, byte[]>>> table) throws Exception {
893
	//
894
	// final Map<String, XmlRecordFactory> builders = Maps.newHashMap();
895
	// for (final Entry<String, Map<String, Map<String, byte[]>>> e : table.entrySet()) {
896
	// final Type type = OafRowKeyDecoder.decode(e.getKey()).getType();
897
	// if (type == Type.result) {
898
	// map(builders, e.getKey(), e.getValue());
899
	// }
900
	// }
901
	// return builders;
902
	// }
903

    
904
	private void map(final Map<String, XmlRecordFactory> builders, final String rowKey, final Map<String, Map<String, byte[]>> row) throws Exception {
905

    
906
		final Type type = OafRowKeyDecoder.decode(rowKey).getType();
907

    
908
		final Map<String, byte[]> familyMap = row.get(type.toString());
909

    
910
		if (familyMap == null) { return; }
911

    
912
		final byte[] bodyB = familyMap.get("body");
913

    
914
		if (bodyB != null) {
915
			ensureBuilder(builders, rowKey);
916

    
917
			final Oaf oaf = UpdateMerger.mergeBodyUpdates(familyMap);
918

    
919
			final OafDecoder mainEntity = OafDecoder.decode(oaf);
920

    
921
			builders.get(rowKey).setMainEntity(mainEntity);
922

    
923
			for (final LinkDescriptor ld : entityConfigTable.getDescriptors(type)) {
924

    
925
				final String it = ld.getRelDescriptor().getIt();
926
				final Map<String, byte[]> cols = row.get(it);
927

    
928
				if (cols != null && !cols.isEmpty()) {
929

    
930
					for (final byte[] oafB : cols.values()) {
931

    
932
						final Oaf.Builder relBuilder = Oaf.newBuilder(Oaf.parseFrom(oafB));
933

    
934
						if (ld.isSymmetric()) {
935
							final RelDescriptor rd = ld.getRelDescriptor();
936

    
937
							relBuilder.getRelBuilder().setCachedTarget(mainEntity.getEntity()).setRelType(rd.getRelType()).setSubRelType(rd.getSubRelType());
938
						}
939

    
940
						relBuilder.getRelBuilder().setChild(ld.isChild());
941

    
942
						final Oaf.Builder oafBuilder = Oaf.newBuilder().setKind(Kind.relation).setLastupdatetimestamp(System.currentTimeMillis());
943
						oafBuilder.mergeFrom(relBuilder.build());
944

    
945
						final String targetId = ld.isSymmetric() ? oafBuilder.getRel().getTarget() : oafBuilder.getRel().getSource();
946
						ensureBuilder(builders, targetId);
947
						final OafDecoder decoder = OafDecoder.decode(oafBuilder.build());
948

    
949
						if (ld.isChild()) {
950
							builders.get(targetId).addChild(type, decoder);
951
						} else {
952
							builders.get(targetId).addRelation(type, decoder);
953
						}
954
					}
955

    
956
				}
957
			}
958
		}
959

    
960
	}
961

    
962
	private void printAll(final Map<String, XmlRecordFactory> builders) throws DocumentException {
963
		print(Sets.newHashSet(Type.values()), builders, null);
964
	}
965

    
966
	private void print(final Set<Type> types, final Map<String, XmlRecordFactory> builders, final Map<Type, Set<String>> xpaths) throws DocumentException {
967
		final SAXReader r = new SAXReader();
968

    
969
		for (final Entry<String, XmlRecordFactory> e : builders.entrySet()) {
970
			final OafRowKeyDecoder kd = OafRowKeyDecoder.decode(e.getKey());
971

    
972
			if (!e.getValue().isValid()) { throw new IllegalArgumentException("invalid builder: " + e.getKey()); }
973
			if (types.contains(kd.getType())) {
974
				final String val = IndentXmlString.apply(e.getValue().build());
975

    
976
				if (xpaths != null && !xpaths.isEmpty() && xpaths.get(kd.getType()) != null) {
977
					final Document doc = r.read(new StringReader(val));
978

    
979
					log.debug("\n" + e.getKey());
980
					for (final String xpath : xpaths.get(kd.getType())) {
981
						log.debug(doc.valueOf(xpath));
982
					}
983
				} else {
984
					log.info(val);
985
				}
986
			}
987
		}
988
	}
989

    
990
	private void printNoIndent(final Map<String, XmlRecordFactory> builders) {
991
		for (final Entry<String, XmlRecordFactory> e : builders.entrySet()) {
992
			if (e.getValue().isValid()) {
993
				log.debug(e.getValue().build());
994
			} else {
995
				log.debug("invalid builder: " + e.getKey());
996
			}
997
		}
998
	}
999

    
1000
	private void ensureBuilder(final Map<String, XmlRecordFactory> builders, final String rowKey) throws Exception {
1001
		if (!builders.containsKey(rowKey)) {
1002
			builders.put(rowKey, newBuilder());
1003
		}
1004
	}
1005

    
1006
	private XmlRecordFactory newBuilder() throws TransformerConfigurationException, TransformerFactoryConfigurationError, DocumentException {
1007
		return new XmlRecordFactory(entityConfigTable, ContextMapper.fromXml(Context.xml),
1008
				RelClasses.fromJSon(RelClassesTest.relClassesJson), XmlRecordFactoryTest.SCHEMA_LOCATION, true, false, false,
1009
				XmlRecordFactoryTest.specialDatasourceTypes);
1010
	}
1011

    
1012
	private InputStream load(final String fileName) {
1013
		return getClass().getResourceAsStream(fileName);
1014
	}
1015

    
1016
	private InputStream loadFromTransformationProfile(final String profilePath) {
1017
		log.info("Loading xslt from: " + basePathProfiles + profilePath);
1018
		final InputStream profile = getClass().getResourceAsStream(basePathProfiles + profilePath);
1019
		final SAXReader saxReader = new SAXReader();
1020
		Document doc = null;
1021
		try {
1022
			doc = saxReader.read(profile);
1023
		} catch (final DocumentException e) {
1024
			e.printStackTrace();
1025
			throw new RuntimeException(e);
1026
		}
1027
		final String xslt = doc.selectSingleNode("//SCRIPT/CODE/*[local-name()='stylesheet']").asXML();
1028
		// log.info(xslt);
1029
		return IOUtils.toInputStream(xslt);
1030
	}
1031

    
1032
	private Row asRowFromJson(final String json) throws ParseException {
1033
		final Oaf.Builder oafBuilder = Oaf.newBuilder();
1034
		JsonFormat.merge(json, oafBuilder);
1035
		final Oaf oaf = oafBuilder.build();
1036
		final Row row = new Row("result", oaf.getEntity().getId());
1037
		final Column<String, byte[]> c = new Column<>("body", oaf.toByteArray());
1038
		row.setColumn("body", c);
1039
		return row;
1040

    
1041
	}
1042

    
1043
}
    (1-1/1)