Project

General

Profile

1
package eu.dnetlib.data.transform;
2

    
3
import java.io.BufferedReader;
4
import java.io.IOException;
5
import java.io.InputStream;
6
import java.io.InputStreamReader;
7
import java.io.StringReader;
8
import java.io.UnsupportedEncodingException;
9
import java.util.ArrayList;
10
import java.util.HashMap;
11
import java.util.Iterator;
12
import java.util.List;
13
import java.util.Map;
14
import java.util.Map.Entry;
15
import java.util.Set;
16
import java.util.zip.GZIPInputStream;
17

    
18
import javax.xml.transform.TransformerConfigurationException;
19
import javax.xml.transform.TransformerFactoryConfigurationError;
20

    
21
import org.apache.commons.io.IOUtils;
22
import org.apache.commons.lang.StringUtils;
23
import org.apache.commons.logging.Log;
24
import org.apache.commons.logging.LogFactory;
25
import org.dom4j.Document;
26
import org.dom4j.DocumentException;
27
import org.dom4j.io.SAXReader;
28
import org.json.JSONObject;
29
import org.junit.Before;
30
import org.junit.Ignore;
31
import org.junit.Test;
32
import org.springframework.core.io.ByteArrayResource;
33
import org.springframework.core.io.Resource;
34

    
35
import com.google.common.base.Function;
36
import com.google.common.collect.Iterables;
37
import com.google.common.collect.Lists;
38
import com.google.common.collect.Maps;
39
import com.google.common.collect.Sets;
40
import com.google.protobuf.InvalidProtocolBufferException;
41
import com.googlecode.protobuf.format.JsonFormat;
42
import com.googlecode.protobuf.format.JsonFormat.ParseException;
43

    
44
import eu.dnetlib.actionmanager.actions.ActionFactory;
45
import eu.dnetlib.actionmanager.actions.XsltInfoPackageAction;
46
import eu.dnetlib.actionmanager.common.Agent;
47
import eu.dnetlib.actionmanager.common.Operation;
48
import eu.dnetlib.actionmanager.common.Provenance;
49
import eu.dnetlib.data.mapreduce.hbase.index.config.Context;
50
import eu.dnetlib.data.mapreduce.hbase.index.config.ContextMapper;
51
import eu.dnetlib.data.mapreduce.hbase.index.config.EntityConfigTable;
52
import eu.dnetlib.data.mapreduce.hbase.index.config.IndexConfig;
53
import eu.dnetlib.data.mapreduce.hbase.index.config.IndexConfigTest;
54
import eu.dnetlib.data.mapreduce.hbase.index.config.LinkDescriptor;
55
import eu.dnetlib.data.mapreduce.hbase.index.config.RelClasses;
56
import eu.dnetlib.data.mapreduce.hbase.index.config.RelClassesTest;
57
import eu.dnetlib.data.mapreduce.util.OafDecoder;
58
import eu.dnetlib.data.mapreduce.util.OafRowKeyDecoder;
59
import eu.dnetlib.data.mapreduce.util.RelDescriptor;
60
import eu.dnetlib.data.mapreduce.util.UpdateMerger;
61
import eu.dnetlib.data.mapreduce.util.XmlRecordFactory;
62
import eu.dnetlib.data.mapreduce.util.XmlRecordFactoryTest;
63
import eu.dnetlib.data.proto.KindProtos.Kind;
64
import eu.dnetlib.data.proto.OafProtos.Oaf;
65
import eu.dnetlib.data.proto.TypeProtos.Type;
66
import eu.dnetlib.miscutils.functional.xml.IndentXmlString;
67

    
68
import static org.junit.Assert.*;
69

    
70
public class XsltRowTransformerFactoryTest {
71

    
72
	private static final Log log = LogFactory.getLog(XsltRowTransformerFactoryTest.class);
73
	private static String basePathProfiles = "/eu/dnetlib/test/profiles/TransformationRuleDSResources/TransformationRuleDSResourceType/2hbase/";
74
	private XsltRowTransformerFactory factory;
75
	private EntityConfigTable entityConfigTable;
76

    
77
	@Before
78
	public void setUp() throws Exception {
79
		factory = new XsltRowTransformerFactory();
80
		entityConfigTable = IndexConfig.load(IndexConfigTest.config).getConfigMap();
81
	}
82

    
83
	@Test
84
	@Ignore // need to reimplement because claimUpdates_2_hbase.xsl was removed
85
	public void testParseOafClaimUpdate() throws Exception {
86
		doTest(loadFromTransformationProfile("claimUpdates_2_hbase.xsl"), load("recordClaimUpdate.xml"));
87
	}
88

    
89
	@Test
90
	@Ignore // need to reimplement because claimUpdates_2_hbase.xsl was removed
91
	public void testParseClaimUpdate() throws Exception {
92

    
93
		final List<Row> rows = Lists.newArrayList();
94
		rows.addAll(asRows(loadFromTransformationProfile("oaf2hbase.xml"), load("recordClaim.xml")));
95
		rows.addAll(asRows(loadFromTransformationProfile("claimUpdates_2_hbase.xsl"), load("recordClaimUpdate.xml")));
96

    
97
		printAll(mapAll(buildTable(rows)));
98
	}
99

    
100
	@Test
101
	public void testParseClaimRel() throws Exception {
102

    
103
		doTest(loadFromTransformationProfile("claimRels_2_hbase.xml"), load("recordClaimRel.xml"));
104
	}
105

    
106
	@Test
107
	public void testParseFp7IctPUB() throws Exception {
108

    
109
		doTest(loadFromTransformationProfile("oaf2hbase.xml"), load("ec_fp7_ict.xml"));
110
	}
111

    
112
	@Test
113
	public void testParseRecordCrossref() throws Exception {
114

    
115
		doTest(loadFromTransformationProfile("oaf2hbase.xml"), load("recordCrossref.xml"));
116
	}
117

    
118
	@Test
119
	public void testParseDatasetPUB() throws Exception {
120

    
121
		doTest(loadFromTransformationProfile("odf2hbase.xml"), load("recordDatasetPUB.xml"));
122
	}
123

    
124
	@Test
125
	public void testParseSoftwareEgiApp() throws Exception {
126

    
127
		doTest(loadFromTransformationProfile("odf2hbase.xml"), load("softwareEgiApp.xml"));
128
	}
129

    
130
	@Test
131
	public void testParseSoftwareEgiApp2() throws Exception {
132

    
133
		doTest(loadFromTransformationProfile("odf2hbase.xml"), load("softwareEgiApp2.xml"));
134
	}
135

    
136
	@Test
137
	public void testParseOrpEgiApp() throws Exception {
138

    
139
		doTest(loadFromTransformationProfile("odf2hbase.xml"), load("orpEgiApp.xml"));
140
	}
141

    
142
	@Test
143
	public void testParseSoftwareDOECODE() throws Exception {
144

    
145
		doTest(loadFromTransformationProfile("odf2hbase.xml"), load("doecode.xml"));
146
	}
147

    
148
	@Test
149
	public void testParseDatasetLindat() throws Exception {
150

    
151
		doTest(loadFromTransformationProfile("odf2hbase.xml"), load("datasetLindat.xml"));
152
	}
153

    
154
	@Test
155
	public void testParseDatasetNeuroVault() throws Exception {
156

    
157
		doTest(loadFromTransformationProfile("odf2hbase.xml"), load("recordNeuroVault.xml"));
158
	}
159

    
160
	@Test
161
	public void testParseDatasetNeuroVault2() throws Exception {
162

    
163
		doTest(loadFromTransformationProfile("odf2hbase.xml"), load("recordNeuroVault2.xml"));
164
	}
165

    
166
	@Test
167
	public void testParseClaim() throws Exception {
168

    
169
		doTest(loadFromTransformationProfile("oaf2hbase.xml"), load("recordClaim.xml"));
170
	}
171

    
172
	@Test
173
	public void testParseClaimDedup() throws Exception {
174

    
175
		doTest(loadFromTransformationProfile("oaf2hbase.xml"), load("recordClaimedDedup.xml"));
176
	}
177

    
178
	@Test
179
	public void testParseClaimDataset() throws Exception {
180

    
181
		doTest(loadFromTransformationProfile("odf2hbase.xml"), load("recordClaimDataset.xml"));
182
	}
183

    
184
	@Test
185
	public void testParseACM() throws Exception {
186

    
187
		doTest(loadFromTransformationProfile("oaf2hbase.xml"), load("recordACM.xml"));
188
	}
189

    
190
	@Test
191
	public void testParseASB() throws Exception {
192

    
193
		doTest(loadFromTransformationProfile("oaf2hbase.xml"), load("recordASB.xml"));
194
	}
195

    
196
	@Test
197
	public void testParseProjectCorda() throws Exception {
198

    
199
		doTest(loadFromTransformationProfile("projects_2_hbase.xsl"), load("projectRecordCorda.xml"));
200
	}
201

    
202
	@Test
203
	public void testParseProjectFCT() throws Exception {
204

    
205
		doTest(loadFromTransformationProfile("projects_2_hbase.xsl"), load("projectRecordFCT.xml"));
206
	}
207

    
208
	@Test
209
	public void testParseProjectWithSummaryAndAmount() throws Exception {
210

    
211
		doTest(loadFromTransformationProfile("projects_2_hbase.xsl"), load("projectWithSummaryAndAmount.xml"));
212
	}
213

    
214
	@Test
215
	public void testLinkOrganizationWithBudget() throws Exception {
216

    
217
		final List<Row> rows = Lists.newArrayList();
218
		rows.addAll(asRows(loadFromTransformationProfile("organizations_2_hbase.xsl"), load("organization.xml")));
219
		rows.addAll(asRows(loadFromTransformationProfile("projectorganization_2_hbase.xsl"), load("project_org_budget.xml")));
220
		rows.addAll(asRows(loadFromTransformationProfile("projects_2_hbase.xsl"), load("projectWithSummaryAndAmount.xml")));
221

    
222
		printAll(mapAll(buildTable(rows)));
223
	}
224

    
225
	@Test
226
	public void testXpath() throws DocumentException {
227

    
228
		final String value = "CONICYT";
229

    
230
		String ftree = "<fundingtree><funder><id>conicytf____::CONICYT</id><shortname>"+value+"</shortname><name>ComisiónNacionaldeInvestigaciónCientíficayTecnológica</name><jurisdiction>CL</jurisdiction></funder><funding_level_1><id>conicytf____::CONICYT::FONDECYT::REGULAR</id><description>Fondecytstream,REGULAR</description><name>Fondecytstream,REGULAR</name><class>conicyt:fondecytfundings</class><parent><funding_level_0><id>conicytf____::CONICYT::FONDECYT</id><name>FONDECYT</name><description>Fondecytfundings</description><parent/><class>conicyt:fondecytfundings</class></funding_level_0></parent></funding_level_1></fundingtree>";
231
		final Document doc = new SAXReader().read(new StringReader(ftree));
232

    
233
		assertEquals(value, doc.valueOf("//fundingtree/funder/shortname/text()"));
234
	}
235

    
236

    
237
	@Test
238
	public void testParseOaf() throws Exception {
239

    
240
		doTest(loadFromTransformationProfile("oaf2hbase.xml"), load("record.xml"));
241
	}
242

    
243
	@Test
244
	public void testParseOafPublication() throws Exception {
245

    
246
		doTest(loadFromTransformationProfile("oaf_entity2hbase.xml"), load("record.xml"));
247
	}
248

    
249
	@Test
250
	public void testParseLindat() throws Exception {
251

    
252
		doTest(loadFromTransformationProfile("odf2hbase.xml"), load("recordLindat.xml"));
253
	}
254

    
255
	@Test
256
	public void testParseOpenAPCrecord() throws Exception {
257

    
258
		doTest(loadFromTransformationProfile("oaf2hbase.xml"), load("recordOpenAPC.xml"));
259
	}
260

    
261
	@Test
262
	public void testParseDatacite() throws Exception {
263

    
264
		doTest(loadFromTransformationProfile("odf2hbase.xml"), load("recordDatacite.xml"));
265
	}
266

    
267
	@Test
268
	public void testParseDatacite2() throws Exception {
269

    
270
		doTest(loadFromTransformationProfile("odf2hbase.xml"), load("recordDatacite2.xml"));
271
	}
272

    
273
	@Test
274
	public void testParseDataciteNewES() throws Exception {
275

    
276
		doTest(loadFromTransformationProfile("odf2hbase.xml"), load("dataciteNew.xml"));
277
	}
278

    
279
	@Test
280
	public void testParseDatacite2hostedby() throws Exception {
281

    
282
		doTest(loadFromTransformationProfile("odf2hbase.xml"), load("datacite2hostedby.xml"));
283
	}
284

    
285
	@Test
286
	public void testParseOpenTrials() throws Exception {
287

    
288
		doTest(loadFromTransformationProfile("odf2hbase.xml"), load("opentrials_datacite1.xml"));
289
	}
290

    
291
	@Test
292
	public void testLinkPangaea() throws Exception {
293

    
294
		final List<Row> rows = Lists.newArrayList();
295
		rows.addAll(asRows(loadFromTransformationProfile("odf2hbase.xml"), load("pangaeODF1.xml")));
296
		rows.addAll(asRows(loadFromTransformationProfile("odf2hbase.xml"), load("pangaeODF2.xml")));
297
		rows.addAll(asRows(loadFromTransformationProfile("oaf2hbase.xml"), load("pangaeOAF.xml")));
298
		rows.addAll(asRows(loadFromTransformationProfile("projects_2_hbase.xsl"), load("projectRecordCordaPangaea.xml")));
299

    
300
		printAll(mapAll(buildTable(rows)));
301
	}
302

    
303
	@Test
304
	public void testODF() throws Exception {
305

    
306
		final List<Row> rows = Lists.newArrayList();
307
		rows.addAll(asRows(loadFromTransformationProfile("oaf2hbase.xml"), load("odf.xml")));
308
		printAll(mapAll(buildTable(rows)));
309
	}
310

    
311
	@Test
312
	public void testPangaea() throws Exception {
313

    
314
		final List<Row> rows = Lists.newArrayList();
315
		rows.addAll(asRows(loadFromTransformationProfile("oaf2hbase.xml"), load("pangaeOAF2.xml")));
316
		printAll(mapAll(buildTable(rows)));
317
	}
318

    
319
	@Test
320
	public void testZenodo() throws Exception {
321

    
322
		final List<Row> rows = Lists.newArrayList();
323
		rows.addAll(asRows(loadFromTransformationProfile("odf2hbase.xml"), load("zenodoData.xml")));
324
		printAll(mapAll(buildTable(rows)));
325
	}
326

    
327
	@Test
328
	public void testZenodoSoftware() throws Exception {
329

    
330
		final List<Row> rows = Lists.newArrayList();
331
		rows.addAll(asRows(loadFromTransformationProfile("odf2hbase.xml"), load("softwareZenodo_odf.xml")));
332
		printAll(mapAll(buildTable(rows)));
333
	}
334

    
335
	@Test
336
	public void testZenodoSoftware2() throws Exception {
337

    
338
		final List<Row> rows = Lists.newArrayList();
339
		rows.addAll(asRows(loadFromTransformationProfile("odf2hbase.xml"), load("softwareZenodo2_odf.xml")));
340
		printAll(mapAll(buildTable(rows)));
341
	}
342

    
343
	@Test
344
	public void testInfoscienceODF() throws Exception {
345

    
346
		final List<Row> rows = Lists.newArrayList();
347
		rows.addAll(asRows(loadFromTransformationProfile("odf2hbase.xml"), load("record_infoscience_odf.xml")));
348
		printAll(mapAll(buildTable(rows)));
349
	}
350

    
351
	@Test
352
	public void testLinkCorda() throws Exception {
353

    
354
		final List<Row> rows = Lists.newArrayList();
355
		rows.addAll(asRows(loadFromTransformationProfile("projects_2_hbase.xsl"), load("projectRecordCorda.xml")));
356
		rows.addAll(asRows(loadFromTransformationProfile("oaf2hbase.xml"), load("recordCorda.xml")));
357

    
358
		printAll(mapAll(buildTable(rows)));
359
	}
360

    
361
	@Test
362
	public void testLinkFCT() throws Exception {
363

    
364
		final List<Row> rows = Lists.newArrayList();
365
		rows.addAll(asRows(loadFromTransformationProfile("projects_2_hbase.xsl"), load("projectRecordFCT.xml")));
366
		rows.addAll(asRows(loadFromTransformationProfile("oaf2hbase.xml"), load("recordFCT.xml")));
367

    
368
		printAll(mapAll(buildTable(rows)));
369
	}
370

    
371
	@Test
372
	public void testLinkARC() throws Exception {
373

    
374
		final List<Row> rows = Lists.newArrayList();
375
		rows.addAll(asRows(loadFromTransformationProfile("projects_2_hbase.xsl"), load("projectRecordARC.xml")));
376
		rows.addAll(asRows(loadFromTransformationProfile("oaf2hbase.xml"), load("recordARC.xml")));
377

    
378
		printAll(mapAll(buildTable(rows)));
379
	}
380

    
381
	@Test
382
	public void testLinkWT() throws Exception {
383

    
384
		final List<Row> rows = Lists.newArrayList();
385
		rows.addAll(asRows(loadFromTransformationProfile("projects_2_hbase.xsl"), load("projectRecordWT.xml")));
386
		rows.addAll(asRows(loadFromTransformationProfile("oaf2hbase.xml"), load("recordWT.xml")));
387

    
388
		printAll(mapAll(buildTable(rows)));
389
	}
390

    
391
	@Test
392
	public void testLinkOrganization() throws Exception {
393

    
394
		final List<Row> rows = Lists.newArrayList();
395
		rows.addAll(asRows(loadFromTransformationProfile("organizations_2_hbase.xsl"), load("organization.xml")));
396
		rows.addAll(asRows(loadFromTransformationProfile("projectorganization_2_hbase.xsl"), load("project_organization.xml")));
397
		rows.addAll(asRows(loadFromTransformationProfile("projects_2_hbase.xsl"), load("projectRecordCorda.xml")));
398

    
399
		printAll(mapAll(buildTable(rows)));
400
	}
401

    
402
	@Test
403
	public void testLinkOrganizationAffiliation() throws Exception {
404

    
405
		final List<Row> rows = Lists.newArrayList();
406
		rows.addAll(asRows(loadFromTransformationProfile("organizations_2_hbase.xsl"), load("organization.xml")));
407
		rows.addAll(asRows(loadFromTransformationProfile("resultorganization_2_hbase.xsl"), load("result_organization.xml")));
408
		rows.addAll(asRows(loadFromTransformationProfile("oaf2hbase.xml"), load("record.xml")));
409

    
410
		printAll(mapAll(buildTable(rows)));
411
	}
412

    
413
	@Test
414
	public void testDuplicates() throws Exception {
415
		final String mergeId = "50|dedup_wf_001::08ed625d07e5738b794ff14d6773fd9f";
416
		final List<Row> rows = Lists.newArrayList();
417

    
418
		final Function<Row, Row> f = rowIn -> {
419

    
420
			final List<Column<String, byte[]>> cols = Lists.newArrayList();
421
			for (final Column<String, byte[]> col : rowIn.getColumns()) {
422
				if (col.getName().equals("body")) {
423
					cols.add(new Column(col.getName(), col.getValue()));
424

    
425
				}
426
			}
427
			return new Row("result", rowIn.getKey(), cols);
428
		};
429

    
430
		final List<Row> puma1 = asRows(loadFromTransformationProfile("oaf2hbase.xml"), load("recordPuma1.xml"), f);
431
		puma1.add(new Row("resultResult_dedup_isMergedIn", mergeId));
432

    
433
		final List<Row> puma2 = asRows(loadFromTransformationProfile("oaf2hbase.xml"), load("recordPuma2.xml"), f);
434
		puma2.add(new Row("resultResult_dedup_isMergedIn", mergeId));
435

    
436
		rows.addAll(puma1);
437
		rows.addAll(puma2);
438

    
439
		final List<Oaf> duplicates = Lists.newArrayList();
440
		duplicates.add(getOafBody(puma1));
441
		duplicates.add(getOafBody(puma2));
442
		final Oaf.Builder oafMerge = OafEntityMerger.merge(mergeId, duplicates);
443

    
444
		final Row mergeRow = new Row("result", mergeId, Lists.newArrayList(new Column("body", oafMerge.build().toByteArray())));
445

    
446
		rows.add(mergeRow);
447

    
448
		printAll(mapAll(buildTable(rows)));
449
	}
450

    
451
	private Oaf getOafBody(final List<Row> rows) throws InvalidProtocolBufferException {
452
		for (final Row row : rows) {
453
			if (StringUtils.startsWith(row.getKey(), "50")) { return Oaf.parseFrom(row.getColumn("body").getValue());
454

    
455
			}
456
		}
457
		return null;
458
	}
459

    
460
	@Test
461
	public void testParseDoajOAF() throws Exception {
462

    
463
		doTest(loadFromTransformationProfile("oaf2hbase.xml"), load("doajUniversityRecord.xml"));
464
	}
465

    
466
	@Test
467
	public void testParseDatasource() throws Exception {
468

    
469
		doTest(loadFromTransformationProfile("datasources_2_hbase.xsl"), load("datasourceNative.xml"));
470
	}
471

    
472
	@Test
473
	public void testParseDatasourcePiwik() throws Exception {
474

    
475
		doTest(loadFromTransformationProfile("datasources_2_hbase.xsl"), load("datasourcePiwik.xml"));
476
	}
477

    
478
	@Test
479
	public void testParseDataDatasource() throws Exception {
480

    
481
		doTest(loadFromTransformationProfile("datasources_2_hbase.xsl"), load("dataDatasource.xml"));
482
	}
483

    
484
	@Test
485
	public void testFromMongodbCompressedDump() throws Exception {
486
		doTestJsonGz(loadFromTransformationProfile("oaf2hbase.xml"), load("mdstore_cleaned.json.gz"));
487
	}
488

    
489
	@Test
490
	public void testLoadFromTransformationProfile() throws IOException {
491
		final InputStream in = loadFromTransformationProfile("oaf2hbase.xml");
492
		log.info(IOUtils.toString(in));
493
	}
494

    
495
	@Test
496
	public void test_template() throws Exception {
497
		final String xslt = IOUtils.toString(loadFromTransformationProfile("oaf2hbase.xml"));
498
		final XsltRowTransformer transformer = factory.getTransformer(xslt);
499
		assertNotNull(transformer);
500

    
501
		final String record = IOUtils.toString(load("record.xml"));
502
		final List<Row> rows = transformer.apply(record);
503

    
504
		System.out.println(rows);
505
	}
506

    
507
	@Test
508
	public void testWrongCharsOrganization() throws Exception {
509
		final List<Row> rows = Lists.newArrayList();
510
		rows.addAll(asRows(loadFromTransformationProfile("organizations_2_hbase.xsl"), load("organizationWrongChars.xml")));
511
		printAll(mapAll(buildTable(rows)));
512
	}
513

    
514
	@Test
515
	public void testParseProjectWithFunderOriginalName() throws Exception {
516

    
517
		doTest(loadFromTransformationProfile("projects_2_hbase.xsl"), load("projectWithFunderOriginalName.xml"));
518
	}
519

    
520
	@Test
521
	public void testLinkFunderOriginalName() throws Exception {
522

    
523
		final List<Row> rows = Lists.newArrayList();
524
		rows.addAll(asRows(loadFromTransformationProfile("projects_2_hbase.xsl"), load("projectWithFunderOriginalName.xml")));
525
		rows.addAll(asRows(loadFromTransformationProfile("oaf2hbase.xml"), load("recordFunderOriginalName.xml")));
526

    
527
		printAll(mapAll(buildTable(rows)));
528
	}
529

    
530
	@Test
531
	public void testProjectExtraInfo() throws Exception {
532
		final List<Row> rows = Lists.newArrayList();
533
		rows.addAll(asRows(loadFromTransformationProfile("projects_2_hbase.xsl"), load("projectRecordExtraInfo.xml")));
534
		printAll(mapAll(buildTable(rows)));
535
	}
536

    
537
	@Test
538
	public void testParseSoftwareFromODF() throws Exception {
539
		final List<Row> rows = Lists.newArrayList();
540
		rows.addAll(asRows(loadFromTransformationProfile("odf2hbase.xml"), load("softwareODF.xml")));
541
		rows.addAll(asRows(loadFromTransformationProfile("projects_2_hbase.xsl"), load("projectRecordCorda.xml")));
542
		printAll(mapAll(buildTable(rows)));
543
	}
544

    
545
	@Test
546
	public void testParseSoftwareFromOAF() throws Exception {
547
		final List<Row> rows = Lists.newArrayList();
548
		rows.addAll(asRows(loadFromTransformationProfile("oaf2hbase.xml"), load("recordOAFsoftwareCLOSED.xml")));
549
		printAll(mapAll(buildTable(rows)));
550
	}
551

    
552
	@Test
553
	public void testParsePubFromODF() throws Exception {
554
		final List<Row> rows = Lists.newArrayList();
555
		rows.addAll(asRows(loadFromTransformationProfile("odf2hbase.xml"), load("recordODFPub.xml")));
556
		printAll(mapAll(buildTable(rows)));
557
	}
558

    
559
	@Test
560
	public void testParseSoftwareFromOAFOpen() throws Exception {
561
		final List<Row> rows = Lists.newArrayList();
562
		rows.addAll(asRows(loadFromTransformationProfile("oaf2hbase.xml"), load("recordOAFsoftwareOPEN.xml")));
563
		printAll(mapAll(buildTable(rows)));
564
	}
565

    
566
	@Test
567
	public void testParseSoftwareBiotool() throws Exception {
568
		final List<Row> rows = Lists.newArrayList();
569
		rows.addAll(asRows(loadFromTransformationProfile("odf2hbase.xml"), load("biotoolSw.xml")));
570
		printAll(mapAll(buildTable(rows)));
571
	}
572

    
573
	@Test
574
	public void testParseOafWithExternalRef() throws Exception {
575
		final List<Row> rows = Lists.newArrayList();
576
		rows.addAll(asRows(loadFromTransformationProfile("oaf2hbase.xml"), load("oafWithExternalReference.xml")));
577
		printAll(mapAll(buildTable(rows)));
578
	}
579

    
580
	@Test
581
	public void testParseOafWithCommunity() throws Exception {
582
		final List<Row> rows = Lists.newArrayList();
583
		rows.addAll(asRows(loadFromTransformationProfile("oaf2hbase.xml"), load("oafWithCommunity.xml")));
584
		printAll(mapAll(buildTable(rows)));
585
	}
586

    
587
	@Test
588
	public void testParseOafWithUpdates() throws Exception {
589
		final List<Row> rows = Lists.newArrayList();
590
		rows.addAll(asRows(loadFromTransformationProfile("oaf2hbase.xml"), load("oafWithCommunity.xml")));
591

    
592
		final ActionFactory actionFactory = new ActionFactory();
593

    
594
		final Map<String, Resource> xslts = Maps.newHashMap();
595

    
596
		xslts.put("oaf2hbase", new ByteArrayResource(IOUtils.toString(loadFromTransformationProfile("oaf2hbase.xml")).getBytes()));
597
		actionFactory.setXslts(xslts);
598

    
599
		final XsltInfoPackageAction pa = actionFactory.generateInfoPackageAction(
600
				"oaf2hbase",
601
				"rawset-id",
602
				new Agent("agent-id", "agent-name", Agent.AGENT_TYPE.algo),
603
				Operation.UPDATE,
604
				IOUtils.toString(load("oafUpdateWithCommunity.xml")),
605
				Provenance.sysimport_mining_aggregator,
606
				"0.9");
607

    
608
		final String qualifier = "update_" + System.nanoTime();
609

    
610
		IOUtils.readLines(load("country_updates.json")).forEach(line -> {
611

    
612
			final Oaf.Builder oaf = Oaf.newBuilder();
613

    
614
			try {
615
				JsonFormat.merge(line, oaf);
616
			} catch (final JsonFormat.ParseException e) {
617
				throw new IllegalArgumentException(e);
618
			}
619

    
620
			final Column<String, byte[]> col = new Column<>("update_" + System.nanoTime(), oaf.build().toByteArray());
621
			rows.add(new Row("result", oaf.getEntity().getId(), Lists.newArrayList(col)));
622
		});
623

    
624
		pa.asAtomicActions().forEach(a -> {
625
			final Column<String, byte[]> col = new Column<>("update_" + System.nanoTime(), a.getTargetValue());
626
			rows.add(new Row(a.getTargetColumnFamily(), a.getTargetRowKey(), Lists.newArrayList(col)));
627
		});
628

    
629
		/*
630
		 * rows.forEach(r -> { log.info(r); });
631
		 */
632

    
633
		mapAll(buildTable(rows)).entrySet().forEach(b -> {
634
			log.info(b.getKey());
635
			log.info(b.getValue());
636
		});
637
	}
638

    
639
	@Test
640
	public void testParseCrisPub() throws Exception {
641
		doTest(loadFromTransformationProfile("odf2hbase.xml"), load("cris_pub1.xml"));
642
	}
643

    
644
	@Test
645
	public void testBioToolSwRowJson() throws Exception {
646
		doTestJsonRow(IOUtils.toString(load("biotoolSwRow.json")));
647
	}
648

    
649
	@Test
650
	public void testParseVirta() throws Exception {
651
		doTest(loadFromTransformationProfile("datasources_2_hbase.xsl"), load("virta.xml"));
652
	}
653

    
654
	@Test
655
	public void testParseJournal() throws Exception {
656
		doTest(loadFromTransformationProfile("datasources_2_hbase.xsl"), load("datasourceWithISSN.xml"));
657
	}
658

    
659
	@Test
660
	public void testGuidelines4Qeios() throws Exception {
661

    
662
		doTest(loadFromTransformationProfile("odf2hbase.xml"), load("guidelines4_qeios1.xml"));
663
	}
664

    
665
	@Test
666
	public void testGuidelines4Aria() throws Exception {
667

    
668
		doTest(loadFromTransformationProfile("odf2hbase.xml"), load("guidelines4_aria.xml"));
669
	}
670

    
671
	@Test
672
	public void testJournalRecord() throws Exception {
673
		doTest(loadFromTransformationProfile("oaf2hbase.xml"), load("journalRecord.xml"));
674
	}
675

    
676
	@Test
677
	public void testSygmaDatacite() throws Exception {
678

    
679
		final List<Row> rows = Lists.newArrayList();
680
		rows.addAll(asRows(loadFromTransformationProfile("odf2hbase.xml"), load("sygma_datacite.xml")));
681
		rows.addAll(asRows(loadFromTransformationProfile("projects_2_hbase.xsl"), load("projectRecordCorda.xml")));
682

    
683
		printAll(mapAll(buildTable(rows)));
684
	}
685

    
686
	@Test
687
	public void testOpenOrganizations() throws Exception {
688

    
689
		final List<Row> rows = Lists.newArrayList();
690
		rows.addAll(asRows(loadFromTransformationProfile("organizations_2_hbase.xsl"), load("openorgs_sample.xml")));
691

    
692
		printAll(mapAll(buildTable(rows)));
693
	}
694

    
695
	private void doTestJsonRow(final String json) throws Exception {
696
		final Row row = asRowFromJson(json);
697
		log.info(row);
698
		final List<Row> rows = new ArrayList<>();
699
		rows.add(row);
700
		final Map<String, Map<String, Map<String, byte[]>>> table = buildTable(rows);
701
		final Map<String, XmlRecordFactory> builders = mapAll(table);
702
		printAll(builders);
703

    
704
	}
705

    
706
	private void doTest(final InputStream xsltStream, final InputStream recordStream) throws Exception {
707
		try {
708
			final List<Row> rows = asRows(xsltStream, recordStream);
709

    
710
			log.info(rows);
711

    
712
			final Map<String, Map<String, Map<String, byte[]>>> table = buildTable(rows);
713

    
714
			// System.out.println("\n" + table.toString());
715

    
716
			final Map<String, XmlRecordFactory> builders = mapAll(table);
717

    
718
			printAll(builders);
719
		} catch (final InvalidProtocolBufferException e) {
720
			throw new Exception(e);
721
		} catch (final TransformerConfigurationException e) {
722
			throw new Exception(e);
723
		} catch (final TransformerFactoryConfigurationError e) {
724
			throw new Exception(e);
725
		} catch (final DocumentException e) {
726
			throw new Exception(e);
727
		}
728
	}
729

    
730
	private void doTestJsonGz(final InputStream xsltStream, final InputStream recordStream) throws Exception {
731

    
732
		final Iterator<List<Row>> rowsIterator = asRowsJsonGzip(xsltStream, recordStream);
733

    
734
		int i = 0;
735
		while (rowsIterator.hasNext()) {
736
			final List<Row> rows = rowsIterator.next();
737
			i++;
738

    
739
			if (i % 10000 == 0) {
740
				System.out.println(i);
741
			}
742

    
743
			final Map<String, Map<String, Map<String, byte[]>>> table = buildTableDoaj(rows);
744

    
745
			for (final Map<String, Map<String, byte[]>> m : table.values()) {
746
				for (final Map<String, byte[]> mv : m.values()) {
747
					for (final byte[] v : mv.values()) {
748
						final OafDecoder d = OafDecoder.decode(v);
749
						assertNotNull(d);
750
						assertNotNull(d.getOaf());
751

    
752
						switch (d.getKind()) {
753
						case entity:
754
							assertNotNull(d.getMetadata());
755
							if (d.getOaf().getEntity().getType().equals(Type.result)) {
756
								System.out.println(d.getOaf());
757
							}
758
							break;
759
						case relation:
760
							assertNotNull(d.getRel());
761
							break;
762
						default:
763
							break;
764
						}
765
					}
766
				}
767
			}
768
		}
769
	}
770

    
771
	private List<Row> asRows(final InputStream xsltStream, final InputStream recordStream, final Function<Row, Row> p) throws Exception {
772
		return asRows(xsltStream, new HashMap<>(), recordStream, p);
773
	}
774

    
775
	private List<Row> asRows(final InputStream xsltStream, final InputStream recordStream) throws Exception {
776
		return asRows(xsltStream, new HashMap<>(), recordStream);
777
	}
778

    
779
	private List<Row> asRows(final InputStream xsltStream, final Map<String, Object> params, final InputStream recordStream) throws Exception {
780
		return asRows(xsltStream, params, recordStream, null);
781
	}
782

    
783
	private List<Row> asRows(final InputStream xsltStream, final Map<String, Object> params, final InputStream recordStream, final Function<Row, Row> p)
784
			throws Exception {
785
		final String xslt = IOUtils.toString(xsltStream);
786
		final XsltRowTransformer transformer = factory.getTransformer(xslt, params);
787
		assertNotNull(transformer);
788

    
789
		final String record = IOUtils.toString(recordStream);
790
		final List<Row> rows = transformer.apply(record);
791

    
792
		assertNotNull(rows);
793
		assertFalse(rows.isEmpty());
794
		return p == null ? rows : Lists.newArrayList(Iterables.transform(rows, p));
795
	}
796

    
797
	private Iterator<List<Row>> asRowsJsonGzip(final InputStream xsltStream, final InputStream recordStreamJsonGzip) throws Exception {
798
		final String xslt = IOUtils.toString(xsltStream);
799
		final XsltRowTransformer transformer = factory.getTransformer(xslt);
800
		assertNotNull(transformer);
801
		assertNotNull(recordStreamJsonGzip);
802

    
803
		final GZIPInputStream stream = new GZIPInputStream(recordStreamJsonGzip);
804
		assertNotNull(stream);
805
		final BufferedReader inStream = new BufferedReader(new InputStreamReader(stream));
806
		assertNotNull(inStream);
807
		return new Iterator<List<Row>>() {
808

    
809
			String jsonRecord = null;
810

    
811
			@Override
812
			public boolean hasNext() {
813
				try {
814
					return (jsonRecord = inStream.readLine()) != null;
815
				} catch (final IOException e) {
816
					throw new RuntimeException(e);
817
				}
818
			}
819

    
820
			@Override
821
			public List<Row> next() {
822

    
823
				final JSONObject jsonObj = new JSONObject(jsonRecord);
824
				final String body = jsonObj.getString("body");
825
				try {
826
					assertTrue(StringUtils.isNotBlank(body));
827
					// System.out.println(body);
828
					final List<Row> rows = transformer.apply(body);
829
					assertNotNull(rows);
830
					assertFalse(rows.isEmpty());
831
					return rows;
832
				} catch (final Throwable e) {
833
					System.err.println("error transforming document: " + body);
834
					throw new RuntimeException(e);
835
				}
836
			}
837

    
838
			@Override
839
			public void remove() {
840
				throw new UnsupportedOperationException();
841
			}
842

    
843
		};
844

    
845
	}
846

    
847
	private Map<String, Map<String, Map<String, byte[]>>> buildTableDoaj(final List<Row> rows) throws UnsupportedEncodingException {
848
		final Map<String, Map<String, Map<String, byte[]>>> table = Maps.newHashMap();
849

    
850
		for (final Row row : rows) {
851
			final String rowKey = row.getKey();
852
			final String cf = row.getColumnFamily();
853
			if (!table.containsKey(rowKey)) {
854
				table.put(rowKey, new HashMap<>());
855
			}
856
			if (!table.get(rowKey).containsKey(cf)) {
857
				table.get(rowKey).put(row.getColumnFamily(), new HashMap<>());
858
			}
859
			for (final Column<String, byte[]> c : row.getColumns()) {
860
				// System.out.println(String.format("ADDING K:%s CF:%s Q:%s", rowKey, cf, c.getName()));
861
				table.get(rowKey).get(cf).put(c.getName(), c.getValue());
862
				if (cf.equals("result") && c.getName().equals("body")) {
863
					// System.out.println(String.format("ADDING K:%s CF:%s Q:%s", rowKey, cf, c.getName()));
864
					assertTrue(StringUtils.isNotBlank(new String(c.getValue(), "UTF-8")));
865
				}
866
			}
867
		}
868
		return table;
869

    
870
	}
871

    
872
	protected Map<String, Map<String, Map<String, byte[]>>> buildTable(final List<Row> rows) throws UnsupportedEncodingException {
873
		final Map<String, Map<String, Map<String, byte[]>>> table = Maps.newHashMap();
874

    
875
		for (final Row row : rows) {
876
			final String rowKey = row.getKey();
877
			final String cf = row.getColumnFamily();
878
			if (!table.containsKey(rowKey)) {
879
				table.put(rowKey, new HashMap<>());
880
			}
881
			if (!table.get(rowKey).containsKey(cf)) {
882
				table.get(rowKey).put(row.getColumnFamily(), new HashMap<>());
883
			}
884
			for (final Column<String, byte[]> c : row.getColumns()) {
885
				System.out.println(String.format("ADDING K:%s CF:%s Q:%s", rowKey, cf, c.getName()));
886
				table.get(rowKey).get(cf).put(c.getName(), c.getValue());
887
				if (c.getName().equals("body")) {
888
					final String theBody = new String(c.getValue(), "UTF-8");
889
					assertTrue(StringUtils.isNotBlank(theBody));
890
					// System.out.println(theBody);
891
				}
892
			}
893
		}
894
		return table;
895

    
896
	}
897

    
898
	protected Map<String, XmlRecordFactory> mapAll(final Map<String, Map<String, Map<String, byte[]>>> table) throws Exception {
899

    
900
		final Map<String, XmlRecordFactory> builders = Maps.newHashMap();
901
		for (final Entry<String, Map<String, Map<String, byte[]>>> e : table.entrySet()) {
902
			map(builders, e.getKey(), e.getValue());
903
		}
904
		return builders;
905
	}
906

    
907
	// private Map<String, XmlRecordFactory> mapResultsOnly(final Map<String, Map<String, Map<String, byte[]>>> table) throws Exception {
908
	//
909
	// final Map<String, XmlRecordFactory> builders = Maps.newHashMap();
910
	// for (final Entry<String, Map<String, Map<String, byte[]>>> e : table.entrySet()) {
911
	// final Type type = OafRowKeyDecoder.decode(e.getKey()).getType();
912
	// if (type == Type.result) {
913
	// map(builders, e.getKey(), e.getValue());
914
	// }
915
	// }
916
	// return builders;
917
	// }
918

    
919
	private void map(final Map<String, XmlRecordFactory> builders, final String rowKey, final Map<String, Map<String, byte[]>> row) throws Exception {
920

    
921
		final Type type = OafRowKeyDecoder.decode(rowKey).getType();
922

    
923
		final Map<String, byte[]> familyMap = row.get(type.toString());
924

    
925
		if (familyMap == null) { return; }
926

    
927
		final byte[] bodyB = familyMap.get("body");
928

    
929
		if (bodyB != null) {
930
			ensureBuilder(builders, rowKey);
931

    
932
			final Oaf oaf = UpdateMerger.mergeBodyUpdates(familyMap);
933

    
934
			final OafDecoder mainEntity = OafDecoder.decode(oaf);
935

    
936
			builders.get(rowKey).setMainEntity(mainEntity);
937

    
938
			for (final LinkDescriptor ld : entityConfigTable.getDescriptors(type)) {
939

    
940
				final String it = ld.getRelDescriptor().getIt();
941
				final Map<String, byte[]> cols = row.get(it);
942

    
943
				if (cols != null && !cols.isEmpty()) {
944

    
945
					for (final byte[] oafB : cols.values()) {
946

    
947
						final Oaf.Builder relBuilder = Oaf.newBuilder(Oaf.parseFrom(oafB));
948

    
949
						if (ld.isSymmetric()) {
950
							final RelDescriptor rd = ld.getRelDescriptor();
951

    
952
							relBuilder.getRelBuilder().setCachedTarget(mainEntity.getEntity()).setRelType(rd.getRelType()).setSubRelType(rd.getSubRelType());
953
						}
954

    
955
						relBuilder.getRelBuilder().setChild(ld.isChild());
956

    
957
						final Oaf.Builder oafBuilder = Oaf.newBuilder().setKind(Kind.relation).setLastupdatetimestamp(System.currentTimeMillis());
958
						oafBuilder.mergeFrom(relBuilder.build());
959

    
960
						final String targetId = ld.isSymmetric() ? oafBuilder.getRel().getTarget() : oafBuilder.getRel().getSource();
961
						ensureBuilder(builders, targetId);
962
						final OafDecoder decoder = OafDecoder.decode(oafBuilder.build());
963

    
964
						if (ld.isChild()) {
965
							builders.get(targetId).addChild(type, decoder);
966
						} else {
967
							builders.get(targetId).addRelation(type, decoder);
968
						}
969
					}
970

    
971
				}
972
			}
973
		}
974

    
975
	}
976

    
977
	private void printAll(final Map<String, XmlRecordFactory> builders) throws DocumentException {
978
		print(Sets.newHashSet(Type.values()), builders, null);
979
	}
980

    
981
	private void print(final Set<Type> types, final Map<String, XmlRecordFactory> builders, final Map<Type, Set<String>> xpaths) throws DocumentException {
982
		final SAXReader r = new SAXReader();
983

    
984
		for (final Entry<String, XmlRecordFactory> e : builders.entrySet()) {
985
			final OafRowKeyDecoder kd = OafRowKeyDecoder.decode(e.getKey());
986

    
987
			if (!e.getValue().isValid()) { throw new IllegalArgumentException("invalid builder: " + e.getKey()); }
988
			if (types.contains(kd.getType())) {
989
				final String val = IndentXmlString.apply(e.getValue().build());
990

    
991
				if (xpaths != null && !xpaths.isEmpty() && xpaths.get(kd.getType()) != null) {
992
					final Document doc = r.read(new StringReader(val));
993

    
994
					log.debug("\n" + e.getKey());
995
					for (final String xpath : xpaths.get(kd.getType())) {
996
						log.debug(doc.valueOf(xpath));
997
					}
998
				} else {
999
					log.info(val);
1000
				}
1001
			}
1002
		}
1003
	}
1004

    
1005
	private void printNoIndent(final Map<String, XmlRecordFactory> builders) {
1006
		for (final Entry<String, XmlRecordFactory> e : builders.entrySet()) {
1007
			if (e.getValue().isValid()) {
1008
				log.debug(e.getValue().build());
1009
			} else {
1010
				log.debug("invalid builder: " + e.getKey());
1011
			}
1012
		}
1013
	}
1014

    
1015
	private void ensureBuilder(final Map<String, XmlRecordFactory> builders, final String rowKey) throws Exception {
1016
		if (!builders.containsKey(rowKey)) {
1017
			builders.put(rowKey, newBuilder());
1018
		}
1019
	}
1020

    
1021
	private XmlRecordFactory newBuilder() throws TransformerConfigurationException, TransformerFactoryConfigurationError, DocumentException {
1022
		return new XmlRecordFactory(entityConfigTable, ContextMapper.fromXml(Context.xml),
1023
				RelClasses.fromJSon(RelClassesTest.relClassesJson), XmlRecordFactoryTest.SCHEMA_LOCATION, true, false, false,
1024
				XmlRecordFactoryTest.specialDatasourceTypes);
1025
	}
1026

    
1027
	private InputStream load(final String fileName) {
1028
		return getClass().getResourceAsStream(fileName);
1029
	}
1030

    
1031
	private InputStream loadFromTransformationProfile(final String profilePath) {
1032
		log.info("Loading xslt from: " + basePathProfiles + profilePath);
1033
		final InputStream profile = getClass().getResourceAsStream(basePathProfiles + profilePath);
1034
		final SAXReader saxReader = new SAXReader();
1035
		Document doc = null;
1036
		try {
1037
			doc = saxReader.read(profile);
1038
		} catch (final DocumentException e) {
1039
			e.printStackTrace();
1040
			throw new RuntimeException(e);
1041
		}
1042
		final String xslt = doc.selectSingleNode("//SCRIPT/CODE/*[local-name()='stylesheet']").asXML();
1043
		// log.info(xslt);
1044
		return IOUtils.toInputStream(xslt);
1045
	}
1046

    
1047
	private Row asRowFromJson(final String json) throws ParseException {
1048
		final Oaf.Builder oafBuilder = Oaf.newBuilder();
1049
		JsonFormat.merge(json, oafBuilder);
1050
		final Oaf oaf = oafBuilder.build();
1051
		final Row row = new Row("result", oaf.getEntity().getId());
1052
		final Column<String, byte[]> c = new Column<>("body", oaf.toByteArray());
1053
		row.setColumn("body", c);
1054
		return row;
1055

    
1056
	}
1057

    
1058
}
    (1-1/1)