Project

General

Profile

1
package eu.dnetlib.data.mapreduce.hbase.statsExport.utils;
2

    
3
import java.util.ArrayList;
4
import java.util.HashMap;
5
import java.util.List;
6

    
7
import org.apache.log4j.Logger;
8

    
9
import eu.dnetlib.data.mapreduce.util.LicenseComparator;
10
import eu.dnetlib.data.proto.DatasourceProtos.Datasource;
11
import eu.dnetlib.data.proto.DatasourceProtos.Datasource.Metadata;
12
import eu.dnetlib.data.proto.FieldTypeProtos.Qualifier;
13
import eu.dnetlib.data.proto.FieldTypeProtos.StringField;
14
import eu.dnetlib.data.proto.FieldTypeProtos.StructuredProperty;
15
import eu.dnetlib.data.proto.OafProtos.Oaf;
16
import eu.dnetlib.data.proto.OafProtos.OafEntity;
17
import eu.dnetlib.data.proto.OafProtos.OafRel;
18
import eu.dnetlib.data.proto.OrganizationProtos.Organization;
19
import eu.dnetlib.data.proto.ProjectProtos.Project;
20
import eu.dnetlib.data.proto.RelTypeProtos.RelType;
21
import eu.dnetlib.data.proto.ResultProtos.Result;
22
import eu.dnetlib.data.proto.ResultProtos.Result.Instance;
23

    
24
/**
25
 * @author eri Simple serializer that parses input Oaf Protos and prepares them
26
 *         for sqoop
27
 * 
28
 */
29
public class Serializer {
30

    
31
	private static String DELIM;
32
	private Logger log = Logger.getLogger(this.getClass());
33
	private String NULL_STRING;
34
	private String NULL_NUM;
35
	private static String ENCLOSED;
36

    
37
	public String serialize(Oaf oaf) {
38

    
39
		switch (oaf.getKind()) {
40
		case entity:
41
			OafEntity valueEntity = oaf.getEntity();
42

    
43
			switch (valueEntity.getType()) {
44
			case datasource:
45

    
46
				return buildDatasource(valueEntity);
47

    
48
			case organization:
49

    
50
				return buildOrganization(valueEntity);
51

    
52
			case project:
53

    
54
				return buildProject(valueEntity);
55
			case result:
56

    
57
				return buildResult(valueEntity);
58
			default:
59
				log.error("wrong type");
60
				break;
61
			}
62
			break;
63
		case relation:
64
			OafRel valueRel = oaf.getRel();
65
			return buildRel(valueRel);
66

    
67
		}
68

    
69
		return null;
70

    
71
	}
72

    
73
	public String serialize(OafRel oaf) {
74

    
75
		switch (oaf.getRelType()) {
76
		case resultProject:
77

    
78
			return buildresultProject(oaf);
79

    
80
		default:
81
			return buildRel(oaf);
82
		}
83
	}
84

    
85
	private String buildRel(OafRel Rel) {
86

    
87
		return getStringField(Rel.getTarget());
88
	}
89

    
90
	public HashMap<String, List<String>> extractRelations(Oaf oaf) {
91
		OafEntity valueEntity = oaf.getEntity();
92
		switch (valueEntity.getType()) {
93
		case datasource:
94

    
95
			return getDatasourceLanguages(valueEntity);
96

    
97
		case result:
98
			HashMap<String, List<String>> relations = new HashMap<String, List<String>>();
99

    
100
			// relations.putAll(getResultLanguages(valueEntity));
101
			relations.putAll(getResultTopics(valueEntity));
102
			// relations.putAll(getResultClassifications(valueEntity));
103
			// relations.putAll(getResultDatasources(valueEntity));
104
			relations.putAll(getResultConcepts(valueEntity));
105
			return relations;
106
		default:
107

    
108
			return null;
109
		}
110

    
111
	}
112

    
113
	private String buildresultProject(OafRel oaf) {
114
		String buff = new String();
115
		buff += getStringField(oaf.getTarget());
116
		// TODO is declared as int!!!
117
		buff += getYearDifferenceInteger(oaf.getResultProject().getOutcome().getRelMetadata().getEnddate(), oaf.getResultProject().getOutcome().getRelMetadata().getStartdate());
118

    
119
		return buff;
120
	}
121

    
122
	private HashMap<String, List<String>> getDatasourceLanguages(OafEntity valueEntity) {
123
		HashMap<String, List<String>> rels = new HashMap<String, List<String>>();
124
		List<String> buffs = new ArrayList<String>();
125

    
126
		Datasource d = valueEntity.getDatasource();
127

    
128
		Metadata metadata = d.getMetadata();
129

    
130
		for (StringField lang : metadata.getOdlanguagesList()) {
131

    
132
			buffs.add(getStringField(lang.getValue()));
133
		}
134
		rels.put("datasourceLanguage", buffs);
135
		return rels;
136
	}
137

    
138
	private HashMap<String, List<String>> getResultLanguages(OafEntity valueEntity) {
139
		HashMap<String, List<String>> rels = new HashMap<String, List<String>>();
140
		List<String> buffs = new ArrayList<String>();
141
		Result d = valueEntity.getResult();
142

    
143
		eu.dnetlib.data.proto.ResultProtos.Result.Metadata metadata = d.getMetadata();
144

    
145
		if (metadata.getLanguage().getClassname() != null && !metadata.getLanguage().getClassname().isEmpty()) {
146

    
147
			buffs.add(getStringField(metadata.getLanguage().getClassname()));
148
		}
149
		rels.put("resultLanguage", buffs);
150
		return rels;
151

    
152
	}
153

    
154
	private HashMap<String, List<String>> getResultClassifications(OafEntity valueEntity) {
155

    
156
		HashMap<String, List<String>> rels = new HashMap<String, List<String>>();
157
		List<String> buffs = new ArrayList<String>();
158
		Result result = valueEntity.getResult();
159

    
160
		for (Instance instance : (result.getInstanceList())) {
161

    
162
			buffs.add(getStringField(instance.getInstancetype().getClassname()));
163
		}
164
		rels.put("resultClassification", buffs);
165
		return rels;
166

    
167
	}
168

    
169
	private HashMap<String, List<String>> getResultConcepts(OafEntity valueEntity) {
170
		HashMap<String, List<String>> rels = new HashMap<String, List<String>>();
171
		List<String> buffs = new ArrayList<String>();
172

    
173
		Result result = valueEntity.getResult();
174

    
175
		for (eu.dnetlib.data.proto.ResultProtos.Result.Context context : result.getMetadata().getContextList()) {
176

    
177
			buffs.add(getStringField(context.getId()));
178

    
179
		}
180
		rels.put("resultConcept", buffs);
181
		return rels;
182

    
183
	}
184

    
185
	private HashMap<String, List<String>> getResultDatasources(OafEntity valueEntity) {
186

    
187
		HashMap<String, List<String>> rels = new HashMap<String, List<String>>();
188
		List<String> buffs = new ArrayList<String>();
189
		Result result = valueEntity.getResult();
190

    
191
		for (Instance instance : (result.getInstanceList())) {
192
			String hostedBy = null;
193

    
194
			hostedBy = instance.getHostedby().getKey();
195

    
196
			buffs.add((getStringField(hostedBy)));
197
		}
198
		rels.put("resultDatasource", buffs);
199
		return rels;
200

    
201
	}
202

    
203
	public static boolean isNumeric(String str) {
204

    
205
		str = str.replaceAll("[^A-Za-z0-9 ]", "");
206
		str = str.replaceAll(" ", "");
207
		return str.matches("-?\\d+(\\.\\d+)?"); // match a number with optional
208
												// '-' and decimal.
209
	}
210

    
211
	// TODO there are topics with "null" as value -> repalce them
212
	private boolean isValidTopic(String t) {
213

    
214
		if (t == null || t.isEmpty()) {
215
			return false;
216
		}
217

    
218
		if (t.equals("") || t.equals(" ")) {
219
			return false;
220
		}
221
		if (t.equals("null") || t.equals("Null") || t.equals("NULL")) {
222
			return false;
223
		}
224

    
225
		if (t.equals(ENCLOSED + ENCLOSED + DELIM) || t.equals(ENCLOSED + NULL_STRING + ENCLOSED + DELIM)) {
226
			return false;
227
		}
228
		// skip dedups
229
		if (t.contains("ddc:")) {
230
			return false;
231
		}
232
		return true;
233
	}
234

    
235
	private HashMap<String, List<String>> getResultTopics(OafEntity valueEntity) {
236
		HashMap<String, List<String>> rels = new HashMap<String, List<String>>();
237
		List<String> buffs = new ArrayList<String>();
238
		Result d = valueEntity.getResult();
239

    
240
		eu.dnetlib.data.proto.ResultProtos.Result.Metadata metadata = d.getMetadata();
241
		List<StructuredProperty> Topics = metadata.getSubjectList();
242
		for (StructuredProperty topic : Topics) {
243
			// TODOs
244
			if (topic.getValue() != null && !topic.getValue().isEmpty() && !topic.getValue().equals("") && !topic.getValue().equals(" ") && !topic.getValue().equals("null")) {
245

    
246
				if (!isNumeric(topic.getValue())) {
247
					String t = getStringField(topic.getValue());
248
					if (isValidTopic(t)) {
249
						buffs.add(t);
250
					}
251

    
252
				}
253
			}
254
		}
255
		rels.put("resultTopic", buffs);
256

    
257
		return rels;
258

    
259
	}
260

    
261
	private String buildDatasource(OafEntity data) {
262

    
263
		String buff = new String();
264

    
265
		Datasource d = data.getDatasource();
266

    
267
		Metadata metadata = d.getMetadata();
268
		String full_id = getStringField(data.getId());
269

    
270
		buff += full_id;
271
		buff += full_id;
272
		buff += full_id;
273
		buff += full_id;
274

    
275
		// TODO move this here???
276
		// UPDATE "shadow".datasource SET name='Other' where name='Unknown
277
		// Repository';
278

    
279
		// name
280
		if (metadata.getOfficialname().getValue().equalsIgnoreCase("unknown")) {
281
			buff += getStringField("Unknown Repository");
282
		} else {
283
			buff += getStringField(metadata.getOfficialname().getValue());
284
		}
285
		// type
286

    
287
		if (metadata.hasDatasourcetype())
288

    
289
		{
290
			buff += getStringField(metadata.getDatasourcetype().getClassname().replaceFirst(".*::", ""));
291

    
292
		} else {
293
			buff += getStringField(null);
294
		}
295

    
296
		// compatibility,
297
		buff += getStringField(metadata.getOpenairecompatibility().getClassname());
298

    
299
		// latitude
300
		buff += getStringField(metadata.getLatitude().getValue());
301

    
302
		// longtitude
303
		buff += getStringField(metadata.getLongitude().getValue());
304

    
305
		// dateofvalidation,
306
		buff += getStringField(metadata.getDateofvalidation().getValue());
307

    
308
		// yearofvalidation,
309

    
310
		// parse year of validation
311
		buff += getYearInt(metadata.getDateofvalidation().getValue());
312

    
313
		// number??
314

    
315
		buff += getStringField("1");
316

    
317
		return buff;
318
	}
319

    
320
	private String buildOrganization(OafEntity data) {
321

    
322
		String buff = new String();
323

    
324
		Organization organization = data.getOrganization();
325
		eu.dnetlib.data.proto.OrganizationProtos.Organization.Metadata metadata = organization.getMetadata();
326

    
327
		// `organization_datasources`,
328
		String full_id = getStringField(data.getId());
329
		buff += full_id;
330
		// organization_projects
331
		buff += full_id;
332
		// `name`,
333
		buff += getStringField(metadata.getLegalname().getValue());
334
		// `country`,
335

    
336
		if (metadata.getCountry().getClassname().equals("UNITED KINGDOM"))
337

    
338
		{
339
			buff += getStringField("United Kingdom");
340
		} else if (metadata.getCountry().getClassname().equals("GREECE")) {
341
			buff += getStringField("Greece");
342
		} else
343

    
344
		{
345
			buff += getStringField(metadata.getCountry().getClassname());
346
		}
347

    
348
		// `number`,
349

    
350
		buff += getStringField("1");
351
		return buff;
352

    
353
	}
354

    
355
	private String buildResult(OafEntity data) {
356

    
357
		String buff = new String();
358

    
359
		Result result = data.getResult();
360
		eu.dnetlib.data.proto.ResultProtos.Result.Metadata metadata = result.getMetadata();
361

    
362
		// result_topics/
363
		String full_id = getStringField(data.getId());
364

    
365
		buff += full_id;
366

    
367
		// result_languages
368
		buff += full_id;
369

    
370
		// `result_projects`,
371
		buff += full_id;
372

    
373
		// `result_datasources`,
374
		buff += full_id;
375

    
376
		// `result_classifications`,
377
		buff += full_id;
378

    
379
		// / `result_infrastructures`,
380
		buff += full_id;
381

    
382
		// `result_claims`,
383
		buff += full_id;
384

    
385
		// `result_results`,
386
		buff += full_id;
387
		// year
388
		buff += getYearInt(metadata.getDateofacceptance().getValue());
389

    
390
		// date
391
		buff += getStringField(metadata.getDateofacceptance().getValue());
392

    
393
		// access_mode,
394
		buff += getStringField(getAcessMode(result));
395

    
396
		// bestlicense
397

    
398
		buff += getStringField(getBestLicense(result));
399
		// type
400
		buff += getStringField(metadata.getResulttype().getClassname());
401
		// embargo_end_date
402
		buff += getStringField(metadata.getEmbargoenddate().getValue());
403

    
404
		// `delayed`,
405
		buff += getStringField("no");
406

    
407
		// `authors`,
408
		int authors = 0;
409

    
410
		for (OafRel rel : data.getCachedRelList()) {
411

    
412
			if (rel.getRelType().equals(RelType.personResult)) {
413

    
414
				authors++;
415
			}
416
		}
417
		// log.info("Result " + full_id +"Author count : " + authors );
418
		buff += getNumericField(String.valueOf(authors));
419

    
420
		// number??
421

    
422
		buff += getStringField("1");
423
		// TODO this is a rotten record.//
424
		// if
425
		// (full_id.contentEquals("datacite____::5903fa1fba477116aba4a922cc9eb56f"))
426
		// {
427
		// log.error("ROTTEN RECORD " + buff);
428
		// return null;
429
		// }
430
		// TODO check if valid or empty protobuff
431
		// TODO do it in all protos?
432
		if (isValid(buff, full_id)) {
433
			return buff;
434
		} else {
435
			return null;
436
		}
437

    
438
	}
439

    
440
	private boolean isValid(String buff, String id) {
441
		if (buff.endsWith(ENCLOSED)) {
442
			log.error("Empty Result with  " + id + " with body: \n" + buff);
443
			return false;
444
		}
445
		return true;
446
	}
447

    
448
	private String getBestLicense(Result result) {
449
		Qualifier bestLicense = null;
450
		LicenseComparator lc = new LicenseComparator();
451
		for (Instance instance : (result.getInstanceList())) {
452
			if (lc.compare(bestLicense, instance.getLicence()) > 0) {
453
				bestLicense = instance.getLicence();
454
			}
455
		}
456
		if (bestLicense != null) {
457
			return bestLicense.getClassname();
458
		} else {
459
			return null;
460
		}
461
	}
462

    
463
	private String getAcessMode(Result result) {
464

    
465
		for (Instance instance : (result.getInstanceList())) {
466
			return instance.getLicence().getClassname();
467

    
468
		}
469

    
470
		return NULL_STRING;
471
	}
472

    
473
	private String buildProject(OafEntity data) {
474

    
475
		String buff = new String();
476

    
477
		Project project = data.getProject();
478
		eu.dnetlib.data.proto.ProjectProtos.Project.Metadata metadata = project.getMetadata();
479
		// project_organizations
480

    
481
		String full_id = getStringField(data.getId());
482
		buff += full_id;
483

    
484
		// project_results
485
		buff += full_id;
486
		// `acronym`,
487
		buff += getStringField(metadata.getAcronym().getValue());
488

    
489
		// `funding_lvl0`,
490
		String funding_tree_0 = NULL_STRING;
491
		String funding_tree_1 = NULL_STRING;
492
		String funding_tree_2 = NULL_STRING;
493

    
494
		List<StringField> fundList = metadata.getFundingtreeList();
495

    
496
		if (!fundList.isEmpty()) // `funding_lvl0`,
497
		{
498
			funding_tree_0 = getFundingLevel(fundList.get(0).getValue(), 0);
499

    
500
			funding_tree_1 = getFundingLevel(fundList.get(0).getValue(), 1);
501
			// log.info(funding_tree_1);
502

    
503
			funding_tree_2 = getFundingLevel(fundList.get(0).getValue(), 2);
504
			// log.info(funding_tree_2);
505

    
506
		}
507

    
508
		buff += getStringField(funding_tree_0);
509
		// `funding_lvl1`,
510
		if (funding_tree_1.equalsIgnoreCase("SP1")) {
511
			funding_tree_1 = "SP1-Cooperation";
512

    
513
		} else if (funding_tree_1.equalsIgnoreCase("SP2")) {
514
			funding_tree_1 = "SP2-Ideas";
515
		} else if (funding_tree_1.equalsIgnoreCase("SP3")) {
516
			funding_tree_1 = "SP3-People";
517
		} else if (funding_tree_1.equalsIgnoreCase("SP4")) {
518
			funding_tree_1 = "SP4-Capacities";
519
		} else if (funding_tree_1.equalsIgnoreCase("SP5")) {
520
			funding_tree_1 = "SP5-Euratom";
521
		}
522

    
523
		buff += getStringField(funding_tree_1);
524

    
525
		// / `funding_lvl2`,
526
		buff += getStringField(funding_tree_2);
527

    
528
		// `sc39`,
529

    
530
		String sc39 = metadata.getEcsc39().getValue();
531
		if (sc39.contains("true")) {
532
			sc39 = "yes";
533
		} else if (sc39.contains("false")) {
534
			sc39 = "no";
535
		}
536

    
537
		buff += getStringField(sc39);
538

    
539
		// `url`,
540

    
541
		buff += getStringField(metadata.getWebsiteurl().getValue());
542

    
543
		// start_year
544

    
545
		buff += getYearInt(metadata.getStartdate().getValue());
546

    
547
		// end_year
548
		buff += getYearInt(metadata.getEnddate().getValue());
549

    
550
		// duration enddate-startdate
551

    
552
		buff += getYearDifferenceInteger(metadata.getEnddate().getValue(), metadata.getStartdate().getValue());
553

    
554
		// haspubs
555
		buff += getStringField("");
556

    
557
		// numpubs
558
		buff += getNumericField("");
559
		// enddate
560
		buff += getStringField(metadata.getEnddate().getValue());
561
		// startdate
562
		buff += getStringField(metadata.getStartdate().getValue());
563

    
564
		// `daysforlastpub`,
565
		buff += getNumericField("");
566
		// `delayedpubs`,
567
		buff += getNumericField("");
568
		// `number`
569
		buff += getStringField("1");
570
		return buff;
571

    
572
	}
573

    
574
	private String getFundingLevel(String funding_level, int level) {
575

    
576
		if (funding_level.isEmpty()) {
577
			return NULL_STRING;
578

    
579
		}
580

    
581
		if (!funding_level.contains("funding_level_" + level)) {
582
			return NULL_STRING;
583
		}
584
		String[] split = funding_level.split("funding_level_" + level);
585

    
586
		funding_level = split[1];
587
		split = funding_level.split("name");
588
		split = split[1].split(",");
589

    
590
		funding_level = split[0].replaceAll(".*:\"", "");
591
		funding_level = funding_level.replaceFirst(ENCLOSED, "");
592
		funding_level = funding_level.trim();
593

    
594
		return funding_level;
595
	}
596

    
597
	private String getYearDifferenceInteger(String enddate, String startdate) {
598

    
599
		if (!enddate.isEmpty() && enddate != null && startdate != null && !startdate.isEmpty()) {
600

    
601
			String[] split = startdate.split("-");
602

    
603
			int Startdate = Integer.parseInt(split[0]);
604

    
605
			split = enddate.split("-");
606

    
607
			int Enddate = Integer.parseInt(split[0]);
608

    
609
			int diff = Enddate - Startdate;
610
			return ENCLOSED + diff + ENCLOSED + DELIM;
611
		}
612

    
613
		return ENCLOSED + NULL_NUM + ENCLOSED + DELIM;
614
	}
615

    
616
	private String getYearInt(String data) {
617
		if (data == null || data.isEmpty()) {
618
			return ENCLOSED + NULL_NUM + ENCLOSED + DELIM;
619
		}
620

    
621
		String[] split = data.split("-");
622

    
623
		if (split != null) {
624

    
625
			String year = split[0];
626
			year = cleanNumber(year);
627
			return ENCLOSED + year + ENCLOSED + DELIM;
628
		} else {
629
			return ENCLOSED + NULL_NUM + ENCLOSED + DELIM;
630
		}
631

    
632
	}
633

    
634
	private String cleanNumber(String number) {
635
		number = number.replaceAll("[^A-Za-z0-9:,____]", "");
636
		number.trim();
637
		return number;
638
	}
639

    
640
	private String getStringField(String data) {
641

    
642
		if (data == null || data.isEmpty() || data.equals("")) {
643

    
644
			return ENCLOSED + NULL_STRING + ENCLOSED + DELIM;
645
		} else {
646

    
647
			String field = clean(data);
648
			if (field == null) {
649
				return ENCLOSED + NULL_STRING + ENCLOSED + DELIM;
650
			} else {
651
				return field + DELIM;
652
			}
653
		}
654
	}
655

    
656
	private String getNumericField(String data) {
657
		if (data == null || data.isEmpty()) {
658
			return ENCLOSED + NULL_NUM + ENCLOSED + DELIM;
659
		} else {
660

    
661
			return ENCLOSED + data + ENCLOSED + DELIM;
662
		}
663
	}
664

    
665
	public String getId(Oaf oaf) {
666
		switch (oaf.getKind()) {
667
		case entity:
668

    
669
			return cleanId(oaf.getEntity().getId());
670
		case relation:
671

    
672
			return cleanId(oaf.getRel().getSource());
673

    
674
		}
675
		return null;
676

    
677
	}
678

    
679
	public String getId(OafRel relOaf) {
680
		return cleanId(relOaf.getSource());
681
	}
682

    
683
	public static String clean(String value) {
684
		if (value != null) {
685
			// TODO DO NOT CHANGE THIS: IT REMOVES ID PREFIX ( 5|datacite____::
686
			// to datacite____:: )
687
			// AND REPLACES OCCURENCES OF DELIM CHARS IN DATA
688
			value = value.replaceFirst(".*\\|", "");
689
			value = value.replaceAll("[^A-Za-z0-9:,____]", " ");
690
			value = value.trim();
691

    
692
		}
693
		if (value == null) {
694
			return null;
695
		}
696
		return ENCLOSED + value + ENCLOSED;
697

    
698
	}
699

    
700
	public static String cleanId(String value) {
701
		if (value != null) {
702
			// TODO DO NOT CHANGE THIS: IT REMOVES ID PREFIX ( 5|datacite____::
703
			// to datacite____:: )
704
			// AND REPLACES OCCURENCES OF DELIM CHARS IN DATA
705
			value = value.replaceFirst(".*\\|", "");
706
			value = value.replaceAll("\n", "");
707
			value = value.replaceAll(DELIM, "");
708
			value = value.replaceAll(ENCLOSED, "");
709
			value = value.trim();
710

    
711
		}
712
		if (value == null) {
713
			return null;
714
		}
715
		return ENCLOSED + value + ENCLOSED;
716

    
717
	}
718

    
719
	public String getName(Oaf oaf) {
720
		switch (oaf.getKind()) {
721
		case entity:
722

    
723
			return oaf.getEntity().getType().name();
724

    
725
		case relation:
726

    
727
			return oaf.getRel().getSource() + oaf.getRel().getTarget();
728

    
729
		}
730
		return null;
731

    
732
	}
733

    
734
	public String getDELIM() {
735
		return DELIM;
736
	}
737

    
738
	public void setDELIM(String dELIM) {
739
		DELIM = dELIM;
740
	}
741

    
742
	public String getNULL_STRING() {
743
		return NULL_STRING;
744
	}
745

    
746
	public void setNULL_STRING(String nULL_STRING) {
747
		NULL_STRING = nULL_STRING;
748
	}
749

    
750
	public String getNULL_NUM() {
751
		return NULL_NUM;
752
	}
753

    
754
	public void setNULL_NUM(String nULL_NUM) {
755
		NULL_NUM = nULL_NUM;
756
	}
757

    
758
	public static String getENCLOSED() {
759
		return ENCLOSED;
760
	}
761

    
762
	public void setENCLOSED(String eNCLOSED) {
763
		ENCLOSED = eNCLOSED;
764
	}
765

    
766
}
(4-4/4)