Project

General

Profile

1
package eu.dnetlib.data.mapreduce.hbase.statsExport.utils;
2

    
3
import com.google.common.collect.Multimap;
4

    
5
import eu.dnetlib.data.mapreduce.util.LicenseComparator;
6
import eu.dnetlib.data.proto.DatasourceProtos.Datasource;
7
import eu.dnetlib.data.proto.DatasourceProtos.Datasource.Metadata;
8
import eu.dnetlib.data.proto.FieldTypeProtos;
9
import eu.dnetlib.data.proto.FieldTypeProtos.Qualifier;
10
import eu.dnetlib.data.proto.FieldTypeProtos.StringField;
11
import eu.dnetlib.data.proto.FieldTypeProtos.StructuredProperty;
12
import eu.dnetlib.data.proto.OafProtos.Oaf;
13
import eu.dnetlib.data.proto.OafProtos.OafEntity;
14
import eu.dnetlib.data.proto.OafProtos.OafRel;
15
import eu.dnetlib.data.proto.OrganizationProtos.Organization;
16
import eu.dnetlib.data.proto.ProjectProtos.Project;
17
import eu.dnetlib.data.proto.RelTypeProtos.RelType;
18
import eu.dnetlib.data.proto.ResultProtos.Result;
19
import eu.dnetlib.data.proto.ResultProtos.Result.Instance;
20
import org.apache.log4j.Logger;
21

    
22
import java.text.DateFormat;
23
import java.text.ParseException;
24
import java.text.SimpleDateFormat;
25
import java.util.Date;
26
import java.util.List;
27

    
28
import org.w3c.dom.Element;
29
import org.w3c.dom.NodeList;
30
import org.xml.sax.InputSource;
31
import com.sun.org.apache.xerces.internal.parsers.DOMParser;
32
import org.w3c.dom.Document;
33

    
34
/**
35
 * @author eri Simple serializer that parses input Oaf Protos and prepares them
36
 *         for sqoop
37
 */
38
public class Serializer {
39
    private static Logger logger = Logger.getLogger(Serializer.class);
40

    
41
    private String DELIM;
42
    private String ENCLOSING;
43

    
44
    public Serializer(String DELIM, String ENCLOSING) {
45
        this.DELIM = DELIM;
46
        this.ENCLOSING = ENCLOSING;
47
    }
48

    
49
    public String serialize(Oaf oaf) {
50

    
51
        switch (oaf.getKind()) {
52
            case entity:
53
                OafEntity valueEntity = oaf.getEntity();
54

    
55
                switch (valueEntity.getType()) {
56
                    case datasource:
57

    
58
                        return buildDatasource(oaf);
59

    
60
                    case organization:
61

    
62
                        return buildOrganization(oaf);
63

    
64
                    case project:
65

    
66
                        return buildProject(oaf);
67
                    case result:
68

    
69
                        return buildResult(oaf);
70
                    default:
71
                        break;
72
                }
73
                break;
74
            case relation:
75
                return buildRel(oaf.getRel());
76
        }
77

    
78
        return null;
79
    }
80

    
81
    public String serialize(OafRel oaf) {
82

    
83
        switch (oaf.getRelType()) {
84
            case resultProject:
85
                return getResultProject(oaf);
86
            default:
87
                return buildRel(oaf);
88
        }
89
    }
90

    
91
    private String buildRel(OafRel Rel) {
92
        return cleanId(Rel.getTarget()) + DELIM;
93
    }
94

    
95
    public void extractRelations(Oaf oaf, Multimap<String, String> relations) {
96
        OafEntity valueEntity = oaf.getEntity();
97
        getOriginalId(valueEntity, relations);
98

    
99
        switch (valueEntity.getType()) {
100
            case datasource:
101
                getDatasourceLanguages(valueEntity, relations);
102
            case result:
103
                getResultTopics(valueEntity, relations);
104
                getResultLanguages(valueEntity, relations);
105
                getResultClassifications(valueEntity, relations);
106
                getResultDatasources(valueEntity, relations);
107
                getResultConcepts(valueEntity, relations);
108
                getResultDois(valueEntity, relations);
109
                getResultCitations(valueEntity, relations);
110

    
111
            case project:
112
                getProjectKeywords(valueEntity, relations);
113
                getProjectSubjects(valueEntity, relations);
114

    
115
            default:
116
        }
117

    
118
    }
119

    
120
    private void getOriginalId(OafEntity oafEntity, Multimap<String, String> relations) {
121

    
122
        String relName = oafEntity.getType().toString().toLowerCase() + "Oid";
123
        for (String oid : oafEntity.getOriginalIdList()) {
124
            relations.put(relName, cleanId(oid));
125
        }
126

    
127
    }
128

    
129
    private void getProjectKeywords(OafEntity oafEntity, Multimap<String, String> relations) {
130
        relations.put("projectKeyword", getStringField(oafEntity.getProject().getMetadata().getKeywords().getValue()));
131

    
132
    }
133

    
134
    private void getProjectSubjects(OafEntity oafEntity, Multimap<String, String> relations) {
135
        for (StructuredProperty subj : oafEntity.getProject().getMetadata().getSubjectsList()) {
136
            relations.put("projectSubject", getStringField(subj.getValue()));
137
        }
138
    }
139

    
140
    private String getResultProject(OafRel oaf) {
141
        StringBuilder buff = new StringBuilder();
142
        buff.append(cleanId(oaf.getTarget())).append(DELIM);
143
        // is declared as int!!!
144
        long diff = DATEDIFF(oaf.getResultProject().getOutcome().getRelMetadata().getEnddate(), oaf.getResultProject().getOutcome().getRelMetadata().getStartdate());
145

    
146
        if (diff < 0) {
147
            diff = 0;
148
        }
149

    
150
        buff.append(getNumericField(String.valueOf(diff)));
151
        return buff.toString();
152
    }
153

    
154

    
155
    private void getDatasourceLanguages(OafEntity valueEntity, Multimap<String, String> rels) {
156
        Datasource d = valueEntity.getDatasource();
157
        Metadata metadata = d.getMetadata();
158

    
159
        for (StringField lang : metadata.getOdlanguagesList()) {
160
            rels.put("datasourceLanguage", getStringField(lang.getValue()));
161
        }
162
    }
163

    
164
    private void getResultLanguages(OafEntity valueEntity, Multimap<String, String> rels) {
165

    
166
        Result d = valueEntity.getResult();
167
        Result.Metadata metadata = d.getMetadata();
168
        if (metadata.getLanguage().getClassname() != null && !metadata.getLanguage().getClassname().isEmpty()) {
169
            rels.put("resultLanguage", getStringField(metadata.getLanguage().getClassname()));
170
        }
171

    
172
    }
173

    
174
    private void getResultDois(OafEntity valueEntity, Multimap<String, String> rels) {
175

    
176
        for (StructuredProperty pid : valueEntity.getPidList()) {
177
            rels.put("resultPid", getStringField(pid.getQualifier().getClassname()) + getStringField(pid.getValue()));
178
        }
179
    }
180

    
181
    private void getResultClassifications(OafEntity valueEntity, Multimap<String, String> rels) {
182

    
183
        Result result = valueEntity.getResult();
184

    
185
        for (Instance instance : (result.getInstanceList())) {
186
            String classification = instance.getInstancetype().getClassname();
187

    
188
            if (classification != null && !classification.isEmpty()) {
189
                rels.put("resultClassification", getStringField(instance.getInstancetype().getClassname()));
190
            }
191
        }
192
    }
193

    
194
    private void getResultConcepts(OafEntity valueEntity, Multimap<String, String> rels) {
195
        Result result = valueEntity.getResult();
196

    
197
        for (Result.Context context : result.getMetadata().getContextList()) {
198
            rels.put("resultConcept", cleanId(context.getId()));
199
        }
200
    }
201

    
202
    private void getResultDatasources(OafEntity valueEntity, Multimap<String, String> rels) {
203
        Result result = valueEntity.getResult();
204

    
205
    // hosted by
206
        for (Instance instance : (result.getInstanceList())) {
207
            String hostedBy = instance.getHostedby().getKey();
208

    
209
            if (hostedBy != null && !hostedBy.isEmpty()) {
210
                rels.put("resultDatasource", cleanId(hostedBy) + DELIM);
211
            }
212
        }
213

    
214
    // collected from
215
        for (FieldTypeProtos.KeyValue collectedFromValue : (valueEntity.getCollectedfromList())) {
216
            String collectedFrom = collectedFromValue.getKey();
217

    
218
            if (collectedFrom != null && !collectedFrom.isEmpty()) {
219
                rels.put("resultDatasource", cleanId(collectedFrom) + DELIM);
220
            }
221
        }
222
    }
223

    
224
    private void getResultTopics(OafEntity valueEntity, Multimap<String, String> rels) {
225
        Result d = valueEntity.getResult();
226
        Result.Metadata metadata = d.getMetadata();
227
        List<StructuredProperty> Topics = metadata.getSubjectList();
228

    
229
        for (StructuredProperty topic : Topics) {
230
            rels.put("resultTopic", getStringField(topic.getValue()));
231
        }
232
    }
233

    
234

    
235
    private void getResultCitations(OafEntity oafEntity, Multimap<String, String> rels) {
236
        for (FieldTypeProtos.ExtraInfo extraInfo : oafEntity.getExtraInfoList()) {
237
            if (extraInfo.getName().equals("result citations")) {
238
                DOMParser parser = new DOMParser();
239
                try {
240
                    parser.parse(new InputSource(new java.io.StringReader(extraInfo.getValue())));
241
                    Document doc = parser.getDocument();
242
                    doc.getDocumentElement().normalize();
243

    
244
                    NodeList citations = doc.getElementsByTagName("citation");
245
                    for (int temp = 0; temp < citations.getLength(); temp++) {
246
                        Element citation = (Element) citations.item(temp);
247
                        NodeList ids = citation.getElementsByTagName("id");
248
                        for(int temp1 = 0; temp1 < ids.getLength(); temp1++){
249
                            Element id = (Element) ids.item(temp1);
250
                            if(id.getAttribute("type").equals("openaire")){
251
                                //System.out.println(id.getAttribute("value"));
252
                                rels.put("resultCitation", id.getAttribute("value"));
253
                            }
254
                        }
255
                    }
256
                } catch (Exception e) {
257
                    logger.error("Error getting result citations", e);
258
                }
259
            }
260
        }
261
    }
262

    
263
    private String buildDatasource(Oaf oaf) {
264
        Metadata metadata = oaf.getEntity().getDatasource().getMetadata();
265
        StringBuilder buff = new StringBuilder();
266

    
267
        // name
268
        if (metadata.getOfficialname().getValue().equalsIgnoreCase("unknown")) {
269
            buff.append(getStringField("Unknown Repository"));
270
        } else {
271
            buff.append(getStringField(metadata.getOfficialname().getValue()));
272
        }
273

    
274
        // type
275
        if (metadata.hasDatasourcetype()) {
276
            buff.append(getStringField(metadata.getDatasourcetype().getClassname().replaceFirst(".*::", "")));
277
        }
278

    
279
        // compatibility,
280
        buff.append(getStringField(metadata.getOpenairecompatibility().getClassname()));
281

    
282
        // latitude
283
        buff.append(getLatLongField(metadata.getLatitude().getValue()));
284

    
285
        // longtitude
286
        buff.append(getLatLongField(metadata.getLongitude().getValue()));
287

    
288
        // dateofvalidation,
289
        buff.append(getStringDateField(metadata.getDateofvalidation().getValue()));
290

    
291
        // yearofvalidation,
292
        buff.append(getYearInt(metadata.getDateofvalidation().getValue()));
293

    
294
        //harvested
295
        buff.append(getStringField("false"));
296

    
297
        //piwik_id
298
        String piwik_id = "";
299
        for (String oid : oaf.getEntity().getOriginalIdList()) {
300
            if (oid.contains("piwik")) {
301
                piwik_id = oid.split(":")[1];
302
                break;
303
            }
304
        }
305
        buff.append(getStringField(cleanNumber(piwik_id)));
306

    
307
        return buff.toString();
308

    
309
    }
310

    
311
    private String buildOrganization(Oaf oaf) {
312

    
313
        StringBuilder buff = new StringBuilder();
314
        Organization.Metadata metadata = oaf.getEntity().getOrganization().getMetadata();
315

    
316
        // `name`,
317
        buff.append(getStringField(metadata.getLegalname().getValue()));
318

    
319
        // `country`,
320
        buff.append(getStringField(metadata.getCountry().getClassid()));
321

    
322
        return buff.toString();
323
    }
324

    
325
    private String buildResult(Oaf oaf) {
326
        StringBuilder buff = new StringBuilder();
327

    
328
        Result.Metadata metadata = oaf.getEntity().getResult().getMetadata();
329

    
330
        String titleString = "";
331

    
332
        if (metadata.getTitleList().size() > 0) {
333
            StructuredProperty title = metadata.getTitleList().get(0);
334

    
335
            titleString = title.getValue().replaceAll("\\s+", " ");
336
            titleString = titleString.replaceAll("\n", " ");
337
        }
338

    
339
        //  pubtitle
340
        buff.append(getStringField(titleString));
341

    
342

    
343
        //  publisher
344
        buff.append(getStringField(metadata.getPublisher().getValue()));
345

    
346
        //  journal
347
        buff.append(getStringField(metadata.getJournal().getName()));  //#null#!
348

    
349
        // year
350
        buff.append(getYearInt(metadata.getDateofacceptance().getValue()));
351

    
352
        // date
353
        buff.append(getStringDateField(metadata.getDateofacceptance().getValue()));
354

    
355
        // bestlicense
356
        buff.append(getStringField(getBestLicense(oaf.getEntity().getResult())));
357

    
358
        // type
359
        buff.append(getStringField(metadata.getResulttype().getClassname()));
360

    
361
        // embargo_end_date
362
        buff.append(getStringDateField(metadata.getEmbargoenddate().getValue()));
363

    
364
        // `authors`,
365
        int authors = metadata.getAuthorCount();
366
        String delayed = "no";
367

    
368
        for (OafRel rel : oaf.getEntity().getCachedRelList()) {
369
            if (rel.getRelType().equals(RelType.resultProject))
370
            // remember : in result Project, first id is project, second is result.
371
            {
372
                String daysfromend = getYearDifferenceInteger(rel.getResultProject().getOutcome().getRelMetadata().getEnddate(),
373
                        rel.getResultProject().getOutcome().getRelMetadata().getStartdate());
374
                if (Integer.parseInt(daysfromend) > 0) {
375
                    delayed = "yes";
376
                }
377
            }
378
        }
379

    
380
        // `delayed`,
381
        buff.append(getStringField(delayed));
382
        //authors
383
        buff.append(getNumericField(String.valueOf(authors)));
384

    
385
        String sources = "";
386

    
387

    
388
        for (Instance instance : (oaf.getEntity().getResult().getInstanceList())) {
389
            List<String> urls = instance.getUrlList();
390
            for (String url : urls) {
391
                sources += cleanUrl(url) + " ;";
392
            }
393
        }
394

    
395
        //sources
396
        sources = ENCLOSING + sources + ENCLOSING + DELIM;
397

    
398
        buff.append(sources);
399

    
400
        return buff.toString();
401

    
402
    }
403

    
404
    private String getBestLicense(Result result) {
405
        Qualifier bestLicense = null;
406
        LicenseComparator lc = new LicenseComparator();
407
        for (Instance instance : (result.getInstanceList())) {
408
            if (lc.compare(bestLicense, instance.getAccessright()) > 0) {
409
                bestLicense = instance.getAccessright();
410
            }
411
        }
412
        if (bestLicense != null) {
413
            return bestLicense.getClassname();
414
        } else {
415
            return "";
416
        }
417
    }
418

    
419
    private String buildProject(Oaf oaf) {
420

    
421
        FundingParser fundingParser = new FundingParser(DELIM, ENCLOSING);
422
        StringBuilder buff = new StringBuilder();
423
        Project.Metadata metadata = oaf.getEntity().getProject().getMetadata();
424
        
425
        // `acronym`,
426
        String acronym = metadata.getAcronym().getValue();
427
        if (acronym.equalsIgnoreCase("UNKNOWN")) {
428
            acronym = metadata.getTitle().getValue();
429
        }
430
        buff.append(getStringField(acronym));
431

    
432
        //title
433
        buff.append(getStringField(metadata.getTitle().getValue()));
434

    
435
        //funding_lvl
436
        List<StringField> fundList = metadata.getFundingtreeList();
437
        if (!fundList.isEmpty()) // `funding_lvl0`,
438
        {
439
            //funder + 3 funding levels
440
            buff.append(fundingParser.getFundingInfo(fundList.get(0).getValue()));
441
        } else {
442
            buff.append(fundingParser.getFundingInfo(""));
443
        }
444

    
445
        //sc39
446
        String sc39 = metadata.getEcsc39().getValue();
447
        if (sc39.equalsIgnoreCase("true") || sc39.equalsIgnoreCase("t") || sc39.contains("yes")) {
448
            sc39 = "yes";
449
        } else if (sc39.equalsIgnoreCase("false") || sc39.equalsIgnoreCase("f") || sc39.contains("no")) {
450
            sc39 = "no";
451
        }
452
        buff.append(getStringField(sc39));
453

    
454
        //project_type
455
        buff.append(getStringField(metadata.getContracttype().getClassid()));
456

    
457
        // start_year
458
        buff.append(getYearInt(metadata.getStartdate().getValue()));
459

    
460
        // end_year
461
        buff.append(getYearInt(metadata.getEnddate().getValue()));
462

    
463
        // duration enddate-startdate
464
        buff.append(getYearDifferenceInteger(metadata.getEnddate().getValue(), metadata.getStartdate().getValue()));
465

    
466
        // haspubs
467
        buff.append(getStringField("no"));
468

    
469
        // numpubs
470
        buff.append(getNumericField("0"));
471

    
472
        // enddate
473
        buff.append(getStringDateField(metadata.getEnddate().getValue()));
474

    
475
        // startdate
476
        buff.append(getStringDateField(metadata.getStartdate().getValue()));
477

    
478
        // `daysforlastpub`,
479
        buff.append(getNumericField(""));
480

    
481
        // `delayedpubs`,
482
        buff.append(getNumericField(""));
483

    
484
        //call identifier
485
        buff.append(getStringField(metadata.getCallidentifier().getValue()));
486

    
487
        //code
488
        buff.append(getStringField(metadata.getCode().getValue()));
489

    
490
        return buff.toString();
491
    }
492

    
493

    
494
    private String getYearDifferenceInteger(String enddate, String startdate) {
495

    
496
        if (enddate != null && !enddate.isEmpty() && startdate != null && !startdate.isEmpty()) {
497

    
498
            String[] split = startdate.split("-");
499

    
500
            if (split.length == 0) {
501
                return ENCLOSING + "0" + ENCLOSING + DELIM;
502
            }
503

    
504
            int Startdate = Integer.parseInt(split[0]);
505

    
506
            split = enddate.split("-");
507

    
508
            if (split.length == 0) {
509
                return ENCLOSING + "0" + ENCLOSING + DELIM;
510
            }
511

    
512
            int Enddate = Integer.parseInt(split[0]);
513

    
514
            int diff = Enddate - Startdate;
515

    
516
            return ENCLOSING + diff + ENCLOSING + DELIM;
517

    
518
        }
519

    
520
        return ENCLOSING + "0" + ENCLOSING + DELIM;
521
    }
522

    
523
    private String getYearInt(String data) {
524
        if (data == null || data.isEmpty() || data.equals("-1")) {
525
            return ENCLOSING + "0" + ENCLOSING + DELIM;
526
        }
527

    
528
        String[] split = data.split("-");
529

    
530
        if (split.length == 0) {
531
            return ENCLOSING + "0" + ENCLOSING + DELIM;
532
        }
533

    
534
        String year = split[0];
535

    
536
        year = cleanNumber(year);
537

    
538
        if (year == null || year.isEmpty()) year = "0";
539

    
540
        return ENCLOSING + year + ENCLOSING + DELIM;
541

    
542
    }
543

    
544
    private String cleanNumber(String number) {
545
        number = number.replaceAll("[^A-Za-z0-9:,_]", "");
546
        return number;
547
    }
548

    
549
    private String getLatLongField(String data) {
550

    
551
        if (data == null || data.isEmpty())
552
            return ENCLOSING + "null" + ENCLOSING + DELIM;
553

    
554
        return ENCLOSING + data.replaceAll("[^-0-9.]+", "")  + ENCLOSING + DELIM;
555

    
556
    }
557

    
558
    private String getStringField(String data) {
559

    
560
        if (data == null || data.isEmpty())
561
            return ENCLOSING + "null" + ENCLOSING + DELIM;
562

    
563
        return ENCLOSING + clean(data) + ENCLOSING + DELIM;
564

    
565
    }
566

    
567
    private String getStringDateField(String data) {
568
        if (data == null || data.isEmpty() || data.equals("-1")) {
569
            return ENCLOSING + "0" + ENCLOSING + DELIM;
570
        } else {
571
            data = data.replace(DELIM, " ");
572
            data = data.replace(ENCLOSING, " ");
573
            data = data.replaceAll("\\r\\n|\\r|\\n", "");
574
            try {
575
                DateFormat format = new SimpleDateFormat("yyyy-MM-dd");
576
                data = format.format(format.parse(data));
577
                return ENCLOSING + data + ENCLOSING + DELIM;
578
            } catch (ParseException e) {
579
                return ENCLOSING + "0" + ENCLOSING + DELIM;
580
            }
581
        }
582
    }
583

    
584
    private String getNumericField(String data) {
585
        if (data == null || data.isEmpty()) {
586
            return ENCLOSING + "0" + ENCLOSING + DELIM;
587
        } else {
588
            return ENCLOSING + data + ENCLOSING + DELIM;
589
        }
590
    }
591

    
592
    public String getId(Oaf oaf) {
593
        switch (oaf.getKind()) {
594
            case entity:
595
                return cleanId(oaf.getEntity().getId());
596
            case relation:
597
                return cleanId(oaf.getRel().getSource());
598

    
599
        }
600
        return null;
601

    
602
    }
603

    
604
    public String getId(OafRel relOaf) {
605
        return cleanId(relOaf.getSource());
606
    }
607

    
608
    private String clean(String value) {
609
        if (value != null) {
610

    
611
            value = value.replaceAll("[\"\\r\\\\;]", "");
612
            value = value.replace(DELIM, " ");
613
            value = value.replace(ENCLOSING, " ");
614
            value = value.replaceAll("\\r\\n|\\r|\\n", " ");
615

    
616
            return value;
617
        } else {
618
            return "";
619
        }
620

    
621
    }
622

    
623
    private String cleanId(String value) {
624
        if (value != null) {
625
            // DO NOT CHANGE THIS: IT REMOVES ID PREFIX ( "5|datacite____::" to "datacite____::")
626
            // AND REPLACES OCCURRENCES OF DELIM CHARS IN DATA
627
            value = value.replaceFirst(".*\\|", "");
628
            value = value.replace("\n", "");
629
            value = value.replace(ENCLOSING, "");
630
            value = value.replace(DELIM, "");
631
            value = value.replace("\"", "");
632
            value = value.replace("«", " ");
633
            value = value.replace("»", " ");
634
        }
635

    
636
        return ENCLOSING + value + ENCLOSING;
637
    }
638

    
639
    private String cleanUrl(String value) {
640
        value = value.replace(DELIM, " ");
641
        value = value.replace(ENCLOSING, " ");
642
        value = value.replace(" ", "");
643
        value = value.replace("\n", "");
644
        return value;
645
    }
646

    
647
    private long DATEDIFF(String startDate, String endDate) {
648
        long MILLISECS_PER_DAY = 24 * 60 * 60 * 1000L;
649
        long days;
650
        SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd"); // "dd/MM/yyyy HH:mm:ss");
651
        // <startdate>2011-09-01</startdate>
652
        // <enddate>2015-08-31</enddate>
653
        Date dateIni;
654
        Date dateFin;
655

    
656
        if (startDate == null || startDate.isEmpty() || endDate == null || endDate.isEmpty()) {
657
            return 0;
658
        }
659
        try {
660
            dateIni = format.parse(startDate);
661
            dateFin = format.parse(endDate);
662
            days = (dateFin.getTime() - dateIni.getTime()) / MILLISECS_PER_DAY;
663
        } catch (Exception e) {
664

    
665
            return 0;
666
        }
667

    
668
        return days;
669
    }
670
}
(3-3/3)