1
|
package eu.dnetlib.data.mapreduce.hbase.statsExport.utils;
|
2
|
|
3
|
import java.io.IOException;
|
4
|
import java.io.InputStream;
|
5
|
import java.text.SimpleDateFormat;
|
6
|
import java.util.ArrayList;
|
7
|
import java.util.Date;
|
8
|
import java.util.HashMap;
|
9
|
import java.util.List;
|
10
|
|
11
|
import org.apache.log4j.Logger;
|
12
|
|
13
|
import eu.dnetlib.data.mapreduce.util.LicenseComparator;
|
14
|
import eu.dnetlib.data.proto.DatasourceProtos.Datasource;
|
15
|
import eu.dnetlib.data.proto.DatasourceProtos.Datasource.Metadata;
|
16
|
import eu.dnetlib.data.proto.FieldTypeProtos.Qualifier;
|
17
|
import eu.dnetlib.data.proto.FieldTypeProtos.StringField;
|
18
|
import eu.dnetlib.data.proto.FieldTypeProtos.StructuredProperty;
|
19
|
import eu.dnetlib.data.proto.OafProtos.Oaf;
|
20
|
import eu.dnetlib.data.proto.OafProtos.OafEntity;
|
21
|
import eu.dnetlib.data.proto.OafProtos.OafRel;
|
22
|
import eu.dnetlib.data.proto.OrganizationProtos.Organization;
|
23
|
import eu.dnetlib.data.proto.ProjectProtos.Project;
|
24
|
import eu.dnetlib.data.proto.RelTypeProtos.RelType;
|
25
|
import eu.dnetlib.data.proto.ResultProtos.Result;
|
26
|
import eu.dnetlib.data.proto.ResultProtos.Result.Instance;
|
27
|
|
28
|
/**
|
29
|
* @author eri Simple serializer that parses input Oaf Protos and prepares them
|
30
|
* for sqoop
|
31
|
*
|
32
|
*/
|
33
|
public class Serializer {
|
34
|
|
35
|
private static String DELIM;
|
36
|
private Logger log = Logger.getLogger(this.getClass());
|
37
|
private String NULL_STRING;
|
38
|
private String NULL_NUM;
|
39
|
private static String ENCLOSED;
|
40
|
private String[] excludedIds;
|
41
|
|
42
|
public Serializer() throws IOException {
|
43
|
this.loadProjectResults();
|
44
|
};
|
45
|
|
46
|
public String serialize(Oaf oaf) {
|
47
|
|
48
|
switch (oaf.getKind()) {
|
49
|
case entity:
|
50
|
OafEntity valueEntity = oaf.getEntity();
|
51
|
|
52
|
switch (valueEntity.getType()) {
|
53
|
case datasource:
|
54
|
|
55
|
return buildDatasource(valueEntity);
|
56
|
|
57
|
case organization:
|
58
|
|
59
|
return buildOrganization(valueEntity);
|
60
|
|
61
|
case project:
|
62
|
|
63
|
return buildProject(valueEntity);
|
64
|
case result:
|
65
|
|
66
|
return buildResult(valueEntity);
|
67
|
default:
|
68
|
log.error("wrong type");
|
69
|
break;
|
70
|
}
|
71
|
break;
|
72
|
case relation:
|
73
|
OafRel valueRel = oaf.getRel();
|
74
|
return buildRel(valueRel);
|
75
|
|
76
|
}
|
77
|
|
78
|
return null;
|
79
|
|
80
|
}
|
81
|
|
82
|
public String serialize(OafRel oaf) {
|
83
|
|
84
|
switch (oaf.getRelType()) {
|
85
|
case resultProject:
|
86
|
|
87
|
return getResultProject(oaf);
|
88
|
|
89
|
default:
|
90
|
return buildRel(oaf);
|
91
|
}
|
92
|
}
|
93
|
|
94
|
private String buildRel(OafRel Rel) {
|
95
|
|
96
|
return getStringField(Rel.getTarget());
|
97
|
}
|
98
|
|
99
|
public HashMap<String, List<String>> extractRelations(Oaf oaf) {
|
100
|
OafEntity valueEntity = oaf.getEntity();
|
101
|
switch (valueEntity.getType()) {
|
102
|
case datasource:
|
103
|
|
104
|
return getDatasourceLanguages(valueEntity);
|
105
|
|
106
|
case result:
|
107
|
HashMap<String, List<String>> relations = new HashMap<String, List<String>>();
|
108
|
relations.putAll(getResultLanguages(valueEntity));
|
109
|
relations.putAll(getResultTopics(valueEntity));
|
110
|
relations.putAll(getResultClassifications(valueEntity));
|
111
|
relations.putAll(getResultDatasources(valueEntity));
|
112
|
relations.putAll(getResultConcepts(valueEntity));
|
113
|
return relations;
|
114
|
default:
|
115
|
|
116
|
return null;
|
117
|
}
|
118
|
|
119
|
}
|
120
|
|
121
|
private String getResultProject(OafRel oaf) {
|
122
|
String buff = new String();
|
123
|
String result = oaf.getTarget();
|
124
|
|
125
|
if (isValidProjectResult(result)) {
|
126
|
buff += getStringField(result);
|
127
|
// TODO is declared as int!!!
|
128
|
long diff = DATEDIFF(oaf.getResultProject().getOutcome().getRelMetadata().getEnddate(), oaf.getResultProject().getOutcome().getRelMetadata().getStartdate());
|
129
|
if (diff < 0) {
|
130
|
diff = 0;
|
131
|
}
|
132
|
buff += getNumericField(String.valueOf(diff));
|
133
|
|
134
|
} else {
|
135
|
return null;
|
136
|
}
|
137
|
return buff;
|
138
|
}
|
139
|
|
140
|
private boolean isValidProjectResult(String id) {
|
141
|
for (String excludedId : excludedIds) {
|
142
|
if (id.contains(excludedId)) {
|
143
|
log.info("contained -> result project is excluded");
|
144
|
return false;
|
145
|
}
|
146
|
}
|
147
|
return true;
|
148
|
}
|
149
|
|
150
|
private HashMap<String, List<String>> getDatasourceLanguages(OafEntity valueEntity) {
|
151
|
HashMap<String, List<String>> rels = new HashMap<String, List<String>>();
|
152
|
List<String> buffs = new ArrayList<String>();
|
153
|
|
154
|
Datasource d = valueEntity.getDatasource();
|
155
|
|
156
|
Metadata metadata = d.getMetadata();
|
157
|
|
158
|
for (StringField lang : metadata.getOdlanguagesList()) {
|
159
|
|
160
|
buffs.add(getStringField(lang.getValue()));
|
161
|
}
|
162
|
rels.put("datasourceLanguage", buffs);
|
163
|
return rels;
|
164
|
}
|
165
|
|
166
|
private HashMap<String, List<String>> getResultLanguages(OafEntity valueEntity) {
|
167
|
HashMap<String, List<String>> rels = new HashMap<String, List<String>>();
|
168
|
List<String> buffs = new ArrayList<String>();
|
169
|
Result d = valueEntity.getResult();
|
170
|
|
171
|
eu.dnetlib.data.proto.ResultProtos.Result.Metadata metadata = d.getMetadata();
|
172
|
|
173
|
if (metadata.getLanguage().getClassname() != null && !metadata.getLanguage().getClassname().isEmpty()) {
|
174
|
|
175
|
buffs.add(getStringField(metadata.getLanguage().getClassname()));
|
176
|
}
|
177
|
rels.put("resultLanguage", buffs);
|
178
|
return rels;
|
179
|
|
180
|
}
|
181
|
|
182
|
private HashMap<String, List<String>> getResultClassifications(OafEntity valueEntity) {
|
183
|
|
184
|
HashMap<String, List<String>> rels = new HashMap<String, List<String>>();
|
185
|
List<String> buffs = new ArrayList<String>();
|
186
|
Result result = valueEntity.getResult();
|
187
|
|
188
|
for (Instance instance : (result.getInstanceList())) {
|
189
|
|
190
|
buffs.add(getStringField(instance.getInstancetype().getClassname()));
|
191
|
}
|
192
|
rels.put("resultClassification", buffs);
|
193
|
return rels;
|
194
|
|
195
|
}
|
196
|
|
197
|
private HashMap<String, List<String>> getResultConcepts(OafEntity valueEntity) {
|
198
|
HashMap<String, List<String>> rels = new HashMap<String, List<String>>();
|
199
|
List<String> buffs = new ArrayList<String>();
|
200
|
|
201
|
Result result = valueEntity.getResult();
|
202
|
|
203
|
for (eu.dnetlib.data.proto.ResultProtos.Result.Context context : result.getMetadata().getContextList()) {
|
204
|
|
205
|
buffs.add(getStringField(context.getId()));
|
206
|
}
|
207
|
rels.put("resultConcept", buffs);
|
208
|
return rels;
|
209
|
|
210
|
}
|
211
|
|
212
|
private HashMap<String, List<String>> getResultDatasources(OafEntity valueEntity) {
|
213
|
|
214
|
HashMap<String, List<String>> rels = new HashMap<String, List<String>>();
|
215
|
List<String> buffs = new ArrayList<String>();
|
216
|
Result result = valueEntity.getResult();
|
217
|
|
218
|
for (Instance instance : (result.getInstanceList())) {
|
219
|
String hostedBy = null;
|
220
|
|
221
|
hostedBy = instance.getHostedby().getKey();
|
222
|
|
223
|
buffs.add((getStringField(hostedBy)));
|
224
|
}
|
225
|
rels.put("resultDatasource", buffs);
|
226
|
return rels;
|
227
|
|
228
|
}
|
229
|
|
230
|
public static boolean isNumeric(String str) {
|
231
|
|
232
|
str = str.replaceAll("[^A-Za-z0-9 ]", "");
|
233
|
str = str.replaceAll(" ", "");
|
234
|
return str.matches("-?\\d+(\\.\\d+)?"); // match a number with optional
|
235
|
// '-' and decimal.
|
236
|
}
|
237
|
|
238
|
// TODO there are topics with "null" as value -> repalce them
|
239
|
private boolean isValidTopic(String t) {
|
240
|
|
241
|
if (t == null || t.isEmpty()) {
|
242
|
return false;
|
243
|
}
|
244
|
|
245
|
if (t.equals("") || t.equals(" ")) {
|
246
|
return false;
|
247
|
}
|
248
|
if (t.equals("null") || t.equals("Null") || t.equals("NULL")) {
|
249
|
return false;
|
250
|
}
|
251
|
|
252
|
if (t.equals(ENCLOSED + ENCLOSED + DELIM) || t.equals(ENCLOSED + NULL_STRING + ENCLOSED + DELIM)) {
|
253
|
return false;
|
254
|
}
|
255
|
// skip dedups
|
256
|
if (t.contains("ddc:")) {
|
257
|
return false;
|
258
|
}
|
259
|
return true;
|
260
|
}
|
261
|
|
262
|
private HashMap<String, List<String>> getResultTopics(OafEntity valueEntity) {
|
263
|
HashMap<String, List<String>> rels = new HashMap<String, List<String>>();
|
264
|
List<String> buffs = new ArrayList<String>();
|
265
|
Result d = valueEntity.getResult();
|
266
|
|
267
|
eu.dnetlib.data.proto.ResultProtos.Result.Metadata metadata = d.getMetadata();
|
268
|
|
269
|
List<StructuredProperty> Topics = metadata.getSubjectList();
|
270
|
String buff = new String();
|
271
|
for (StructuredProperty topic : Topics) {
|
272
|
// TODOs
|
273
|
if (isValidTopic(topic.getValue())) {
|
274
|
if (!isNumeric(topic.getValue())) {
|
275
|
String t = getStringField(topic.getValue());
|
276
|
if (isValidTopic(t)) {
|
277
|
buff += t + " ";
|
278
|
|
279
|
}
|
280
|
|
281
|
}
|
282
|
}
|
283
|
if (!buff.isEmpty()) {
|
284
|
buff=getStringField(buff);
|
285
|
buffs.add(buff);
|
286
|
}
|
287
|
}
|
288
|
rels.put("resultTopic", buffs);
|
289
|
|
290
|
return rels;
|
291
|
|
292
|
}
|
293
|
|
294
|
private String buildDatasource(OafEntity data) {
|
295
|
|
296
|
String buff = new String();
|
297
|
|
298
|
Datasource d = data.getDatasource();
|
299
|
|
300
|
Metadata metadata = d.getMetadata();
|
301
|
String full_id = getStringField(data.getId());
|
302
|
|
303
|
buff += full_id;
|
304
|
buff += full_id;
|
305
|
buff += full_id;
|
306
|
buff += full_id;
|
307
|
|
308
|
// TODO move this here???
|
309
|
// UPDATE "shadow".datasource SET name='Other' where name='Unknown
|
310
|
// Repository';
|
311
|
|
312
|
// name
|
313
|
if (metadata.getOfficialname().getValue().equalsIgnoreCase("unknown")) {
|
314
|
buff += getStringField("Unknown Repository");
|
315
|
} else {
|
316
|
buff += getStringField(metadata.getOfficialname().getValue());
|
317
|
}
|
318
|
// type
|
319
|
|
320
|
if (metadata.hasDatasourcetype())
|
321
|
|
322
|
{
|
323
|
buff += getStringField(metadata.getDatasourcetype().getClassname().replaceFirst(".*::", ""));
|
324
|
|
325
|
} else {
|
326
|
buff += getStringField(null);
|
327
|
}
|
328
|
|
329
|
// compatibility,
|
330
|
buff += getStringField(metadata.getOpenairecompatibility().getClassname());
|
331
|
|
332
|
// latitude
|
333
|
buff += getStringField(metadata.getLatitude().getValue());
|
334
|
|
335
|
// longtitude
|
336
|
buff += getStringField(metadata.getLongitude().getValue());
|
337
|
|
338
|
// dateofvalidation,
|
339
|
buff += getStringField(metadata.getDateofvalidation().getValue());
|
340
|
|
341
|
// yearofvalidation,
|
342
|
|
343
|
// parse year of validation
|
344
|
buff += getYearInt(metadata.getDateofvalidation().getValue());
|
345
|
|
346
|
// number??
|
347
|
|
348
|
buff += getStringField("1");
|
349
|
|
350
|
return buff;
|
351
|
}
|
352
|
|
353
|
private String buildOrganization(OafEntity data) {
|
354
|
|
355
|
String buff = new String();
|
356
|
|
357
|
Organization organization = data.getOrganization();
|
358
|
eu.dnetlib.data.proto.OrganizationProtos.Organization.Metadata metadata = organization.getMetadata();
|
359
|
|
360
|
// `organization_datasources`,
|
361
|
String full_id = getStringField(data.getId());
|
362
|
buff += full_id;
|
363
|
// organization_projects
|
364
|
buff += full_id;
|
365
|
// `name`,
|
366
|
buff += getStringField(metadata.getLegalname().getValue());
|
367
|
// `country`,
|
368
|
|
369
|
if (metadata.getCountry().getClassname().equals("UNITED KINGDOM"))
|
370
|
|
371
|
{
|
372
|
buff += getStringField("United Kingdom");
|
373
|
} else if (metadata.getCountry().getClassname().equals("GREECE")) {
|
374
|
buff += getStringField("Greece");
|
375
|
} else
|
376
|
|
377
|
{
|
378
|
buff += getStringField(metadata.getCountry().getClassname());
|
379
|
}
|
380
|
|
381
|
// `number`,
|
382
|
|
383
|
buff += getStringField("1");
|
384
|
return buff;
|
385
|
|
386
|
}
|
387
|
|
388
|
private String buildResult(OafEntity data) {
|
389
|
|
390
|
String buff = new String();
|
391
|
|
392
|
Result result = data.getResult();
|
393
|
eu.dnetlib.data.proto.ResultProtos.Result.Metadata metadata = result.getMetadata();
|
394
|
|
395
|
// result_topics/
|
396
|
String full_id = getStringField(data.getId());
|
397
|
|
398
|
buff += full_id;
|
399
|
|
400
|
// result_languages
|
401
|
buff += full_id;
|
402
|
|
403
|
// `result_projects`,
|
404
|
buff += full_id;
|
405
|
|
406
|
// `result_datasources`,
|
407
|
buff += full_id;
|
408
|
|
409
|
// `result_classifications`,
|
410
|
buff += full_id;
|
411
|
|
412
|
// / `result_infrastructures`,
|
413
|
buff += full_id;
|
414
|
|
415
|
// `result_claims`,
|
416
|
buff += full_id;
|
417
|
|
418
|
// `result_results`,
|
419
|
buff += full_id;
|
420
|
// year
|
421
|
buff += getYearInt(metadata.getDateofacceptance().getValue());
|
422
|
|
423
|
// date
|
424
|
buff += getYearInt(metadata.getDateofacceptance().getValue());
|
425
|
|
426
|
// access_mode,
|
427
|
buff += getStringField(getAcessMode(result));
|
428
|
|
429
|
// bestlicense
|
430
|
|
431
|
buff += getStringField(getBestLicense(result));
|
432
|
// type
|
433
|
buff += getStringField(metadata.getResulttype().getClassname());
|
434
|
// embargo_end_date
|
435
|
buff += getStringField(metadata.getEmbargoenddate().getValue());
|
436
|
|
437
|
// `authors`,
|
438
|
int authors = 0;
|
439
|
String delayed = "no";
|
440
|
|
441
|
// UPDATE "shadow"."result" SET delayed = 'yes' WHERE id IN
|
442
|
// (SELECT id FROM "shadow"."result_projects"
|
443
|
// WHERE "shadow"."result_projects".id = "shadow"."result".id AND
|
444
|
// daysfromend > 0);
|
445
|
|
446
|
for (OafRel rel : data.getCachedRelList()) {
|
447
|
|
448
|
if (rel.getRelType().equals(RelType.personResult)) {
|
449
|
|
450
|
authors++;
|
451
|
} else if (rel.getRelType().equals(RelType.resultProject))
|
452
|
// TODO remember : in result Project, first id is project, second is
|
453
|
// result.
|
454
|
{
|
455
|
|
456
|
String daysfromend = getYearDifferenceInteger(rel.getResultProject().getOutcome().getRelMetadata().getEnddate(), rel.getResultProject().getOutcome().getRelMetadata().getStartdate());
|
457
|
if (Integer.parseInt(daysfromend) > 0) {
|
458
|
delayed = "yes";
|
459
|
}
|
460
|
}
|
461
|
}
|
462
|
// `delayed`,
|
463
|
buff += getStringField(delayed);
|
464
|
|
465
|
// log.info("Result " + full_id +"Author count : " + authors );
|
466
|
buff += getNumericField(String.valueOf(authors));
|
467
|
|
468
|
// number??
|
469
|
|
470
|
buff += getStringField("1");
|
471
|
|
472
|
if (isValid(buff, full_id)) {
|
473
|
return buff;
|
474
|
} else {
|
475
|
return null;
|
476
|
}
|
477
|
|
478
|
}
|
479
|
|
480
|
private boolean isValid(String buff, String id) {
|
481
|
if (buff.endsWith(ENCLOSED)) {
|
482
|
log.error("Empty Result with " + id + " with body: \n" + buff);
|
483
|
return false;
|
484
|
}
|
485
|
return true;
|
486
|
}
|
487
|
|
488
|
private String getBestLicense(Result result) {
|
489
|
Qualifier bestLicense = null;
|
490
|
LicenseComparator lc = new LicenseComparator();
|
491
|
for (Instance instance : (result.getInstanceList())) {
|
492
|
if (lc.compare(bestLicense, instance.getLicence()) > 0) {
|
493
|
bestLicense = instance.getLicence();
|
494
|
}
|
495
|
}
|
496
|
if (bestLicense != null) {
|
497
|
return bestLicense.getClassname();
|
498
|
} else {
|
499
|
return null;
|
500
|
}
|
501
|
}
|
502
|
|
503
|
private String getAcessMode(Result result) {
|
504
|
|
505
|
for (Instance instance : (result.getInstanceList())) {
|
506
|
return instance.getLicence().getClassname();
|
507
|
|
508
|
}
|
509
|
|
510
|
return NULL_STRING;
|
511
|
}
|
512
|
|
513
|
private String buildProject(OafEntity data) {
|
514
|
|
515
|
String buff = new String();
|
516
|
|
517
|
Project project = data.getProject();
|
518
|
eu.dnetlib.data.proto.ProjectProtos.Project.Metadata metadata = project.getMetadata();
|
519
|
// project_organizations
|
520
|
|
521
|
String full_id = getStringField(data.getId());
|
522
|
buff += full_id;
|
523
|
|
524
|
// project_results
|
525
|
buff += full_id;
|
526
|
// `acronym`,
|
527
|
buff += getStringField(metadata.getAcronym().getValue());
|
528
|
|
529
|
// `funding_lvl0`,
|
530
|
String funding_tree_0 = NULL_STRING;
|
531
|
String funding_tree_1 = NULL_STRING;
|
532
|
String funding_tree_2 = NULL_STRING;
|
533
|
|
534
|
List<StringField> fundList = metadata.getFundingtreeList();
|
535
|
|
536
|
if (!fundList.isEmpty()) // `funding_lvl0`,
|
537
|
{
|
538
|
funding_tree_0 = getFundingLevel(fundList.get(0).getValue(), 0);
|
539
|
|
540
|
funding_tree_1 = getFundingLevel(fundList.get(0).getValue(), 1);
|
541
|
// log.info(funding_tree_1);
|
542
|
|
543
|
funding_tree_2 = getFundingLevel(fundList.get(0).getValue(), 2);
|
544
|
// log.info(funding_tree_2);
|
545
|
|
546
|
}
|
547
|
funding_tree_0 = funding_tree_0.replaceAll("\"", "");
|
548
|
buff += getStringField(funding_tree_0);
|
549
|
// `funding_lvl1`,
|
550
|
|
551
|
funding_tree_1 = funding_tree_1.replaceAll("\"", "");
|
552
|
|
553
|
if (funding_tree_1.equalsIgnoreCase("SP1")) {
|
554
|
|
555
|
funding_tree_1 = "SP1-Cooperation";
|
556
|
|
557
|
} else if (funding_tree_1.equalsIgnoreCase("SP2")) {
|
558
|
funding_tree_1 = "SP2-Ideas";
|
559
|
} else if (funding_tree_1.equalsIgnoreCase("SP3")) {
|
560
|
funding_tree_1 = "SP3-People";
|
561
|
} else if (funding_tree_1.equalsIgnoreCase("SP4")) {
|
562
|
funding_tree_1 = "SP4-Capacities";
|
563
|
} else if (funding_tree_1.equalsIgnoreCase("SP5")) {
|
564
|
funding_tree_1 = "SP5-Euratom";
|
565
|
}
|
566
|
|
567
|
buff += getStringField(funding_tree_1);
|
568
|
funding_tree_2 = funding_tree_2.replaceAll("\"", "");
|
569
|
|
570
|
// / `funding_lvl2`,
|
571
|
buff += getStringField(funding_tree_2);
|
572
|
|
573
|
// `sc39`,
|
574
|
|
575
|
String sc39 = metadata.getEcsc39().getValue();
|
576
|
if (sc39.contains("true")) {
|
577
|
sc39 = "yes";
|
578
|
} else if (sc39.contains("false")) {
|
579
|
sc39 = "no";
|
580
|
}
|
581
|
|
582
|
buff += getStringField(sc39);
|
583
|
|
584
|
// `url`,
|
585
|
|
586
|
buff += getStringField(metadata.getWebsiteurl().getValue());
|
587
|
|
588
|
// start_year
|
589
|
|
590
|
buff += getYearInt(metadata.getStartdate().getValue());
|
591
|
|
592
|
// end_year
|
593
|
buff += getYearInt(metadata.getEnddate().getValue());
|
594
|
|
595
|
// duration enddate-startdate
|
596
|
|
597
|
buff += getYearDifferenceInteger(metadata.getEnddate().getValue(), metadata.getStartdate().getValue());
|
598
|
|
599
|
// haspubs
|
600
|
buff += getStringField("no");
|
601
|
|
602
|
// numpubs
|
603
|
buff += getNumericField("0");
|
604
|
// enddate
|
605
|
buff += getYearInt(metadata.getEnddate().getValue());
|
606
|
// startdate
|
607
|
buff += getYearInt(metadata.getStartdate().getValue());
|
608
|
|
609
|
// `daysforlastpub`,
|
610
|
buff += getNumericField("");
|
611
|
// `delayedpubs`,
|
612
|
buff += getNumericField("");
|
613
|
// `number`
|
614
|
buff += getStringField("1");
|
615
|
return buff;
|
616
|
|
617
|
}
|
618
|
|
619
|
private String getFundingLevel(String funding_level, int level) {
|
620
|
|
621
|
if (funding_level.isEmpty()) {
|
622
|
return NULL_STRING;
|
623
|
|
624
|
}
|
625
|
|
626
|
if (!funding_level.contains("funding_level_" + level)) {
|
627
|
return NULL_STRING;
|
628
|
}
|
629
|
String[] split = funding_level.split("funding_level_" + level);
|
630
|
|
631
|
funding_level = split[1];
|
632
|
split = funding_level.split("name");
|
633
|
split = split[1].split(",");
|
634
|
|
635
|
funding_level = split[0].replaceAll(".*:\"", "");
|
636
|
funding_level = funding_level.replaceFirst(ENCLOSED, "");
|
637
|
funding_level = funding_level.trim();
|
638
|
|
639
|
return funding_level;
|
640
|
}
|
641
|
|
642
|
private String getYearDifferenceInteger(String enddate, String startdate) {
|
643
|
|
644
|
if (!enddate.isEmpty() && enddate != null && startdate != null && !startdate.isEmpty()) {
|
645
|
|
646
|
// if (enddate != null && startdate != null&&) {
|
647
|
|
648
|
String[] split = startdate.split("-");
|
649
|
|
650
|
int Startdate = Integer.parseInt(split[0]);
|
651
|
|
652
|
split = enddate.split("-");
|
653
|
|
654
|
int Enddate = Integer.parseInt(split[0]);
|
655
|
|
656
|
int diff = Enddate - Startdate;
|
657
|
return ENCLOSED + diff + ENCLOSED + DELIM;
|
658
|
}
|
659
|
|
660
|
return ENCLOSED + NULL_NUM + ENCLOSED + DELIM;
|
661
|
}
|
662
|
|
663
|
private String getYearInt(String data) {
|
664
|
if (data == null || data.isEmpty()) {
|
665
|
return ENCLOSED + NULL_NUM + ENCLOSED + DELIM;
|
666
|
}
|
667
|
|
668
|
String[] split = data.split("-");
|
669
|
|
670
|
if (split != null) {
|
671
|
|
672
|
String year = split[0];
|
673
|
year = cleanNumber(year);
|
674
|
return ENCLOSED + year + ENCLOSED + DELIM;
|
675
|
} else {
|
676
|
return ENCLOSED + NULL_NUM + ENCLOSED + DELIM;
|
677
|
}
|
678
|
|
679
|
}
|
680
|
|
681
|
private String cleanNumber(String number) {
|
682
|
number = number.replaceAll("[^A-Za-z0-9:,____]", "");
|
683
|
number.trim();
|
684
|
return number;
|
685
|
}
|
686
|
|
687
|
private String getStringField(String data) {
|
688
|
|
689
|
if (data == null || data.isEmpty() || data.equals("")) {
|
690
|
|
691
|
return ENCLOSED + NULL_STRING + ENCLOSED + DELIM;
|
692
|
} else {
|
693
|
|
694
|
String field = clean(data);
|
695
|
if (field == null) {
|
696
|
return ENCLOSED + NULL_STRING + ENCLOSED + DELIM;
|
697
|
} else {
|
698
|
return field + DELIM;
|
699
|
}
|
700
|
}
|
701
|
}
|
702
|
|
703
|
private String getNumericField(String data) {
|
704
|
if (data == null || data.isEmpty()) {
|
705
|
return ENCLOSED + NULL_NUM + ENCLOSED + DELIM;
|
706
|
} else {
|
707
|
|
708
|
return ENCLOSED + data + ENCLOSED + DELIM;
|
709
|
}
|
710
|
}
|
711
|
|
712
|
public String getId(Oaf oaf) {
|
713
|
switch (oaf.getKind()) {
|
714
|
case entity:
|
715
|
|
716
|
return cleanId(oaf.getEntity().getId());
|
717
|
case relation:
|
718
|
|
719
|
return cleanId(oaf.getRel().getSource());
|
720
|
|
721
|
}
|
722
|
return null;
|
723
|
|
724
|
}
|
725
|
|
726
|
public String getId(OafRel relOaf) {
|
727
|
return cleanId(relOaf.getSource());
|
728
|
}
|
729
|
|
730
|
public static String clean(String value) {
|
731
|
if (value != null) {
|
732
|
// TODO DO NOT CHANGE THIS: IT REMOVES ID PREFIX ( 5|datacite____::
|
733
|
// to datacite____:: )
|
734
|
// AND REPLACES OCCURENCES OF DELIM CHARS IN DATA
|
735
|
value = value.replaceFirst(".*\\|", "");
|
736
|
value = value.replaceAll(DELIM, "");
|
737
|
value = value.replaceAll(ENCLOSED, "");
|
738
|
|
739
|
// value = value.replaceAll("[^A-Za-z0-9:,____-;:]", " ");
|
740
|
value = value.trim();
|
741
|
|
742
|
}
|
743
|
if (value == null) {
|
744
|
return null;
|
745
|
}
|
746
|
return ENCLOSED + value + ENCLOSED;
|
747
|
|
748
|
}
|
749
|
|
750
|
public static String cleanId(String value) {
|
751
|
if (value != null) {
|
752
|
// TODO DO NOT CHANGE THIS: IT REMOVES ID PREFIX ( 5|datacite____::
|
753
|
// to datacite____:: )
|
754
|
// AND REPLACES OCCURENCES OF DELIM CHARS IN DATA
|
755
|
value = value.replaceFirst(".*\\|", "");
|
756
|
value = value.replaceAll("\n", "");
|
757
|
value = value.replaceAll(DELIM, "");
|
758
|
value = value.replaceAll(ENCLOSED, "");
|
759
|
value = value.trim();
|
760
|
|
761
|
}
|
762
|
if (value == null) {
|
763
|
return null;
|
764
|
}
|
765
|
return ENCLOSED + value + ENCLOSED;
|
766
|
|
767
|
}
|
768
|
|
769
|
public String getName(Oaf oaf) {
|
770
|
switch (oaf.getKind()) {
|
771
|
case entity:
|
772
|
|
773
|
return oaf.getEntity().getType().name();
|
774
|
|
775
|
case relation:
|
776
|
|
777
|
return oaf.getRel().getSource() + oaf.getRel().getTarget();
|
778
|
|
779
|
}
|
780
|
return null;
|
781
|
|
782
|
}
|
783
|
|
784
|
private void loadProjectResults() throws IOException {
|
785
|
InputStream in = ClassLoader.getSystemResourceAsStream("eu/dnetlib/data/mapreduce/hbase/statsExport/excludedProjectResults");
|
786
|
byte[] b = new byte[in.available()];
|
787
|
in.read(b);
|
788
|
String q = new String(b);
|
789
|
// log.info(q);
|
790
|
excludedIds = q.split("\n");
|
791
|
// log.info(excludedIds.length);
|
792
|
in.close();
|
793
|
|
794
|
}
|
795
|
|
796
|
public long DATEDIFF(String startDate, String endDate) {
|
797
|
long MILLISECS_PER_DAY = 24 * 60 * 60 * 1000;
|
798
|
long days = 0l;
|
799
|
SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd"); // "dd/MM/yyyy HH:mm:ss");
|
800
|
// <startdate>2011-09-01</startdate>
|
801
|
// <enddate>2015-08-31</enddate>
|
802
|
Date dateIni = null;
|
803
|
Date dateFin = null;
|
804
|
try {
|
805
|
dateIni = (Date) format.parse(startDate);
|
806
|
dateFin = (Date) format.parse(endDate);
|
807
|
days = (dateFin.getTime() - dateIni.getTime()) / MILLISECS_PER_DAY;
|
808
|
} catch (Exception e) {
|
809
|
e.printStackTrace();
|
810
|
}
|
811
|
|
812
|
return days;
|
813
|
}
|
814
|
|
815
|
public String getDELIM() {
|
816
|
return DELIM;
|
817
|
}
|
818
|
|
819
|
public void setDELIM(String dELIM) {
|
820
|
DELIM = dELIM;
|
821
|
}
|
822
|
|
823
|
public String getNULL_STRING() {
|
824
|
return NULL_STRING;
|
825
|
}
|
826
|
|
827
|
public void setNULL_STRING(String nULL_STRING) {
|
828
|
NULL_STRING = nULL_STRING;
|
829
|
}
|
830
|
|
831
|
public String getNULL_NUM() {
|
832
|
return NULL_NUM;
|
833
|
}
|
834
|
|
835
|
public void setNULL_NUM(String nULL_NUM) {
|
836
|
NULL_NUM = nULL_NUM;
|
837
|
}
|
838
|
|
839
|
public static String getENCLOSED() {
|
840
|
return ENCLOSED;
|
841
|
}
|
842
|
|
843
|
public void setENCLOSED(String eNCLOSED) {
|
844
|
ENCLOSED = eNCLOSED;
|
845
|
}
|
846
|
|
847
|
}
|