Revision 35787
Added by Eri Katsari over 9 years ago
modules/dnet-openaire-stats/branches/oaimpact/src/main/java/eu/dnetlib/data/mapreduce/hbase/statsExport/utils/Serializer.java | ||
---|---|---|
1 |
package eu.dnetlib.data.mapreduce.hbase.statsExport.utils; |
|
2 |
|
|
3 |
import eu.dnetlib.data.mapreduce.util.LicenseComparator; |
|
4 |
import eu.dnetlib.data.proto.DatasourceProtos.Datasource; |
|
5 |
import eu.dnetlib.data.proto.DatasourceProtos.Datasource.Metadata; |
|
6 |
import eu.dnetlib.data.proto.FieldTypeProtos; |
|
7 |
import eu.dnetlib.data.proto.FieldTypeProtos.Qualifier; |
|
8 |
import eu.dnetlib.data.proto.FieldTypeProtos.StringField; |
|
9 |
import eu.dnetlib.data.proto.FieldTypeProtos.StructuredProperty; |
|
10 |
import eu.dnetlib.data.proto.OafProtos.Oaf; |
|
11 |
import eu.dnetlib.data.proto.OafProtos.OafEntity; |
|
12 |
import eu.dnetlib.data.proto.OafProtos.OafRel; |
|
13 |
import eu.dnetlib.data.proto.OrganizationProtos.Organization; |
|
14 |
import eu.dnetlib.data.proto.PersonProtos; |
|
15 |
import eu.dnetlib.data.proto.ProjectProtos.Project; |
|
16 |
import eu.dnetlib.data.proto.RelTypeProtos.RelType; |
|
17 |
import eu.dnetlib.data.proto.ResultProtos.Result; |
|
18 |
import eu.dnetlib.data.proto.ResultProtos.Result.Instance; |
|
19 |
import org.apache.log4j.Logger; |
|
20 |
|
|
21 |
import java.text.SimpleDateFormat; |
|
22 |
import java.util.ArrayList; |
|
23 |
import java.util.Date; |
|
24 |
import java.util.HashMap; |
|
25 |
import java.util.List; |
|
26 |
|
|
27 |
/** |
|
28 |
* @author eri Simple serializer that parses input Oaf Protos and prepares them |
|
29 |
* for sqoop |
|
30 |
*/ |
|
31 |
public class Serializer { |
|
32 |
|
|
33 |
private static String DELIM; |
|
34 |
private Logger log = Logger.getLogger(this.getClass()); |
|
35 |
private String NULL_STRING; |
|
36 |
private String NULL_NUM; |
|
37 |
//TODO no longer used |
|
38 |
private static String ENCLOSED; |
|
39 |
|
|
40 |
public Serializer() { |
|
41 |
} |
|
42 |
|
|
43 |
; |
|
44 |
|
|
45 |
public String serialize(Oaf oaf) { |
|
46 |
|
|
47 |
switch (oaf.getKind()) { |
|
48 |
case entity: |
|
49 |
OafEntity valueEntity = oaf.getEntity(); |
|
50 |
|
|
51 |
switch (valueEntity.getType()) { |
|
52 |
case datasource: |
|
53 |
|
|
54 |
return buildDatasource(valueEntity); |
|
55 |
|
|
56 |
case organization: |
|
57 |
|
|
58 |
return buildOrganization(valueEntity); |
|
59 |
|
|
60 |
case project: |
|
61 |
|
|
62 |
return buildProject(valueEntity); |
|
63 |
case result: |
|
64 |
|
|
65 |
return buildResult(valueEntity); |
|
66 |
// case person: |
|
67 |
// |
|
68 |
// return buildPerson(valueEntity); |
|
69 |
default: |
|
70 |
log.error("wrong type"); |
|
71 |
break; |
|
72 |
} |
|
73 |
break; |
|
74 |
case relation: |
|
75 |
OafRel valueRel = oaf.getRel(); |
|
76 |
|
|
77 |
return buildRel(valueRel); |
|
78 |
|
|
79 |
} |
|
80 |
|
|
81 |
return null; |
|
82 |
|
|
83 |
} |
|
84 |
|
|
85 |
public String serialize(OafRel oaf) { |
|
86 |
|
|
87 |
switch (oaf.getRelType()) { |
|
88 |
case resultProject: |
|
89 |
|
|
90 |
return getResultProject(oaf); |
|
91 |
|
|
92 |
default: |
|
93 |
return buildRel(oaf); |
|
94 |
} |
|
95 |
} |
|
96 |
|
|
97 |
private String buildRel(OafRel Rel) { |
|
98 |
|
|
99 |
return getStringField(Rel.getTarget()); |
|
100 |
} |
|
101 |
|
|
102 |
public HashMap<String, List<String>> extractRelations(Oaf oaf) { |
|
103 |
OafEntity valueEntity = oaf.getEntity(); |
|
104 |
switch (valueEntity.getType()) { |
|
105 |
case datasource: |
|
106 |
|
|
107 |
return getDatasourceLanguages(valueEntity); |
|
108 |
|
|
109 |
case result: |
|
110 |
HashMap<String, List<String>> relations = new HashMap<String, List<String>>(); |
|
111 |
relations.putAll(getResultLanguages(valueEntity)); |
|
112 |
relations.putAll(getResultTopics(valueEntity)); |
|
113 |
relations.putAll(getResultClassifications(valueEntity)); |
|
114 |
relations.putAll(getResultDatasources(valueEntity)); |
|
115 |
relations.putAll(getResultConcepts(valueEntity)); |
|
116 |
return relations; |
|
117 |
default: |
|
118 |
|
|
119 |
return null; |
|
120 |
} |
|
121 |
|
|
122 |
} |
|
123 |
|
|
124 |
private String getResultProject(OafRel oaf) { |
|
125 |
String buff = new String(); |
|
126 |
String result = oaf.getTarget(); |
|
127 |
|
|
128 |
|
|
129 |
buff += getStringField(result); |
|
130 |
// TODO is declared as int!!! |
|
131 |
long diff = DATEDIFF(oaf.getResultProject().getOutcome().getRelMetadata().getEnddate(), oaf.getResultProject().getOutcome().getRelMetadata().getStartdate()); |
|
132 |
if (diff < 0) { |
|
133 |
diff = 0; |
|
134 |
} |
|
135 |
buff += getNumericField(String.valueOf(diff)); |
|
136 |
|
|
137 |
|
|
138 |
return buff; |
|
139 |
} |
|
140 |
|
|
141 |
|
|
142 |
private HashMap<String, List<String>> getDatasourceLanguages(OafEntity valueEntity) { |
|
143 |
HashMap<String, List<String>> rels = new HashMap<String, List<String>>(); |
|
144 |
List<String> buffs = new ArrayList<String>(); |
|
145 |
|
|
146 |
|
|
147 |
Datasource d = valueEntity.getDatasource(); |
|
148 |
|
|
149 |
Metadata metadata = d.getMetadata(); |
|
150 |
|
|
151 |
for (StringField lang : metadata.getOdlanguagesList()) { |
|
152 |
|
|
153 |
buffs.add(getStringField(lang.getValue())); |
|
154 |
} |
|
155 |
rels.put("datasourceLanguage", buffs); |
|
156 |
return rels; |
|
157 |
} |
|
158 |
|
|
159 |
private HashMap<String, List<String>> getResultLanguages(OafEntity valueEntity) { |
|
160 |
HashMap<String, List<String>> rels = new HashMap<String, List<String>>(); |
|
161 |
List<String> buffs = new ArrayList<String>(); |
|
162 |
Result d = valueEntity.getResult(); |
|
163 |
|
|
164 |
eu.dnetlib.data.proto.ResultProtos.Result.Metadata metadata = d.getMetadata(); |
|
165 |
|
|
166 |
if (metadata.getLanguage().getClassname() != null && !metadata.getLanguage().getClassname().isEmpty()) { |
|
167 |
|
|
168 |
buffs.add(getStringField(metadata.getLanguage().getClassname())); |
|
169 |
} |
|
170 |
rels.put("resultLanguage", buffs); |
|
171 |
return rels; |
|
172 |
|
|
173 |
} |
|
174 |
|
|
175 |
private HashMap<String, List<String>> getResultClassifications(OafEntity valueEntity) { |
|
176 |
|
|
177 |
HashMap<String, List<String>> rels = new HashMap<String, List<String>>(); |
|
178 |
List<String> buffs = new ArrayList<String>(); |
|
179 |
Result result = valueEntity.getResult(); |
|
180 |
|
|
181 |
for (Instance instance : (result.getInstanceList())) { |
|
182 |
String classification = instance.getInstancetype().getClassname(); |
|
183 |
if (classification != null && !classification.isEmpty()) { |
|
184 |
buffs.add(getStringField(instance.getInstancetype().getClassname())); |
|
185 |
// TODO HERE KEEP ONLY ONE CLASSIFICATIONS PER RESULT |
|
186 |
break; |
|
187 |
} |
|
188 |
} |
|
189 |
rels.put("resultClassification", buffs); |
|
190 |
return rels; |
|
191 |
|
|
192 |
} |
|
193 |
|
|
194 |
private HashMap<String, List<String>> getResultConcepts(OafEntity valueEntity) { |
|
195 |
HashMap<String, List<String>> rels = new HashMap<String, List<String>>(); |
|
196 |
List<String> buffs = new ArrayList<String>(); |
|
197 |
|
|
198 |
Result result = valueEntity.getResult(); |
|
199 |
|
|
200 |
for (eu.dnetlib.data.proto.ResultProtos.Result.Context context : result.getMetadata().getContextList()) { |
|
201 |
|
|
202 |
buffs.add(getStringField(context.getId())); |
|
203 |
} |
|
204 |
rels.put("resultConcept", buffs); |
|
205 |
return rels; |
|
206 |
|
|
207 |
} |
|
208 |
|
|
209 |
private HashMap<String, List<String>> getResultDatasources(OafEntity valueEntity) { |
|
210 |
|
|
211 |
HashMap<String, List<String>> rels = new HashMap<String, List<String>>(); |
|
212 |
List<String> buffs = new ArrayList<String>(); |
|
213 |
Result result = valueEntity.getResult(); |
|
214 |
|
|
215 |
//TODO hosted by |
|
216 |
for (Instance instance : (result.getInstanceList())) { |
|
217 |
|
|
218 |
|
|
219 |
String hostedBy = instance.getHostedby().getKey(); |
|
220 |
if (hostedBy != null && !hostedBy.isEmpty()) { |
|
221 |
buffs.add((getStringField(hostedBy))); |
|
222 |
} |
|
223 |
} |
|
224 |
|
|
225 |
//TODO collected froms |
|
226 |
for (FieldTypeProtos.KeyValue collectedFromValue : (valueEntity.getCollectedfromList())) { |
|
227 |
|
|
228 |
String collectedFrom = collectedFromValue.getKey(); |
|
229 |
if (collectedFrom != null && !collectedFrom.isEmpty()) |
|
230 |
buffs.add((getStringField(collectedFrom))); |
|
231 |
|
|
232 |
} |
|
233 |
rels.put("resultDatasource", buffs); |
|
234 |
return rels; |
|
235 |
|
|
236 |
} |
|
237 |
|
|
238 |
public static boolean isNumeric(String str) { |
|
239 |
|
|
240 |
str = str.replaceAll("[^A-Za-z0-9 ]", ""); |
|
241 |
str = str.replaceAll(" ", ""); |
|
242 |
return str.matches("-?\\d+(\\.\\d+)?"); // match a number with optional |
|
243 |
// '-' and decimal. |
|
244 |
} |
|
245 |
|
|
246 |
// TODO there are topics with "null" as value -> replace them |
|
247 |
private boolean isValidTopic(String t) { |
|
248 |
|
|
249 |
if (t == null || t.isEmpty()) { |
|
250 |
return false; |
|
251 |
} |
|
252 |
|
|
253 |
if (t.equals("") || t.equals(" ")) { |
|
254 |
return false; |
|
255 |
} |
|
256 |
if (t.equals("null") || t.equals("Null") || t.equals("NULL")) { |
|
257 |
return false; |
|
258 |
} |
|
259 |
|
|
260 |
if (t.equals(ENCLOSED + ENCLOSED + DELIM) || t.equals(ENCLOSED + NULL_STRING + ENCLOSED + DELIM)) { |
|
261 |
return false; |
|
262 |
} |
|
263 |
// skip dedups |
|
264 |
if (t.contains("ddc:")) { |
|
265 |
return false; |
|
266 |
} |
|
267 |
return true; |
|
268 |
} |
|
269 |
|
|
270 |
private HashMap<String, List<String>> getResultTopics(OafEntity valueEntity) { |
|
271 |
HashMap<String, List<String>> rels = new HashMap<String, List<String>>(); |
|
272 |
List<String> buffs = new ArrayList<String>(); |
|
273 |
Result d = valueEntity.getResult(); |
|
274 |
|
|
275 |
eu.dnetlib.data.proto.ResultProtos.Result.Metadata metadata = d.getMetadata(); |
|
276 |
|
|
277 |
List<StructuredProperty> Topics = metadata.getSubjectList(); |
|
278 |
String buff = new String(); |
|
279 |
for (StructuredProperty topic : Topics) { |
|
280 |
// TODOs |
|
281 |
if (isValidTopic(topic.getValue())) { |
|
282 |
if (!isNumeric(topic.getValue())) { |
|
283 |
String t = getStringField(topic.getValue()); |
|
284 |
if (isValidTopic(t)) { |
|
285 |
buff += t + " "; |
|
286 |
|
|
287 |
} |
|
288 |
|
|
289 |
} |
|
290 |
} |
|
291 |
|
|
292 |
} |
|
293 |
if (!buff.isEmpty()) { |
|
294 |
buff = getStringField(buff); |
|
295 |
buffs.add(buff); |
|
296 |
} |
|
297 |
rels.put("resultTopic", buffs); |
|
298 |
|
|
299 |
return rels; |
|
300 |
|
|
301 |
} |
|
302 |
|
|
303 |
private String buildDatasource(OafEntity data) { |
|
304 |
|
|
305 |
String buff = new String(); |
|
306 |
|
|
307 |
Datasource d = data.getDatasource(); |
|
308 |
|
|
309 |
Metadata metadata = d.getMetadata(); |
|
310 |
String full_id = getStringField(data.getId()); |
|
311 |
|
|
312 |
buff += full_id; |
|
313 |
buff += full_id; |
|
314 |
buff += full_id; |
|
315 |
buff += full_id; |
|
316 |
|
|
317 |
|
|
318 |
// name |
|
319 |
if (metadata.getOfficialname().getValue().equalsIgnoreCase("unknown")) { |
|
320 |
buff += getStringField("Unknown Repository"); |
|
321 |
} else { |
|
322 |
buff += getStringField(metadata.getOfficialname().getValue()); |
|
323 |
} |
|
324 |
// type |
|
325 |
|
|
326 |
if (metadata.hasDatasourcetype()) |
|
327 |
|
|
328 |
{ |
|
329 |
buff += getStringField(metadata.getDatasourcetype().getClassname().replaceFirst(".*::", "")); |
|
330 |
|
|
331 |
} else { |
|
332 |
buff += getStringField(null); |
|
333 |
} |
|
334 |
|
|
335 |
// compatibility, |
|
336 |
buff += getStringField(metadata.getOpenairecompatibility().getClassname()); |
|
337 |
|
|
338 |
// latitude |
|
339 |
buff += getStringField(metadata.getLatitude().getValue()); |
|
340 |
|
|
341 |
// longtitude |
|
342 |
buff += getStringField(metadata.getLongitude().getValue()); |
|
343 |
|
|
344 |
// dateofvalidation, |
|
345 |
buff += getStringField(metadata.getDateofvalidation().getValue()); |
|
346 |
|
|
347 |
// yearofvalidation, |
|
348 |
|
|
349 |
// parse year of validation |
|
350 |
buff += getYearInt(metadata.getDateofvalidation().getValue()); |
|
351 |
|
|
352 |
// number?? |
|
353 |
|
|
354 |
buff += getStringField("1"); |
|
355 |
|
|
356 |
return buff; |
|
357 |
} |
|
358 |
|
|
359 |
private String buildOrganization(OafEntity data) { |
|
360 |
|
|
361 |
String buff = new String(); |
|
362 |
|
|
363 |
Organization organization = data.getOrganization(); |
|
364 |
eu.dnetlib.data.proto.OrganizationProtos.Organization.Metadata metadata = organization.getMetadata(); |
|
365 |
|
|
366 |
// `organization_datasources`, |
|
367 |
String full_id = getStringField(data.getId()); |
|
368 |
buff += full_id; |
|
369 |
// organization_projects |
|
370 |
buff += full_id; |
|
371 |
// `name`, |
|
372 |
buff += getStringField(metadata.getLegalname().getValue()); |
|
373 |
// `country`, |
|
374 |
|
|
375 |
|
|
376 |
buff += getStringField(metadata.getCountry().getClassname()); |
|
377 |
|
|
378 |
|
|
379 |
// `number`, |
|
380 |
|
|
381 |
buff += getStringField("1"); |
|
382 |
return buff; |
|
383 |
|
|
384 |
} |
|
385 |
|
|
386 |
private String buildResult(OafEntity data) { |
|
387 |
|
|
388 |
String buff = new String(); |
|
389 |
|
|
390 |
Result result = data.getResult(); |
|
391 |
eu.dnetlib.data.proto.ResultProtos.Result.Metadata metadata = result.getMetadata(); |
|
392 |
|
|
393 |
// result_topics/ |
|
394 |
String full_id = getStringField(data.getId()); |
|
395 |
|
|
396 |
buff += full_id; |
|
397 |
|
|
398 |
// result_languages |
|
399 |
buff += full_id; |
|
400 |
|
|
401 |
// `result_projects`, |
|
402 |
buff += full_id; |
|
403 |
|
|
404 |
// `result_datasources`, |
|
405 |
buff += full_id; |
|
406 |
|
|
407 |
// `result_classifications`, |
|
408 |
buff += full_id; |
|
409 |
|
|
410 |
// / `result_infrastructures`, |
|
411 |
buff += full_id; |
|
412 |
|
|
413 |
// `result_claims`, |
|
414 |
buff += full_id; |
|
415 |
|
|
416 |
// `result_results`, |
|
417 |
buff += full_id; |
|
418 |
|
|
419 |
//TODO pubtitle |
|
420 |
|
|
421 |
String titleString = NULL_STRING; |
|
422 |
|
|
423 |
for (StructuredProperty title : metadata.getTitleList()) { |
|
424 |
titleString = title.getValue(); |
|
425 |
break; |
|
426 |
} |
|
427 |
|
|
428 |
buff += getStringField(titleString); |
|
429 |
|
|
430 |
// TODO format |
|
431 |
String formatString = NULL_STRING; |
|
432 |
for (StringField format : metadata.getFormatList()) { |
|
433 |
formatString = format.getValue(); |
|
434 |
break; |
|
435 |
} |
|
436 |
|
|
437 |
buff += getStringField(formatString); |
|
438 |
//TODO publisher |
|
439 |
|
|
440 |
buff += getStringField(metadata.getPublisher().getValue()); |
|
441 |
//TODO journal |
|
442 |
|
|
443 |
buff += getStringField(metadata.getJournal().getName()); |
|
444 |
|
|
445 |
// year |
|
446 |
buff += getYearInt(metadata.getDateofacceptance().getValue()); |
|
447 |
|
|
448 |
// date CHANGED THIS TO DATE FORMAT |
|
449 |
buff += getStringDateField(metadata.getDateofacceptance().getValue()); |
|
450 |
|
|
451 |
// access_mode, |
|
452 |
buff += getStringField(getAccessMode(result)); |
|
453 |
|
|
454 |
// bestlicense |
|
455 |
|
|
456 |
buff += getStringField(getBestLicense(result)); |
|
457 |
// type |
|
458 |
buff += getStringField(metadata.getResulttype().getClassname()); |
|
459 |
// embargo_end_date |
|
460 |
buff += getStringField(metadata.getEmbargoenddate().getValue()); |
|
461 |
|
|
462 |
// `authors`, |
|
463 |
int authors = 0; |
|
464 |
String delayed = "no"; |
|
465 |
|
|
466 |
for (OafRel rel : data.getCachedRelList()) { |
|
467 |
|
|
468 |
if (rel.getRelType().equals(RelType.personResult)) { |
|
469 |
|
|
470 |
authors++; |
|
471 |
} else if (rel.getRelType().equals(RelType.resultProject)) |
|
472 |
// TODO remember : in result Project, first id is project, second is |
|
473 |
// result. |
|
474 |
{ |
|
475 |
|
|
476 |
String daysfromend = getYearDifferenceInteger(rel.getResultProject().getOutcome().getRelMetadata().getEnddate(), rel.getResultProject().getOutcome().getRelMetadata().getStartdate()); |
|
477 |
if (Integer.parseInt(daysfromend) > 0) { |
|
478 |
delayed = "yes"; |
|
479 |
} |
|
480 |
} |
|
481 |
} |
|
482 |
// `delayed`, |
|
483 |
buff += getStringField(delayed); |
|
484 |
|
|
485 |
// log.info("Result " + full_id +"Author count : " + authors ); |
|
486 |
buff += getNumericField(String.valueOf(authors)); |
|
487 |
|
|
488 |
// number?? |
|
489 |
|
|
490 |
buff += getStringField("1"); |
|
491 |
|
|
492 |
if (isValid(buff, full_id)) { |
|
493 |
return buff; |
|
494 |
} else { |
|
495 |
return null; |
|
496 |
} |
|
497 |
|
|
498 |
} |
|
499 |
|
|
500 |
//TODO here see if check is ok |
|
501 |
private boolean isValid(String buff, String id) { |
|
502 |
if (buff.endsWith(ENCLOSED)) { |
|
503 |
log.error("Empty Result with " + id + " with body: \n" + buff); |
|
504 |
return false; |
|
505 |
} |
|
506 |
return true; |
|
507 |
} |
|
508 |
|
|
509 |
private String getBestLicense(Result result) { |
|
510 |
Qualifier bestLicense = null; |
|
511 |
LicenseComparator lc = new LicenseComparator(); |
|
512 |
for (Instance instance : (result.getInstanceList())) { |
|
513 |
if (lc.compare(bestLicense, instance.getLicence()) > 0) { |
|
514 |
bestLicense = instance.getLicence(); |
|
515 |
} |
|
516 |
} |
|
517 |
if (bestLicense != null) { |
|
518 |
return bestLicense.getClassname(); |
|
519 |
} else { |
|
520 |
return null; |
|
521 |
} |
|
522 |
} |
|
523 |
|
|
524 |
// TODO here iterate over all values |
|
525 |
private String getAccessMode(Result result) { |
|
526 |
String accessMode = NULL_STRING; |
|
527 |
for (Instance instance : (result.getInstanceList())) { |
|
528 |
if (instance.getLicence().getClassname() != null && !instance.getLicence().getClassname().isEmpty()) { |
|
529 |
accessMode = instance.getLicence().getClassname(); |
|
530 |
break; |
|
531 |
} |
|
532 |
|
|
533 |
} |
|
534 |
|
|
535 |
return accessMode; |
|
536 |
} |
|
537 |
|
|
538 |
private String buildProject(OafEntity data) { |
|
539 |
|
|
540 |
String buff = new String(); |
|
541 |
|
|
542 |
Project project = data.getProject(); |
|
543 |
eu.dnetlib.data.proto.ProjectProtos.Project.Metadata metadata = project.getMetadata(); |
|
544 |
// project_organizations |
|
545 |
|
|
546 |
String full_id = getStringField(data.getId()); |
|
547 |
buff += full_id; |
|
548 |
|
|
549 |
// project_results |
|
550 |
buff += full_id; |
|
551 |
// `acronym`, |
|
552 |
String acronym = metadata.getAcronym().getValue(); |
|
553 |
if (acronym.equalsIgnoreCase("UNKNOWN")) { |
|
554 |
acronym = metadata.getTitle().getValue(); |
|
555 |
} |
|
556 |
|
|
557 |
buff += getStringField(acronym); |
|
558 |
|
|
559 |
//title! |
|
560 |
String title = getStringField(metadata.getTitle().getValue()); |
|
561 |
buff += getStringField(title); |
|
562 |
// `funding_lvl0`, |
|
563 |
String funding_tree_0 = NULL_STRING; |
|
564 |
String funding_tree_1 = NULL_STRING; |
|
565 |
String funding_tree_2 = NULL_STRING; |
|
566 |
|
|
567 |
List<StringField> fundList = metadata.getFundingtreeList(); |
|
568 |
|
|
569 |
if (!fundList.isEmpty()) // `funding_lvl0`, |
|
570 |
{ |
|
571 |
funding_tree_0 = getFundingLevel(fundList.get(0).getValue(), 0); |
|
572 |
|
|
573 |
funding_tree_1 = getFundingLevel(fundList.get(0).getValue(), 1); |
|
574 |
// log.info(funding_tree_1); |
|
575 |
|
|
576 |
funding_tree_2 = getFundingLevel(fundList.get(0).getValue(), 2); |
|
577 |
// log.info(funding_tree_2); |
|
578 |
|
|
579 |
} |
|
580 |
funding_tree_0 = funding_tree_0.replaceAll("\"", ""); |
|
581 |
buff += getStringField(funding_tree_0); |
|
582 |
// `funding_lvl1`, |
|
583 |
|
|
584 |
funding_tree_1 = funding_tree_1.replaceAll("\"", ""); |
|
585 |
|
|
586 |
if (funding_tree_1.equalsIgnoreCase("SP1")) { |
|
587 |
|
|
588 |
funding_tree_1 = "SP1-Cooperation"; |
|
589 |
|
|
590 |
} else if (funding_tree_1.equalsIgnoreCase("SP2")) { |
|
591 |
funding_tree_1 = "SP2-Ideas"; |
|
592 |
} else if (funding_tree_1.equalsIgnoreCase("SP3")) { |
|
593 |
funding_tree_1 = "SP3-People"; |
|
594 |
} else if (funding_tree_1.equalsIgnoreCase("SP4")) { |
|
595 |
funding_tree_1 = "SP4-Capacities"; |
|
596 |
} else if (funding_tree_1.equalsIgnoreCase("SP5")) { |
|
597 |
funding_tree_1 = "SP5-Euratom"; |
|
598 |
} |
|
599 |
|
|
600 |
buff += getStringField(funding_tree_1); |
|
601 |
|
|
602 |
// TODO checking for level 2 value present in other funding tree |
|
603 |
if (funding_tree_2.equals(NULL_STRING)) { |
|
604 |
if (fundList.size() > 1) { |
|
605 |
funding_tree_2 = getFundingLevel(fundList.get(1).getValue(), 2); |
|
606 |
} |
|
607 |
} |
|
608 |
|
|
609 |
funding_tree_2 = funding_tree_2.replaceAll("\"", ""); |
|
610 |
|
|
611 |
// / `funding_lvl2`, |
|
612 |
buff += getStringField(funding_tree_2); |
|
613 |
|
|
614 |
// `sc39`, |
|
615 |
|
|
616 |
String sc39 = metadata.getEcsc39().getValue().toString(); |
|
617 |
if (sc39.equalsIgnoreCase("true") || sc39.equalsIgnoreCase("t") || sc39.contains("yes")) { |
|
618 |
sc39 = "yes"; |
|
619 |
} else if (sc39.equalsIgnoreCase("false") || sc39.equalsIgnoreCase("f") || sc39.contains("no")) { |
|
620 |
sc39 = "no"; |
|
621 |
} |
|
622 |
|
|
623 |
buff += getStringField(sc39); |
|
624 |
|
|
625 |
// `url`, |
|
626 |
|
|
627 |
buff += getStringField(metadata.getWebsiteurl().getValue()); |
|
628 |
|
|
629 |
// start_year |
|
630 |
|
|
631 |
buff += getYearInt(metadata.getStartdate().getValue()); |
|
632 |
|
|
633 |
// end_year |
|
634 |
buff += getYearInt(metadata.getEnddate().getValue()); |
|
635 |
|
|
636 |
// duration enddate-startdate |
|
637 |
|
|
638 |
buff += getYearDifferenceInteger(metadata.getEnddate().getValue(), metadata.getStartdate().getValue()); |
|
639 |
|
|
640 |
// haspubs |
|
641 |
buff += getStringField("no"); |
|
642 |
|
|
643 |
// numpubs |
|
644 |
buff += getNumericField("0"); |
|
645 |
// enddate |
|
646 |
buff += getNumericField(metadata.getEnddate().getValue()); |
|
647 |
// startdate |
|
648 |
buff += getNumericField(metadata.getStartdate().getValue()); |
|
649 |
|
|
650 |
// `daysforlastpub`, |
|
651 |
buff += getNumericField(""); |
|
652 |
// `delayedpubs`, |
|
653 |
buff += getNumericField(""); |
|
654 |
// `number` |
|
655 |
buff += getStringField("1"); |
|
656 |
return buff; |
|
657 |
|
|
658 |
} |
|
659 |
|
|
660 |
//TODO remove for production - not yet ready |
|
661 |
private String buildPerson(OafEntity data) { |
|
662 |
|
|
663 |
String buff = new String(); |
|
664 |
|
|
665 |
PersonProtos.Person person = data.getPerson(); |
|
666 |
eu.dnetlib.data.proto.PersonProtos.Person.Metadata metadata = person.getMetadata(); |
|
667 |
|
|
668 |
person.getCoauthorsCount(); |
|
669 |
|
|
670 |
// `person_id`, |
|
671 |
String full_id = getStringField(data.getId()); |
|
672 |
buff += full_id; |
|
673 |
// person_result |
|
674 |
buff += full_id; |
|
675 |
|
|
676 |
// `fullname`, |
|
677 |
buff += metadata.getFullname(); |
|
678 |
|
|
679 |
// `Nationality`, |
|
680 |
buff += metadata.getNationality(); |
|
681 |
// `Email`, |
|
682 |
buff += metadata.getEmail(); |
|
683 |
// `Phone`, |
|
684 |
buff += metadata.getPhone(); |
|
685 |
// `CoauthorsCount` |
|
686 |
buff += person.getCoauthorsCount(); |
|
687 |
// `number`, |
|
688 |
buff += getStringField("1"); |
|
689 |
return buff; |
|
690 |
|
|
691 |
} |
|
692 |
|
|
693 |
private String getFundingLevel(String funding_level, int level) { |
|
694 |
|
|
695 |
if (funding_level.isEmpty()) { |
|
696 |
return NULL_STRING; |
|
697 |
|
|
698 |
} |
|
699 |
|
|
700 |
if (!funding_level.contains("funding_level_" + level)) { |
|
701 |
return NULL_STRING; |
|
702 |
} |
|
703 |
String[] split = funding_level.split("funding_level_" + level); |
|
704 |
|
|
705 |
funding_level = split[1]; |
|
706 |
split = funding_level.split("name"); |
|
707 |
split = split[1].split(","); |
|
708 |
|
|
709 |
funding_level = split[0].replaceAll(".*:\"", ""); |
|
710 |
funding_level = funding_level.replaceFirst(ENCLOSED, ""); |
|
711 |
funding_level = funding_level.trim(); |
|
712 |
|
|
713 |
return funding_level; |
|
714 |
} |
|
715 |
|
|
716 |
private String getYearDifferenceInteger(String enddate, String startdate) { |
|
717 |
|
|
718 |
if (!enddate.isEmpty() && enddate != null && startdate != null && !startdate.isEmpty()) { |
|
719 |
|
|
720 |
// if (enddate != null && startdate != null&&) { |
|
721 |
|
|
722 |
String[] split = startdate.split("-"); |
|
723 |
|
|
724 |
int Startdate = Integer.parseInt(split[0]); |
|
725 |
|
|
726 |
split = enddate.split("-"); |
|
727 |
|
|
728 |
int Enddate = Integer.parseInt(split[0]); |
|
729 |
|
|
730 |
int diff = Enddate - Startdate; |
|
731 |
return ENCLOSED + diff + ENCLOSED + DELIM; |
|
732 |
} |
|
733 |
|
|
734 |
return ENCLOSED + NULL_NUM + ENCLOSED + DELIM; |
|
735 |
} |
|
736 |
|
|
737 |
private String getYearInt(String data) { |
|
738 |
if (data == null || data.isEmpty() || data.equals("-1")) { |
|
739 |
return ENCLOSED + NULL_NUM + ENCLOSED + DELIM; |
|
740 |
} |
|
741 |
|
|
742 |
String[] split = data.split("-"); |
|
743 |
|
|
744 |
if (split != null) { |
|
745 |
|
|
746 |
String year = split[0]; |
|
747 |
year = cleanNumber(year); |
|
748 |
|
|
749 |
return ENCLOSED + year + ENCLOSED + DELIM; |
|
750 |
} else { |
|
751 |
return ENCLOSED + NULL_NUM + ENCLOSED + DELIM; |
|
752 |
} |
|
753 |
|
|
754 |
} |
|
755 |
|
|
756 |
private String cleanNumber(String number) { |
|
757 |
number = number.replaceAll("[^A-Za-z0-9:,____]", ""); |
|
758 |
|
|
759 |
return number; |
|
760 |
} |
|
761 |
|
|
762 |
private String getStringField(String data) { |
|
763 |
|
|
764 |
if (data == null || data.isEmpty() || data.equals("")) { |
|
765 |
|
|
766 |
return ENCLOSED + NULL_STRING + ENCLOSED + DELIM; |
|
767 |
} else { |
|
768 |
|
|
769 |
String field = clean(data); |
|
770 |
if (field == null) { |
|
771 |
return ENCLOSED + NULL_STRING + ENCLOSED + DELIM; |
|
772 |
} else { |
|
773 |
return field + DELIM; |
|
774 |
} |
|
775 |
} |
|
776 |
} |
|
777 |
|
|
778 |
private String getStringDateField(String data) { |
|
779 |
|
|
780 |
if (data == null || data.isEmpty() || data.equals("") || data.equals("-1")) { |
|
781 |
|
|
782 |
return ENCLOSED + NULL_NUM + ENCLOSED + DELIM; |
|
783 |
} else { |
|
784 |
|
|
785 |
String field = clean(data); |
|
786 |
if (field == null) { |
|
787 |
return ENCLOSED + NULL_NUM + ENCLOSED + DELIM; |
|
788 |
} else { |
|
789 |
return field + DELIM; |
|
790 |
} |
|
791 |
} |
|
792 |
} |
|
793 |
|
|
794 |
private String getNumericField(String data) { |
|
795 |
if (data == null || data.isEmpty() || data.equals("")) { |
|
796 |
return ENCLOSED + NULL_NUM + ENCLOSED + DELIM; |
|
797 |
} else { |
|
798 |
|
|
799 |
return ENCLOSED + data + ENCLOSED + DELIM; |
|
800 |
} |
|
801 |
} |
|
802 |
|
|
803 |
public String getId(Oaf oaf) { |
|
804 |
switch (oaf.getKind()) { |
|
805 |
case entity: |
|
806 |
|
|
807 |
return cleanId(oaf.getEntity().getId()); |
|
808 |
case relation: |
|
809 |
|
|
810 |
return cleanId(oaf.getRel().getSource()); |
|
811 |
|
|
812 |
} |
|
813 |
return null; |
|
814 |
|
|
815 |
} |
|
816 |
|
|
817 |
public String getId(OafRel relOaf) { |
|
818 |
return cleanId(relOaf.getSource()); |
|
819 |
} |
|
820 |
|
|
821 |
public static String clean(String value) { |
|
822 |
if (value != null) { |
|
823 |
// TODO DO NOT CHANGE THIS: IT REMOVES ID PREFIX ( 5|datacite____:: |
|
824 |
// to datacite____:: ) |
|
825 |
// AND REPLACES OCCURENCES OF DELIM CHARS IN DATA |
|
826 |
value = value.replaceFirst(".*\\|", ""); |
|
827 |
value = value.replaceAll(DELIM, ""); |
|
828 |
value = value.replaceAll(ENCLOSED, ""); |
|
829 |
|
|
830 |
// value = value.replaceAll("[^A-Za-z0-9:,____-;:]", " "); |
|
831 |
value = value.trim(); |
|
832 |
|
|
833 |
} |
|
834 |
if (value == null) { |
|
835 |
return null; |
|
836 |
} |
|
837 |
return ENCLOSED + value + ENCLOSED; |
|
838 |
|
|
839 |
} |
|
840 |
|
|
841 |
public static String cleanId(String value) { |
|
842 |
if (value != null) { |
|
843 |
// TODO DO NOT CHANGE THIS: IT REMOVES ID PREFIX ( 5|datacite____:: |
|
844 |
// to datacite____:: ) |
|
845 |
// AND REPLACES OCCURENCES OF DELIM CHARS IN DATA |
|
846 |
value = value.replaceFirst(".*\\|", ""); |
|
847 |
value = value.replaceAll("\n", ""); |
|
848 |
value = value.replaceAll(DELIM, ""); |
|
849 |
value = value.replaceAll(ENCLOSED, ""); |
|
850 |
value = value.trim(); |
|
851 |
|
|
852 |
} |
|
853 |
if (value == null) { |
|
854 |
return null; |
|
855 |
} |
|
856 |
return ENCLOSED + value + ENCLOSED; |
|
857 |
|
|
858 |
} |
|
859 |
|
|
860 |
public long DATEDIFF(String startDate, String endDate) { |
|
861 |
long MILLISECS_PER_DAY = 24 * 60 * 60 * 1000; |
|
862 |
long days = 0l; |
|
863 |
SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd"); // "dd/MM/yyyy HH:mm:ss"); |
|
864 |
// <startdate>2011-09-01</startdate> |
|
865 |
// <enddate>2015-08-31</enddate> |
|
866 |
Date dateIni = null; |
|
867 |
Date dateFin = null; |
|
868 |
try { |
|
869 |
dateIni = (Date) format.parse(startDate); |
|
870 |
dateFin = (Date) format.parse(endDate); |
|
871 |
days = (dateFin.getTime() - dateIni.getTime()) / MILLISECS_PER_DAY; |
|
872 |
} catch (Exception e) { |
|
873 |
e.printStackTrace(); |
|
874 |
} |
|
875 |
|
|
876 |
return days; |
|
877 |
} |
|
878 |
|
|
879 |
public String getDELIM() { |
|
880 |
return DELIM; |
|
881 |
} |
|
882 |
|
|
883 |
public void setDELIM(String dELIM) { |
|
884 |
DELIM = dELIM; |
|
885 |
} |
|
886 |
|
|
887 |
public String getNULL_STRING() { |
|
888 |
return NULL_STRING; |
|
889 |
} |
|
890 |
|
|
891 |
public void setNULL_STRING(String nULL_STRING) { |
|
892 |
NULL_STRING = nULL_STRING; |
|
893 |
} |
|
894 |
|
|
895 |
public String getNULL_NUM() { |
|
896 |
return NULL_NUM; |
|
897 |
} |
|
898 |
|
|
899 |
public void setNULL_NUM(String nULL_NUM) { |
|
900 |
NULL_NUM = nULL_NUM; |
|
901 |
} |
|
902 |
|
|
903 |
public String getENCLOSED() { |
|
904 |
return ENCLOSED; |
|
905 |
} |
|
906 |
|
|
907 |
public void setENCLOSED(String eNCLOSED) { |
|
908 |
ENCLOSED = eNCLOSED; |
|
909 |
} |
|
910 |
|
|
911 |
} |
|
0 | 912 |
modules/dnet-openaire-stats/branches/oaimpact/src/main/java/eu/dnetlib/data/mapreduce/hbase/statsExport/utils/ContextExporter.java | ||
---|---|---|
1 |
package eu.dnetlib.data.mapreduce.hbase.statsExport.utils; |
|
2 |
|
|
3 |
import java.util.ArrayList; |
|
4 |
import java.util.List; |
|
5 |
|
|
6 |
import org.apache.hadoop.conf.Configuration; |
|
7 |
import org.apache.hadoop.fs.FSDataOutputStream; |
|
8 |
import org.apache.hadoop.fs.FileSystem; |
|
9 |
import org.apache.hadoop.fs.Path; |
|
10 |
import org.apache.log4j.Logger; |
|
11 |
|
|
12 |
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; |
|
13 |
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; |
|
14 |
|
|
15 |
import org.apache.cxf.jaxws.JaxWsProxyFactoryBean; |
|
16 |
|
|
17 |
public class ContextExporter { |
|
18 |
private ContextTransformer contextTransformer = new ContextTransformer(); |
|
19 |
private String outputPath; |
|
20 |
private Logger log = Logger.getLogger(this.getClass()); |
|
21 |
|
|
22 |
|
|
23 |
private ArrayList<String> context = new ArrayList<String>(); |
|
24 |
private ArrayList<String> category = new ArrayList<String>(); |
|
25 |
private ArrayList<String> concept = new ArrayList<String>(); |
|
26 |
|
|
27 |
public ContextExporter(String outputPath, String contextMap, boolean readFromUrl) throws Exception { |
|
28 |
if (!outputPath.endsWith("/")) { |
|
29 |
outputPath += "/"; |
|
30 |
} |
|
31 |
this.outputPath = outputPath; |
|
32 |
if (readFromUrl) { |
|
33 |
readFromUrl(contextMap); |
|
34 |
} else { |
|
35 |
readFromBuffer(contextMap); |
|
36 |
} |
|
37 |
|
|
38 |
} |
|
39 |
|
|
40 |
public void readFromUrl(String url) throws Exception { |
|
41 |
|
|
42 |
List<String> concepts = getContextResouces(url); |
|
43 |
log.info("Returned concept " + concepts.size() |
|
44 |
); |
|
45 |
|
|
46 |
for (String data : concepts) { |
|
47 |
log.info("++++++++++++++ Transforming concept data "); |
|
48 |
String res = contextTransformer.transformXSL(data); |
|
49 |
|
|
50 |
processData(res); |
|
51 |
|
|
52 |
|
|
53 |
} |
|
54 |
writeData(this.context, "context"); |
|
55 |
writeData(this.category, "category"); |
|
56 |
writeData(this.concept, "concept"); |
|
57 |
|
|
58 |
|
|
59 |
} |
|
60 |
|
|
61 |
private void readFromBuffer(String contextMap) throws Exception { |
|
62 |
|
|
63 |
if (contextMap == null || contextMap.isEmpty()) { |
|
64 |
log.error("Context Resources file is empty."); |
|
65 |
throw new Exception("Context Resources file is empty."); |
|
66 |
} |
|
67 |
|
|
68 |
String data = contextTransformer.transformXSL(contextMap); |
|
69 |
|
|
70 |
log.info(data); |
|
71 |
processData(data); |
|
72 |
} |
|
73 |
|
|
74 |
private List<String> getContextResouces(String url) throws ISLookUpException { |
|
75 |
ISLookUpService lookUpService; |
|
76 |
|
|
77 |
JaxWsProxyFactoryBean factory = new JaxWsProxyFactoryBean(); |
|
78 |
factory.setServiceClass(ISLookUpService.class); |
|
79 |
factory.setAddress(url); |
|
80 |
|
|
81 |
lookUpService = (ISLookUpService) factory.create(); |
|
82 |
// for $x in //RESOURCE_PROFILE[.//RESOURCE_TYPE/@value='ContextDSResourceType'] |
|
83 |
// [.//RESOURCE_KIND/@value='ContextDSResources'] return $x |
|
84 |
return lookUpService.quickSearchProfile("//RESOURCE_PROFILE[.//RESOURCE_TYPE/@value='ContextDSResourceType'][.//RESOURCE_KIND/@value='ContextDSResources']"); |
|
85 |
} |
|
86 |
|
|
87 |
|
|
88 |
private void writeData(ArrayList<String> dataList, String listName) throws Exception { |
|
89 |
|
|
90 |
log.info(listName + " size " + dataList.size()); |
|
91 |
|
|
92 |
String data = new String(); |
|
93 |
for (int i = 0; i < dataList.size(); i++) { |
|
94 |
|
|
95 |
data += dataList.get(i); |
|
96 |
|
|
97 |
|
|
98 |
} |
|
99 |
|
|
100 |
|
|
101 |
data = data.substring(0, data.lastIndexOf("\n")); |
|
102 |
|
|
103 |
|
|
104 |
flushString(data, outputPath + listName); |
|
105 |
|
|
106 |
|
|
107 |
} |
|
108 |
private void processData(String data) throws Exception { |
|
109 |
try { |
|
110 |
|
|
111 |
String[] split = data.split("COPY\n"); |
|
112 |
|
|
113 |
|
|
114 |
context.add(split[0]); |
|
115 |
|
|
116 |
|
|
117 |
if (split[1].endsWith("\n")) { |
|
118 |
// split[1] = split[1].substring(0, split[1].lastIndexOf("\n")); |
|
119 |
|
|
120 |
} |
|
121 |
|
|
122 |
category.add(split[1]); |
|
123 |
|
|
124 |
concept.add(split[2]); |
|
125 |
|
|
126 |
} catch (Exception e) { |
|
127 |
String msg = " Unable to create file with context, " + "concept and category values in output path " + outputPath + ". Reason: "; |
|
128 |
log.error(msg); |
|
129 |
throw new Exception(msg, e); |
|
130 |
} |
|
131 |
|
|
132 |
} |
|
133 |
|
|
134 |
private void flushString(String data, String destination) throws Exception { |
|
135 |
|
|
136 |
FSDataOutputStream fin = null; |
|
137 |
try { |
|
138 |
|
|
139 |
|
|
140 |
log.info("***********************Writing data:***********************\n" + data); |
|
141 |
log.info("*********************** data:***********************\n"); |
|
142 |
FileSystem fs = FileSystem.get(new Configuration()); |
|
143 |
fin = fs.create(new Path(destination), true); |
|
144 |
|
|
145 |
fin.write(data.getBytes()); |
|
146 |
|
|
147 |
} catch (Exception e) { |
|
148 |
log.error("Failed to write exported data to a file : ", e); |
|
149 |
throw new Exception("Failed to write exported data to a file : " + e.toString(), e); |
|
150 |
|
|
151 |
} finally { |
|
152 |
|
|
153 |
fin.close(); |
|
154 |
|
|
155 |
} |
|
156 |
} |
|
157 |
|
|
158 |
public String getOutputPath() { |
|
159 |
return outputPath; |
|
160 |
} |
|
161 |
|
|
162 |
public void setOutputPath(String outputPath) { |
|
163 |
this.outputPath = outputPath; |
|
164 |
} |
|
165 |
|
|
166 |
} |
modules/dnet-openaire-stats/branches/oaimpact/src/test/java/eu/dnetlib/data/mapreduce/hbase/statsExport/utils/ExportTest.java | ||
---|---|---|
1 |
package eu.dnetlib.data.mapreduce.hbase.statsExport.utils; |
|
2 |
|
|
3 |
import java.io.InputStream; |
|
4 |
import java.util.Properties; |
|
5 |
|
|
6 |
import org.apache.hadoop.conf.Configuration; |
|
7 |
import org.apache.log4j.Logger; |
|
8 |
import org.junit.Before; |
|
9 |
import org.junit.Test; |
|
10 |
|
|
11 |
import eu.dnetlib.data.mapreduce.hbase.statsExport.daos.SqlDAO; |
|
12 |
import eu.dnetlib.data.mapreduce.hbase.statsExport.daos.SqlStore; |
|
13 |
import eu.dnetlib.data.mapreduce.hbase.statsExport.drivers.DBDriver; |
|
14 |
import eu.dnetlib.data.mapreduce.hbase.statsExport.drivers.SqoopDriver; |
|
15 |
import eu.dnetlib.data.mapreduce.hbase.statsExport.drivers.StatsJobDriver; |
|
16 |
|
|
17 |
public class ExportTest { |
|
18 |
|
|
19 |
private Logger log = Logger.getLogger(this.getClass()); |
|
20 |
|
|
21 |
private SqlDAO statsDao; |
|
22 |
private SqlStore statsStore; |
|
23 |
private SqoopDriver sqoopDriver; |
|
24 |
private Properties props; |
|
25 |
private StatsJobDriver statsJobDriver; |
|
26 |
private DBDriver dbDriver; |
|
27 |
|
|
28 |
// @Before |
|
29 |
public void init() throws Exception { |
|
30 |
InputStream file = ClassLoader.getSystemResourceAsStream("eu/dnetlib/data/mapreduce/hbase/statsExport/StatsProperties"); |
|
31 |
props = new Properties(); |
|
32 |
props.load(file); |
|
33 |
file.close(); |
|
34 |
|
|
35 |
statsStore = new SqlStore(); |
|
36 |
|
|
37 |
statsStore.setDbDriver(props.getProperty("Stats_db_Driver")); |
|
38 |
// |
|
39 |
statsStore.setDB_URL(props.getProperty("Stats_db_Url")); |
|
40 |
statsStore.setDbUser(props.getProperty("Stats_db_User")); |
|
41 |
statsStore.setDbPassword(props.getProperty("Stats_db_Pass")); |
|
42 |
|
|
43 |
statsDao = new SqlDAO(); |
|
44 |
|
|
45 |
statsDao.setStore(statsStore); |
|
46 |
|
|
47 |
dbDriver = new DBDriver(); |
|
48 |
dbDriver.setStatsDao(statsDao); |
|
49 |
|
|
50 |
sqoopDriver = new SqoopDriver(); |
|
51 |
sqoopDriver.setDelim(props.getProperty("Stats_delim_Character")); |
|
52 |
sqoopDriver.setOutputPath(props.getProperty("Stats_outputPath")); |
|
53 |
|
|
54 |
sqoopDriver.setConnectionUrl(props.getProperty("Stats_db_Url")); |
|
55 |
sqoopDriver.setDbUser(props.getProperty("Stats_db_User")); |
|
56 |
sqoopDriver.setDbPass(props.getProperty("Stats_db_Pass")); |
|
57 |
sqoopDriver.setSqoopReducersCount(props.getProperty("Stats_sqoop_ReducersCount")); |
|
58 |
sqoopDriver.setRecsPerStatement("1"); |
|
59 |
sqoopDriver.setStatementPerTrans("1"); |
|
60 |
sqoopDriver.setBatch(false); |
|
61 |
sqoopDriver.setVerbose(true); |
|
62 |
|
|
63 |
sqoopDriver.setConf(new Configuration()); |
|
64 |
sqoopDriver.setUseHdfsStore(false); |
|
65 |
} |
|
66 |
|
|
67 |
// @Test |
|
68 |
public void parser() { |
|
69 |
|
|
70 |
String url = "jdbc:postgresql://kiczora.vls.icm.edu.pl:5432/stats"; |
|
71 |
if (url.endsWith("/")) { |
|
72 |
url = url.substring(0, url.lastIndexOf('/')); |
|
73 |
System.out.print("url 1 " + url + "\n"); |
|
74 |
|
|
75 |
} |
|
76 |
url = url.substring(0, url.lastIndexOf('/') + 1); |
|
77 |
System.out.print("url 2" + url); |
|
78 |
|
|
79 |
} |
|
80 |
|
|
81 |
public void runStatsExport() throws Exception { |
|
82 |
statsJobDriver = new StatsJobDriver(); |
|
83 |
statsJobDriver.init(); |
|
84 |
statsJobDriver.run(); |
|
85 |
} |
|
86 |
|
|
87 |
// @Test |
|
88 |
public void runPrepareDB() throws Exception { |
|
89 |
|
|
90 |
statsDao.setSearchPathDB(props.getProperty("Stats_db_User")); |
|
91 |
// dbDriver.prepareDB(props.getProperty("Stats_db_User")); |
|
92 |
|
|
93 |
} |
|
94 |
|
|
95 |
// @Test |
|
96 |
public void runSqoopImport() throws Exception { |
|
97 |
// dbDriver.prepareDB(props.getProperty("Stats.dbUser")); |
|
98 |
sqoopDriver.initSqoopJob(); |
|
99 |
} |
|
100 |
|
|
101 |
// @Test |
|
102 |
public void test() throws Exception { |
|
103 |
// InputStream in = |
|
104 |
// ClassLoader.getSystemResourceAsStream("eu/dnetlib/data/mapreduce/hbase/statsExport/" |
|
105 |
// + "ContextResourceProfile.xml"); |
|
106 |
|
|
107 |
// byte[] b = new byte[in.available()]; |
|
108 |
|
|
109 |
// in.read(b); |
|
110 |
// String data = new String(b); |
|
111 |
// in.close(); |
|
112 |
// log.info("data" + data); |
|
113 |
|
|
114 |
// ContextExporter exp= new |
|
115 |
// ContextExporter(props.getProperty("Stats_outputPath"), |
|
116 |
// props.getProperty("ConhaspubstextResourceXML"), false); |
|
117 |
ContextTransformer tr = new ContextTransformer(); |
|
118 |
log.info(tr.transformXSL(props.getProperty("ContextResourceXML"))); |
|
119 |
|
|
120 |
} |
|
121 |
|
|
122 |
|
|
123 |
// @Test |
|
124 |
public void testExport() throws Exception { |
|
125 |
|
|
126 |
ContextExporter exp= new ContextExporter(props.getProperty("Stats_outputPath"), props.getProperty("isLookupEndpoint"), true); |
|
127 |
|
|
128 |
} |
|
129 |
|
|
130 |
|
|
131 |
// @Test |
|
132 |
public void executeExtraInserts() throws Exception { |
|
133 |
// statsDao.buildViews(); |
|
134 |
statsDao.executeExtraInserts(); |
|
135 |
|
|
136 |
} |
|
137 |
|
|
138 |
|
|
139 |
// @Test |
|
140 |
public void buildIndexesDb() throws Exception { |
|
141 |
|
|
142 |
statsDao.executeExtraInserts(); |
|
143 |
|
|
144 |
} |
|
145 |
|
|
146 |
// @Test |
|
147 |
public void finalizeDb() throws Exception { |
|
148 |
|
|
149 |
dbDriver.finalizedDB(); |
|
150 |
|
|
151 |
} |
|
152 |
|
|
153 |
// @Test |
|
154 |
public void clean() throws Exception { |
|
155 |
|
|
156 |
log.info(cleanId("od_______133::000d16306f933e487892fdad65e1fc59")); |
|
157 |
} |
|
158 |
public String cleanId(String value) { |
|
159 |
if (value != null) { |
|
160 |
// TODO DO NOT CHANGE THIS: IT REMOVES ID PREFIX ( 5|datacite____:: |
|
161 |
// to datacite____:: ) |
|
162 |
// AND REPLACES OCCURENCES OF DELIM CHARS IN DATA |
|
163 |
value = value.replaceFirst(".*\\|", ""); |
|
164 |
value = value.replaceAll("\n", ""); |
|
165 |
value = value.replaceAll("#", ""); |
|
166 |
value = value.replaceAll("!", ""); |
|
167 |
value = value.trim(); |
|
168 |
|
|
169 |
} |
|
170 |
if (value == null) { |
|
171 |
return null; |
|
172 |
} |
|
173 |
return value ; |
|
174 |
|
|
175 |
} |
|
176 |
|
|
177 |
} |
modules/dnet-openaire-stats/branches/oaimpact/pom.xml | ||
---|---|---|
1 |
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" |
|
2 |
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> |
|
3 |
<parent> |
|
4 |
<groupId>eu.dnetlib</groupId> |
|
5 |
<artifactId>dnet-hadoop-parent</artifactId> |
|
6 |
<version>1.0.0-SNAPSHOT</version> |
|
7 |
<relativePath></relativePath> |
|
8 |
</parent> |
|
9 |
<modelVersion>4.0.0</modelVersion> |
|
10 |
<groupId>eu.dnetlib</groupId> |
|
11 |
<artifactId>dnet-openaire-stats</artifactId> |
|
12 |
<version>0.0.1-SNAPSHOT</version> |
|
13 |
<build> |
|
14 |
<plugins> |
|
15 |
<!-- <plugin> --> |
|
16 |
<!-- <groupId>org.codehaus.mojo</groupId> --> |
|
17 |
<!-- <artifactId>exec-maven-plugin</artifactId> --> |
|
18 |
<!-- <version>1.2.1</version> --> |
|
19 |
<!-- <configuration> --> |
|
20 |
<!-- <mainClass>eu.dnetlib.data.mapreduce.hbase.statsExport.drivers.SqoopDriver</mainClass> --> |
|
21 |
<!-- </configuration> --> |
|
22 |
<!-- </plugin> --> |
|
23 |
<!-- <plugin> --> |
|
24 |
<!-- <groupId>org.apache.maven.plugins</groupId> --> |
|
25 |
<!-- <artifactId>maven-shade-plugin</artifactId> --> |
|
26 |
<!-- <version>2.1</version> --> |
|
27 |
<!-- <executions> --> |
|
28 |
<!-- <execution> --> |
|
29 |
<!-- <phase>package</phase> --> |
|
30 |
<!-- <goals> --> |
|
31 |
<!-- <goal>shade</goal> --> |
|
32 |
<!-- </goals> --> |
|
33 |
<!-- <configuration> --> |
|
34 |
<!-- <transformers> --> |
|
35 |
<!-- <transformer --> |
|
36 |
<!-- implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer"> --> |
|
37 |
<!-- </transformer> --> |
|
38 |
<!-- </transformers> --> |
|
39 |
<!-- </configuration> --> |
|
40 |
<!-- </execution> --> |
|
41 |
<!-- </executions> </plugin> --> |
|
42 |
</plugins> |
|
43 |
|
|
44 |
</build> |
|
45 |
<dependencies> |
|
46 |
<dependency> |
|
47 |
<groupId>javax.xml</groupId> |
|
48 |
<artifactId>jaxp-api</artifactId> |
|
49 |
<version>1.4.2</version> |
|
50 |
</dependency> |
|
51 |
<dependency> |
|
52 |
<groupId>net.sf.opencsv</groupId> |
|
53 |
<artifactId>opencsv</artifactId> |
|
54 |
<version>2.3</version> |
|
55 |
</dependency> |
|
56 |
<dependency> |
|
57 |
<groupId>org.apache.poi</groupId> |
|
58 |
<artifactId>poi</artifactId> |
|
59 |
<version>3.8</version> |
|
60 |
</dependency> |
|
61 |
|
|
62 |
<dependency> |
|
63 |
<groupId>jdk.tools</groupId> |
|
64 |
<artifactId>jdk.tools</artifactId> |
|
65 |
<version>1.7.0_05</version> |
|
66 |
<scope>system</scope> |
|
67 |
<systemPath>${JAVA_HOME}/lib/tools.jar</systemPath> |
|
68 |
</dependency> |
|
69 |
<dependency> |
|
70 |
<groupId>org.apache.logging.log4j</groupId> |
|
71 |
<artifactId>log4j-api</artifactId> |
|
72 |
<version>2.0-rc2</version> |
|
73 |
</dependency> |
|
74 |
<dependency> |
|
75 |
<groupId>org.apache.logging.log4j</groupId> |
|
76 |
<artifactId>log4j-core</artifactId> |
|
77 |
<version>2.0-rc2</version> |
|
78 |
</dependency> |
|
79 |
<dependency> |
|
80 |
<groupId>log4j</groupId> |
|
81 |
<artifactId>log4j</artifactId> |
|
82 |
<version>1.2.17</version> |
|
83 |
</dependency> |
|
84 |
<dependency> |
|
85 |
<groupId>org.slf4j</groupId> |
|
86 |
<artifactId>slf4j-api</artifactId> |
|
87 |
<version>1.7.6</version> |
|
88 |
</dependency> |
|
89 |
<dependency> |
|
90 |
<groupId>org.slf4j</groupId> |
|
91 |
<artifactId>slf4j-log4j12</artifactId> |
|
92 |
<version>1.7.6</version> |
|
93 |
</dependency> |
|
94 |
<dependency> |
|
95 |
<groupId>com.google.guava</groupId> |
|
96 |
<artifactId>guava-collections</artifactId> |
|
97 |
<version>r03</version> |
|
98 |
</dependency> |
|
99 |
|
|
100 |
<dependency> |
|
101 |
<groupId>eu.dnetlib</groupId> |
|
102 |
<artifactId>dnet-mapreduce-jobs</artifactId> |
|
103 |
<version>0.0.6.2-SNAPSHOT</version> |
|
104 |
</dependency> |
|
105 |
<dependency> |
|
106 |
<groupId>eu.dnetlib</groupId> |
|
107 |
<artifactId>dnet-openaire-data-protos</artifactId> |
|
108 |
<version>[3.0.0,4.0.0)</version> |
|
109 |
</dependency> |
|
110 |
<dependency> |
|
111 |
<groupId>org.apache.httpcomponents</groupId> |
|
112 |
<artifactId>httpclient</artifactId> |
|
113 |
<version>4.0-alpha4</version> |
|
114 |
</dependency> |
|
115 |
<dependency> |
|
116 |
<groupId>commons-httpclient</groupId> |
|
117 |
<artifactId>commons-httpclient</artifactId> |
|
118 |
<version>3.1</version> |
|
119 |
</dependency> |
|
120 |
<dependency> |
|
121 |
<groupId>junit</groupId> |
|
122 |
<artifactId>junit</artifactId> |
|
123 |
<version>4.8.2</version> |
|
124 |
</dependency> |
|
125 |
<dependency> |
|
126 |
<groupId>org.apache.sqoop</groupId> |
|
127 |
<artifactId>sqoop</artifactId> |
|
128 |
<version>1.4.4</version> |
|
129 |
</dependency> |
|
130 |
<dependency> |
|
131 |
<groupId>postgresql</groupId> |
|
132 |
<artifactId>postgresql</artifactId> |
|
133 |
<version>9.1-901.jdbc4</version> |
|
134 |
</dependency> |
|
135 |
<dependency> |
|
136 |
<groupId>eu.dnetlib</groupId> |
|
137 |
<artifactId>cnr-rmi-api</artifactId> |
|
138 |
<version>2.0.0-SNAPSHOT</version> |
|
139 |
</dependency> |
|
140 |
<dependency> |
|
141 |
<groupId>org.apache.cxf</groupId> |
|
142 |
<artifactId>cxf-rt-frontend-jaxws</artifactId> |
|
143 |
<version>2.7.8</version> |
|
144 |
</dependency> |
|
145 |
|
|
146 |
</dependencies> |
|
147 |
<repositories> |
|
148 |
<repository> |
|
149 |
<id>cloudera</id> |
|
150 |
<name>Cloudera Repository</name> |
|
151 |
<url>https://repository.cloudera.com/artifactory/cloudera-repos</url> |
|
152 |
<releases> |
|
153 |
<enabled>true</enabled> |
|
154 |
</releases> |
|
155 |
<snapshots> |
|
156 |
<enabled>false</enabled> |
|
157 |
</snapshots> |
|
158 |
</repository> |
|
159 |
</repositories> |
|
160 |
|
|
161 |
</project> |
|
0 | 162 |
modules/dnet-openaire-stats/branches/oaimpact/src/main/java/eu/dnetlib/data/mapreduce/hbase/statsExport/utils/HdfsWriter.java | ||
---|---|---|
1 |
package eu.dnetlib.data.mapreduce.hbase.statsExport.utils; |
|
2 |
|
|
3 |
import java.io.BufferedWriter; |
|
4 |
import java.io.OutputStreamWriter; |
|
5 |
import java.text.SimpleDateFormat; |
|
6 |
import java.util.ArrayList; |
|
7 |
import java.util.Date; |
|
8 |
import java.util.List; |
|
9 |
|
|
10 |
import org.apache.hadoop.conf.Configuration; |
|
11 |
import org.apache.hadoop.fs.FileSystem; |
|
12 |
import org.apache.hadoop.fs.Path; |
|
13 |
import org.apache.log4j.Logger; |
|
14 |
|
|
15 |
public class HdfsWriter { |
|
16 |
|
|
17 |
|
|
18 |
public static void write(String data, String filename) throws Exception { |
|
19 |
|
|
20 |
Configuration conf = new Configuration(); |
|
21 |
|
|
22 |
FileSystem hdfs = FileSystem.get(conf); |
|
23 |
|
|
24 |
try { |
|
25 |
|
|
26 |
|
|
27 |
Path exportPath = new Path(hdfs.getUri() + filename); |
|
28 |
BufferedWriter br = new BufferedWriter(new OutputStreamWriter(hdfs.create(exportPath, false))); |
|
29 |
// TO append data to a file, use fs.append(Path f) |
|
30 |
|
|
31 |
br.write(data); |
|
32 |
br.close(); |
|
33 |
|
|
34 |
} catch (Exception e) { |
|
35 |
|
|
36 |
|
|
37 |
throw new Exception("Error while writing file ", e); |
|
38 |
} |
|
39 |
|
|
40 |
} |
|
41 |
|
|
42 |
|
|
43 |
} |
Also available in: Unified diff
creating branch for meausing oaimpact