Revision 41251
Added by Eri Katsari over 8 years ago
modules/dnet-openaire-lodexport/trunk/src/test/java/Test.java | ||
---|---|---|
4 | 4 |
import java.text.DateFormat; |
5 | 5 |
import java.text.ParseException; |
6 | 6 |
import java.text.SimpleDateFormat; |
7 |
import java.util.ArrayList; |
|
7 | 8 |
|
8 | 9 |
/** |
9 | 10 |
* Created by eri on 1/14/16. |
... | ... | |
11 | 12 |
public class Test { |
12 | 13 |
|
13 | 14 |
|
14 |
public static void main(String[] args) throws ParseException {
|
|
15 |
public static void main(String[] args) throws ParseException {
|
|
15 | 16 |
|
16 |
String date = "2015-05-26";
|
|
17 |
String date = "2015-05-26";
|
|
17 | 18 |
|
18 |
DateTimeFormatter formatter = DateTimeFormat.forPattern("yyyy-MM-dd");
|
|
19 |
System.out.println(formatter.parseDateTime(date));
|
|
20 |
DateFormat df = new SimpleDateFormat("yyyy-mm-dd");
|
|
21 |
System.out.println(df.parse(date));
|
|
19 |
DateTimeFormatter formatter = DateTimeFormat.forPattern("yyyy-MM-dd");
|
|
20 |
System.out.println(formatter.parseDateTime(date));
|
|
21 |
DateFormat df = new SimpleDateFormat("yyyy-mm-dd");
|
|
22 |
System.out.println(df.parse(date));
|
|
22 | 23 |
|
23 | 24 |
|
25 |
String value = " \"lala\" /lo {}li? la "; |
|
24 | 26 |
|
25 |
String value=" \"lala\" /lo {}li? la ";
|
|
27 |
value = "piii-33:/ .-_:/\\ aa;;a,,a ooo\n yy";
|
|
26 | 28 |
|
27 |
value= "piii-33:/ .-_:/\\ aa;;a,,a ooo\n yy"; |
|
28 | 29 |
|
29 |
|
|
30 |
|
|
31 |
value = value.replaceAll("[\"\\r\\\\;|-]", ""); |
|
32 |
/*value = value.replace("\\r", " "); |
|
30 |
value = value.replaceAll("[\"\\r\\\\;|-]", ""); |
|
31 |
/*value = value.replace("\\r", " "); |
|
33 | 32 |
value = value.replace("\\", ""); |
34 | 33 |
*/ |
35 | 34 |
|
... | ... | |
37 | 36 |
value = value.replace("\\", ""); |
38 | 37 |
*/ |
39 | 38 |
|
40 |
value = value.replaceAll("[^a-zA-Z0-9 .-_:/]+", " ");
|
|
39 |
value = value.replaceAll("[^a-zA-Z0-9 .-_:/]+", " ");
|
|
41 | 40 |
|
42 |
System.out.println(value); |
|
43 |
} |
|
41 |
System.out.println(value); |
|
42 |
|
|
43 |
ArrayList<String> l = new ArrayList<String>(); |
|
44 |
fillList(l); |
|
45 |
for (String s : l) { |
|
46 |
System.out.println(s); |
|
47 |
} |
|
48 |
|
|
49 |
} |
|
50 |
|
|
51 |
|
|
52 |
static void fillList(ArrayList list) { |
|
53 |
list.add("lala"); |
|
54 |
|
|
55 |
} |
|
56 |
|
|
44 | 57 |
} |
modules/dnet-openaire-lodexport/trunk/src/main/java/eu/dnetlib/data/mapreduce/hbase/lodExport/utils/NewSerializer.java | ||
---|---|---|
159 | 159 |
|
160 | 160 |
private static String buildDatasource(OafEntity data, String DELIM) { |
161 | 161 |
|
162 |
Datasource d = data.getDatasource();
|
|
162 |
Metadata metadata = data.getDatasource().getMetadata();
|
|
163 | 163 |
|
164 |
Metadata metadata = d.getMetadata(); |
|
165 |
|
|
166 |
|
|
167 | 164 |
String buff = getHeader(data, DELIM); |
168 | 165 |
|
169 | 166 |
//Datasourcetype |
... | ... | |
223 | 220 |
|
224 | 221 |
//languages |
225 | 222 |
String dataStr = new String(); |
223 |
|
|
226 | 224 |
for (StringField lang : metadata.getOdlanguagesList()) { |
227 |
dataStr += clean(lang.getValue()) + SEPERATOR;
|
|
228 |
} |
|
225 |
dataStr += lang.getValue() + SEPERATOR;}
|
|
226 |
|
|
229 | 227 |
buff += dataStr + DELIM; |
230 | 228 |
|
231 | 229 |
|
232 | 230 |
// Content type |
233 | 231 |
dataStr=" "; |
234 | 232 |
for (StringField c : metadata.getOdcontenttypesList()) { |
235 |
dataStr += clean(c.getValue()) + SEPERATOR;
|
|
233 |
dataStr += c.getValue() + SEPERATOR;
|
|
236 | 234 |
} |
237 | 235 |
buff += dataStr + DELIM; |
238 | 236 |
|
239 | 237 |
//Access info package |
240 | 238 |
dataStr = " "; |
241 | 239 |
for (StringField c : metadata.getAccessinfopackageList()) { |
242 |
dataStr += clean(c.getValue()) + SEPERATOR;
|
|
240 |
dataStr += c.getValue() + SEPERATOR;
|
|
243 | 241 |
} |
244 | 242 |
|
245 | 243 |
buff += dataStr + DELIM; |
... | ... | |
286 | 284 |
//Policies |
287 | 285 |
dataStr = " "; |
288 | 286 |
for (FieldTypeProtos.KeyValue property : metadata.getPoliciesList()) { |
289 |
dataStr += clean(property.getValue()) + SEPERATOR;
|
|
287 |
dataStr += property.getValue() + SEPERATOR;
|
|
290 | 288 |
} |
291 | 289 |
buff += dataStr + DELIM; |
292 | 290 |
|
293 | 291 |
buff += getTrust(data, DELIM); |
294 |
|
|
295 | 292 |
return buff; |
296 |
|
|
297 | 293 |
} |
298 | 294 |
|
299 | 295 |
|
... | ... | |
314 | 310 |
String dataStr = new String(); |
315 | 311 |
|
316 | 312 |
for (String s : split) { |
317 |
dataStr += clean(s) + SEPERATOR;
|
|
313 |
dataStr += s.replace(DELIM," ")+ SEPERATOR;
|
|
318 | 314 |
} |
315 |
|
|
319 | 316 |
buff += dataStr + DELIM; |
320 |
|
|
321 | 317 |
//logourl |
322 |
buff += clean(metadata.getLogourl().getValue()) + DELIM;
|
|
318 |
buff += metadata.getLogourl().getValue().replace(DELIM," ") + DELIM;
|
|
323 | 319 |
// `country`, |
324 | 320 |
buff += metadata.getCountry().getClassid() + DELIM; |
325 | 321 |
|
... | ... | |
339 | 335 |
|
340 | 336 |
private static String buildResult(OafEntity data,String DELIM) { |
341 | 337 |
|
342 |
Result result = data.getResult(); |
|
343 |
Result.Metadata metadata = result.getMetadata(); |
|
338 |
Result.Metadata metadata = data.getResult().getMetadata(); |
|
344 | 339 |
|
345 |
|
|
346 | 340 |
String buff = getHeader(data,DELIM); |
347 | 341 |
|
348 | 342 |
|
... | ... | |
357 | 351 |
|
358 | 352 |
// pubtitle |
359 | 353 |
buff += clean(dataStr) + DELIM; |
354 |
|
|
360 | 355 |
// date of acceptance CHANGED THIS TO DATE FORMAT |
361 | 356 |
buff += metadata.getDateofacceptance().getValue() + DELIM; |
362 | 357 |
|
... | ... | |
369 | 364 |
for (StructuredProperty p : data.getPidList()) { |
370 | 365 |
dataStr += clean(p.getValue()) + SEPERATOR; |
371 | 366 |
} |
367 |
|
|
372 | 368 |
buff += dataStr + DELIM; |
373 | 369 |
|
374 | 370 |
//language |
... | ... | |
376 | 372 |
|
377 | 373 |
// RelevantDate |
378 | 374 |
dataStr = " "; |
375 |
|
|
379 | 376 |
for (StructuredProperty p : metadata.getRelevantdateList()) { |
380 | 377 |
dataStr += p.getValue(); |
381 | 378 |
break; |
... | ... | |
407 | 404 |
|
408 | 405 |
buff += dataStr + DELIM; |
409 | 406 |
|
410 |
|
|
411 | 407 |
//TODO Format |
412 | 408 |
buff += " " + DELIM; |
413 | 409 |
|
... | ... | |
425 | 421 |
buff += " " + DELIM; |
426 | 422 |
|
427 | 423 |
//Best License |
428 |
buff += getBestLicense(result) + DELIM;
|
|
424 |
buff += getBestLicense(data.getResult()) + DELIM;
|
|
429 | 425 |
//Description |
430 | 426 |
dataStr = " "; |
431 | 427 |
|
... | ... | |
437 | 433 |
buff += dataStr + DELIM; |
438 | 434 |
|
439 | 435 |
//Journal |
440 |
buff += metadata.getJournal().getName() + DELIM; //#null#!
|
|
436 |
buff += clean(metadata.getJournal().getName()) + DELIM; //#null#!
|
|
441 | 437 |
|
442 | 438 |
|
443 | 439 |
// TODO ERI SOS : HERE IN GET JOUTNAL. GET DATA INFO I CAN FIND PROVENANCE AND SIMILARITY |
444 | 440 |
|
445 |
|
|
446 | 441 |
// TODO isRelatedTo |
447 | 442 |
|
448 | 443 |
// resource type |
... | ... | |
466 | 461 |
// type |
467 | 462 |
buff += metadata.getResulttype().getClassname() + DELIM; |
468 | 463 |
|
469 |
|
|
470 | 464 |
buff += getTrust(data,DELIM); |
471 | 465 |
|
472 | 466 |
|
... | ... | |
477 | 471 |
private static String buildProject(OafEntity data,String DELIM) { |
478 | 472 |
|
479 | 473 |
|
480 |
Project project = data.getProject(); |
|
481 |
Project.Metadata metadata = project.getMetadata(); |
|
482 | 474 |
|
475 |
Project.Metadata metadata = data.getProject().getMetadata(); |
|
483 | 476 |
|
484 | 477 |
String buff = getHeader(data,DELIM); |
485 | 478 |
|
486 |
|
|
487 | 479 |
//Code |
488 | 480 |
buff += metadata.getCode().getValue() + DELIM; |
489 | 481 |
// `Websiteurl`, |
... | ... | |
592 | 584 |
dataStr = " "; |
593 | 585 |
for (StructuredProperty s : data.getPidList()) { |
594 | 586 |
|
595 |
dataStr += s.getValue() + ";";
|
|
587 |
dataStr += cleanId(s.getValue()) + ";";
|
|
596 | 588 |
} |
597 | 589 |
buff += dataStr + DELIM; |
598 | 590 |
|
... | ... | |
633 | 625 |
public static String cleanId(String value) { |
634 | 626 |
if (value != null) { |
635 | 627 |
// DO NOT CHANGE THIS: IT REMOVES ID PREFIX ( 5|datacite____:: |
636 |
|
|
637 | 628 |
// to datacite____:: ) |
638 | 629 |
// AND REPLACES OCCURENCES OF DELIM CHARS IN DATA |
639 | 630 |
value = value.replaceFirst(".*\\|", ""); |
640 |
value = value.replace("\n", ""); |
|
631 |
value = value.replace("\n", " ");
|
|
641 | 632 |
} |
642 | 633 |
|
643 | 634 |
return value; |
... | ... | |
647 | 638 |
//TODO make them in pairs |
648 | 639 |
private static void getDedups(OafEntity valueEntity,String DELIM,ArrayList<String> returnList) { |
649 | 640 |
|
650 |
|
|
651 | 641 |
if (!valueEntity.getChildrenList().isEmpty()) { |
652 | 642 |
|
653 | 643 |
String header = "dedup" + DELIM + valueEntity.getType().name() + DELIM + |
... | ... | |
679 | 669 |
return " " + DELIM; |
680 | 670 |
} |
681 | 671 |
|
682 |
String year = split[0]; |
|
683 | 672 |
|
684 |
year = cleanNumber(year);
|
|
673 |
return split[0] + DELIM;
|
|
685 | 674 |
|
686 |
return year + DELIM; |
|
687 | 675 |
|
688 |
|
|
689 | 676 |
} |
690 | 677 |
|
691 |
private static String cleanNumber(String number) { |
|
692 |
number = number.replaceAll("[^A-Za-z0-9:,____]", ""); |
|
693 | 678 |
|
694 |
return number; |
|
695 |
} |
|
696 | 679 |
|
697 |
|
|
698 | 680 |
private static String getBestLicense(Result result) { |
699 | 681 |
FieldTypeProtos.Qualifier bestLicense = null; |
700 | 682 |
LicenseComparator lc = new LicenseComparator(); |
... | ... | |
716 | 698 |
// TODO DO NOT CHANGE THIS: IT REMOVES ID PREFIX ( 5|datacite____:: |
717 | 699 |
// to datacite____:: ) |
718 | 700 |
// AND REPLACES OCCURENCES OF DELIM CHARS IN DATA |
719 |
value = value.replaceFirst(".*\\|", "");
|
|
701 |
/* value = value.replaceFirst(".*\\|", "");*/
|
|
720 | 702 |
value = value.replaceAll("[\"\\r\\\\;]", ""); |
721 | 703 |
value = value.replace(SEPERATOR, " "); |
722 |
value = value.replaceAll("[^a-zA-Z0-9 .-_:/]+", ""); |
|
704 |
value = value.replaceAll("[^a-zA-Z0-9 .-_:/]+", " ");
|
|
723 | 705 |
} |
724 | 706 |
|
725 | 707 |
return value; |
726 | 708 |
|
727 | 709 |
} |
728 | 710 |
|
729 |
|
|
730 |
|
|
731 | 711 |
} |
modules/dnet-openaire-lodexport/trunk/src/main/java/eu/dnetlib/data/mapreduce/hbase/lodExport/LodMapper.java | ||
---|---|---|
1 | 1 |
package eu.dnetlib.data.mapreduce.hbase.lodExport; |
2 | 2 |
|
3 | 3 |
import com.google.protobuf.InvalidProtocolBufferException; |
4 |
import com.sun.tools.internal.xjc.reader.gbind.ElementSets; |
|
4 | 5 |
import eu.dnetlib.data.mapreduce.hbase.index.config.EntityConfigTable; |
5 | 6 |
import eu.dnetlib.data.mapreduce.hbase.index.config.IndexConfig; |
6 | 7 |
import eu.dnetlib.data.mapreduce.hbase.index.config.LinkDescriptor; |
... | ... | |
12 | 13 |
import eu.dnetlib.data.proto.OafProtos.OafRelOrBuilder; |
13 | 14 |
import eu.dnetlib.data.proto.RelTypeProtos.SubRelType; |
14 | 15 |
import eu.dnetlib.data.proto.TypeProtos.Type; |
16 |
import org.apache.hadoop.fs.Syncable; |
|
15 | 17 |
import org.apache.hadoop.hbase.client.Result; |
16 | 18 |
import org.apache.hadoop.hbase.io.ImmutableBytesWritable; |
17 | 19 |
import org.apache.hadoop.hbase.mapreduce.TableMapper; |
... | ... | |
49 | 51 |
PROJECT, |
50 | 52 |
DATASOURCE, |
51 | 53 |
PERSON, |
52 |
ORGANIZATION |
|
54 |
ORGANIZATION, |
|
55 |
DELETED_BY_INFERENCE, |
|
56 |
NOT_DELETED_BY_INFERENCE, |
|
57 |
TOTAL_ENTITIES, |
|
58 |
TOTAL_RELATIONS |
|
59 |
|
|
53 | 60 |
} |
54 | 61 |
|
55 | 62 |
; |
... | ... | |
79 | 86 |
// TODO set this only when the configuration file has include dups |
80 | 87 |
// to fals for Results |
81 | 88 |
// or else were gonna get deleted by inference entries |
82 |
if (!deletedByInference(oaf)) |
|
89 |
// if (!deletedByInference(oaf)) |
|
90 |
if(deletedByInference(oaf)) |
|
91 |
{context.getCounter(ENTITIES_COUNTER.DELETED_BY_INFERENCE).increment(1);} |
|
92 |
else |
|
93 |
{context.getCounter(ENTITIES_COUNTER.NOT_DELETED_BY_INFERENCE).increment(1);} |
|
94 |
context.getCounter(ENTITIES_COUNTER.TOTAL_ENTITIES).increment(1); |
|
83 | 95 |
|
84 | 96 |
// entityConfigTable.includeDuplicates(type)) { |
85 | 97 |
//TODO eri: here allow entities deletedbyinfer |
... | ... | |
146 | 158 |
} |
147 | 159 |
|
148 | 160 |
private void emitEntity(Context context, Oaf oaf, Type type, Oaf.Builder oafBuilder) { |
149 |
String serialized = serializer.Serialize(oafBuilder.build(),DELIM); |
|
161 |
String serialized = serializer.Serialize(oafBuilder.build(), DELIM);
|
|
150 | 162 |
|
151 |
if (serialized != null && !oaf.getEntity().getId().startsWith("dedup")) {
|
|
163 |
if (serialized != null && !oaf.getEntity().getId().contains("dedup")) {
|
|
152 | 164 |
try { |
153 | 165 |
Text TextKeyOut = new Text("entities"); |
154 | 166 |
context.write((TextKeyOut), new ImmutableBytesWritable(serialized.getBytes())); |
... | ... | |
171 | 183 |
// Existing Hbase relations are generated here |
172 | 184 |
if (entityConfigTable.getDescriptors(type) != null && !entityConfigTable.getDescriptors(type).isEmpty()) { |
173 | 185 |
for (LinkDescriptor ld : entityConfigTable.getDescriptors(type)) { |
174 |
|
|
175 | 186 |
try { |
176 | 187 |
|
177 | 188 |
final Map<byte[], byte[]> columnMap = result.getFamilyMap(Bytes.toBytes(ld.getRelDescriptor().getIt())); |
... | ... | |
180 | 191 |
|
181 | 192 |
for (OafRel rel : relOaf) { |
182 | 193 |
builder.getEntityBuilder().addCachedRel(rel); |
183 |
|
|
184 | 194 |
try { |
185 | 195 |
Text TextKeyOut = new Text("relations"); |
186 |
String buff = serializer.Serialize(rel,DELIM); |
|
196 |
String buff = serializer.Serialize(rel, DELIM);
|
|
187 | 197 |
context.write((TextKeyOut), new ImmutableBytesWritable(buff.getBytes())); |
188 | 198 |
|
199 |
relOaf.clear(); |
|
200 |
relOaf = null; |
|
201 |
context.getCounter(ENTITIES_COUNTER.TOTAL_RELATIONS).increment(1); |
|
202 |
|
|
189 | 203 |
} catch (Exception e) { |
190 | 204 |
log.error("Error while writing Relation Proto to M/R output", e); |
191 | 205 |
} |
... | ... | |
200 | 214 |
} |
201 | 215 |
} |
202 | 216 |
|
203 |
List<String> relationsList = serializer.extractRelations(oaf,DELIM);
|
|
204 |
|
|
217 |
if (!oaf.getEntity().getId().startsWith("dedup")) {
|
|
218 |
List<String> relationsList = serializer.extractRelations(oaf, DELIM); |
|
205 | 219 |
for (String rel : relationsList) { |
206 | 220 |
try { |
207 | 221 |
Text TextKeyOut = new Text("relations"); |
208 |
|
|
209 |
if (oaf.getEntity().getId().startsWith("dedup")) { |
|
210 |
|
|
211 |
if (rel.startsWith("dedup")) { |
|
212 |
context.write((TextKeyOut), new ImmutableBytesWritable(rel.getBytes())); |
|
213 |
} |
|
214 |
} else { |
|
215 |
context.write((TextKeyOut), new ImmutableBytesWritable(rel.getBytes())); |
|
216 |
} |
|
217 |
|
|
222 |
context.write((TextKeyOut), new ImmutableBytesWritable(rel.getBytes())); |
|
223 |
context.getCounter(ENTITIES_COUNTER.TOTAL_RELATIONS).increment(1); |
|
218 | 224 |
} catch (Exception e) { |
219 | 225 |
log.error("Error writing relations to output : " + rel); |
220 | 226 |
} |
221 | 227 |
} |
222 |
relationsList.clear(); |
|
223 |
relationsList=null;
|
|
224 |
|
|
228 |
relationsList.clear();
|
|
229 |
relationsList = null;
|
|
230 |
} |
|
225 | 231 |
} |
226 | 232 |
|
227 | 233 |
private ArrayList<OafRel> decodeRelation(final Oaf body, final Context context, Map<byte[], byte[]> columnMap, final LinkDescriptor ld) throws IOException, InterruptedException { |
Also available in: Unified diff
new optimized serializer