Project

General

Profile

« Previous | Next » 

Revision 41251

Added by Eri Katsari over 8 years ago

new optimized serializer

View differences:

modules/dnet-openaire-lodexport/trunk/src/test/java/Test.java
4 4
import java.text.DateFormat;
5 5
import java.text.ParseException;
6 6
import java.text.SimpleDateFormat;
7
import java.util.ArrayList;
7 8

  
8 9
/**
9 10
 * Created by eri on 1/14/16.
......
11 12
public class Test {
12 13

  
13 14

  
14
	public static void main(String[] args) throws ParseException {
15
    public static void main(String[] args) throws ParseException {
15 16

  
16
		String date = "2015-05-26";
17
        String date = "2015-05-26";
17 18

  
18
		DateTimeFormatter formatter = DateTimeFormat.forPattern("yyyy-MM-dd");
19
		System.out.println(formatter.parseDateTime(date));
20
		DateFormat df = new SimpleDateFormat("yyyy-mm-dd");
21
		System.out.println(df.parse(date));
19
        DateTimeFormatter formatter = DateTimeFormat.forPattern("yyyy-MM-dd");
20
        System.out.println(formatter.parseDateTime(date));
21
        DateFormat df = new SimpleDateFormat("yyyy-mm-dd");
22
        System.out.println(df.parse(date));
22 23

  
23 24

  
25
        String value = " \"lala\" /lo {}li?   la ";
24 26

  
25
		String value=" \"lala\" /lo {}li?   la ";
27
        value = "piii-33:/ .-_:/\\ aa;;a,,a ooo\n yy";
26 28

  
27
		value= "piii-33:/ .-_:/\\ aa;;a,,a ooo\n yy";
28 29

  
29

  
30

  
31
		value = value.replaceAll("[\"\\r\\\\;|-]", "");
32
		/*value = value.replace("\\r", " ");
30
        value = value.replaceAll("[\"\\r\\\\;|-]", "");
31
        /*value = value.replace("\\r", " ");
33 32
		value = value.replace("\\", "");
34 33
		*/
35 34

  
......
37 36
	     value = value.replace("\\", "");
38 37
*/
39 38

  
40
		value = value.replaceAll("[^a-zA-Z0-9 .-_:/]+", " ");
39
        value = value.replaceAll("[^a-zA-Z0-9 .-_:/]+", " ");
41 40

  
42
		System.out.println(value);
43
	}
41
        System.out.println(value);
42

  
43
        ArrayList<String> l = new ArrayList<String>();
44
        fillList(l);
45
        for (String s : l) {
46
            System.out.println(s);
47
        }
48

  
49
    }
50

  
51

  
52
    static void fillList(ArrayList list) {
53
        list.add("lala");
54

  
55
    }
56

  
44 57
}
modules/dnet-openaire-lodexport/trunk/src/main/java/eu/dnetlib/data/mapreduce/hbase/lodExport/utils/NewSerializer.java
159 159

  
160 160
    private static String buildDatasource(OafEntity data, String DELIM) {
161 161

  
162
        Datasource d = data.getDatasource();
162
        Metadata metadata = data.getDatasource().getMetadata();
163 163

  
164
        Metadata metadata = d.getMetadata();
165

  
166

  
167 164
        String buff = getHeader(data, DELIM);
168 165

  
169 166
        //Datasourcetype
......
223 220

  
224 221
        //languages
225 222
        String dataStr = new String();
223

  
226 224
        for (StringField lang : metadata.getOdlanguagesList()) {
227
            dataStr += clean(lang.getValue()) + SEPERATOR;
228
        }
225
            dataStr += lang.getValue() + SEPERATOR;}
226

  
229 227
        buff += dataStr + DELIM;
230 228

  
231 229

  
232 230
        // Content type
233 231
        dataStr=" ";
234 232
        for (StringField c : metadata.getOdcontenttypesList()) {
235
            dataStr += clean(c.getValue()) + SEPERATOR;
233
            dataStr += c.getValue() + SEPERATOR;
236 234
        }
237 235
        buff += dataStr + DELIM;
238 236

  
239 237
        //Access info package
240 238
          dataStr = " ";
241 239
        for (StringField c : metadata.getAccessinfopackageList()) {
242
            dataStr += clean(c.getValue()) + SEPERATOR;
240
            dataStr += c.getValue() + SEPERATOR;
243 241
        }
244 242

  
245 243
        buff += dataStr + DELIM;
......
286 284
        //Policies
287 285
          dataStr = " ";
288 286
        for (FieldTypeProtos.KeyValue property : metadata.getPoliciesList()) {
289
            dataStr += clean(property.getValue()) + SEPERATOR;
287
            dataStr += property.getValue() + SEPERATOR;
290 288
        }
291 289
        buff += dataStr + DELIM;
292 290

  
293 291
        buff += getTrust(data, DELIM);
294

  
295 292
        return buff;
296

  
297 293
    }
298 294

  
299 295

  
......
314 310
        String dataStr = new String();
315 311

  
316 312
        for (String s : split) {
317
            dataStr += clean(s) + SEPERATOR;
313
            dataStr += s.replace(DELIM," ")+ SEPERATOR;
318 314
        }
315

  
319 316
        buff += dataStr + DELIM;
320

  
321 317
        //logourl
322
        buff += clean(metadata.getLogourl().getValue()) + DELIM;
318
        buff += metadata.getLogourl().getValue().replace(DELIM," ") + DELIM;
323 319
        // `country`,
324 320
        buff += metadata.getCountry().getClassid() + DELIM;
325 321

  
......
339 335

  
340 336
    private static String buildResult(OafEntity data,String DELIM) {
341 337

  
342
        Result result = data.getResult();
343
        Result.Metadata metadata = result.getMetadata();
338
        Result.Metadata metadata = data.getResult().getMetadata();
344 339

  
345

  
346 340
        String buff = getHeader(data,DELIM);
347 341

  
348 342

  
......
357 351

  
358 352
        //  pubtitle
359 353
        buff += clean(dataStr) + DELIM;
354

  
360 355
        // date of acceptance CHANGED THIS TO DATE FORMAT
361 356
        buff += metadata.getDateofacceptance().getValue() + DELIM;
362 357

  
......
369 364
        for (StructuredProperty p : data.getPidList()) {
370 365
            dataStr += clean(p.getValue()) + SEPERATOR;
371 366
        }
367

  
372 368
        buff += dataStr + DELIM;
373 369

  
374 370
        //language
......
376 372

  
377 373
        // RelevantDate
378 374
        dataStr = " ";
375

  
379 376
        for (StructuredProperty p : metadata.getRelevantdateList()) {
380 377
            dataStr += p.getValue();
381 378
            break;
......
407 404

  
408 405
        buff += dataStr + DELIM;
409 406

  
410

  
411 407
        //TODO Format     
412 408
        buff += " " + DELIM;
413 409

  
......
425 421
        buff += " " + DELIM;
426 422

  
427 423
        //Best License
428
        buff += getBestLicense(result) + DELIM;
424
        buff += getBestLicense(data.getResult()) + DELIM;
429 425
        //Description
430 426
        dataStr = " ";
431 427

  
......
437 433
        buff += dataStr + DELIM;
438 434

  
439 435
        //Journal  
440
        buff += metadata.getJournal().getName() + DELIM;  //#null#!
436
        buff += clean(metadata.getJournal().getName()) + DELIM;  //#null#!
441 437

  
442 438

  
443 439
        // TODO ERI SOS : HERE IN GET JOUTNAL. GET DATA INFO I CAN FIND PROVENANCE AND SIMILARITY
444 440

  
445

  
446 441
        // TODO isRelatedTo
447 442

  
448 443
        //   resource type
......
466 461
        // type
467 462
        buff += metadata.getResulttype().getClassname() + DELIM;
468 463

  
469

  
470 464
        buff += getTrust(data,DELIM);
471 465

  
472 466

  
......
477 471
    private static String buildProject(OafEntity data,String DELIM) {
478 472

  
479 473

  
480
        Project project = data.getProject();
481
        Project.Metadata metadata = project.getMetadata();
482 474

  
475
        Project.Metadata metadata = data.getProject().getMetadata();
483 476

  
484 477
        String buff = getHeader(data,DELIM);
485 478

  
486

  
487 479
        //Code
488 480
        buff += metadata.getCode().getValue() + DELIM;
489 481
        // `Websiteurl`,
......
592 584
        dataStr = " ";
593 585
        for (StructuredProperty s : data.getPidList()) {
594 586

  
595
            dataStr += s.getValue() + ";";
587
            dataStr += cleanId(s.getValue()) + ";";
596 588
        }
597 589
        buff += dataStr + DELIM;
598 590

  
......
633 625
    public static String cleanId(String value) {
634 626
        if (value != null) {
635 627
            //   DO NOT CHANGE THIS: IT REMOVES ID PREFIX ( 5|datacite____::
636

  
637 628
            // to datacite____:: )
638 629
            // AND REPLACES OCCURENCES OF DELIM CHARS IN DATA
639 630
            value = value.replaceFirst(".*\\|", "");
640
            value = value.replace("\n", "");
631
            value = value.replace("\n", " ");
641 632
        }
642 633

  
643 634
        return value;
......
647 638
    //TODO make them in pairs
648 639
    private static void getDedups(OafEntity valueEntity,String DELIM,ArrayList<String> returnList) {
649 640

  
650

  
651 641
        if (!valueEntity.getChildrenList().isEmpty()) {
652 642

  
653 643
            String header = "dedup" + DELIM + valueEntity.getType().name() + DELIM +
......
679 669
            return " " + DELIM;
680 670
        }
681 671

  
682
        String year = split[0];
683 672

  
684
        year = cleanNumber(year);
673
        return split[0] + DELIM;
685 674

  
686
        return year + DELIM;
687 675

  
688

  
689 676
    }
690 677

  
691
    private static String cleanNumber(String number) {
692
        number = number.replaceAll("[^A-Za-z0-9:,____]", "");
693 678

  
694
        return number;
695
    }
696 679

  
697

  
698 680
    private static String getBestLicense(Result result) {
699 681
        FieldTypeProtos.Qualifier bestLicense = null;
700 682
        LicenseComparator lc = new LicenseComparator();
......
716 698
            // TODO DO NOT CHANGE THIS: IT REMOVES ID PREFIX ( 5|datacite____::
717 699
            // to datacite____:: )
718 700
            // AND REPLACES OCCURENCES OF DELIM CHARS IN DATA
719
            value = value.replaceFirst(".*\\|", "");
701
           /* value = value.replaceFirst(".*\\|", "");*/
720 702
            value = value.replaceAll("[\"\\r\\\\;]", "");
721 703
            value = value.replace(SEPERATOR, " ");
722
            value = value.replaceAll("[^a-zA-Z0-9 .-_:/]+", "");
704
            value = value.replaceAll("[^a-zA-Z0-9 .-_:/]+", " ");
723 705
        }
724 706

  
725 707
        return value;
726 708

  
727 709
    }
728 710

  
729

  
730

  
731 711
}
modules/dnet-openaire-lodexport/trunk/src/main/java/eu/dnetlib/data/mapreduce/hbase/lodExport/LodMapper.java
1 1
package eu.dnetlib.data.mapreduce.hbase.lodExport;
2 2

  
3 3
import com.google.protobuf.InvalidProtocolBufferException;
4
import com.sun.tools.internal.xjc.reader.gbind.ElementSets;
4 5
import eu.dnetlib.data.mapreduce.hbase.index.config.EntityConfigTable;
5 6
import eu.dnetlib.data.mapreduce.hbase.index.config.IndexConfig;
6 7
import eu.dnetlib.data.mapreduce.hbase.index.config.LinkDescriptor;
......
12 13
import eu.dnetlib.data.proto.OafProtos.OafRelOrBuilder;
13 14
import eu.dnetlib.data.proto.RelTypeProtos.SubRelType;
14 15
import eu.dnetlib.data.proto.TypeProtos.Type;
16
import org.apache.hadoop.fs.Syncable;
15 17
import org.apache.hadoop.hbase.client.Result;
16 18
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
17 19
import org.apache.hadoop.hbase.mapreduce.TableMapper;
......
49 51
        PROJECT,
50 52
        DATASOURCE,
51 53
        PERSON,
52
        ORGANIZATION
54
        ORGANIZATION,
55
        DELETED_BY_INFERENCE,
56
        NOT_DELETED_BY_INFERENCE,
57
        TOTAL_ENTITIES,
58
        TOTAL_RELATIONS
59

  
53 60
    }
54 61

  
55 62
    ;
......
79 86
            // TODO set this only when the configuration file has include dups
80 87
            // to fals for Results
81 88
            // or else were gonna get deleted by inference entries
82
            if (!deletedByInference(oaf))
89
           // if (!deletedByInference(oaf))
90
if(deletedByInference(oaf))
91
{context.getCounter(ENTITIES_COUNTER.DELETED_BY_INFERENCE).increment(1);}
92
else
93
{context.getCounter(ENTITIES_COUNTER.NOT_DELETED_BY_INFERENCE).increment(1);}
94
 context.getCounter(ENTITIES_COUNTER.TOTAL_ENTITIES).increment(1);
83 95

  
84 96
            // entityConfigTable.includeDuplicates(type)) {
85 97
            //TODO eri: here allow entities deletedbyinfer
......
146 158
    }
147 159

  
148 160
    private void emitEntity(Context context, Oaf oaf, Type type, Oaf.Builder oafBuilder) {
149
        String serialized = serializer.Serialize(oafBuilder.build(),DELIM);
161
        String serialized = serializer.Serialize(oafBuilder.build(), DELIM);
150 162

  
151
        if (serialized != null && !oaf.getEntity().getId().startsWith("dedup")) {
163
        if (serialized != null && !oaf.getEntity().getId().contains("dedup")) {
152 164
            try {
153 165
                Text TextKeyOut = new Text("entities");
154 166
                context.write((TextKeyOut), new ImmutableBytesWritable(serialized.getBytes()));
......
171 183
            // Existing Hbase relations are generated here
172 184
            if (entityConfigTable.getDescriptors(type) != null && !entityConfigTable.getDescriptors(type).isEmpty()) {
173 185
                for (LinkDescriptor ld : entityConfigTable.getDescriptors(type)) {
174

  
175 186
                    try {
176 187

  
177 188
                        final Map<byte[], byte[]> columnMap = result.getFamilyMap(Bytes.toBytes(ld.getRelDescriptor().getIt()));
......
180 191

  
181 192
                        for (OafRel rel : relOaf) {
182 193
                            builder.getEntityBuilder().addCachedRel(rel);
183

  
184 194
                            try {
185 195
                                Text TextKeyOut = new Text("relations");
186
                                String buff = serializer.Serialize(rel,DELIM);
196
                                String buff = serializer.Serialize(rel, DELIM);
187 197
                                context.write((TextKeyOut), new ImmutableBytesWritable(buff.getBytes()));
188 198

  
199
                                relOaf.clear();
200
                                relOaf = null;
201
                                context.getCounter(ENTITIES_COUNTER.TOTAL_RELATIONS).increment(1);
202

  
189 203
                            } catch (Exception e) {
190 204
                                log.error("Error while writing Relation Proto to M/R output", e);
191 205
                            }
......
200 214
            }
201 215
        }
202 216

  
203
        List<String> relationsList = serializer.extractRelations(oaf,DELIM);
204

  
217
        if (!oaf.getEntity().getId().startsWith("dedup")) {
218
        List<String> relationsList = serializer.extractRelations(oaf, DELIM);
205 219
        for (String rel : relationsList) {
206 220
            try {
207 221
                Text TextKeyOut = new Text("relations");
208

  
209
                if (oaf.getEntity().getId().startsWith("dedup")) {
210

  
211
                    if (rel.startsWith("dedup")) {
212
                        context.write((TextKeyOut), new ImmutableBytesWritable(rel.getBytes()));
213
                    }
214
                } else {
215
                    context.write((TextKeyOut), new ImmutableBytesWritable(rel.getBytes()));
216
                }
217

  
222
                context.write((TextKeyOut), new ImmutableBytesWritable(rel.getBytes()));
223
                context.getCounter(ENTITIES_COUNTER.TOTAL_RELATIONS).increment(1);
218 224
            } catch (Exception e) {
219 225
                log.error("Error writing relations to output : " + rel);
220 226
            }
221 227
        }
222
        relationsList.clear();
223
        relationsList=null;
224

  
228
            relationsList.clear();
229
            relationsList = null;
230
        }
225 231
    }
226 232

  
227 233
    private ArrayList<OafRel> decodeRelation(final Oaf body, final Context context, Map<byte[], byte[]> columnMap, final LinkDescriptor ld) throws IOException, InterruptedException {

Also available in: Unified diff