Project

General

Profile

« Previous | Next » 

Revision 42280

Added by Eri Katsari about 8 years ago

fix for "" in person ids; fix for duplicates in relations; updated delim to !

View differences:

modules/dnet-openaire-lodexport/trunk/src/main/java/eu/dnetlib/data/mapreduce/hbase/lodExport/utils/NewSerializer.java
17 17

  
18 18
import java.util.ArrayList;
19 19
import java.util.List;
20
import java.util.Set;
20 21

  
21 22
/**
22 23
 * @author eri Simple serializer that parses input Oaf Protos and prepares them
......
64 65
    }
65 66

  
66 67

  
67
    public static void extractRelations(Oaf oaf, String DELIM, List<String> relations) {
68
    public static void extractRelations(Oaf oaf, String DELIM, Set<String> relations) {
68 69
        OafEntity valueEntity = oaf.getEntity();
69 70
        switch (valueEntity.getType()) {
70 71
            case result:
......
87 88
    }
88 89

  
89 90
    public static String Serialize(OafRel Rel, String DELIM) {
90
        StringBuilder buff = new StringBuilder();
91
        StringBuilder buff ;
91 92

  
92 93
        switch (Rel.getRelType()) {
93 94
            case datasourceOrganization:
94

  
95
                buff = new StringBuilder();
95 96
                buff.append(Rel.getRelType().name()).append(DELIM).append("datasource").append(DELIM).append(cleanId(Rel.getSource())).append(DELIM)
96
                        .append("organization").append(DELIM).append(cleanId(Rel.getTarget())).append(DELIM);
97
                        .append("organization").append(DELIM).append(cleanId(Rel.getTarget())).append(DELIM).append("\n");
98
                return buff.toString();
97 99
            case resultResult:
100
                buff = new StringBuilder();
98 101
                buff.append(Rel.getRelType().name()).append(DELIM).append("result").append(DELIM).append(cleanId(Rel.getSource())).append(DELIM)
99 102
                        .append("result").append(DELIM).append(cleanId(Rel.getTarget())).append(DELIM);
103
                return buff.toString();
100 104
            case personPerson:
101

  
105
                buff = new StringBuilder();
102 106
                buff.append(Rel.getRelType().name()).append(DELIM).append("person").append(DELIM).append(cleanId(Rel.getSource())).append(DELIM)
103 107
                        .append("person").append(DELIM).append(cleanId(Rel.getTarget())).append(DELIM);
104

  
108
                return buff.toString();
105 109
            case organizationOrganization:
106

  
110
                buff = new StringBuilder();
107 111
                buff.append(Rel.getRelType().name()).append(DELIM).append("organization").append(DELIM).append(cleanId(Rel.getSource())).append(DELIM)
108 112
                        .append("organization").append(DELIM).append(cleanId(Rel.getTarget())).append(DELIM);
109

  
110

  
113
                return buff.toString();
111 114
            case personResult:
112

  
115
                buff = new StringBuilder();
113 116
                buff.append(Rel.getRelType().name()).append(DELIM).append("person").append(DELIM).append(cleanId(Rel.getSource())).append(DELIM)
114 117
                        .append("result").append(DELIM).append(cleanId(Rel.getTarget())).append(DELIM);
115

  
116

  
118
                return buff.toString();
117 119
            case projectOrganization:
120
                buff = new StringBuilder();
118 121
                buff.append(Rel.getRelType().name()).append(DELIM).append("project").append(DELIM).append(cleanId(Rel.getSource())).append(DELIM)
119 122
                        .append("organization").append(DELIM).append(cleanId(Rel.getTarget())).append(DELIM);
120

  
121

  
123
                return buff.toString();
122 124
            case projectPerson:
123

  
125
                buff = new StringBuilder();
124 126
                buff.append(Rel.getRelType().name()).append(DELIM).append("project").append(DELIM).append(cleanId(Rel.getSource())).append(DELIM)
125 127
                        .append("person").append(DELIM).append(cleanId(Rel.getTarget())).append(DELIM);
126

  
128
                return buff.toString();
127 129
            case resultOrganization:
128

  
130
                buff = new StringBuilder();
129 131
                buff.append(Rel.getRelType().name()).append(DELIM).append("result").append(DELIM).append(cleanId(Rel.getSource())).append(DELIM)
130 132
                        .append("organization").append(DELIM).append(cleanId(Rel.getTarget())).append(DELIM);
131

  
133
                return buff.toString();
132 134
//TOOD maybe switch them???
133 135

  
134 136
            case resultProject:
135

  
137
                buff = new StringBuilder();
136 138
                buff.append(Rel.getRelType().name()).append(DELIM).append("result").append(DELIM).append(cleanId(Rel.getSource())).append(DELIM)
137 139
                        .append("project").append(DELIM).append(cleanId(Rel.getTarget())).append(DELIM);
138

  
139

  
140
                return buff.toString();
140 141
            default:
141
                buff = null;
142 142

  
143 143
        }
144
        return buff.toString();
145 144

  
145
        return "";
146

  
146 147
    }
147 148

  
148 149
    private static String getHeader(OafEntity data, String DELIM) {
......
190 191
        buff.append(clean(metadata.getDatasourcetype().getClassname()) + DELIM);
191 192

  
192 193
        //Openairecompatibility
193
        buff.append(clean(metadata.getOpenairecompatibility().getClassname()) + DELIM;
194
        buff.append(clean(metadata.getOpenairecompatibility().getClassname()) + DELIM);
194 195

  
195 196
        //OfficialName
196 197
        buff.append(clean(metadata.getOfficialname().getValue()) + DELIM);
......
342 343

  
343 344
        buff.append(dataStr + DELIM);
344 345
        //logourl
345
        buff.append(metadata.getLogourl().getValue().replace(DELIM, " ") + DELIM);
346
        buff.append(cleanUrl(metadata.getLogourl().getValue(), DELIM) + DELIM);
346 347
        // `country`,
347 348
        buff.append(clean(metadata.getCountry().getClassid()) + DELIM);
348 349
        buff.append(getTrust(data) + DELIM);
......
443 444
        }
444 445
        buff.append(dataStr + DELIM);
445 446

  
446
        //country TODO does not exist; throws error
447
        buff.append(" " + DELIM);
447
        //country
448 448

  
449
        String country = " ";
450

  
451
        for (FieldTypeProtos.Qualifier c : metadata.getCountryList()) {
452
            country += clean(c.getClassid()) + SEPERATOR;
453
        }
454

  
455
        buff.append(country + DELIM);
456

  
449 457
        //Best License
450 458
        buff.append(getBestLicense(data.getResult()) + DELIM);
451 459

  
......
595 603
        buff.append(dataStr + DELIM);
596 604

  
597 605
        // `fullname`,
598
        buff.append(clean(metadata.getFullname().getValue().replace(DELIM, " ")) + DELIM);
606
        buff.append(clean(metadata.getFullname().getValue()) + DELIM);
599 607

  
600 608
        // `Fax`,
601 609
        buff.append(clean(metadata.getFax().getValue()) + DELIM);
......
624 632
    }
625 633

  
626 634

  
627
    private static void getResultDatasources(OafEntity valueEntity, String DELIM, List<String> returnList) {
635
    private static void getResultDatasources(OafEntity valueEntity, String DELIM, Set<String> returnList) {
628 636
        String SEPERATOR = ";";
629 637

  
630 638
        Result result = valueEntity.getResult();
......
661 669
        //PROSOXI PROSOXI DEN TO PIANEI AUTO H REGEX GIA TO REPLACE!!!!
662 670

  
663 671
        value = value.replace("\n", " ");
664
        value =value.replace(">", " ");
665
        value =value.replace("<", " ");
666
        value =value.replace(",", " ");
667
        value =value.replace("\"", " ");
668
        value =value.replace("'", " ");
669
        value = value.replaceAll("[^a-zA-Z0-9 .-_:/!@+=]+", " ");
672
        value = value.replace(">", " ");
673
        value = value.replace("<", " ");
674
        value = value.replace(",", " ");
675
        value = value.replace("\"", " ");
676
        value = value.replace("'", " ");
670 677

  
678
        value = value.replace("«", " ");
679
        value = value.replace("»", " ");
680
        value = value.replace("!", " ");
681
        value = value.replace("#", " ");
682

  
683

  
684
        value = value.replaceAll("[^a-zA-Z0-9 .-_:/@+=]+", " ");
685

  
671 686
        return value;
672 687
    }
673 688

  
......
676 691
        value = value.replace(" ", "");
677 692
        value = value.replace("\"", " ");
678 693
        value = value.replace("\n", "");
694

  
695

  
696
        value = value.replace(">", " ");
697
        value = value.replace("<", " ");
698
        value = value.replace(",", " ");
699
        value = value.replace("\"", " ");
700
        value = value.replace("'", " ");
701
        value = value.replace("«", " ");
702
        value = value.replace("»", " ");
703

  
704
        value = value.replace("!", " ");
705
        value = value.replace("#", " ");
706

  
707

  
679 708
        return value;
680 709
    }
681 710

  
......
687 716
            // AND REPLACES OCCURENCES OF DELIM CHARS IN DATA
688 717
            value = value.replaceFirst(".*\\|", "");
689 718
            value = value.replace("\n", " ");
719
            value = value.replace("\"", "");
720
            value = value.replace("'", "");
721
            value = value.replace("#", "");
722
            value = value.replace("!", "");
690 723

  
724
            value = value.replace("«", " ");
725
            value = value.replace("»", " ");
726

  
691 727
        }
692 728

  
729

  
693 730
        return value;
694 731

  
695 732
    }
696 733

  
697 734

  
698 735
    //TODO make them in pairs
699
    private static void getDedups(OafEntity valueEntity, String DELIM, List<String> returnList) {
736
    private static void getDedups(OafEntity valueEntity, String DELIM, Set<String> returnList) {
700 737
        if (!valueEntity.getChildrenList().isEmpty() && valueEntity.getId().contains("dedup")) {
701 738
            ArrayList<String> entries = new ArrayList<String>();
702 739

  
......
758 795
    private static String clean(String value) {
759 796
        String SEPERATOR = ";";
760 797
        String enclosing = "\"";
798

  
799

  
761 800
        if (value != null) {
762 801
            // TODO DO NOT CHANGE THIS: IT REMOVES ID PREFIX ( 5|datacite____::
763 802
            // to datacite____:: )
......
773 812
            value = value.replace("\"", " ");
774 813
            value = value.replace("'", " ");
775 814
            value = value.replace(enclosing, " ");
815
            value = value.replace("«", " ");
816
            value = value.replace("»", " ");
817
            value = value.replace("!", " ");
818
            value = value.replace("#", " ");
776 819

  
777
            value = value.replaceAll("[^a-zA-Z0-9 .-_:/!@+=]+", " ");
820
            value = value.replaceAll("[^a-zA-Z0-9 .-_:/@+=]+", " ");
778 821
        }
779 822

  
823

  
780 824
        return value;
781 825

  
782 826
    }
modules/dnet-openaire-lodexport/trunk/src/main/java/eu/dnetlib/data/mapreduce/hbase/lodExport/LodMapper.java
23 23
import org.joda.time.format.DateTimeFormatter;
24 24

  
25 25
import java.io.IOException;
26
import java.util.ArrayList;
27
import java.util.List;
28
import java.util.Map;
26
import java.util.*;
29 27
import java.util.Map.Entry;
30 28

  
31 29
/**
......
153 151
        String serialized = serializer.Serialize(oafBuilder.build(), DELIM);
154 152

  
155 153
        if (serialized != null && !oaf.getEntity().getId().contains("dedup")) {
154

  
156 155
            try {
157 156
                Text TextKeyOut = new Text("entities");
158 157
                context.write((TextKeyOut), new ImmutableBytesWritable(serialized.getBytes()));
158

  
159 159
                //counter
160 160
                context.getCounter(type).increment(1);
161 161

  
......
179 179

  
180 180
                        final Map<byte[], byte[]> columnMap = result.getFamilyMap(Bytes.toBytes(ld.getRelDescriptor().getIt()));
181 181

  
182
                       List<OafRel> relOaf=decodeRelation(oaf, context, columnMap, ld);
182
                        List<OafRel> relOaf = decodeRelation(oaf, context, columnMap, ld);
183 183

  
184

  
184 185
                        for (OafRel rel : relOaf) {
185 186
                            builder.getEntityBuilder().addCachedRel(rel);
186 187
                            try {
187 188
                                Text TextKeyOut = new Text("relations");
189

  
188 190
                                String buff = serializer.Serialize(rel, DELIM);
189 191

  
190
                                if (!rel.getTarget().contains("dedup")) {
192
                                if (!buff.isEmpty() && !rel.getTarget().contains("dedup")) {
191 193
                                    context.write((TextKeyOut), new ImmutableBytesWritable(buff.getBytes()));
192 194
                                    context.getCounter(ENTITIES_COUNTER.TOTAL_RELATIONS).increment(1);
193 195
                                }
194 196

  
195

  
196 197
                            } catch (Exception e) {
197 198
                                log.error("Error while writing Relation Proto to M/R output", e);
198 199
                            }
......
211 212
        }
212 213

  
213 214

  
214
        List<String> relationsList = new ArrayList<String>();
215
        Set<String> relationsList = new HashSet<String>();
215 216

  
216
                serializer.extractRelations(oaf, DELIM,relationsList);
217
        serializer.extractRelations(oaf, DELIM, relationsList);
217 218

  
218 219
        for (String rel : relationsList) {
219 220
            try {
221

  
220 222
                Text TextKeyOut = new Text("relations");
223

  
221 224
                if (!oaf.getEntity().getId().contains("dedup")) {
222
                    if(!rel.contains("dedup")){
223
                    context.write((TextKeyOut), new ImmutableBytesWritable(rel.getBytes()));
224
                    context.getCounter(ENTITIES_COUNTER.TOTAL_RELATIONS).increment(1);
225
                }}
226
                else
227
                {
225
                    if (!rel.contains("dedup")) {
226
                        context.write((TextKeyOut), new ImmutableBytesWritable(rel.getBytes()));
227
                        context.getCounter(ENTITIES_COUNTER.TOTAL_RELATIONS).increment(1);
228

  
229
                    }
230
                } else {
228 231
                    //for dedup entities write only dedup relationships: all the permutations
229 232
                    // of children
230
                 if(rel.contains("dedup")){
231
                     context.write((TextKeyOut), new ImmutableBytesWritable(rel.getBytes()));
232
                     context.getCounter(ENTITIES_COUNTER.TOTAL_RELATIONS).increment(1);
233
                 }
233
                    if (rel.contains("dedup")) {
234
                        context.write((TextKeyOut), new ImmutableBytesWritable(rel.getBytes()));
235
                        context.getCounter(ENTITIES_COUNTER.TOTAL_RELATIONS).increment(1);
236
                    }
234 237

  
235 238
                }
236 239

  
......
255 258
                final Oaf decodedOaf = decodeProto(context, e.getValue());
256 259
                if (isValid(decodedOaf)) {
257 260
                    OafRel.Builder relBuilder = OafRel.newBuilder(decodedOaf.getRel());
261

  
258 262
                    // skip dedups
259 263

  
260 264
                    if (ld.getRelDescriptor().getIt().contains(SubRelType.dedup.toString()) && isDedupSelf(relBuilder)) {
modules/dnet-openaire-lodexport/trunk/pom.xml
10 10
    <groupId>eu.dnetlib</groupId>
11 11
    <artifactId>dnet-openaire-lod-export</artifactId>
12 12
    <version>1.0.0-SNAPSHOT</version>
13

  
13 14
    <build>
15

  
14 16
        <plugins>
15

  
16 17
        </plugins>
17 18

  
18 19
    </build>

Also available in: Unified diff