Project

General

Profile

« Previous | Next » 

Revision 55436

Updated classes for DOIBoost based on the trunk version

View differences:

modules/dnet-mapreduce-jobs/branches/master/src/main/java/eu/dnetlib/data/mapreduce/hbase/Reporter.java
1
package eu.dnetlib.data.mapreduce.hbase;
2

  
3
import java.io.Serializable;
4

  
5
/**
6
 * Created by Alessia Bardi on 2019-04-08.
7
 *
8
 * @author Alessia Bardi
9
 */
10
public interface Reporter extends Serializable {
11

  
12
	void incrementCounter(String counterGroup, String counterName, long delta);
13

  
14
}
modules/dnet-mapreduce-jobs/branches/master/src/main/java/eu/dnetlib/data/mapreduce/hbase/dataimport/DOIBoostToActions.java
1 1
package eu.dnetlib.data.mapreduce.hbase.dataimport;
2 2

  
3
import java.io.ByteArrayOutputStream;
4
import java.io.IOException;
5
import java.io.InputStream;
6
import java.util.*;
7
import java.util.concurrent.atomic.AtomicInteger;
8
import java.util.function.Function;
9
import java.util.stream.Collectors;
10
import java.util.stream.Stream;
11
import java.util.zip.Inflater;
12

  
3 13
import com.google.gson.Gson;
4 14
import com.google.gson.JsonElement;
5 15
import com.google.gson.JsonObject;
6 16
import eu.dnetlib.actionmanager.actions.ActionFactory;
7 17
import eu.dnetlib.actionmanager.actions.AtomicAction;
8 18
import eu.dnetlib.actionmanager.common.Agent;
19
import eu.dnetlib.data.mapreduce.hbase.Reporter;
9 20
import eu.dnetlib.data.mapreduce.util.StreamUtils;
10 21
import eu.dnetlib.data.proto.*;
11 22
import eu.dnetlib.data.transform.xml.AbstractDNetXsltFunctions;
......
14 25
import org.apache.commons.io.IOUtils;
15 26
import org.apache.commons.lang3.StringUtils;
16 27

  
17
import java.io.ByteArrayOutputStream;
18
import java.io.IOException;
19
import java.io.InputStream;
20
import java.util.*;
21
import java.util.concurrent.atomic.AtomicInteger;
22
import java.util.function.Function;
23
import java.util.stream.Collectors;
24
import java.util.zip.Inflater;
25

  
26 28
import static eu.dnetlib.data.mapreduce.hbase.dataimport.DumpToActionsUtility.*;
27 29
import static eu.dnetlib.data.proto.ResultOrganizationProtos.ResultOrganization;
28 30

  
......
36 38
    public static final String GRID_AC = "grid.ac";
37 39
    public static final String WIKPEDIA = "wikpedia";
38 40

  
39
    public final static String doiBoostNSPREFIX ="doiboost____";
41
    public final static String doiBoostNSPREFIX = "doiboost____";
40 42
    public static final String OPENAIRE_PREFIX = "openaire____";
41 43

  
42 44
    public static final String SEPARATOR = "::";
43 45

  
44
    private static Map<String, Pair<String, String>> datasources =  new HashMap<String, Pair<String, String>>() {{
45
        put(MAG, new Pair<>("Microsoft Academic Graph", OPENAIRE_PREFIX + SEPARATOR + "microsoft"));
46
        put(ORCID, new Pair<>(ORCID, OPENAIRE_PREFIX + SEPARATOR + "orcid"));
47
        put(CROSSREF, new Pair<>(CROSSREF, OPENAIRE_PREFIX + SEPARATOR + "crossref"));
48
        put(UNPAYWALL, new Pair<>(UNPAYWALL, OPENAIRE_PREFIX + SEPARATOR + "unpaywall"));
46
    private static Map<String, Pair<String, String>> datasources = new HashMap<String, Pair<String, String>>() {{
47
        put(MAG.toLowerCase(), new Pair<>("Microsoft Academic Graph", OPENAIRE_PREFIX + SEPARATOR + "microsoft"));
48
        put(ORCID.toLowerCase(), new Pair<>(ORCID, OPENAIRE_PREFIX + SEPARATOR + "orcid"));
49
        put(CROSSREF.toLowerCase(), new Pair<>(CROSSREF, OPENAIRE_PREFIX + SEPARATOR + "crossref"));
50
        put(UNPAYWALL.toLowerCase(), new Pair<>(UNPAYWALL, OPENAIRE_PREFIX + SEPARATOR + "unpaywall"));
49 51

  
50 52
    }};
51 53

  
52
    private static String decompressAbstract(final String abstractCompressed)  {
54
    private static String decompressAbstract(final String abstractCompressed) {
53 55
        try {
54 56
            byte[] byteArray = Base64.decodeBase64(abstractCompressed.getBytes());
55 57
            final Inflater decompresser = new Inflater();
......
64 66
            decompresser.end();
65 67
            return new String(unzippeddata);
66 68
        } catch (Throwable e) {
67
            System.out.println("Wrong abstract:"+ abstractCompressed);
68
            throw  new RuntimeException(e);
69
            System.out.println("Wrong abstract:" + abstractCompressed);
70
            throw new RuntimeException(e);
69 71
        }
70 72
    }
71 73

  
72 74
    public static final String PID_TYPES = "dnet:pid_types";
73
    private static Map<String, FieldTypeProtos.Qualifier> affiliationPIDType =  new HashMap<String, FieldTypeProtos.Qualifier>() {{
74
        put(MAG, FieldTypeProtos.Qualifier.newBuilder().setClassid("mag_id" ).setClassname("Microsoft Academic Graph Identifier").setSchemename(PID_TYPES).setSchemeid(PID_TYPES).build());
75
    private static Map<String, FieldTypeProtos.Qualifier> affiliationPIDType = new HashMap<String, FieldTypeProtos.Qualifier>() {{
76
        put(MAG, FieldTypeProtos.Qualifier.newBuilder().setClassid("mag_id").setClassname("Microsoft Academic Graph Identifier").setSchemename(PID_TYPES)
77
                .setSchemeid(PID_TYPES).build());
75 78
        put(GRID_AC, getQualifier("grid", PID_TYPES));
76 79
        put(WIKPEDIA, getQualifier("urn", PID_TYPES));
77 80
    }};
......
81 84
    static {
82 85
        try {
83 86
            final InputStream is = DOIBoostToActions.class.getResourceAsStream("/eu/dnetlib/data/mapreduce/hbase/dataimport/mapping_typologies.json");
84
            final String tt =IOUtils.toString(is);
87
            final String tt = IOUtils.toString(is);
85 88
            typologiesMapping = new Gson().fromJson(tt, Map.class);
86 89
        } catch (IOException e) {
87 90
            e.printStackTrace();
88 91
        }
89 92
    }
90 93

  
94
    protected static boolean isValid(final JsonObject rootElement, final Reporter context) {
91 95

  
96
        final String doi = getStringValue(rootElement, "doi");
97
        if (doi == null) {
98
            context.incrementCounter("filtered", "no_doi", 1);
99
            return false;
100
        }
101
        final String type = getStringValue(rootElement, "type");
102
        if (!typologiesMapping.containsKey(type)) {
103
            context.incrementCounter("filtered", "unknowntype_" + type, 1);
104
            return false;
105
        }
106
        // fixes #4360 (test publisher)
107
        final String publisher = getStringValue(rootElement, "publisher");
108
        if (StringUtils.isNotBlank(publisher) && publisher.equalsIgnoreCase("Test accounts")) {
109
            context.incrementCounter("filtered", "test_publisher", 1);
110
            return false;
111
        }
92 112

  
113
        List<JsonObject> authors = getArrayObjects(rootElement, "authors");
114
        boolean hasAuthors = false;
115
        for (JsonObject author : authors) {
116
            final String given = getStringValue(author, "given");
117
            final String family = getStringValue(author, "family");
118
            String fullname = getStringValue(author, "fullname");
119
            if (StringUtils.isBlank(fullname) && StringUtils.isNotBlank(given) && StringUtils.isNotBlank(family)) {
120
                fullname = String.format("%s %s", given, family);
121
            }
122
            // fixes #4368
123
            if (fullname.equalsIgnoreCase("Addie Jackson") && publisher.equalsIgnoreCase("Elsevier BV")) {
124
                context.incrementCounter("invalid_author", "addiejackson", 1);
125
                context.incrementCounter("filtered", "invalid_authors", 1);
126
                return false;
127
            }
128
            if (isValidAuthorName(fullname, context)) hasAuthors = true;
129
        }
93 130

  
94
    public static List<AtomicAction> generatePublicationActionsFromDump(final JsonObject rootElement, final ActionFactory factory, final String setName, final Agent agent, boolean invisible,
95
                                                                        final boolean onlyOrganization) {
131
        if (!hasAuthors) {
132
            context.incrementCounter("filtered", "invalid_authors", 1);
133
            return false;
134
        }
135
        // fixes #4360
136
        if (getCleanedTitles(rootElement).isEmpty()) {
137
            context.incrementCounter("filtered", "invalid_title", 1);
138
            return false;
139
        }
96 140

  
141
        return true;
142
    }
143

  
144
    private static List<String> getCleanedTitles(final JsonObject rootElement) {
145
        List<String> titles = getArrayValues(rootElement, "title");
146
        return titles.stream().filter(t -> StringUtils.isNotBlank(t) && !t.equalsIgnoreCase("[NO TITLE AVAILABLE]")).collect(Collectors.toList());
147
    }
148

  
149
    private static boolean isValidAuthorName(final String fullName, final Reporter context) {
150
        if (StringUtils.isBlank(fullName)) {
151
            if(context != null) context.incrementCounter("invalid_author", "blank", 1);
152
            return false;
153
        }
154
        // fixes #4391 and subtasks related to DOIBoost
155
        switch (fullName) {
156
        case ",":
157
        case "none none":
158
        case "none &na;":
159
        case "(:null)":
160
        case "&na; &na;": {
161
            if(context != null) context.incrementCounter("invalid_author", "value_" + fullName, 1);
162
            return false;
163
        }
164
        }
165
        return true;
166
    }
167

  
168
    public static List<AtomicAction> generatePublicationActionsFromDump(final JsonObject rootElement,
169
            final ActionFactory factory,
170
            final String setName,
171
            final Agent agent,
172
            boolean invisible,
173
            final boolean onlyOrganization,
174
            final Reporter context) {
175

  
176
        if (!isValid(rootElement, context)) return null;
177

  
97 178
        //Create OAF Proto
98 179

  
99 180
        final OafProtos.Oaf.Builder oaf = OafProtos.Oaf.newBuilder();
......
112 193
        //creating Result Proto
113 194
        final OafProtos.OafEntity.Builder entity = OafProtos.OafEntity.newBuilder().setType(TypeProtos.Type.result);
114 195

  
115
        entity.setDateofcollection("2018-10-10");
196
        entity.setDateofcollection("2019-02-15");
116 197

  
117
        if (rootElement.has("collectedFrom") && rootElement.get("collectedFrom").isJsonArray()){
198
        if (rootElement.has("collectedFrom") && rootElement.get("collectedFrom").isJsonArray()) {
118 199
            StreamUtils.toStream(rootElement.getAsJsonArray("collectedFrom").iterator())
119 200
                    .map(JsonElement::getAsString)
120 201
                    .forEach(cf -> {
121
                                final String id = datasources.get(cf).getValue();
122
                                final String name = datasources.get(cf).getKey();
202
                                final String id = datasources.get(cf.toLowerCase()).getValue();
203
                                final String name = datasources.get(cf.toLowerCase()).getKey();
123 204
                                if (StringUtils.isNotBlank(id) && StringUtils.isNotBlank(name)) {
124 205
                                    final FieldTypeProtos.KeyValue collectedFrom = FieldTypeProtos.KeyValue.newBuilder()
125 206
                                            .setValue(name)
......
132 213
        }
133 214
        //Adding identifier
134 215
        final String doi = getStringValue(rootElement, "doi");
135
        if (doi == null)
136
            return null;
216
        entity.addOriginalId(doi);
217

  
137 218
        final String sourceId = String.format("50|%s" + SEPARATOR + "%s", doiBoostNSPREFIX, AbstractDNetXsltFunctions.md5(doi));
138 219
        entity.setId(sourceId);
139 220

  
......
142 223
                .setQualifier(getQualifier("doi", PID_TYPES))
143 224
                .build());
144 225

  
145

  
146 226
        //Create Result Field
147 227
        ResultProtos.Result.Builder result = ResultProtos.Result.newBuilder();
148 228

  
149
        final String type = getStringValue(rootElement,"type");
229
        final String type = getStringValue(rootElement, "type");
150 230

  
151
        if (!typologiesMapping.containsKey(type))
152
            return null;
153

  
154 231
        //Adding Instances
155 232
        final String typeValue = typologiesMapping.get(type).get("value");
156 233
        final String cobjValue = typologiesMapping.get(type).get("cobj");
157 234

  
235
        // TODO: workaround for #4362: remove it when UnpayWall is correctly mapped
236
        List<JsonObject> unpaywallLicenses = getArrayObjects(rootElement, "license").stream().filter(prov -> {
237
            String provS = getStringValue(prov, "provenance");
238
            if (StringUtils.isNotBlank(provS) && provS.equalsIgnoreCase(UNPAYWALL)) return true;
239
            else return false;
240
        }).collect(Collectors.toList());
158 241

  
159
        getArrayObjects(rootElement, "instances").stream().map(it ->
242
        Stream.concat(unpaywallLicenses.stream(), getArrayObjects(rootElement, "instances").stream()).map(it ->
160 243
        {
161
            ResultProtos.Result.Instance.Builder instance= ResultProtos.Result.Instance.newBuilder();
244
            ResultProtos.Result.Instance.Builder instance = ResultProtos.Result.Instance.newBuilder();
162 245
            instance.setInstancetype(FieldTypeProtos.Qualifier.newBuilder()
163 246
                    .setClassid(cobjValue)
164 247
                    .setClassname(typeValue)
......
170 253
                    .setValue("Unknown Repository")
171 254
                    .build());
172 255

  
173
            final String acc_class_id =it.get("access-rights").getAsString();
256
            final String acc_class_id = it.get("access-rights").getAsString();
174 257
            String acc_class_value;
175
            switch (acc_class_id){
176
                case "OPEN": {
177
                    acc_class_value = "open access";
178
                    break;
179
                }
180
                case "CLOSED": {
181
                    acc_class_value = "closed access";
182
                    break;
183
                }
258
            switch (acc_class_id) {
259
            case "OPEN": {
260
                acc_class_value = "open access";
261
                break;
262
            }
263
            case "CLOSED": {
264
                acc_class_value = "closed access";
265
                break;
266
            }
267
            default: {
268
                acc_class_value = "not available";
269
            }
184 270

  
185
                default: {
186
                    acc_class_value = "not available";
187
                }
188

  
189 271
            }
190 272

  
191 273
            instance.addUrl(it.get("url").getAsString());
......
196 278
                    .setSchemename("dnet:access_modes")
197 279
                    .build());
198 280

  
199
            final String id =datasources.get(it.get("provenance").getAsString()).getValue();
200
            final String name =datasources.get(it.get("provenance").getAsString()).getKey();
281
            final String id = datasources.get(it.get("provenance").getAsString().toLowerCase()).getValue();
282
            final String name = datasources.get(it.get("provenance").getAsString().toLowerCase()).getKey();
201 283
            if (StringUtils.isNotBlank(id) && StringUtils.isNotBlank(name)) {
202 284
                final FieldTypeProtos.KeyValue collectedFrom = FieldTypeProtos.KeyValue.newBuilder()
203 285
                        .setValue(name)
......
207 289
                instance.setCollectedfrom(collectedFrom);
208 290
            }
209 291

  
210
            return  instance.build();
292
            return instance.build();
211 293
        }).forEach(result::addInstance);
212 294

  
213 295
        //Adding DOI URL as  Instance
214 296
        final String doiURL = getStringValue(rootElement, "doi-url");
297
        JsonObject hostedByOpenAire = null;
298
        if (rootElement.has("hostedByOpenAire")) {
299
            hostedByOpenAire = rootElement.getAsJsonObject("hostedByOpenAire");
300
        }
301

  
215 302
        if (StringUtils.isNotBlank(doiURL)) {
216

  
217

  
218 303
            final ResultProtos.Result.Instance.Builder instance = ResultProtos.Result.Instance.newBuilder();
219 304
            instance.addUrl(doiURL);
305
            instance.setInstancetype(FieldTypeProtos.Qualifier.newBuilder()
306
                    .setClassid(cobjValue)
307
                    .setClassname(typeValue)
308
                    .setSchemeid("dnet:publication_resource")
309
                    .setSchemename("dnet:publication_resource")
310
                    .build());
220 311
            instance.setAccessright(FieldTypeProtos.Qualifier.newBuilder()
221 312
                    .setClassid("CLOSED")
222 313
                    .setClassname("Closed Access")
......
224 315
                    .setSchemename("dnet:access_modes")
225 316
                    .build());
226 317
            instance.setCollectedfrom(FieldTypeProtos.KeyValue.newBuilder()
227
                    .setValue("CrossRef")
318
                    .setValue(CROSSREF)
228 319
                    .setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + AbstractDNetXsltFunctions.md5("crossref"))
229 320
                    .build());
321

  
322
            if (hostedByOpenAire == null)
323
                instance.setHostedby(FieldTypeProtos.KeyValue.newBuilder()
324
                        .setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "55045bd2a65019fd8e6741a755395c8c")
325
                        .setValue("Unknown Repository")
326
                        .build());
327
            else {
328
                instance.setHostedby(FieldTypeProtos.KeyValue.newBuilder()
329
                        .setKey(AbstractDNetXsltFunctions.oafSplitId("datasource", hostedByOpenAire.get("id").getAsString()))
330
                        .setValue(hostedByOpenAire.get("name").getAsString())
331
                        .build());
332
            }
333

  
230 334
            result.addInstance(instance);
231 335
        }
232 336

  
233 337
        //Create Metadata Proto
234 338
        final ResultProtos.Result.Metadata.Builder metadata = ResultProtos.Result.Metadata.newBuilder();
235 339

  
236

  
237 340
        Pair<List<FieldTypeProtos.Author>, Collection<OafProtos.Oaf>> authorsOrganizations = createAuthorsOrganization(rootElement);
238 341

  
239 342
        if (authorsOrganizations.getKey().size() > 0) {
240 343
            metadata.addAllAuthor(authorsOrganizations.getKey());
241
        }
242
        else {
344
        } else {
345
            //Should never enter here becasue of the isValid method at the beginning.
346
            context.incrementCounter("filtered", "unexpected_no_authors", 1);
243 347
            return null;
244 348
        }
245 349
        //adding Language
......
251 355
                .build());
252 356

  
253 357
        //Adding subjects
254
        List<String> subjects =getArrayValues(rootElement, "subject");
358
        List<String> subjects = getArrayValues(rootElement, "subject");
255 359

  
256
        subjects.forEach(s-> metadata.addSubject(FieldTypeProtos.StructuredProperty.newBuilder()
360
        subjects.forEach(s -> metadata.addSubject(FieldTypeProtos.StructuredProperty.newBuilder()
257 361
                .setValue(s)
258 362
                .setQualifier(getQualifier("keyword", "dnet:subject"))
259 363
                .build()));
260 364

  
261
        List<String>titles =getArrayValues(rootElement, "title");
262
        titles.forEach(t->
365
        List<String> titles = getCleanedTitles(rootElement);
366
        titles.forEach(t ->
263 367
                metadata.addTitle(FieldTypeProtos.StructuredProperty.newBuilder()
264 368
                        .setValue(t)
265 369
                        .setQualifier(getQualifier("main title", "dnet:dataCite_title"))
266 370
                        .build()));
371

  
267 372
        settingRelevantDate(rootElement, metadata, "issued", "issued", true);
268 373
        settingRelevantDate(rootElement, metadata, "accepted", "accepted", false);
269 374
        settingRelevantDate(rootElement, metadata, "published-online", "published-online", false);
270 375
        settingRelevantDate(rootElement, metadata, "published-print", "published-print", false);
271 376

  
272

  
273 377
        getArrayObjects(rootElement, "abstract").forEach(d ->
274 378
                {
275 379
                    if (MAG.equals(d.get("provenance").getAsString()))
......
277 381
                    else
278 382
                        metadata.addDescription(FieldTypeProtos.StringField.newBuilder().setValue(d.get("value").getAsString()).build());
279 383
                }
280
                );
384
        );
281 385

  
282

  
283

  
284 386
        //Adding Journal
285
        final String publisher = getStringValue(rootElement,"publisher");
286
        if (StringUtils.isNotBlank(publisher)){
387
        final String publisher = getStringValue(rootElement, "publisher");
388
        if (StringUtils.isNotBlank(publisher)) {
287 389

  
288 390
            final ResultProtos.Result.Journal.Builder journal = ResultProtos.Result.Journal.newBuilder().setName(publisher);
289 391

  
290
            if (hasJSONArrayField(rootElement,"issn" )){
392
            if (hasJSONArrayField(rootElement, "issn")) {
291 393
                StreamUtils.toStream(rootElement.getAsJsonArray("issn").iterator())
292 394
                        .map(JsonElement::getAsJsonObject)
293 395
                        .forEach(it -> {
294 396
                            final String issntype = getStringValue(it, "type");
295 397
                            final String value = getStringValue(it, "value");
296
                            if("electronic".equals(issntype)){
398
                            if ("electronic".equals(issntype)) {
297 399
                                journal.setIssnOnline(value);
298 400
                            }
299 401
                            if ("print".equals(issntype))
......
306 408
        result.setMetadata(metadata.build());
307 409
        entity.setResult(result.build());
308 410
        oaf.setEntity(entity.build());
411

  
412
        //System.out.println(JsonFormat.printToString(oaf.build()));
413

  
309 414
        final List<AtomicAction> actionList = new ArrayList<>();
310 415

  
311 416
        if (!onlyOrganization)
......
320 425
                if (!onlyOrganization)
321 426
                    actionList.addAll(createPublicationOrganizationRelation(oaf.build(), o, factory, setName, agent));
322 427
                final String gridOrganization = getSimilarGridOrganization(o.getEntity());
323
                if (gridOrganization!= null) {
324
                    actionList.add(factory.createAtomicAction(setName, agent, o.getEntity().getId(), "organizationOrganization_dedupSimilarity_isSimilarTo", gridOrganization, "".getBytes()));
325
                    actionList.add(factory.createAtomicAction(setName, agent, gridOrganization, "organizationOrganization_dedupSimilarity_isSimilarTo", o.getEntity().getId(), "".getBytes()));
428
                if (gridOrganization != null) {
429
                    actionList.add(factory
430
                            .createAtomicAction(setName, agent, o.getEntity().getId(), "organizationOrganization_dedupSimilarity_isSimilarTo", gridOrganization,
431
                                    "".getBytes()));
432
                    actionList.add(factory
433
                            .createAtomicAction(setName, agent, gridOrganization, "organizationOrganization_dedupSimilarity_isSimilarTo", o.getEntity().getId(),
434
                                    "".getBytes()));
326 435
                }
327 436
            });
328 437
        }
......
330 439

  
331 440
    }
332 441

  
333

  
334 442
    private static String getSimilarGridOrganization(final OafProtos.OafEntity organization) {
335 443

  
336 444
        final List<FieldTypeProtos.StructuredProperty> pidList = organization.getPidList();
337
        if (pidList!= null ) {
338
            for (FieldTypeProtos.StructuredProperty p: pidList) {
339
                if (p.getQualifier().getClassname().equals("grid")){
340
                    return "20|grid________" + SEPARATOR +AbstractDNetXsltFunctions.md5(p.getValue());
445
        if (pidList != null) {
446
            for (FieldTypeProtos.StructuredProperty p : pidList) {
447
                if (p.getQualifier().getClassname().equals("grid")) {
448
                    return "20|grid________" + SEPARATOR + AbstractDNetXsltFunctions.md5(p.getValue());
341 449
                }
342 450
            }
343 451
        }
......
345 453

  
346 454
    }
347 455

  
348
    private static List<AtomicAction> createPublicationOrganizationRelation(final OafProtos.Oaf publication, final OafProtos.Oaf organization, final ActionFactory factory, final String setName, final Agent agent) {
456
    private static List<AtomicAction> createPublicationOrganizationRelation(final OafProtos.Oaf publication,
457
            final OafProtos.Oaf organization,
458
            final ActionFactory factory,
459
            final String setName,
460
            final Agent agent) {
349 461

  
350 462
        List<AtomicAction> result = new ArrayList<>();
351 463

  
......
360 472
                .setProvenanceaction(getQualifier("sysimport:actionset", "dnet:provenanceActions"))
361 473
                .build());
362 474

  
363

  
364 475
        final OafProtos.OafRel.Builder rel = OafProtos.OafRel.newBuilder();
365 476

  
366 477
        rel.setRelType(RelTypeProtos.RelType.resultOrganization);
......
381 492
        rel.setResultOrganization(rel_instance.build());
382 493

  
383 494
        rel.addCollectedfrom(FieldTypeProtos.KeyValue.newBuilder()
384
                .setValue(datasources.get(MAG).getKey())
385
                .setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + AbstractDNetXsltFunctions.md5(StringUtils.substringAfter(datasources.get(MAG).getValue(), SEPARATOR)))
495
                .setValue(datasources.get(MAG.toLowerCase()).getKey())
496
                .setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + AbstractDNetXsltFunctions
497
                        .md5(StringUtils.substringAfter(datasources.get(MAG.toLowerCase()).getValue(), SEPARATOR)))
386 498
                .build());
387 499

  
388

  
389

  
390 500
        rel.setChild(false);
391 501
        roaf.setRel(rel.build());
392 502

  
393
        result.add(factory.createAtomicAction(setName, agent, publication.getEntity().getId(), "resultOrganization_affiliation_hasAuthorInstitution", organization.getEntity().getId(), roaf.build().toByteArray() ));
503
        result.add(factory.createAtomicAction(setName, agent, publication.getEntity().getId(), "resultOrganization_affiliation_hasAuthorInstitution",
504
                organization.getEntity().getId(), roaf.build().toByteArray()));
394 505

  
395

  
396 506
        //Create a relation Organization --> Result
397 507
        rel.setTarget(publication.getEntity().getId());
398 508
        rel.setSource(organization.getEntity().getId());
399 509
        rel.setRelClass(ResultOrganization.Affiliation.RelName.isAuthorInstitutionOf.toString());
400 510

  
401

  
402 511
        affiliationRel.setRelMetadata(RelMetadataProtos.RelMetadata.newBuilder()
403 512
                .setSemantics(getQualifier("isAuthorInstitutionOf", "dnet:result_organization_relations"))
404 513
                .build());
405 514
        rel_instance.setAffiliation(affiliationRel.build());
406 515
        rel.setResultOrganization(rel_instance.build());
407 516
        roaf.setRel(rel.build());
408
        result.add(factory.createAtomicAction(setName, agent, organization.getEntity().getId(), "resultOrganization_affiliation_isAuthorInstitutionOf", publication.getEntity().getId(), roaf.build().toByteArray()));
517
        result.add(factory.createAtomicAction(setName, agent, organization.getEntity().getId(), "resultOrganization_affiliation_isAuthorInstitutionOf",
518
                publication.getEntity().getId(), roaf.build().toByteArray()));
409 519

  
410 520
        return result;
411 521

  
......
415 525
        return root.has(key) && root.get(key).isJsonArray();
416 526
    }
417 527

  
418
    private static void settingRelevantDate(JsonObject rootElement, ResultProtos.Result.Metadata.Builder metadata , final String jsonKey, final String dictionaryKey, final boolean addToDateOfAcceptance) {
528
    private static void settingRelevantDate(JsonObject rootElement,
529
            ResultProtos.Result.Metadata.Builder metadata,
530
            final String jsonKey,
531
            final String dictionaryKey,
532
            final boolean addToDateOfAcceptance) {
419 533
        //Adding date
420
        String date = getStringValue(rootElement,jsonKey);
534
        String date = getStringValue(rootElement, jsonKey);
421 535
        if (date == null)
422 536
            return;
423 537
        if (date.length() == 4) {
......
428 542
                metadata.setDateofacceptance(FieldTypeProtos.StringField.newBuilder().setValue(date).build());
429 543
            metadata.addRelevantdate(FieldTypeProtos.StructuredProperty.newBuilder()
430 544
                    .setValue(date)
431
                    .setQualifier(getQualifier(dictionaryKey,"dnet:dataCite_date"))
545
                    .setQualifier(getQualifier(dictionaryKey, "dnet:dataCite_date"))
432 546
                    .build());
433 547
        }
434 548
    }
435 549

  
436

  
437 550
    public static FieldTypeProtos.KeyValue extractIdentifier(final String value) {
438 551
        FieldTypeProtos.KeyValue.Builder pid = FieldTypeProtos.KeyValue.newBuilder();
439
        if (StringUtils.contains(value, "orcid.org")){
440
            return pid.setValue(value)
552
        if (StringUtils.contains(value, "orcid.org")) {
553
            return pid.setValue(value.replaceAll("https://orcid.org/", ""))
441 554
                    .setKey(ORCID).build();
442 555
        }
443
        if (StringUtils.contains(value, "academic.microsoft.com/#/detail")){
444
            return pid.setValue(value)
556
        if (StringUtils.contains(value, "academic.microsoft.com/#/detail")) {
557
            return pid.setValue(value.replaceAll("https://academic.microsoft.com/#/detail/", ""))
445 558
                    .setKey("MAG Identifier").build();
446 559
        }
447 560
        return pid.setValue(value)
448 561
                .setKey("URL").build();
449 562
    }
450 563

  
451

  
452 564
    public static OafProtos.Oaf createOrganizationFromJSON(final JsonObject affiliation) {
453 565
        final Map<String, FieldTypeProtos.Qualifier> affiliationIdentifiers = new HashMap<>();
454 566
        final List<String> magId = new ArrayList<>();
......
456 568
            if (StringUtils.contains(it.get("value").getAsString(), "academic.microsoft.com")) {
457 569
                affiliationIdentifiers.put(it.get("value").getAsString(), affiliationPIDType.get(MAG));
458 570
                magId.add(it.get("value").getAsString());
459
            }
460
            else
461
                affiliationIdentifiers.put( it.get("value").getAsString(), affiliationPIDType.get(it.get("schema").getAsString()));
571
            } else
572
                affiliationIdentifiers.put(it.get("value").getAsString(), affiliationPIDType.get(it.get("schema").getAsString()));
462 573
        });
463 574
        if (magId.size() > 0) {
464 575
            final String microsoftID = magId.get(0);
......
466 577
            oaf.setKind(KindProtos.Kind.entity);
467 578
            OafProtos.OafEntity.Builder entity = OafProtos.OafEntity.newBuilder();
468 579
            entity.setType(TypeProtos.Type.organization);
469
            entity.setId("20|microsoft___" + SEPARATOR +AbstractDNetXsltFunctions.md5(microsoftID));
470
            final String id =datasources.get(affiliation.get("provenance").getAsString()).getValue();
471
            final String name =datasources.get(affiliation.get("provenance").getAsString()).getKey();
580
            entity.setId("20|microsoft___" + SEPARATOR + AbstractDNetXsltFunctions.md5(microsoftID));
581
            final String id = datasources.get(affiliation.get("provenance").getAsString().toLowerCase()).getValue();
582
            final String name = datasources.get(affiliation.get("provenance").getAsString().toLowerCase()).getKey();
472 583
            if (StringUtils.isNotBlank(id) && StringUtils.isNotBlank(name)) {
473 584
                final FieldTypeProtos.KeyValue collectedFrom = FieldTypeProtos.KeyValue.newBuilder()
474 585
                        .setValue(name)
......
503 614
                    .build());
504 615
            return oaf.build();
505 616
        }
506
        return  null;
617
        return null;
507 618
    }
508 619

  
509
    public static Pair<List<FieldTypeProtos.Author>, Collection<OafProtos.Oaf>>  createAuthorsOrganization(final JsonObject root) {
620
    public static Pair<List<FieldTypeProtos.Author>, Collection<OafProtos.Oaf>> createAuthorsOrganization(final JsonObject root) {
510 621

  
511 622
        final Map<String, OafProtos.Oaf> affiliations = new HashMap<>();
512 623

  
513 624
        List<JsonObject> authors = getArrayObjects(root, "authors");
514 625

  
515
        final AtomicInteger counter = new AtomicInteger();
626
        final AtomicInteger counter = new AtomicInteger(1);
516 627

  
517 628
        List<FieldTypeProtos.Author> collect = authors.stream().map(author -> {
518 629
            final String given = getStringValue(author, "given");
......
523 634
                fullname = String.format("%s %s", given, family);
524 635
            }
525 636

  
526
            if (StringUtils.isBlank(fullname)){
637
            if (!isValidAuthorName(fullname, null)) {
527 638
                return null;
528

  
529 639
            }
530 640
            final FieldTypeProtos.Author.Builder abuilder = FieldTypeProtos.Author.newBuilder();
531 641

  
......
554 664
                    Collectors.toMap(
555 665
                            FieldTypeProtos.KeyValue::getKey,
556 666
                            Function.identity(),
557
                            (a,b) -> a
667
                            (a, b) -> a
558 668
                    )).values().forEach(abuilder::addPid);
559 669
            abuilder.setRank(counter.getAndIncrement());
560 670

  
......
562 672

  
563 673
        }).filter(Objects::nonNull).collect(Collectors.toList());
564 674

  
565
        return new Pair<> ( collect,affiliations.values() );
675
        return new Pair<>(collect, affiliations.values());
566 676
    }
567 677

  
568

  
569

  
570

  
571

  
572

  
573 678
}
modules/dnet-mapreduce-jobs/branches/master/src/main/java/eu/dnetlib/data/mapreduce/hbase/dataimport/DOIBoostImportMapper.java
1 1
package eu.dnetlib.data.mapreduce.hbase.dataimport;
2 2

  
3
import java.io.IOException;
4
import java.util.List;
5

  
3 6
import com.google.common.base.Joiner;
4 7
import com.google.gson.JsonObject;
5 8
import com.google.gson.JsonParser;
6 9
import eu.dnetlib.actionmanager.actions.ActionFactory;
7 10
import eu.dnetlib.actionmanager.actions.AtomicAction;
8 11
import eu.dnetlib.actionmanager.common.Agent;
12
import eu.dnetlib.data.mapreduce.hbase.Reporter;
9 13
import org.apache.hadoop.io.LongWritable;
10 14
import org.apache.hadoop.io.Text;
11 15
import org.apache.hadoop.mapreduce.Mapper;
12 16

  
13
import java.io.IOException;
14
import java.util.List;
15

  
16 17
public class DOIBoostImportMapper extends Mapper<LongWritable, Text, Text, Text> {
17 18

  
18 19
    private String setName;
......
43 44
        final String inputJson = value.toString();
44 45
        final JsonObject rootElement = parser.parse(inputJson).getAsJsonObject();
45 46
        try {
46
            List<AtomicAction> atomicActions = DOIBoostToActions.generatePublicationActionsFromDump(rootElement, factory, setName, agent, invisible, onlyOrganization);
47
            if (atomicActions!= null) {
48
                for (AtomicAction action: atomicActions){
47
            List<AtomicAction> atomicActions =
48
                    DOIBoostToActions.generatePublicationActionsFromDump(rootElement, factory, setName, agent, invisible, onlyOrganization,
49
                            (Reporter) (counterGroup, counterName, delta) -> context.getCounter(counterGroup, counterName).increment(delta));
50
            if (atomicActions != null) {
51
                for (AtomicAction action : atomicActions) {
49 52
                    keyout.set(Joiner.on(SEPARATOR).join(action.getTargetRowKey(), action.getTargetColumnFamily(), action.getTargetColumn()));
50 53
                    valueOut.set(action.toJSON());
51 54
                    context.write(keyout, valueOut);

Also available in: Unified diff