Project

General

Profile

1 52912 sandro.lab
package eu.dnetlib.data.mapreduce.hbase.dataimport;
2
3
import com.google.gson.JsonArray;
4 52935 claudio.at
import com.google.gson.JsonElement;
5 52912 sandro.lab
import com.google.gson.JsonObject;
6
import eu.dnetlib.actionmanager.actions.ActionFactory;
7
import eu.dnetlib.actionmanager.actions.AtomicAction;
8
import eu.dnetlib.actionmanager.common.Agent;
9 52935 claudio.at
import eu.dnetlib.data.mapreduce.util.StreamUtils;
10 52912 sandro.lab
import eu.dnetlib.data.proto.RelMetadataProtos;
11
import eu.dnetlib.data.proto.RelTypeProtos;
12
import eu.dnetlib.data.transform.xml.AbstractDNetXsltFunctions;
13
import org.apache.commons.lang3.StringUtils;
14
15 53068 claudio.at
import java.util.ArrayList;
16
import java.util.List;
17
import java.util.Map;
18
import java.util.Objects;
19
20
import static eu.dnetlib.data.mapreduce.hbase.dataimport.DumpToActionsUtility.*;
21 52912 sandro.lab
import static eu.dnetlib.data.proto.FieldTypeProtos.*;
22
import static eu.dnetlib.data.proto.KindProtos.Kind;
23
import static eu.dnetlib.data.proto.OafProtos.*;
24
import static eu.dnetlib.data.proto.ResultProtos.Result;
25
import static eu.dnetlib.data.proto.ResultProtos.Result.*;
26
import static eu.dnetlib.data.proto.ResultResultProtos.ResultResult;
27
import static eu.dnetlib.data.proto.TypeProtos.Type;
28
29
public class ScholixToActions {
30
31 52931 claudio.at
    public static List<AtomicAction> generateActionsFromScholix(final JsonObject rootElement, final Map<String, ScholExplorerConfiguration> conf,
32 52912 sandro.lab
                                                                final String setName, final Agent agent, ActionFactory factory, String nsPrefix, final String dsName,
33
                                                                final String dsId, String dateOfCollection) {
34
35
        final List<AtomicAction> actions = new ArrayList<>();
36
37
        final String typology = getStringValue(rootElement, "typology");
38
        final List<String> publisher = getArrayValues(rootElement, "publisher");
39
        final String abstractValue = getStringValue(rootElement, "abstract");
40
        final List<String> authors = getArrayValues(rootElement, "author");
41
        final List<String> dates = getArrayValues(rootElement, "date");
42 52935 claudio.at
43
        final JsonArray localIdentifiers = rootElement.getAsJsonArray("localIdentifier");
44 52912 sandro.lab
        final String dnetId = getStringValue(rootElement, "id").substring(17);
45
46 52931 claudio.at
        String title = "";
47 52912 sandro.lab
        if (rootElement.has("title") && rootElement.get("title").isJsonArray()) {
48
            StringBuilder ttl = new StringBuilder();
49
            getArrayValues(rootElement, "title").forEach(ttl::append);
50
            title = ttl.toString();
51
        } else {
52
            title = getStringValue(rootElement, "title");
53
        }
54 57193 sandro.lab
        if ("\"".equals(title) || title == null || StringUtils.isEmpty(title))
55
            return actions;
56
        if (title != null && title.charAt(0) == '"' && title.charAt(title.length() - 1) == '"' ) {
57 52912 sandro.lab
            title = title.substring(1, title.length() - 1);
58 52931 claudio.at
        }
59 52912 sandro.lab
60
        final Oaf.Builder oafBuilder = Oaf.newBuilder();
61 52935 claudio.at
62
        final boolean isVisible = StringUtils.isNotBlank(title) && StreamUtils.toStream(localIdentifiers.iterator())
63
                .map(JsonElement::getAsJsonObject)
64
                .anyMatch(o -> {
65
                    final String type = getStringValue(o, "type");
66
                    return StringUtils.isNotBlank(type) && conf.containsKey(type) && conf.get(type).isVisible();
67
                });
68 52912 sandro.lab
        oafBuilder.setDataInfo(
69
                DataInfo.newBuilder()
70
                        .setInvisible(!isVisible)
71
                        .setDeletedbyinference(false)
72
                        .setInferred(false)
73
                        .setTrust("0.9")
74
                        .setProvenanceaction(getQualifier("sysimport:actionset", "dnet:provenanceActions"))
75
                        .build());
76
        oafBuilder.setKind(Kind.entity);
77
        final String sourceId = String.format("50|%s::%s", nsPrefix, dnetId);
78 52931 claudio.at
        final KeyValue collectedFrom = KeyValue.newBuilder()
79
                .setValue(dsName)
80
                .setKey("10|openaire____::" + AbstractDNetXsltFunctions.md5(dsId))
81
                .build();
82
        final OafEntity.Builder oafEntityBuilder = OafEntity.newBuilder()
83
                .setType(Type.result)
84
                .setDateofcollection(dateOfCollection)
85
                .addCollectedfrom(collectedFrom)
86
                .setId(sourceId);
87
88 52935 claudio.at
        StreamUtils.toStream(localIdentifiers.iterator())
89
                .map(JsonElement::getAsJsonObject)
90
                .map(localIdentifier -> getPid(localIdentifier, conf))
91
                .filter(Objects::nonNull)
92
                .forEach(oafEntityBuilder::addPid);
93
94 52912 sandro.lab
        final Result.Builder result = Result.newBuilder();
95 52931 claudio.at
96
        final Metadata.Builder metadata = Metadata.newBuilder()
97
            .setResulttype(getQualifier(typology, "dnet:result_typologies"))
98
            .setLanguage(Qualifier.newBuilder()
99 52912 sandro.lab
                .setClassid("und")
100
                .setClassname("Undetermined")
101
                .setSchemeid("dent:languages")
102
                .setSchemename("dent:languages")
103
                .build());
104 52935 claudio.at
        if (StringUtils.isNotBlank(title)) {
105 52931 claudio.at
            metadata.addTitle(StructuredProperty.newBuilder()
106
                    .setValue(title)
107
                    .setQualifier(getQualifier("main title", "dnet:dataCite_title"))
108
                    .build());
109
        }
110 52912 sandro.lab
        if (publisher.size() > 0)
111
            metadata.setPublisher(StringField.newBuilder().setValue(publisher.get(0)).build());
112
        if (StringUtils.isNotEmpty(abstractValue)) {
113
            metadata.addDescription(StringField.newBuilder().setValue(abstractValue).build());
114
        }
115
116 53067 claudio.at
        dates.stream()
117
                .map(d -> {
118
                    if (d.length() == 4 && StringUtils.isNumeric(d)) {
119
                        return d + "-01-01";
120
                    } else {
121
                        return d;
122
                    }
123
                }).filter(d -> isValidDate(d))
124
                .forEach(d -> metadata.addRelevantdate(StructuredProperty.newBuilder()
125
                        .setValue(d)
126
                        .setQualifier(getQualifier("dnet:date", "dnet:date"))
127
                        .build()));
128
129 57186 sandro.lab
        if (rootElement.has("subject") && !rootElement.get("subject").isJsonNull()) {
130 52912 sandro.lab
            JsonArray subject = rootElement.getAsJsonArray("subject");
131 52935 claudio.at
            subject.forEach(it -> {
132
                    final JsonObject item = it.getAsJsonObject();
133
                    final String scheme = getStringValue(item, "scheme");
134
                    metadata.addSubject(StructuredProperty.newBuilder()
135
                            .setValue(Objects.requireNonNull(getStringValue(item, "value")))
136
                            .setQualifier(getQualifier(scheme, "dnet:subject"))
137
                            .build());
138
                }
139 52912 sandro.lab
            );
140
        }
141
        int i = 1;
142
        for (String it : authors) {
143
            metadata.addAuthor(Author.newBuilder()
144
                    .setFullname(it)
145
                    .setRank(i++)
146
                    .build());
147
        }
148
        result.setMetadata(metadata.build());
149
150 52935 claudio.at
        localIdentifiers.forEach(it -> {
151 52912 sandro.lab
152 52935 claudio.at
            final JsonObject localIdentifier = it.getAsJsonObject();
153
            final String pidType = getStringValue(localIdentifier, "type");
154
            final ScholExplorerConfiguration currentConf = conf.get(pidType);
155
            if (currentConf.getGeneratedUrl() != null) {
156
                final Instance.Builder instance = Instance.newBuilder();
157
                final String pidValue = getStringValue(localIdentifier, "id");
158
                instance.addUrl(String.format(currentConf.getGeneratedUrl(), pidValue));
159
                instance.setAccessright(Qualifier.newBuilder()
160
                        .setClassid("UNKNOWN")
161
                        .setClassname("not available")
162
                        .setSchemeid("dnet:access_modes")
163
                        .setSchemename("dnet:access_modes")
164
                        .build());
165 52912 sandro.lab
166 52935 claudio.at
                instance.setInstancetype(Qualifier.newBuilder()
167
                        .setClassid("0000")
168
                        .setClassname("Unknown")
169
                        .setSchemeid("dnet:publication_resource")
170
                        .setSchemename("dnet:publication_resource")
171
                        .build());
172
                instance.setHostedby(KeyValue.newBuilder()
173
                        .setKey("10|openaire____::55045bd2a65019fd8e6741a755395c8c")
174
                        .setValue("Unknown Repository")
175
                        .build());
176
177
                instance.setCollectedfrom(collectedFrom);
178
                result.addInstance(instance);
179
            }
180
        });
181
182 52912 sandro.lab
        generateExternalReference(extractRelations(rootElement, "externalRels"))
183
                .forEach(result::addExternalReference);
184
        oafEntityBuilder.setResult(result.build());
185
        oafBuilder.setEntity(oafEntityBuilder.build());
186
187 53035 claudio.at
        //System.out.println(JsonFormat.printToString(oafBuilder.build()));
188 52912 sandro.lab
189
        actions.add(factory.createAtomicAction(setName, agent, oafEntityBuilder.getId(), "result", "body", oafBuilder.build().toByteArray()));
190
191
        final List<JsonObject> doiRels = extractRelations(rootElement, "doiRels");
192
        doiRels.stream().map(it -> convertDoiRelations(it, factory, sourceId, nsPrefix, collectedFrom, setName, agent)).forEach(actions::addAll);
193
        return actions;
194
    }
195
196
197
    private static AtomicAction createResultResultRelation(final String source, final String target,
198
                                                           final KeyValue collectedFrom, final ResultResult resultResultRel, final String relClass, final String cfRelation, final ActionFactory factory, final String setName, final Agent agent) {
199
        final Oaf.Builder oaf = Oaf.newBuilder();
200
        oaf.setDataInfo(
201
                DataInfo.newBuilder()
202
                        .setDeletedbyinference(false)
203
                        .setInferred(false)
204
                        .setTrust("0.9")
205
                        .setProvenanceaction(getQualifier("sysimport:actionset", "dnet:provenanceActions"))
206
                        .build());
207
        oaf.setKind(Kind.relation);
208
        final OafRel.Builder relation = OafRel.newBuilder();
209
        relation.setSource(source);
210
        relation.setTarget(target);
211
        relation.setRelType(RelTypeProtos.RelType.resultResult);
212 53035 claudio.at
213
        if (StringUtils.contains(relClass.toLowerCase(), "supplement")) {
214
            relation.setSubRelType(RelTypeProtos.SubRelType.supplement);
215
        } else {
216
            relation.setSubRelType(RelTypeProtos.SubRelType.publicationDataset);
217
        }
218
219 52912 sandro.lab
        relation.setChild(false);
220
        relation.setResultResult(resultResultRel);
221
        relation.setRelClass(relClass);
222
        relation.addCollectedfrom(collectedFrom);
223
        oaf.setRel(relation.build());
224
225 52935 claudio.at
        //System.out.println(JsonFormat.printToString(oaf.build()));
226 52912 sandro.lab
        return factory.createAtomicAction(setName, agent, source, cfRelation, target, oaf.build().toByteArray());
227
    }
228
229
230
    private static List<AtomicAction> convertDoiRelations(final JsonObject doiRel, final ActionFactory factory, final String sourceId, final String nsPrefix, final KeyValue collectedFrom, final String setName, final Agent agent) {
231
        final String target = Objects.requireNonNull(getStringValue(doiRel, "dnetId")).substring(17);
232
        final String targetId = String.format("50|%s::%s", nsPrefix, target);
233
        final String relationSemantic = getStringValue(doiRel, "relationSemantic");
234
        String cfRelation;
235
        String cfInverseRelation;
236
        ResultResult.Builder resultRel = ResultResult.newBuilder();
237
        ResultResult.Builder resultInverseRel = ResultResult.newBuilder();
238
        String relClass;
239
        String inverseRelClass;
240 52931 claudio.at
241 52912 sandro.lab
        switch (relationSemantic) {
242
            case "isSupplementedBy": {
243
                cfRelation = "resultResult_supplement_isSupplementedBy";
244
                cfInverseRelation = "resultResult_supplement_isSupplementTo";
245
246
                relClass = ResultResult.Supplement.RelName.isSupplementedBy.toString();
247
                inverseRelClass = ResultResult.Supplement.RelName.isSupplementTo.toString();
248
                resultRel.setSupplement(ResultResult.Supplement.newBuilder()
249
                        .setRelMetadata(RelMetadataProtos.RelMetadata.newBuilder()
250
                                .setSemantics(getQualifier(relClass, "dnet:result_result_relations"))
251
                                .build())
252
                        .build());
253
                resultInverseRel.setSupplement(ResultResult.Supplement.newBuilder()
254
                        .setRelMetadata(RelMetadataProtos.RelMetadata.newBuilder()
255
                                .setSemantics(getQualifier(inverseRelClass, "dnet:result_result_relations"))
256
                                .build())
257
                        .build());
258
                break;
259
            }
260
            case "isSupplementTo": {
261
                cfRelation = "resultResult_supplement_isSupplementTo";
262
                cfInverseRelation = "resultResult_supplement_isSupplementedBy";
263
                inverseRelClass = ResultResult.Supplement.RelName.isSupplementedBy.toString();
264
                relClass = ResultResult.Supplement.RelName.isSupplementTo.toString();
265
                resultInverseRel.setSupplement(ResultResult.Supplement.newBuilder()
266
                        .setRelMetadata(RelMetadataProtos.RelMetadata.newBuilder()
267
                                .setSemantics(getQualifier(inverseRelClass, "dnet:result_result_relations"))
268
                                .build())
269
                        .build());
270
                resultRel.setSupplement(ResultResult.Supplement.newBuilder()
271
                        .setRelMetadata(RelMetadataProtos.RelMetadata.newBuilder()
272
                                .setSemantics(getQualifier(relClass, "dnet:result_result_relations"))
273
                                .build())
274
                        .build());
275
                break;
276
            }
277
            default: {
278
                cfRelation = "resultResult_publicationDataset_isRelatedTo";
279
                cfInverseRelation = "resultResult_publicationDataset_isRelatedTo";
280
                relClass = ResultResult.PublicationDataset.RelName.isRelatedTo.toString();
281
                inverseRelClass = relClass;
282
                resultInverseRel.setPublicationDataset(ResultResult.PublicationDataset.newBuilder()
283
                        .setRelMetadata(RelMetadataProtos.RelMetadata.newBuilder()
284
                                .setSemantics(getQualifier(relClass, "dnet:result_result_relations"))
285
                                .build())
286
                        .build());
287
                resultRel = resultInverseRel;
288
            }
289
        }
290
291 52931 claudio.at
        final List<AtomicAction> actions = new ArrayList<>();
292 52912 sandro.lab
        actions.add(createResultResultRelation(sourceId, targetId, collectedFrom, resultRel.build(), relClass, cfRelation, factory, setName, agent));
293
        actions.add(createResultResultRelation(targetId, sourceId, collectedFrom, resultInverseRel.build(), inverseRelClass, cfInverseRelation, factory, setName, agent));
294
295
        return actions;
296
    }
297
298
    private static List<ExternalReference> generateExternalReference(final List<JsonObject> jsonRels) {
299
        final List<ExternalReference> result = new ArrayList<>();
300
301
        jsonRels.forEach(it -> {
302
            ExternalReference.Builder builder = ExternalReference.newBuilder();
303
            if("url".equals(getStringValue(it.getAsJsonObject("id"), "schema"))) {
304
                builder.setUrl(Objects.requireNonNull(getStringValue(it.getAsJsonObject("id"), "identifier")));
305
            }
306
            result.add(builder
307
                    .setRefidentifier(Objects.requireNonNull(getStringValue(it.getAsJsonObject("id"), "identifier")))
308
                    .setSitename(Objects.requireNonNull(getStringValue(it, "collectedFrom")))
309
                    .setQualifier(Qualifier.newBuilder()
310
                            .setClassid(Objects.requireNonNull(getStringValue(it.getAsJsonObject("id"), "schema")))
311
                            .setClassname(Objects.requireNonNull(getStringValue(it.getAsJsonObject("id"), "schema")))
312
                            .setSchemename("dnet:externalReference_typologies")
313
                            .setSchemeid("dnet:externalReference_typologies")
314
                            .build())
315
                    .build());
316
        });
317
        return result;
318
    }
319
320
    private static List<JsonObject> extractRelations(final JsonObject rootElement, final String fieldType) {
321
        final List<JsonObject> result = new ArrayList<>();
322
        if (rootElement.has(fieldType) && rootElement.get(fieldType).isJsonArray()) {
323
            final JsonArray asJsonArray = rootElement.getAsJsonArray(fieldType);
324
            asJsonArray.forEach(it -> result.add(it.getAsJsonObject()));
325
        }
326
        return result;
327
    }
328
329
}