Revision 29807
Added by Marek Horst about 10 years ago
modules/icm-iis-import/trunk/src/main/java/eu/dnetlib/iis/importer/converter/CitationConverter.java | ||
---|---|---|
1 |
package eu.dnetlib.iis.importer.converter; |
|
2 |
|
|
3 |
import java.io.IOException; |
|
4 |
import java.util.ArrayList; |
|
5 |
import java.util.List; |
|
6 |
import java.util.SortedSet; |
|
7 |
|
|
8 |
import org.apache.hadoop.hbase.client.Result; |
|
9 |
|
|
10 |
import eu.dnetlib.data.proto.FieldTypeProtos.ExtraInfo; |
|
11 |
import eu.dnetlib.data.proto.OafProtos.Oaf; |
|
12 |
import eu.dnetlib.iis.citationmatching.schemas.Citation; |
|
13 |
import eu.dnetlib.iis.common.hbase.HBaseConstants; |
|
14 |
import eu.dnetlib.iis.common.model.extrainfo.ExtraInfoConstants; |
|
15 |
import eu.dnetlib.iis.common.model.extrainfo.citations.ComparableCitationEntry; |
|
16 |
import eu.dnetlib.iis.common.model.extrainfo.citations.TypedId; |
|
17 |
import eu.dnetlib.iis.common.model.extrainfo.converter.CitationsExtraInfoConverter; |
|
18 |
import eu.dnetlib.iis.importer.input.approver.ResultApprover; |
|
19 |
|
|
20 |
/** |
|
21 |
* Converter producing {@link Citation} objects based on {@link ExtraInfo} element holding citation |
|
22 |
* XML representation. |
|
23 |
* @author mhorst |
|
24 |
* |
|
25 |
*/ |
|
26 |
public class CitationConverter extends AbstractAvroConverter<Citation[]>{ |
|
27 |
|
|
28 |
/** |
|
29 |
* Citations XML blob converter. |
|
30 |
*/ |
|
31 |
CitationsExtraInfoConverter citationExtraInfoConverter; |
|
32 |
|
|
33 |
public CitationConverter(String encoding, ResultApprover resultApprover) { |
|
34 |
super(encoding, resultApprover); |
|
35 |
citationExtraInfoConverter = new CitationsExtraInfoConverter(); |
|
36 |
} |
|
37 |
|
|
38 |
@Override |
|
39 |
public Citation[] buildObject(Result hbaseResult, Oaf resolvedOafObject) |
|
40 |
throws IOException { |
|
41 |
if (resolvedOafObject!=null && resolvedOafObject.getEntity()!=null && |
|
42 |
resolvedOafObject.getEntity().getExtraInfoList()!=null) { |
|
43 |
String sourceId = resolvedOafObject.getEntity().getId(); |
|
44 |
List<Citation> results = new ArrayList<Citation>(); |
|
45 |
for (ExtraInfo currentExtraInfo : resolvedOafObject.getEntity().getExtraInfoList()) { |
|
46 |
if (ExtraInfoConstants.TYPOLOGY_CITATIONS.equals(currentExtraInfo.getTypology()) && |
|
47 |
currentExtraInfo.getValue()!=null) { |
|
48 |
SortedSet<ComparableCitationEntry> citationSet = citationExtraInfoConverter.deserialize( |
|
49 |
currentExtraInfo.getValue()); |
|
50 |
if (citationSet!=null && citationSet.size()>0) { |
|
51 |
for (ComparableCitationEntry currentEntry : citationSet) { |
|
52 |
for (TypedId currentTypedId : currentEntry.getIdentifiers()) { |
|
53 |
if (ExtraInfoConstants.CITATION_TYPE_OPENAIRE.equals(currentTypedId.getType())) { |
|
54 |
Citation.Builder citationBuilder = Citation.newBuilder(); |
|
55 |
citationBuilder.setSourceDocumentId(sourceId); |
|
56 |
citationBuilder.setDestinationDocumentId(currentTypedId.getValue()); |
|
57 |
citationBuilder.setConfidenceLevel(currentTypedId.getConfidenceLevel() |
|
58 |
/HBaseConstants.CONFIDENCE_TO_TRUST_LEVEL_FACTOR); |
|
59 |
results.add(citationBuilder.build()); |
|
60 |
} |
|
61 |
} |
|
62 |
} |
|
63 |
} |
|
64 |
} |
|
65 |
} |
|
66 |
return results.toArray(new Citation[results.size()]); |
|
67 |
} |
|
68 |
// fallback |
|
69 |
return null; |
|
70 |
} |
|
71 |
|
|
72 |
} |
|
0 | 73 |
modules/icm-iis-import/trunk/src/main/java/eu/dnetlib/iis/importer/mapred/IISDataImporterMapper.java | ||
---|---|---|
33 | 33 |
import eu.dnetlib.data.proto.ResultResultProtos.ResultResult.PublicationDataset; |
34 | 34 |
import eu.dnetlib.data.proto.TypeProtos; |
35 | 35 |
import eu.dnetlib.data.proto.TypeProtos.Type; |
36 |
import eu.dnetlib.iis.citationmatching.schemas.Citation; |
|
36 | 37 |
import eu.dnetlib.iis.common.ByteArrayUtils; |
37 | 38 |
import eu.dnetlib.iis.common.WorkflowRuntimeParameters; |
38 | 39 |
import eu.dnetlib.iis.common.hbase.HBaseConstants; |
39 | 40 |
import eu.dnetlib.iis.common.schemas.DocumentId; |
40 | 41 |
import eu.dnetlib.iis.core.javamapreduce.MultipleOutputs; |
41 | 42 |
import eu.dnetlib.iis.importer.OafHelper; |
43 |
import eu.dnetlib.iis.importer.converter.CitationConverter; |
|
42 | 44 |
import eu.dnetlib.iis.importer.converter.DeduplicationMappingConverter; |
43 | 45 |
import eu.dnetlib.iis.importer.converter.DocumentIdConverter; |
44 | 46 |
import eu.dnetlib.iis.importer.converter.DocumentMetadataConverter; |
... | ... | |
72 | 74 |
|
73 | 75 |
private static final String OUTPUT_NAME_DOCUMENT_META = "output.name.document_meta"; |
74 | 76 |
|
77 |
private static final String OUTPUT_NAME_CITATION = "output.name.citation"; |
|
78 |
|
|
75 | 79 |
private static final String OUTPUT_NAME_DOCUMENT_RELATION = "output.name.document_relation"; |
76 | 80 |
|
77 | 81 |
private static final String OUTPUT_NAME_DOCUMENT_PROJECT = "output.name.document_project"; |
... | ... | |
86 | 90 |
|
87 | 91 |
private String outputNameDocumentMeta; |
88 | 92 |
|
93 |
private String outputNameCitation; |
|
94 |
|
|
89 | 95 |
private String outputNameDocumentRelation; |
90 | 96 |
|
91 | 97 |
private String outputNameDocumentProject; |
... | ... | |
110 | 116 |
|
111 | 117 |
private DocumentMetadataConverter docMetaConverter; |
112 | 118 |
|
119 |
private CitationConverter citationConverter; |
|
120 |
|
|
113 | 121 |
private DocumentRelationConverter docRelationConverter; |
114 | 122 |
|
115 | 123 |
private DocumentToProjectConverter docProjectConverter; |
... | ... | |
122 | 130 |
|
123 | 131 |
private ProjectConverter projectConverter; |
124 | 132 |
|
125 |
// currently content import is disabled in this module |
|
126 |
// private ContentProviderService contentProviderService; |
|
127 |
|
|
128 | 133 |
@Override |
129 | 134 |
protected void setup(Context context) throws IOException, |
130 | 135 |
InterruptedException { |
... | ... | |
179 | 184 |
this.resultApprover, this.fieldApprover, |
180 | 185 |
getCollumnFamily(RelType.personResult, SubRelType.authorship, |
181 | 186 |
Authorship.RelName.hasAuthor.toString())); |
187 |
citationConverter = new CitationConverter(encoding, this.resultApprover); |
|
182 | 188 |
docRelationConverter = new DocumentRelationConverter( |
183 | 189 |
encoding, resultApprover, |
184 | 190 |
getCollumnFamily(RelType.resultResult, SubRelType.publicationDataset, |
... | ... | |
200 | 206 |
if (outputNameDocumentMeta==null) { |
201 | 207 |
throw new RuntimeException("document metadata output name not provided!"); |
202 | 208 |
} |
209 |
outputNameCitation = context.getConfiguration().get(OUTPUT_NAME_CITATION); |
|
210 |
if (outputNameCitation==null) { |
|
211 |
throw new RuntimeException("citation output name not provided!"); |
|
212 |
} |
|
203 | 213 |
outputNameDocumentRelation = context.getConfiguration().get(OUTPUT_NAME_DOCUMENT_RELATION); |
204 | 214 |
if (outputNameDocumentRelation==null) { |
205 | 215 |
throw new RuntimeException("document relation output name not provided!"); |
... | ... | |
263 | 273 |
if (resultApprover.approveBeforeBuilding(oafObj)) { |
264 | 274 |
mos.write(outputNameDocumentMeta, new AvroKey<DocumentMetadata>( |
265 | 275 |
docMetaConverter.buildObject(value, oafObj))); |
276 |
// handling citations retrieved from ExtraInfo XML blob |
|
277 |
Citation[] citations = citationConverter.buildObject(value, oafObj); |
|
278 |
if (citations!=null && citations.length>0) { |
|
279 |
for (Citation citation : citations) { |
|
280 |
mos.write(outputNameCitation, new AvroKey<Citation>(citation)); |
|
281 |
} |
|
282 |
} |
|
266 | 283 |
// handling resultResult relations, required for filtering out existing dataset relations from inferenced dataset relations |
267 | 284 |
DocumentRelation docRel = docRelationConverter.buildObject(value, oafObj); |
268 | 285 |
if (docRel!=null) { |
modules/icm-iis-import/trunk/src/main/resources/eu/dnetlib/iis/importer/mapred_import/oozie_app/workflow.xml | ||
---|---|---|
70 | 70 |
<description>document metadata output subdirectory name</description> |
71 | 71 |
</property> |
72 | 72 |
<property> |
73 |
<name>output_name_citation</name> |
|
74 |
<value>citation</value> |
|
75 |
<description>citation output subdirectory name</description> |
|
76 |
</property> |
|
77 |
<property> |
|
73 | 78 |
<name>output_name_document_relation</name> |
74 | 79 |
<value>docrelation</value> |
75 | 80 |
<description>document to document relation output subdirectory name</description> |
... | ... | |
262 | 267 |
<value>${output_name_document_meta}</value> |
263 | 268 |
</property> |
264 | 269 |
<property> |
270 |
<name>output.name.citation</name> |
|
271 |
<value>${output_name_citation}</value> |
|
272 |
</property> |
|
273 |
<property> |
|
265 | 274 |
<name>output.name.document_relation</name> |
266 | 275 |
<value>${output_name_document_relation}</value> |
267 | 276 |
</property> |
... | ... | |
304 | 313 |
<!-- ## Names of all output ports --> |
305 | 314 |
<property> |
306 | 315 |
<name>avro.mapreduce.multipleoutputs</name> |
307 |
<value>${output_name_document_meta} ${output_name_document_relation} ${output_name_document_project} ${output_name_dedup_mapping} ${output_name_dataset_id} ${output_name_person} ${output_name_project}</value> |
|
316 |
<value>${output_name_document_meta} ${output_name_citation} ${output_name_document_relation} ${output_name_document_project} ${output_name_dedup_mapping} ${output_name_dataset_id} ${output_name_person} ${output_name_project}</value>
|
|
308 | 317 |
</property> |
309 | 318 |
<!-- ## Output classes for all output ports --> |
310 | 319 |
<property> |
... | ... | |
313 | 322 |
<value>org.apache.avro.mapreduce.AvroKeyOutputFormat</value> |
314 | 323 |
</property> |
315 | 324 |
<property> |
325 |
<name>avro.mapreduce.multipleoutputs.namedOutput.${output_name_citation}.format |
|
326 |
</name> |
|
327 |
<value>org.apache.avro.mapreduce.AvroKeyOutputFormat</value> |
|
328 |
</property> |
|
329 |
<property> |
|
316 | 330 |
<name>avro.mapreduce.multipleoutputs.namedOutput.${output_name_document_relation}.format |
317 | 331 |
</name> |
318 | 332 |
<value>org.apache.avro.mapreduce.AvroKeyOutputFormat</value> |
... | ... | |
365 | 379 |
<value>eu.dnetlib.iis.importer.schemas.DocumentMetadata</value> |
366 | 380 |
</property> |
367 | 381 |
<property> |
382 |
<name>eu.dnetlib.iis.avro.multipleoutputs.class.${output_name_citation}</name> |
|
383 |
<value>eu.dnetlib.iis.citationmatching.schemas.Citation</value> |
|
384 |
</property> |
|
385 |
<property> |
|
368 | 386 |
<name>eu.dnetlib.iis.avro.multipleoutputs.class.${output_name_document_relation}</name> |
369 | 387 |
<value>eu.dnetlib.iis.importer.schemas.DocumentRelation</value> |
370 | 388 |
</property> |
Also available in: Unified diff
introducing shared citation ExtraData XML model in icm-iis-common, implementing citation importer in mapred_import workflow, implementing exporter module