Project

General

Profile

« Previous | Next » 

Revision 29807

introducing shared citation ExtraData XML model in icm-iis-common, implementing citation importer in mapred_import workflow, implementing exporter module

View differences:

modules/icm-iis-import/trunk/src/main/java/eu/dnetlib/iis/importer/converter/CitationConverter.java
1
package eu.dnetlib.iis.importer.converter;
2

  
3
import java.io.IOException;
4
import java.util.ArrayList;
5
import java.util.List;
6
import java.util.SortedSet;
7

  
8
import org.apache.hadoop.hbase.client.Result;
9

  
10
import eu.dnetlib.data.proto.FieldTypeProtos.ExtraInfo;
11
import eu.dnetlib.data.proto.OafProtos.Oaf;
12
import eu.dnetlib.iis.citationmatching.schemas.Citation;
13
import eu.dnetlib.iis.common.hbase.HBaseConstants;
14
import eu.dnetlib.iis.common.model.extrainfo.ExtraInfoConstants;
15
import eu.dnetlib.iis.common.model.extrainfo.citations.ComparableCitationEntry;
16
import eu.dnetlib.iis.common.model.extrainfo.citations.TypedId;
17
import eu.dnetlib.iis.common.model.extrainfo.converter.CitationsExtraInfoConverter;
18
import eu.dnetlib.iis.importer.input.approver.ResultApprover;
19

  
20
/**
21
 * Converter producing {@link Citation} objects based on {@link ExtraInfo} element holding citation
22
 * XML representation.
23
 * @author mhorst
24
 *
25
 */
26
public class CitationConverter extends AbstractAvroConverter<Citation[]>{
27

  
28
	/**
29
	 * Citations XML blob converter.
30
	 */
31
	CitationsExtraInfoConverter citationExtraInfoConverter;
32
	
33
	public CitationConverter(String encoding, ResultApprover resultApprover) {
34
		super(encoding, resultApprover);
35
		citationExtraInfoConverter = new CitationsExtraInfoConverter();
36
	}
37

  
38
	@Override
39
	public Citation[] buildObject(Result hbaseResult, Oaf resolvedOafObject)
40
			throws IOException {
41
		if (resolvedOafObject!=null && resolvedOafObject.getEntity()!=null &&
42
				resolvedOafObject.getEntity().getExtraInfoList()!=null) {
43
			String sourceId = resolvedOafObject.getEntity().getId();
44
			List<Citation> results = new ArrayList<Citation>();
45
			for (ExtraInfo currentExtraInfo : resolvedOafObject.getEntity().getExtraInfoList()) {
46
				if (ExtraInfoConstants.TYPOLOGY_CITATIONS.equals(currentExtraInfo.getTypology()) &&
47
						currentExtraInfo.getValue()!=null) {
48
					SortedSet<ComparableCitationEntry> citationSet = citationExtraInfoConverter.deserialize(
49
							currentExtraInfo.getValue());
50
					if (citationSet!=null && citationSet.size()>0) {
51
						for (ComparableCitationEntry currentEntry : citationSet) {
52
							for (TypedId currentTypedId : currentEntry.getIdentifiers()) {
53
								if (ExtraInfoConstants.CITATION_TYPE_OPENAIRE.equals(currentTypedId.getType())) {
54
									Citation.Builder citationBuilder = Citation.newBuilder();
55
									citationBuilder.setSourceDocumentId(sourceId);
56
									citationBuilder.setDestinationDocumentId(currentTypedId.getValue());
57
									citationBuilder.setConfidenceLevel(currentTypedId.getConfidenceLevel()
58
											/HBaseConstants.CONFIDENCE_TO_TRUST_LEVEL_FACTOR);
59
									results.add(citationBuilder.build());
60
								}
61
							}
62
						}
63
					}
64
				}
65
			}
66
			return results.toArray(new Citation[results.size()]);
67
		}
68
//		fallback
69
		return null;
70
	}
71

  
72
}
0 73

  
modules/icm-iis-import/trunk/src/main/java/eu/dnetlib/iis/importer/mapred/IISDataImporterMapper.java
33 33
import eu.dnetlib.data.proto.ResultResultProtos.ResultResult.PublicationDataset;
34 34
import eu.dnetlib.data.proto.TypeProtos;
35 35
import eu.dnetlib.data.proto.TypeProtos.Type;
36
import eu.dnetlib.iis.citationmatching.schemas.Citation;
36 37
import eu.dnetlib.iis.common.ByteArrayUtils;
37 38
import eu.dnetlib.iis.common.WorkflowRuntimeParameters;
38 39
import eu.dnetlib.iis.common.hbase.HBaseConstants;
39 40
import eu.dnetlib.iis.common.schemas.DocumentId;
40 41
import eu.dnetlib.iis.core.javamapreduce.MultipleOutputs;
41 42
import eu.dnetlib.iis.importer.OafHelper;
43
import eu.dnetlib.iis.importer.converter.CitationConverter;
42 44
import eu.dnetlib.iis.importer.converter.DeduplicationMappingConverter;
43 45
import eu.dnetlib.iis.importer.converter.DocumentIdConverter;
44 46
import eu.dnetlib.iis.importer.converter.DocumentMetadataConverter;
......
72 74
	
73 75
	private static final String OUTPUT_NAME_DOCUMENT_META = "output.name.document_meta";
74 76
	
77
	private static final String OUTPUT_NAME_CITATION = "output.name.citation";
78
	
75 79
	private static final String OUTPUT_NAME_DOCUMENT_RELATION = "output.name.document_relation";
76 80
	
77 81
	private static final String OUTPUT_NAME_DOCUMENT_PROJECT = "output.name.document_project";
......
86 90
	
87 91
	private String outputNameDocumentMeta;
88 92
	
93
	private String outputNameCitation;
94
	
89 95
	private String outputNameDocumentRelation;
90 96
	
91 97
	private String outputNameDocumentProject;
......
110 116
	
111 117
	private DocumentMetadataConverter docMetaConverter;
112 118
	
119
	private CitationConverter citationConverter;
120
	
113 121
	private DocumentRelationConverter docRelationConverter;
114 122
	
115 123
	private DocumentToProjectConverter docProjectConverter;
......
122 130
	
123 131
	private ProjectConverter projectConverter;
124 132
	
125
//	currently content import is disabled in this module
126
//	private ContentProviderService contentProviderService;
127
	
128 133
	@Override
129 134
	protected void setup(Context context) throws IOException,
130 135
			InterruptedException {
......
179 184
				this.resultApprover, this.fieldApprover,
180 185
				getCollumnFamily(RelType.personResult, SubRelType.authorship, 
181 186
						Authorship.RelName.hasAuthor.toString()));
187
		citationConverter = new CitationConverter(encoding, this.resultApprover);
182 188
		docRelationConverter = new DocumentRelationConverter(
183 189
				encoding, resultApprover, 
184 190
				getCollumnFamily(RelType.resultResult, SubRelType.publicationDataset, 
......
200 206
		if (outputNameDocumentMeta==null) {
201 207
			throw new RuntimeException("document metadata output name not provided!");
202 208
		}
209
		outputNameCitation = context.getConfiguration().get(OUTPUT_NAME_CITATION);
210
		if (outputNameCitation==null) {
211
			throw new RuntimeException("citation output name not provided!");
212
		}
203 213
		outputNameDocumentRelation = context.getConfiguration().get(OUTPUT_NAME_DOCUMENT_RELATION);
204 214
		if (outputNameDocumentRelation==null) {
205 215
			throw new RuntimeException("document relation output name not provided!");
......
263 273
		if (resultApprover.approveBeforeBuilding(oafObj)) {
264 274
			mos.write(outputNameDocumentMeta, new AvroKey<DocumentMetadata>(
265 275
					docMetaConverter.buildObject(value, oafObj)));
276
//			handling citations retrieved from ExtraInfo XML blob
277
			Citation[] citations = citationConverter.buildObject(value, oafObj);
278
			if (citations!=null && citations.length>0) {
279
				for (Citation citation : citations) {
280
					mos.write(outputNameCitation, new AvroKey<Citation>(citation));	
281
				}
282
			}
266 283
//			handling resultResult relations, required for filtering out existing dataset relations from inferenced dataset relations
267 284
			DocumentRelation docRel = docRelationConverter.buildObject(value, oafObj);
268 285
			if (docRel!=null) {
modules/icm-iis-import/trunk/src/main/resources/eu/dnetlib/iis/importer/mapred_import/oozie_app/workflow.xml
70 70
			<description>document metadata output subdirectory name</description>
71 71
		</property>
72 72
		<property>
73
			<name>output_name_citation</name>
74
			<value>citation</value>
75
			<description>citation output subdirectory name</description>
76
		</property>
77
		<property>
73 78
			<name>output_name_document_relation</name>
74 79
			<value>docrelation</value>
75 80
			<description>document to document relation output subdirectory name</description>
......
262 267
		            <value>${output_name_document_meta}</value>
263 268
		        </property>
264 269
		        <property>
270
		            <name>output.name.citation</name>
271
		            <value>${output_name_citation}</value>
272
		        </property>
273
		        <property>
265 274
		            <name>output.name.document_relation</name>
266 275
		            <value>${output_name_document_relation}</value>
267 276
		        </property>
......
304 313
				<!-- ## Names of all output ports -->
305 314
				<property>
306 315
					<name>avro.mapreduce.multipleoutputs</name>
307
					<value>${output_name_document_meta} ${output_name_document_relation} ${output_name_document_project} ${output_name_dedup_mapping} ${output_name_dataset_id} ${output_name_person} ${output_name_project}</value>
316
					<value>${output_name_document_meta} ${output_name_citation} ${output_name_document_relation} ${output_name_document_project} ${output_name_dedup_mapping} ${output_name_dataset_id} ${output_name_person} ${output_name_project}</value>
308 317
				</property>
309 318
				<!-- ## Output classes for all output ports -->
310 319
				<property>
......
313 322
					<value>org.apache.avro.mapreduce.AvroKeyOutputFormat</value>
314 323
				</property>
315 324
				<property>
325
					<name>avro.mapreduce.multipleoutputs.namedOutput.${output_name_citation}.format
326
					</name>
327
					<value>org.apache.avro.mapreduce.AvroKeyOutputFormat</value>
328
				</property>
329
				<property>
316 330
					<name>avro.mapreduce.multipleoutputs.namedOutput.${output_name_document_relation}.format
317 331
					</name>
318 332
					<value>org.apache.avro.mapreduce.AvroKeyOutputFormat</value>
......
365 379
					<value>eu.dnetlib.iis.importer.schemas.DocumentMetadata</value>
366 380
				</property>
367 381
				<property>
382
					<name>eu.dnetlib.iis.avro.multipleoutputs.class.${output_name_citation}</name>
383
					<value>eu.dnetlib.iis.citationmatching.schemas.Citation</value>
384
				</property>
385
				<property>
368 386
					<name>eu.dnetlib.iis.avro.multipleoutputs.class.${output_name_document_relation}</name>
369 387
					<value>eu.dnetlib.iis.importer.schemas.DocumentRelation</value>
370 388
				</property>

Also available in: Unified diff