Project

General

Profile

« Previous | Next » 

Revision 30986

Added by Marek Horst over 9 years ago

#757 fixing pmid and doi matching, fixing sourceDocumentId and destinationDocumentId generation

View differences:

CitationExtractorFunction.java
1 1
package eu.dnetlib.iis.ingest.pmc.citations;
2 2

  
3
import static eu.dnetlib.iis.ingest.pmc.citations.Constants.DST_FIELD;
4
import static eu.dnetlib.iis.ingest.pmc.citations.Constants.DST_IDS_FIELD;
5
import static eu.dnetlib.iis.ingest.pmc.citations.Constants.DST_PMID_FIELD;
6
import static eu.dnetlib.iis.ingest.pmc.citations.Constants.DST_DOI_FIELD;
7
import static eu.dnetlib.iis.ingest.pmc.citations.Constants.POSITION_FIELD;
8
import static eu.dnetlib.iis.ingest.pmc.citations.Constants.RAW_TEXT_FIELD;
9
import static eu.dnetlib.iis.ingest.pmc.citations.Constants.SRC_FIELD;
3 10
import cascading.flow.FlowProcess;
4 11
import cascading.operation.BaseOperation;
5 12
import cascading.operation.Function;
......
7 14
import cascading.tuple.Fields;
8 15
import cascading.tuple.Tuple;
9 16

  
10
import static eu.dnetlib.iis.ingest.pmc.citations.Constants.*;
11

  
12 17
/**
13
 * Extracts Resolved citations from an NLM file. Input pipe should contain a field named TEXT_FIELD. Result stream will
14
 * contain fields SRC_FIELD, POSITION_FIELD, DST_FIELD and DST_IDS_FIELD
18
 * Extracts Resolved citations from an NLM file. 
19
 * Input pipe should contain a field named TEXT_FIELD and ID_FIELD.
15 20
 *
16 21
 * @author Mateusz Fedoryszak (m.fedoryszak@icm.edu.pl)
17 22
 */
18 23
public class CitationExtractorFunction extends BaseOperation implements Function {
19 24
    private final static Fields fields =
20
            new Fields(SRC_FIELD, POSITION_FIELD, RAW_TEXT_FIELD, DST_FIELD, DST_IDS_FIELD);
21
    //private final Logger logger = Logger.getLogger(CitationExtractorFunction.class);
25
            new Fields(SRC_FIELD, POSITION_FIELD, RAW_TEXT_FIELD, 
26
            		DST_FIELD, DST_IDS_FIELD, DST_DOI_FIELD, DST_PMID_FIELD);
22 27

  
23 28
    public  CitationExtractorFunction() {
24 29
        super(2, fields);
......
27 32
    @Override
28 33
    public void operate(FlowProcess flowProcess, FunctionCall functionCall) {
29 34
        String text = functionCall.getArguments().getString(Constants.TEXT_FIELD);
35
        String oaId = functionCall.getArguments().getString(Constants.ID_FIELD);
30 36

  
31 37
        try {
32
            for (ResolvedCitation cit : ResolvedCitation.extractFromNlm(text)) {
38
            for (ResolvedCitation cit : ResolvedCitation.extractFromNlm(oaId, text)) {
33 39
                Tuple result = new Tuple();
34 40
                result.addString(cit.getSourceOaid());
35 41
                result.addInteger(cit.getPosition());
36 42
                result.addString(cit.getRawText());
37
                result.addString(cit.getTargetOaid());
43
                result.addString(null);
38 44
                result.addString(cit.getTargetIdsJson());
45
                result.addString(cit.getTargetDoi());
46
                result.addString(cit.getTargetPmid());
39 47
                functionCall.getOutputCollector().add(result);
40 48
            }
41 49
        } catch (Exception e) {
42
            //in case of parse exception
43
            //logger.error("Error while parsing NLM\n"+text, e);
50
        	throw new RuntimeException(e);
44 51
        }
45 52
    }
46 53
}

Also available in: Unified diff