Revision 30986
Added by Marek Horst over 9 years ago
CitationExtractorFunction.java | ||
---|---|---|
1 | 1 |
package eu.dnetlib.iis.ingest.pmc.citations; |
2 | 2 |
|
3 |
import static eu.dnetlib.iis.ingest.pmc.citations.Constants.DST_FIELD; |
|
4 |
import static eu.dnetlib.iis.ingest.pmc.citations.Constants.DST_IDS_FIELD; |
|
5 |
import static eu.dnetlib.iis.ingest.pmc.citations.Constants.DST_PMID_FIELD; |
|
6 |
import static eu.dnetlib.iis.ingest.pmc.citations.Constants.DST_DOI_FIELD; |
|
7 |
import static eu.dnetlib.iis.ingest.pmc.citations.Constants.POSITION_FIELD; |
|
8 |
import static eu.dnetlib.iis.ingest.pmc.citations.Constants.RAW_TEXT_FIELD; |
|
9 |
import static eu.dnetlib.iis.ingest.pmc.citations.Constants.SRC_FIELD; |
|
3 | 10 |
import cascading.flow.FlowProcess; |
4 | 11 |
import cascading.operation.BaseOperation; |
5 | 12 |
import cascading.operation.Function; |
... | ... | |
7 | 14 |
import cascading.tuple.Fields; |
8 | 15 |
import cascading.tuple.Tuple; |
9 | 16 |
|
10 |
import static eu.dnetlib.iis.ingest.pmc.citations.Constants.*; |
|
11 |
|
|
12 | 17 |
/** |
13 |
* Extracts Resolved citations from an NLM file. Input pipe should contain a field named TEXT_FIELD. Result stream will
|
|
14 |
* contain fields SRC_FIELD, POSITION_FIELD, DST_FIELD and DST_IDS_FIELD
|
|
18 |
* Extracts Resolved citations from an NLM file. |
|
19 |
* Input pipe should contain a field named TEXT_FIELD and ID_FIELD.
|
|
15 | 20 |
* |
16 | 21 |
* @author Mateusz Fedoryszak (m.fedoryszak@icm.edu.pl) |
17 | 22 |
*/ |
18 | 23 |
public class CitationExtractorFunction extends BaseOperation implements Function { |
19 | 24 |
private final static Fields fields = |
20 |
new Fields(SRC_FIELD, POSITION_FIELD, RAW_TEXT_FIELD, DST_FIELD, DST_IDS_FIELD);
|
|
21 |
//private final Logger logger = Logger.getLogger(CitationExtractorFunction.class);
|
|
25 |
new Fields(SRC_FIELD, POSITION_FIELD, RAW_TEXT_FIELD, |
|
26 |
DST_FIELD, DST_IDS_FIELD, DST_DOI_FIELD, DST_PMID_FIELD);
|
|
22 | 27 |
|
23 | 28 |
public CitationExtractorFunction() { |
24 | 29 |
super(2, fields); |
... | ... | |
27 | 32 |
@Override |
28 | 33 |
public void operate(FlowProcess flowProcess, FunctionCall functionCall) { |
29 | 34 |
String text = functionCall.getArguments().getString(Constants.TEXT_FIELD); |
35 |
String oaId = functionCall.getArguments().getString(Constants.ID_FIELD); |
|
30 | 36 |
|
31 | 37 |
try { |
32 |
for (ResolvedCitation cit : ResolvedCitation.extractFromNlm(text)) { |
|
38 |
for (ResolvedCitation cit : ResolvedCitation.extractFromNlm(oaId, text)) {
|
|
33 | 39 |
Tuple result = new Tuple(); |
34 | 40 |
result.addString(cit.getSourceOaid()); |
35 | 41 |
result.addInteger(cit.getPosition()); |
36 | 42 |
result.addString(cit.getRawText()); |
37 |
result.addString(cit.getTargetOaid());
|
|
43 |
result.addString(null);
|
|
38 | 44 |
result.addString(cit.getTargetIdsJson()); |
45 |
result.addString(cit.getTargetDoi()); |
|
46 |
result.addString(cit.getTargetPmid()); |
|
39 | 47 |
functionCall.getOutputCollector().add(result); |
40 | 48 |
} |
41 | 49 |
} catch (Exception e) { |
42 |
//in case of parse exception |
|
43 |
//logger.error("Error while parsing NLM\n"+text, e); |
|
50 |
throw new RuntimeException(e); |
|
44 | 51 |
} |
45 | 52 |
} |
46 | 53 |
} |
Also available in: Unified diff
#757 fixing pmid and doi matching, fixing sourceDocumentId and destinationDocumentId generation