Revision 30986
Added by Marek Horst about 10 years ago
modules/icm-iis-ingest-pmc/trunk/src/test/java/eu/dnetlib/iis/ingest/pmc/citations/ResolvedCitationTest.java | ||
---|---|---|
15 | 15 |
private static final String testXML = "/eu/dnetlib/iis/ingest/pmc/plaintext/document.nxml"; |
16 | 16 |
private static final String gappedXML = "/eu/dnetlib/iis/ingest/pmc/plaintext/document_gapped.nxml"; |
17 | 17 |
|
18 |
@Test |
|
19 |
public void testPmidToOaid() throws Exception { |
|
20 |
assertEquals("50|od_______908::0000194eb47642a5cda25359367b9754", ResolvedCitation.pmidToOaid("218258")); |
|
21 |
} |
|
18 |
// @Test
|
|
19 |
// public void testPmidToOaid() throws Exception {
|
|
20 |
// assertEquals("50|od_______908::0000194eb47642a5cda25359367b9754", ResolvedCitation.pmidToOaid("218258"));
|
|
21 |
// }
|
|
22 | 22 |
|
23 | 23 |
@Test |
24 | 24 |
public void testExtraction() throws Exception { |
25 |
String oaId = "some-oaid"; |
|
25 | 26 |
InputStream testIS = ClassLoader.class.getResourceAsStream(testXML); |
26 | 27 |
String text = IOUtils.toString(testIS); |
27 |
List<ResolvedCitation> citations = ResolvedCitation.extractFromNlm(text); |
|
28 |
List<ResolvedCitation> citations = ResolvedCitation.extractFromNlm(oaId, text);
|
|
28 | 29 |
assertEquals(34, citations.size()); |
29 | 30 |
|
30 | 31 |
ResolvedCitation citation10 = citations.get(9); |
31 |
assertEquals("19943949", citation10.getSourcePmid());
|
|
32 |
assertEquals(oaId, citation10.getSourceOaid());
|
|
32 | 33 |
assertEquals(10, citation10.getPosition()); |
33 | 34 |
assertEquals("15574825", citation10.getTargetPmid()); |
34 | 35 |
|
35 | 36 |
ResolvedCitation citation30 = citations.get(29); |
36 |
assertEquals("19943949", citation30.getSourcePmid());
|
|
37 |
assertEquals(oaId, citation30.getSourceOaid());
|
|
37 | 38 |
assertEquals(30, citation30.getPosition()); |
38 | 39 |
assertEquals("9395406", citation30.getTargetPmid()); |
39 | 40 |
|
... | ... | |
42 | 43 |
public void testGappedExtraction() throws Exception { |
43 | 44 |
InputStream testIS = ClassLoader.class.getResourceAsStream(gappedXML); |
44 | 45 |
String text = IOUtils.toString(testIS); |
45 |
List<ResolvedCitation> citations = ResolvedCitation.extractFromNlm(text); |
|
46 |
String oaId = "some-oaid"; |
|
47 |
List<ResolvedCitation> citations = ResolvedCitation.extractFromNlm(oaId, text); |
|
46 | 48 |
assertEquals(34, citations.size()); |
47 | 49 |
|
48 | 50 |
ResolvedCitation citation10 = citations.get(9); |
49 |
assertEquals("19943949", citation10.getSourcePmid());
|
|
51 |
assertEquals(oaId, citation10.getSourceOaid());
|
|
50 | 52 |
assertEquals(10, citation10.getPosition()); |
51 | 53 |
assertEquals("15574825", citation10.getTargetPmid()); |
52 | 54 |
String expectedRawText10 = "Charlebois, R, Doolittle, W. Computing prokaryotic gene ubiquity: rescuing the core from extinction. Genome Res. 2004; 14 (12): 2469-2477"; |
53 | 55 |
assertEquals(expectedRawText10, citation10.getRawText()); |
54 | 56 |
|
55 | 57 |
ResolvedCitation citation11 = citations.get(10); |
56 |
assertEquals("19943949", citation11.getSourcePmid());
|
|
58 |
assertEquals(oaId, citation11.getSourceOaid());
|
|
57 | 59 |
assertEquals(11, citation11.getPosition()); |
58 | 60 |
assertEquals(null, citation11.getTargetPmid()); |
59 | 61 |
String expectedRawText11 = "Carbone, A. Computational prediction of genomic functional cores specific to different microbes. J Mol Evol. 2006;63(6):733-746"; |
60 | 62 |
assertEquals(expectedRawText11, citation11.getRawText()); |
61 | 63 |
|
62 | 64 |
ResolvedCitation citation14 = citations.get(13); |
63 |
assertEquals("19943949", citation14.getSourcePmid());
|
|
65 |
assertEquals(oaId, citation14.getSourceOaid());
|
|
64 | 66 |
assertEquals(14, citation14.getPosition()); |
65 | 67 |
assertEquals("17370266", citation14.getTargetPmid()); |
66 | 68 |
String expectedRawText14 = "Danchin, A, Fang, G, Noria, S. The extant core bacterial proteome is an archive of the origin of life. Proteomics. 2007; 7 (6): 875-889"; |
modules/icm-iis-ingest-pmc/trunk/src/main/java/eu/dnetlib/iis/ingest/pmc/citations/ResolvedCitationsImporter.java | ||
---|---|---|
20 | 20 |
|
21 | 21 |
/** |
22 | 22 |
* @author Mateusz Fedoryszak (m.fedoryszak@icm.edu.pl) |
23 |
* @author mhorst |
|
23 | 24 |
*/ |
24 | 25 |
public class ResolvedCitationsImporter { |
25 | 26 |
public static void main(String[] args) throws IOException { |
26 | 27 |
String inPath = args[0]; |
27 | 28 |
String dedupPath = args[1]; |
28 |
String idsPath = args[2]; |
|
29 |
String outPath = args[3]; |
|
29 |
String pmidToOaidPath = args[2]; |
|
30 |
String doiToOaidPath = args[3]; |
|
31 |
String outPath = args[4]; |
|
30 | 32 |
|
31 | 33 |
Properties properties = new Properties(); |
32 | 34 |
AppProps.setApplicationJarClass(properties, ResolvedCitationsImporter.class); |
... | ... | |
37 | 39 |
|
38 | 40 |
Tap docTap = new Hfs(new AvroScheme(DocumentText.getClassSchema()), inPath); |
39 | 41 |
Tap dedupTap = new Hfs(new AvroScheme(DeduplicationMapping.getClassSchema()), dedupPath); |
40 |
Tap idsTap = new Hfs(new AvroScheme(DocumentId.getClassSchema()), idsPath); |
|
42 |
Tap pmidToOaidTap = new Hfs(new AvroScheme(DeduplicationMapping.getClassSchema()), pmidToOaidPath); |
|
43 |
Tap doiToOaidTap = new Hfs(new AvroScheme(DeduplicationMapping.getClassSchema()), doiToOaidPath); |
|
41 | 44 |
|
42 | 45 |
Pipe docPipe = new Pipe("doc"); |
43 | 46 |
Pipe dedupMapPipe = new Pipe("dedup"); |
44 |
Pipe existentDocIdsPipe = new Pipe("existent_ids"); |
|
47 |
Pipe pmidToOaidPipe = new Pipe("pmid_to_oaid"); |
|
48 |
Pipe doiToOaidPipe = new Pipe("doi_to_oaid"); |
|
45 | 49 |
|
46 | 50 |
Tap outTap = new Hfs(new PackedAvroScheme<Citation>(Citation.getClassSchema()), outPath); |
47 | 51 |
|
48 |
SubAssembly main = new ResolvedCitationsSubAssembly(docPipe, dedupMapPipe, existentDocIdsPipe); |
|
52 |
SubAssembly main = new ResolvedCitationsSubAssembly(docPipe, dedupMapPipe, |
|
53 |
pmidToOaidPipe, doiToOaidPipe); |
|
49 | 54 |
|
50 | 55 |
FlowDef flowDef = FlowDef.flowDef() |
51 | 56 |
.addSource(docPipe, docTap) |
52 | 57 |
.addSource(dedupMapPipe, dedupTap) |
53 |
.addSource(existentDocIdsPipe, idsTap) |
|
58 |
.addSource(pmidToOaidPipe, pmidToOaidTap) |
|
59 |
.addSource(doiToOaidPipe, doiToOaidTap) |
|
54 | 60 |
.addTailSink(main.getTails()[0], outTap); |
55 | 61 |
|
56 | 62 |
Flow flow = flowConnector.connect(flowDef); |
modules/icm-iis-ingest-pmc/trunk/src/main/java/eu/dnetlib/iis/ingest/pmc/citations/CitationReshaperFuncion.java | ||
---|---|---|
1 | 1 |
package eu.dnetlib.iis.ingest.pmc.citations; |
2 | 2 |
|
3 |
import static eu.dnetlib.iis.ingest.pmc.citations.Constants.DST_FIELD; |
|
4 |
import static eu.dnetlib.iis.ingest.pmc.citations.Constants.DST_IDS_FIELD; |
|
5 |
import static eu.dnetlib.iis.ingest.pmc.citations.Constants.RAW_TEXT_FIELD; |
|
6 |
import static eu.dnetlib.iis.ingest.pmc.citations.Constants.SRC_FIELD; |
|
7 |
|
|
8 |
import java.util.HashMap; |
|
9 |
import java.util.Iterator; |
|
10 |
import java.util.Map; |
|
11 |
|
|
12 |
import org.apache.commons.lang3.StringUtils; |
|
13 |
import org.codehaus.jettison.json.JSONException; |
|
14 |
import org.codehaus.jettison.json.JSONObject; |
|
15 |
|
|
3 | 16 |
import cascading.flow.FlowProcess; |
4 | 17 |
import cascading.operation.BaseOperation; |
5 | 18 |
import cascading.operation.Function; |
6 | 19 |
import cascading.operation.FunctionCall; |
7 | 20 |
import cascading.tuple.Tuple; |
8 |
|
|
9 | 21 |
import eu.dnetlib.iis.ingest.pmc.citations.schemas.Citation; |
10 |
import org.apache.commons.lang3.StringUtils; |
|
11 |
import org.codehaus.jettison.json.JSONException; |
|
12 |
import org.codehaus.jettison.json.JSONObject; |
|
13 | 22 |
|
14 |
import java.util.HashMap; |
|
15 |
import java.util.Iterator; |
|
16 |
import java.util.Map; |
|
17 |
|
|
18 |
import static eu.dnetlib.iis.ingest.pmc.citations.Constants.*; |
|
19 |
|
|
20 | 23 |
/** |
21 | 24 |
* Converts tuple into a Citation. |
22 | 25 |
* |
23 | 26 |
* @see eu.dnetlib.iis.ingest.pmc.citations.schemas.Citation |
24 | 27 |
* |
25 | 28 |
* @author Mateusz Fedoryszak (m.fedoryszak@icm.edu.pl) |
29 |
* @author mhorst |
|
26 | 30 |
*/ |
27 | 31 |
public class CitationReshaperFuncion extends BaseOperation implements Function { |
28 | 32 |
@Override |
... | ... | |
35 | 39 |
cit.setRawText(rawText); |
36 | 40 |
} |
37 | 41 |
|
38 |
//This field is not empty for documents existent in OA+ |
|
39 |
String dstId = functionCall.getArguments().getString(ID_FIELD); |
|
42 |
String dstId = functionCall.getArguments().getString(DST_FIELD); |
|
40 | 43 |
if (StringUtils.isNotEmpty(dstId)) { |
41 | 44 |
cit.setDestinationDocumentId(dstId); |
42 | 45 |
} |
modules/icm-iis-ingest-pmc/trunk/src/main/java/eu/dnetlib/iis/ingest/pmc/citations/ResolvedCitationsSubAssembly.java | ||
---|---|---|
12 | 12 |
* The main workflow. |
13 | 13 |
* |
14 | 14 |
* @author Mateusz Fedoryszak (m.fedoryszak@icm.edu.pl) |
15 |
* @author mhorst |
|
16 |
* |
|
15 | 17 |
*/ |
16 | 18 |
public class ResolvedCitationsSubAssembly extends SubAssembly { |
17 |
public ResolvedCitationsSubAssembly(Pipe nlmsPipe, Pipe dedupMapPipe, Pipe existentDocsIdsPipe) { |
|
18 |
setPrevious(nlmsPipe, dedupMapPipe, existentDocsIdsPipe); |
|
19 |
public ResolvedCitationsSubAssembly(Pipe nlmsPipe, Pipe dedupMapPipe, |
|
20 |
Pipe pmidToOaidPipe, Pipe doiToOaidPipe) { |
|
21 |
setPrevious(nlmsPipe, dedupMapPipe, pmidToOaidPipe, doiToOaidPipe); |
|
19 | 22 |
|
20 |
// extract resolved citations from NLMs
|
|
23 |
// extract resolved citations from NLMs
|
|
21 | 24 |
Pipe citationsPipe = new Each(nlmsPipe, new CitationExtractorFunction(), Fields.RESULTS); |
22 | 25 |
|
23 |
// replace document ids for ids of deduplicated documents |
|
26 |
// translate PMID target identifiers to openaire identifier in DST_FIELD |
|
27 |
// grouping citations and dedup entries by pmid |
|
28 |
Pipe matchingPipe = new CoGroup( |
|
29 |
citationsPipe, new Fields(DST_PMID_FIELD), |
|
30 |
pmidToOaidPipe, new Fields(ORIGINAL_ID_FIELD), |
|
31 |
new LeftJoin()); |
|
32 |
// replacing destination identifier with id from dedup mapping |
|
33 |
matchingPipe = new Each( |
|
34 |
matchingPipe, |
|
35 |
new ReplacerFunction(new Fields(DST_FIELD), new Fields(NEW_ID_FIELD), |
|
36 |
new Fields(SRC_FIELD, |
|
37 |
POSITION_FIELD, RAW_TEXT_FIELD, DST_IDS_FIELD, |
|
38 |
DST_DOI_FIELD)), |
|
39 |
Fields.RESULTS); |
|
40 |
// translate DOI target identifiers to openaire identifier in DST_FIELD |
|
41 |
// grouping citations and dedup entries by DOI |
|
42 |
matchingPipe = new CoGroup( |
|
43 |
matchingPipe, new Fields(DST_DOI_FIELD), |
|
44 |
doiToOaidPipe, new Fields(ORIGINAL_ID_FIELD), |
|
45 |
new LeftJoin()); |
|
46 |
// replacing destination identifier with id from dedup mapping |
|
47 |
matchingPipe = new Each( |
|
48 |
matchingPipe, |
|
49 |
new ReplacerFunction(new Fields(DST_FIELD), new Fields(NEW_ID_FIELD), |
|
50 |
new Fields(SRC_FIELD, |
|
51 |
POSITION_FIELD, RAW_TEXT_FIELD, DST_IDS_FIELD)), |
|
52 |
Fields.RESULTS); |
|
53 |
|
|
54 |
// replace document ids for ids of deduplicated documents |
|
24 | 55 |
Pipe dedupPipe; |
25 |
dedupPipe = new CoGroup(citationsPipe, new Fields(SRC_FIELD), dedupMapPipe, |
|
56 |
// grouping citations and dedup entries by source id |
|
57 |
dedupPipe = new CoGroup(matchingPipe, new Fields(SRC_FIELD), dedupMapPipe, |
|
26 | 58 |
new Fields(ORIGINAL_ID_FIELD), new LeftJoin()); |
27 |
|
|
59 |
// replacing source identifier with id from dedup mapping |
|
28 | 60 |
dedupPipe = new Each( |
29 | 61 |
dedupPipe, |
30 |
new ReplacerFunction(new Fields(SRC_FIELD), new Fields(NEW_ID_FIELD), |
|
62 |
new ReplacerFunction( |
|
63 |
new Fields(SRC_FIELD), new Fields(NEW_ID_FIELD), |
|
31 | 64 |
new Fields(POSITION_FIELD, RAW_TEXT_FIELD, DST_FIELD, DST_IDS_FIELD)), |
32 | 65 |
Fields.RESULTS); |
33 |
|
|
66 |
// grouping citations and dedup entries by destination id |
|
34 | 67 |
dedupPipe = new CoGroup(dedupPipe, new Fields(DST_FIELD), dedupMapPipe, |
35 | 68 |
new Fields(ORIGINAL_ID_FIELD), new LeftJoin()); |
36 |
|
|
69 |
// replacing destination identifier with id from dedup mapping |
|
37 | 70 |
dedupPipe = new Each( |
38 | 71 |
dedupPipe, |
39 |
new ReplacerFunction(new Fields(DST_FIELD), new Fields(NEW_ID_FIELD), |
|
72 |
new ReplacerFunction( |
|
73 |
new Fields(DST_FIELD), new Fields(NEW_ID_FIELD), |
|
40 | 74 |
new Fields(POSITION_FIELD, RAW_TEXT_FIELD, SRC_FIELD, DST_IDS_FIELD)), |
41 | 75 |
Fields.RESULTS); |
76 |
|
|
77 |
// mark citations to existent documents |
|
78 |
// FIXME I guess we don't need that step anymore, since identifiers are not generated |
|
79 |
// Pipe existenceMarkedPipe = new CoGroup(dedupPipe, new Fields(DST_FIELD), existentDocsIdsPipe, |
|
80 |
// new Fields(ID_FIELD), new LeftJoin()); |
|
42 | 81 |
|
43 |
// Mark citations to existent documents |
|
44 |
Pipe existenceMarkedPipe = new CoGroup(dedupPipe, new Fields(DST_FIELD), existentDocsIdsPipe, |
|
45 |
new Fields(ID_FIELD), new LeftJoin()); |
|
82 |
Pipe outPipe = new Each(dedupPipe, new CitationReshaperFuncion(), Fields.RESULTS); |
|
46 | 83 |
|
47 |
Pipe outPipe = new Each(existenceMarkedPipe, new CitationReshaperFuncion(), Fields.RESULTS); |
|
48 |
|
|
49 | 84 |
setTails(outPipe); |
50 | 85 |
} |
51 | 86 |
} |
modules/icm-iis-ingest-pmc/trunk/src/main/java/eu/dnetlib/iis/ingest/pmc/citations/CitationExtractorFunction.java | ||
---|---|---|
1 | 1 |
package eu.dnetlib.iis.ingest.pmc.citations; |
2 | 2 |
|
3 |
import static eu.dnetlib.iis.ingest.pmc.citations.Constants.DST_FIELD; |
|
4 |
import static eu.dnetlib.iis.ingest.pmc.citations.Constants.DST_IDS_FIELD; |
|
5 |
import static eu.dnetlib.iis.ingest.pmc.citations.Constants.DST_PMID_FIELD; |
|
6 |
import static eu.dnetlib.iis.ingest.pmc.citations.Constants.DST_DOI_FIELD; |
|
7 |
import static eu.dnetlib.iis.ingest.pmc.citations.Constants.POSITION_FIELD; |
|
8 |
import static eu.dnetlib.iis.ingest.pmc.citations.Constants.RAW_TEXT_FIELD; |
|
9 |
import static eu.dnetlib.iis.ingest.pmc.citations.Constants.SRC_FIELD; |
|
3 | 10 |
import cascading.flow.FlowProcess; |
4 | 11 |
import cascading.operation.BaseOperation; |
5 | 12 |
import cascading.operation.Function; |
... | ... | |
7 | 14 |
import cascading.tuple.Fields; |
8 | 15 |
import cascading.tuple.Tuple; |
9 | 16 |
|
10 |
import static eu.dnetlib.iis.ingest.pmc.citations.Constants.*; |
|
11 |
|
|
12 | 17 |
/** |
13 |
* Extracts Resolved citations from an NLM file. Input pipe should contain a field named TEXT_FIELD. Result stream will
|
|
14 |
* contain fields SRC_FIELD, POSITION_FIELD, DST_FIELD and DST_IDS_FIELD
|
|
18 |
* Extracts Resolved citations from an NLM file. |
|
19 |
* Input pipe should contain a field named TEXT_FIELD and ID_FIELD.
|
|
15 | 20 |
* |
16 | 21 |
* @author Mateusz Fedoryszak (m.fedoryszak@icm.edu.pl) |
17 | 22 |
*/ |
18 | 23 |
public class CitationExtractorFunction extends BaseOperation implements Function { |
19 | 24 |
private final static Fields fields = |
20 |
new Fields(SRC_FIELD, POSITION_FIELD, RAW_TEXT_FIELD, DST_FIELD, DST_IDS_FIELD);
|
|
21 |
//private final Logger logger = Logger.getLogger(CitationExtractorFunction.class);
|
|
25 |
new Fields(SRC_FIELD, POSITION_FIELD, RAW_TEXT_FIELD, |
|
26 |
DST_FIELD, DST_IDS_FIELD, DST_DOI_FIELD, DST_PMID_FIELD);
|
|
22 | 27 |
|
23 | 28 |
public CitationExtractorFunction() { |
24 | 29 |
super(2, fields); |
... | ... | |
27 | 32 |
@Override |
28 | 33 |
public void operate(FlowProcess flowProcess, FunctionCall functionCall) { |
29 | 34 |
String text = functionCall.getArguments().getString(Constants.TEXT_FIELD); |
35 |
String oaId = functionCall.getArguments().getString(Constants.ID_FIELD); |
|
30 | 36 |
|
31 | 37 |
try { |
32 |
for (ResolvedCitation cit : ResolvedCitation.extractFromNlm(text)) { |
|
38 |
for (ResolvedCitation cit : ResolvedCitation.extractFromNlm(oaId, text)) {
|
|
33 | 39 |
Tuple result = new Tuple(); |
34 | 40 |
result.addString(cit.getSourceOaid()); |
35 | 41 |
result.addInteger(cit.getPosition()); |
36 | 42 |
result.addString(cit.getRawText()); |
37 |
result.addString(cit.getTargetOaid());
|
|
43 |
result.addString(null);
|
|
38 | 44 |
result.addString(cit.getTargetIdsJson()); |
45 |
result.addString(cit.getTargetDoi()); |
|
46 |
result.addString(cit.getTargetPmid()); |
|
39 | 47 |
functionCall.getOutputCollector().add(result); |
40 | 48 |
} |
41 | 49 |
} catch (Exception e) { |
42 |
//in case of parse exception |
|
43 |
//logger.error("Error while parsing NLM\n"+text, e); |
|
50 |
throw new RuntimeException(e); |
|
44 | 51 |
} |
45 | 52 |
} |
46 | 53 |
} |
modules/icm-iis-ingest-pmc/trunk/src/main/java/eu/dnetlib/iis/ingest/pmc/citations/Constants.java | ||
---|---|---|
2 | 2 |
|
3 | 3 |
/** |
4 | 4 |
* @author Mateusz Fedoryszak (m.fedoryszak@icm.edu.pl) |
5 |
* @author mhorst |
|
5 | 6 |
*/ |
6 | 7 |
public class Constants { |
7 | 8 |
public static final String SRC_FIELD = "sourceDocumentId"; |
... | ... | |
13 | 14 |
public static final String ID_FIELD = "id"; |
14 | 15 |
public static final String TEXT_FIELD = "text"; |
15 | 16 |
public static final String RAW_TEXT_FIELD = "rawText"; |
17 |
|
|
18 |
public static final String DST_DOI_FIELD = "destinationDoi"; |
|
19 |
public static final String DST_PMID_FIELD = "destinationPmid"; |
|
16 | 20 |
|
17 | 21 |
private Constants() { |
18 | 22 |
} |
modules/icm-iis-ingest-pmc/trunk/src/main/java/eu/dnetlib/iis/ingest/pmc/citations/ResolvedCitation.java | ||
---|---|---|
1 | 1 |
package eu.dnetlib.iis.ingest.pmc.citations; |
2 | 2 |
|
3 |
import eu.dnetlib.data.proto.TypeProtos.Type; |
|
4 |
import eu.dnetlib.data.transform.xml.AbstractDNetOafXsltFunctions; |
|
3 |
import java.util.ArrayList; |
|
4 |
import java.util.HashMap; |
|
5 |
import java.util.List; |
|
6 |
import java.util.Map; |
|
5 | 7 |
|
6 | 8 |
import org.apache.commons.io.IOUtils; |
7 | 9 |
import org.apache.commons.lang.StringUtils; |
... | ... | |
12 | 14 |
import pl.edu.icm.ceon.scala_commons.xml.XPathEvaluator; |
13 | 15 |
import scala.collection.JavaConversions; |
14 | 16 |
|
15 |
import java.util.ArrayList; |
|
16 |
import java.util.HashMap; |
|
17 |
import java.util.List; |
|
18 |
import java.util.Map; |
|
19 |
|
|
20 | 17 |
/** |
21 | 18 |
* A model of resolved citation from PMC. |
22 | 19 |
* |
23 | 20 |
* @author Mateusz Fedoryszak (m.fedoryszak@icm.edu.pl) |
24 | 21 |
*/ |
25 | 22 |
public class ResolvedCitation { |
26 |
private final String sourcePmid;
|
|
23 |
private final String sourceOaid;
|
|
27 | 24 |
private final int position; |
28 | 25 |
private final String rawText; |
29 | 26 |
private final Map<String, String> targetIds; |
30 | 27 |
|
31 |
private static final String PUBMED_NS_PREFIX = "od_______908"; |
|
32 |
private static final String PUBMED_ID_PREFIX = "oai:europepmc.org:"; |
|
33 | 28 |
private static final String PMID_KEY = "pmid"; |
29 |
private static final String DOI_KEY = "doi"; |
|
34 | 30 |
|
35 |
public ResolvedCitation(final String sourcePmid, final int position, final String rawText, final Map<String, String> targetIds) {
|
|
36 |
this.sourcePmid = sourcePmid;
|
|
31 |
public ResolvedCitation(final String sourceOaid, final int position, final String rawText, final Map<String, String> targetIds) {
|
|
32 |
this.sourceOaid = sourceOaid;
|
|
37 | 33 |
this.position = position; |
38 | 34 |
this.rawText = rawText; |
39 | 35 |
this.targetIds = targetIds; |
40 | 36 |
} |
41 | 37 |
|
42 | 38 |
/** |
43 |
* @return Source (citing) document PubMedID |
|
44 |
*/ |
|
45 |
public String getSourcePmid() { |
|
46 |
return sourcePmid; |
|
47 |
} |
|
48 |
|
|
49 |
/** |
|
50 | 39 |
* @return Source (citing) document OpenAIRE ID (before dedup) |
51 | 40 |
*/ |
52 | 41 |
public String getSourceOaid() { |
53 |
return pmidToOaid(sourcePmid);
|
|
42 |
return sourceOaid;
|
|
54 | 43 |
} |
55 | 44 |
|
56 | 45 |
public int getPosition() { |
... | ... | |
68 | 57 |
return targetIds.get(PMID_KEY); |
69 | 58 |
} |
70 | 59 |
|
60 |
|
|
71 | 61 |
/** |
72 |
* @return Target (cited) document OpenAIRE ID (before dedup)
|
|
62 |
* @return Target (cited) document DOI
|
|
73 | 63 |
*/ |
74 |
public String getTargetOaid() { |
|
75 |
final String pmid = getTargetPmid(); |
|
76 |
if (pmid != null) { |
|
77 |
return pmidToOaid(pmid); |
|
78 |
} else { |
|
79 |
return null; |
|
80 |
} |
|
64 |
public String getTargetDoi() { |
|
65 |
return targetIds.get(DOI_KEY); |
|
81 | 66 |
} |
82 |
|
|
67 |
|
|
83 | 68 |
public String getTargetIdsJson() { |
84 | 69 |
return new JSONObject(targetIds).toString(); |
85 | 70 |
} |
86 |
|
|
71 |
|
|
87 | 72 |
/** |
88 |
* Transforms PubMed ID into OpenAIRE ID (before dedup) |
|
89 |
*/ |
|
90 |
public static String pmidToOaid(final String pmid) { |
|
91 |
return AbstractDNetOafXsltFunctions.oafId( |
|
92 |
Type.result.name(), PUBMED_NS_PREFIX, PUBMED_ID_PREFIX + pmid); |
|
93 |
} |
|
94 |
|
|
95 |
/** |
|
96 | 73 |
* Parses NLM file and extracts resolved citations. |
97 | 74 |
*/ |
98 |
public static List<ResolvedCitation> extractFromNlm(final String text) { |
|
99 |
XPathEvaluator evaluator = XPathEvaluator.fromString(text); |
|
100 |
String id = evaluator.apply("/article/front/article-meta/article-id[@pub-id-type='pmid']"); |
|
75 |
public static List<ResolvedCitation> extractFromNlm(final String oaSourceId, final String text) { |
|
76 |
XPathEvaluator evaluator = XPathEvaluator.fromInputStream(IOUtils.toInputStream(text)); |
|
101 | 77 |
|
102 | 78 |
List<ResolvedCitation> result = new ArrayList<ResolvedCitation>(); |
103 | 79 |
int position = 1; |
... | ... | |
109 | 85 |
final String idValue = citId.getTextContent(); |
110 | 86 |
targetIds.put(idType, idValue); |
111 | 87 |
} |
112 |
result.add(new ResolvedCitation(id, position, rawTextGenerator(ref), targetIds));
|
|
88 |
result.add(new ResolvedCitation(oaSourceId, position, rawTextGenerator(ref), targetIds));
|
|
113 | 89 |
|
114 | 90 |
++position; |
115 | 91 |
} |
modules/icm-iis-ingest-pmc/trunk/src/main/java/eu/dnetlib/iis/ingest/pmc/citations/PmcXmlHandler.java | ||
---|---|---|
1 |
package eu.dnetlib.iis.ingest.pmc.citations; |
|
2 |
|
|
3 |
import java.util.Stack; |
|
4 |
|
|
5 |
import org.apache.log4j.Logger; |
|
6 |
import org.xml.sax.Attributes; |
|
7 |
import org.xml.sax.SAXException; |
|
8 |
import org.xml.sax.helpers.DefaultHandler; |
|
9 |
|
|
10 |
/** |
|
11 |
* PMC XML dump SAX handler. |
|
12 |
* Notice: writer is not being closed by handler. |
|
13 |
* Created outside, let it be closed outside as well. |
|
14 |
* |
|
15 |
* Currently extracts pmid identifiers only. |
|
16 |
* |
|
17 |
* @author mhorst |
|
18 |
* |
|
19 |
*/ |
|
20 |
public class PmcXmlHandler extends DefaultHandler { |
|
21 |
|
|
22 |
// elements |
|
23 |
private static final String ELEM_ARTICLE = "article"; |
|
24 |
// front article id |
|
25 |
// /article/front/article-meta/article-id[@pub-id-type='pmid'] |
|
26 |
private static final String ELEM_FRONT = "front"; |
|
27 |
private static final String ELEM_ARTICLE_META = "article-meta"; |
|
28 |
private static final String ELEM_ARTICLE_ID = "article-id"; |
|
29 |
// back citations |
|
30 |
private static final String ELEM_BACK = "back"; |
|
31 |
private static final String ELEM_REF_LIST = "ref-list"; |
|
32 |
private static final String ELEM_REF = "ref"; |
|
33 |
private static final String ELEM_PUB_ID = "pub-id"; |
|
34 |
// back citations meta |
|
35 |
private static final String ELEM_ARTICLE_TITLE = "article-title"; |
|
36 |
private static final String ELEM_SOURCE = "source"; |
|
37 |
private static final String ELEM_YEAR = "year"; |
|
38 |
private static final String ELEM_VOLUME = "volume"; |
|
39 |
private static final String ELEM_ISSUE = "issue"; |
|
40 |
private static final String ELEM_FPAGE = "fpage"; |
|
41 |
private static final String ELEM_LPAGE = "lpage"; |
|
42 |
// back citations author |
|
43 |
private static final String ELEM_NAME = "name"; |
|
44 |
private static final String ELEM_SURNAME = "surname"; |
|
45 |
private static final String ELEM_GIVEN_NAMES = "given-names"; |
|
46 |
// back citations contains text child |
|
47 |
private static final String ELEM_CITATION = "citation"; |
|
48 |
private static final String ELEM_ELEMENT_CITATION = "element-citation"; |
|
49 |
private static final String ELEM_MIXED_CITATION = "mixed-citation"; |
|
50 |
|
|
51 |
// attributes |
|
52 |
private static final String ATTR_PUB_TYPE = "pub-id-type"; |
|
53 |
|
|
54 |
private static final String PUB_ID_TYPE_PMID = "pmid"; |
|
55 |
|
|
56 |
|
|
57 |
private final Logger log = Logger.getLogger(this.getClass()); |
|
58 |
|
|
59 |
private Stack<String> parents; |
|
60 |
|
|
61 |
private StringBuilder currentValue = new StringBuilder(); |
|
62 |
|
|
63 |
/** |
|
64 |
* article pmid |
|
65 |
*/ |
|
66 |
private String articleId = null; |
|
67 |
|
|
68 |
private String articleIdType = null; |
|
69 |
|
|
70 |
boolean containsTextChild = false; |
|
71 |
|
|
72 |
private int counter = 0; |
|
73 |
|
|
74 |
/** |
|
75 |
* Default constructor. |
|
76 |
* @param receiver |
|
77 |
*/ |
|
78 |
public PmcXmlHandler() { |
|
79 |
super(); |
|
80 |
} |
|
81 |
|
|
82 |
@Override |
|
83 |
public void startDocument() throws SAXException { |
|
84 |
this.parents = new Stack<String>(); |
|
85 |
this.counter = 0; |
|
86 |
clearAllFields(); |
|
87 |
} |
|
88 |
|
|
89 |
@Override |
|
90 |
public void startElement(String uri, String localName, String qName, |
|
91 |
Attributes attributes) throws SAXException { |
|
92 |
if (isWithinElement(qName, ELEM_ARTICLE_ID, ELEM_ARTICLE_META)) { |
|
93 |
this.currentValue = new StringBuilder(); |
|
94 |
this.articleIdType = attributes.getValue(ATTR_PUB_TYPE); |
|
95 |
} |
|
96 |
this.parents.push(qName); |
|
97 |
} |
|
98 |
|
|
99 |
@Override |
|
100 |
public void endElement(String uri, String localName, String qName) |
|
101 |
throws SAXException { |
|
102 |
this.parents.pop(); |
|
103 |
if (isWithinElement(qName, ELEM_ARTICLE_ID, ELEM_ARTICLE_META) && |
|
104 |
PUB_ID_TYPE_PMID.equals(this.articleIdType)) { |
|
105 |
this.articleId = this.currentValue.toString().trim(); |
|
106 |
} |
|
107 |
// resetting current value; |
|
108 |
this.currentValue = null; |
|
109 |
} |
|
110 |
|
|
111 |
private void clearAllFields() { |
|
112 |
this.articleId = null; |
|
113 |
this.articleIdType = null; |
|
114 |
} |
|
115 |
|
|
116 |
boolean isWithinElement(String qName, |
|
117 |
String expectedElement, String expectedParent) { |
|
118 |
return qName.equals(expectedElement) && |
|
119 |
(expectedParent==null || !this.parents.isEmpty() && expectedParent.equals(this.parents.peek())); |
|
120 |
} |
|
121 |
|
|
122 |
@Override |
|
123 |
public void endDocument() throws SAXException { |
|
124 |
parents.clear(); |
|
125 |
parents = null; |
|
126 |
log.debug("total number of processed records: " + counter); |
|
127 |
} |
|
128 |
|
|
129 |
@Override |
|
130 |
public void characters(char[] ch, int start, int length) |
|
131 |
throws SAXException { |
|
132 |
if (this.currentValue!=null) { |
|
133 |
this.currentValue.append(ch, start, length); |
|
134 |
} |
|
135 |
} |
|
136 |
|
|
137 |
public String getArticleId() { |
|
138 |
return articleId; |
|
139 |
} |
|
140 |
|
|
141 |
} |
|
0 | 142 |
modules/icm-iis-ingest-pmc/trunk/src/main/java/eu/dnetlib/iis/ingest/pmc/citations/PmidToOaidMapper.java | ||
---|---|---|
1 |
package eu.dnetlib.iis.ingest.pmc.citations; |
|
2 |
|
|
3 |
import java.io.IOException; |
|
4 |
import java.io.StringReader; |
|
5 |
|
|
6 |
import javax.xml.parsers.ParserConfigurationException; |
|
7 |
import javax.xml.parsers.SAXParser; |
|
8 |
import javax.xml.parsers.SAXParserFactory; |
|
9 |
|
|
10 |
import org.apache.avro.mapred.AvroKey; |
|
11 |
import org.apache.hadoop.io.NullWritable; |
|
12 |
import org.apache.hadoop.mapreduce.Mapper; |
|
13 |
import org.apache.log4j.Logger; |
|
14 |
import org.xml.sax.InputSource; |
|
15 |
import org.xml.sax.SAXException; |
|
16 |
|
|
17 |
import eu.dnetlib.iis.importer.schemas.DeduplicationMapping; |
|
18 |
import eu.dnetlib.iis.metadataextraction.schemas.DocumentText; |
|
19 |
|
|
20 |
/** |
|
21 |
* Mapper class creating pmid to oaid mappings from PMC XML files. |
|
22 |
* @author mhorst |
|
23 |
*/ |
|
24 |
|
|
25 |
public class PmidToOaidMapper extends Mapper<AvroKey<DocumentText>, NullWritable, AvroKey<DeduplicationMapping>, NullWritable> { |
|
26 |
|
|
27 |
private final Logger log = Logger.getLogger(PmidToOaidMapper.class); |
|
28 |
|
|
29 |
@Override |
|
30 |
protected void map(AvroKey<DocumentText> key, NullWritable value, Context context) |
|
31 |
throws IOException, InterruptedException { |
|
32 |
DocumentText nlm = key.datum(); |
|
33 |
String pmid = null; |
|
34 |
if (nlm.getText() != null) { |
|
35 |
try { |
|
36 |
SAXParser saxParser = SAXParserFactory.newInstance().newSAXParser(); |
|
37 |
PmcXmlHandler pmcXmlHandler = new PmcXmlHandler(); |
|
38 |
saxParser.parse(new InputSource( |
|
39 |
new StringReader(nlm.getText().toString())), |
|
40 |
pmcXmlHandler); |
|
41 |
pmid = pmcXmlHandler.getArticleId(); |
|
42 |
} catch (ParserConfigurationException e) { |
|
43 |
log.error("Text extraction failed for id " + nlm.getId() + |
|
44 |
" and text: " + nlm.getText(), e); |
|
45 |
} catch (SAXException e) { |
|
46 |
log.error("Text extraction failed for id " + nlm.getId() + |
|
47 |
" and text: " + nlm.getText(), e); |
|
48 |
} |
|
49 |
if (pmid!=null && !pmid.isEmpty()) { |
|
50 |
DeduplicationMapping output = DeduplicationMapping.newBuilder() |
|
51 |
.setOriginalId(pmid) |
|
52 |
.setNewId(nlm.getId().toString()).build(); |
|
53 |
context.write(new AvroKey<DeduplicationMapping>(output), NullWritable.get()); |
|
54 |
} else { |
|
55 |
log.warn("null or empty pmid extracted for id " + nlm.getId() + |
|
56 |
" and text: " + nlm.getText()); |
|
57 |
} |
|
58 |
} |
|
59 |
} |
|
60 |
} |
|
0 | 61 |
modules/icm-iis-ingest-pmc/trunk/src/main/resources/eu/dnetlib/iis/ingest/pmc/idmapping/pmidtooaid/job.properties | ||
---|---|---|
1 |
input_document_nlm=/share/contents/xml/europePMC/2014-07-28 |
|
2 |
output=${workingDir}/output |
|
0 | 3 |
modules/icm-iis-ingest-pmc/trunk/src/main/resources/eu/dnetlib/iis/ingest/pmc/citations/job.properties | ||
---|---|---|
1 | 1 |
input_document_nlm=/share/contents/xml/europePMC/2014-07-28 |
2 | 2 |
input_dedup_map=/share/import/dedupmapping/2014-06-30 |
3 |
input_document_id=/share/transformers/importer/documentmetadata/idextractor/2014-07-10 |
|
3 |
input_pmid_to_oaid=/user/marek.horst/ingest/pmc/idmapping/pmidtooaid/working_dir/output |
|
4 |
#fix this path below |
|
5 |
input_doi_to_oaid=/user/marek.horst/ingest/pmc/idmapping/pmidtooaid/working_dir/output |
|
4 | 6 |
output_citation=${workingDir}/citation |
5 | 7 |
reduce_tasks=36 |
modules/icm-iis-ingest-pmc/trunk/src/main/resources/eu/dnetlib/iis/ingest/pmc/citations/oozie_app/workflow.xml | ||
---|---|---|
10 | 10 |
<description>mapping between original OpenAire IDs and deduplicated documents' IDs</description> |
11 | 11 |
</property> |
12 | 12 |
<property> |
13 |
<name>input_document_id</name>
|
|
14 |
<description>all existent document ids</description>
|
|
13 |
<name>input_pmid_to_oaid</name>
|
|
14 |
<description>pmid to oaid mappings</description>
|
|
15 | 15 |
</property> |
16 | 16 |
<property> |
17 |
<name>input_doi_to_oaid</name> |
|
18 |
<description>doi to oaid mappings</description> |
|
19 |
</property> |
|
20 |
<property> |
|
17 | 21 |
<name>output_citation</name> |
18 | 22 |
<description>extracted citations</description> |
19 | 23 |
</property> |
... | ... | |
43 | 47 |
<java-opts>-Dmapred.reduce.tasks=${reduce_tasks}</java-opts> |
44 | 48 |
<arg>${nameNode}${input_document_nlm}</arg> |
45 | 49 |
<arg>${nameNode}${input_dedup_map}</arg> |
46 |
<arg>${nameNode}${input_document_id}</arg> |
|
50 |
<arg>${nameNode}${input_pmid_to_oaid}</arg> |
|
51 |
<arg>${nameNode}${input_doi_to_oaid}</arg> |
|
47 | 52 |
<arg>${nameNode}${output_citation}</arg> |
48 | 53 |
</java> |
49 | 54 |
<ok to="end"/> |
Also available in: Unified diff
#757 fixing pmid and doi matching, fixing sourceDocumentId and destinationDocumentId generation