Project

General

Profile

« Previous | Next » 

Revision 30986

#757 fixing pmid and doi matching, fixing sourceDocumentId and destinationDocumentId generation

View differences:

modules/icm-iis-ingest-pmc/trunk/src/test/java/eu/dnetlib/iis/ingest/pmc/citations/ResolvedCitationTest.java
15 15
    private static final String testXML = "/eu/dnetlib/iis/ingest/pmc/plaintext/document.nxml";
16 16
    private static final String gappedXML = "/eu/dnetlib/iis/ingest/pmc/plaintext/document_gapped.nxml";
17 17

  
18
    @Test
19
	public void testPmidToOaid() throws Exception {
20
        assertEquals("50|od_______908::0000194eb47642a5cda25359367b9754", ResolvedCitation.pmidToOaid("218258"));
21
    }
18
//    @Test
19
//	public void testPmidToOaid() throws Exception {
20
//        assertEquals("50|od_______908::0000194eb47642a5cda25359367b9754", ResolvedCitation.pmidToOaid("218258"));
21
//    }
22 22

  
23 23
    @Test
24 24
    public void testExtraction() throws Exception {
25
    	String oaId = "some-oaid";
25 26
        InputStream testIS = ClassLoader.class.getResourceAsStream(testXML);
26 27
        String text = IOUtils.toString(testIS);
27
        List<ResolvedCitation> citations = ResolvedCitation.extractFromNlm(text);
28
        List<ResolvedCitation> citations = ResolvedCitation.extractFromNlm(oaId, text);
28 29
        assertEquals(34, citations.size());
29 30

  
30 31
        ResolvedCitation citation10 = citations.get(9);
31
        assertEquals("19943949", citation10.getSourcePmid());
32
        assertEquals(oaId, citation10.getSourceOaid());
32 33
        assertEquals(10, citation10.getPosition());
33 34
        assertEquals("15574825", citation10.getTargetPmid());
34 35

  
35 36
        ResolvedCitation citation30 = citations.get(29);
36
        assertEquals("19943949", citation30.getSourcePmid());
37
        assertEquals(oaId, citation30.getSourceOaid());
37 38
        assertEquals(30, citation30.getPosition());
38 39
        assertEquals("9395406", citation30.getTargetPmid());
39 40

  
......
42 43
    public void testGappedExtraction() throws Exception {
43 44
        InputStream testIS = ClassLoader.class.getResourceAsStream(gappedXML);
44 45
        String text = IOUtils.toString(testIS);
45
        List<ResolvedCitation> citations = ResolvedCitation.extractFromNlm(text);
46
        String oaId = "some-oaid";
47
        List<ResolvedCitation> citations = ResolvedCitation.extractFromNlm(oaId, text);
46 48
        assertEquals(34, citations.size());
47 49

  
48 50
        ResolvedCitation citation10 = citations.get(9);
49
        assertEquals("19943949", citation10.getSourcePmid());
51
        assertEquals(oaId, citation10.getSourceOaid());
50 52
        assertEquals(10, citation10.getPosition());
51 53
        assertEquals("15574825", citation10.getTargetPmid());
52 54
        String expectedRawText10 = "Charlebois, R, Doolittle, W. Computing prokaryotic gene ubiquity: rescuing the core from extinction. Genome Res. 2004; 14 (12): 2469-2477";
53 55
        assertEquals(expectedRawText10, citation10.getRawText());
54 56

  
55 57
        ResolvedCitation citation11 = citations.get(10);
56
        assertEquals("19943949", citation11.getSourcePmid());
58
        assertEquals(oaId, citation11.getSourceOaid());
57 59
        assertEquals(11, citation11.getPosition());
58 60
        assertEquals(null, citation11.getTargetPmid());
59 61
        String expectedRawText11 = "Carbone, A. Computational prediction of genomic functional cores specific to different microbes. J Mol Evol. 2006;63(6):733-746";
60 62
        assertEquals(expectedRawText11, citation11.getRawText());
61 63

  
62 64
        ResolvedCitation citation14 = citations.get(13);
63
        assertEquals("19943949", citation14.getSourcePmid());
65
        assertEquals(oaId, citation14.getSourceOaid());
64 66
        assertEquals(14, citation14.getPosition());
65 67
        assertEquals("17370266", citation14.getTargetPmid());
66 68
        String expectedRawText14 = "Danchin, A, Fang, G, Noria, S. The extant core bacterial proteome is an archive of the origin of life. Proteomics. 2007; 7 (6): 875-889";
modules/icm-iis-ingest-pmc/trunk/src/main/java/eu/dnetlib/iis/ingest/pmc/citations/ResolvedCitationsImporter.java
20 20

  
21 21
/**
22 22
 * @author Mateusz Fedoryszak (m.fedoryszak@icm.edu.pl)
23
 * @author mhorst
23 24
 */
24 25
public class ResolvedCitationsImporter {
25 26
    public static void main(String[] args) throws IOException {
26 27
        String inPath = args[0];
27 28
        String dedupPath = args[1];
28
        String idsPath = args[2];
29
        String outPath = args[3];
29
        String pmidToOaidPath = args[2];
30
        String doiToOaidPath = args[3];
31
        String outPath = args[4];
30 32

  
31 33
        Properties properties = new Properties();
32 34
        AppProps.setApplicationJarClass(properties, ResolvedCitationsImporter.class);
......
37 39

  
38 40
        Tap docTap = new Hfs(new AvroScheme(DocumentText.getClassSchema()), inPath);
39 41
        Tap dedupTap = new Hfs(new AvroScheme(DeduplicationMapping.getClassSchema()), dedupPath);
40
        Tap idsTap = new Hfs(new AvroScheme(DocumentId.getClassSchema()), idsPath);
42
        Tap pmidToOaidTap = new Hfs(new AvroScheme(DeduplicationMapping.getClassSchema()), pmidToOaidPath);
43
        Tap doiToOaidTap = new Hfs(new AvroScheme(DeduplicationMapping.getClassSchema()), doiToOaidPath);
41 44

  
42 45
        Pipe docPipe = new Pipe("doc");
43 46
        Pipe dedupMapPipe = new Pipe("dedup");
44
        Pipe existentDocIdsPipe = new Pipe("existent_ids");
47
        Pipe pmidToOaidPipe = new Pipe("pmid_to_oaid");
48
        Pipe doiToOaidPipe = new Pipe("doi_to_oaid");
45 49

  
46 50
        Tap outTap = new Hfs(new PackedAvroScheme<Citation>(Citation.getClassSchema()), outPath);
47 51

  
48
        SubAssembly main = new ResolvedCitationsSubAssembly(docPipe, dedupMapPipe, existentDocIdsPipe);
52
        SubAssembly main = new ResolvedCitationsSubAssembly(docPipe, dedupMapPipe, 
53
        		pmidToOaidPipe, doiToOaidPipe);
49 54

  
50 55
        FlowDef flowDef = FlowDef.flowDef()
51 56
                .addSource(docPipe, docTap)
52 57
                .addSource(dedupMapPipe, dedupTap)
53
                .addSource(existentDocIdsPipe, idsTap)
58
                .addSource(pmidToOaidPipe, pmidToOaidTap)
59
                .addSource(doiToOaidPipe, doiToOaidTap)
54 60
                .addTailSink(main.getTails()[0], outTap);
55 61

  
56 62
        Flow flow = flowConnector.connect(flowDef);
modules/icm-iis-ingest-pmc/trunk/src/main/java/eu/dnetlib/iis/ingest/pmc/citations/CitationReshaperFuncion.java
1 1
package eu.dnetlib.iis.ingest.pmc.citations;
2 2

  
3
import static eu.dnetlib.iis.ingest.pmc.citations.Constants.DST_FIELD;
4
import static eu.dnetlib.iis.ingest.pmc.citations.Constants.DST_IDS_FIELD;
5
import static eu.dnetlib.iis.ingest.pmc.citations.Constants.RAW_TEXT_FIELD;
6
import static eu.dnetlib.iis.ingest.pmc.citations.Constants.SRC_FIELD;
7

  
8
import java.util.HashMap;
9
import java.util.Iterator;
10
import java.util.Map;
11

  
12
import org.apache.commons.lang3.StringUtils;
13
import org.codehaus.jettison.json.JSONException;
14
import org.codehaus.jettison.json.JSONObject;
15

  
3 16
import cascading.flow.FlowProcess;
4 17
import cascading.operation.BaseOperation;
5 18
import cascading.operation.Function;
6 19
import cascading.operation.FunctionCall;
7 20
import cascading.tuple.Tuple;
8

  
9 21
import eu.dnetlib.iis.ingest.pmc.citations.schemas.Citation;
10
import org.apache.commons.lang3.StringUtils;
11
import org.codehaus.jettison.json.JSONException;
12
import org.codehaus.jettison.json.JSONObject;
13 22

  
14
import java.util.HashMap;
15
import java.util.Iterator;
16
import java.util.Map;
17

  
18
import static eu.dnetlib.iis.ingest.pmc.citations.Constants.*;
19

  
20 23
/**
21 24
 * Converts tuple into a Citation.
22 25
 *
23 26
 * @see eu.dnetlib.iis.ingest.pmc.citations.schemas.Citation
24 27
 *
25 28
 * @author Mateusz Fedoryszak (m.fedoryszak@icm.edu.pl)
29
 * @author mhorst
26 30
 */
27 31
public class CitationReshaperFuncion extends BaseOperation implements Function {
28 32
    @Override
......
35 39
            cit.setRawText(rawText);
36 40
        }
37 41

  
38
        //This field is not empty for documents existent in OA+
39
        String dstId = functionCall.getArguments().getString(ID_FIELD);
42
        String dstId = functionCall.getArguments().getString(DST_FIELD);
40 43
        if (StringUtils.isNotEmpty(dstId)) {
41 44
            cit.setDestinationDocumentId(dstId);
42 45
        }
modules/icm-iis-ingest-pmc/trunk/src/main/java/eu/dnetlib/iis/ingest/pmc/citations/ResolvedCitationsSubAssembly.java
12 12
 * The main workflow.
13 13
 *
14 14
 * @author Mateusz Fedoryszak (m.fedoryszak@icm.edu.pl)
15
 * @author mhorst
16
 * 
15 17
 */
16 18
public class ResolvedCitationsSubAssembly extends SubAssembly {
17
    public ResolvedCitationsSubAssembly(Pipe nlmsPipe, Pipe dedupMapPipe, Pipe existentDocsIdsPipe) {
18
        setPrevious(nlmsPipe, dedupMapPipe, existentDocsIdsPipe);
19
    public ResolvedCitationsSubAssembly(Pipe nlmsPipe, Pipe dedupMapPipe, 
20
    		Pipe pmidToOaidPipe, Pipe doiToOaidPipe) {
21
        setPrevious(nlmsPipe, dedupMapPipe, pmidToOaidPipe, doiToOaidPipe);
19 22

  
20
        // extract resolved citations from NLMs
23
//      extract resolved citations from NLMs
21 24
        Pipe citationsPipe = new Each(nlmsPipe, new CitationExtractorFunction(), Fields.RESULTS);
22 25

  
23
        // replace document ids for ids of deduplicated documents
26
//      translate PMID target identifiers to openaire identifier in DST_FIELD
27
//      grouping citations and dedup entries by pmid
28
        Pipe matchingPipe = new CoGroup(
29
        		citationsPipe, new Fields(DST_PMID_FIELD), 
30
        		pmidToOaidPipe, new Fields(ORIGINAL_ID_FIELD), 
31
        		new LeftJoin());
32
//		replacing destination identifier with id from dedup mapping
33
        matchingPipe = new Each(
34
        		matchingPipe,
35
                new ReplacerFunction(new Fields(DST_FIELD), new Fields(NEW_ID_FIELD),
36
                        new Fields(SRC_FIELD, 
37
                        		POSITION_FIELD, RAW_TEXT_FIELD, DST_IDS_FIELD,
38
                        		DST_DOI_FIELD)),
39
                Fields.RESULTS);
40
//      translate DOI target identifiers to openaire identifier in DST_FIELD
41
//      grouping citations and dedup entries by DOI
42
        matchingPipe = new CoGroup(
43
        		matchingPipe, new Fields(DST_DOI_FIELD), 
44
        		doiToOaidPipe, new Fields(ORIGINAL_ID_FIELD), 
45
        		new LeftJoin());
46
//		replacing destination identifier with id from dedup mapping
47
        matchingPipe = new Each(
48
        		matchingPipe,
49
                new ReplacerFunction(new Fields(DST_FIELD), new Fields(NEW_ID_FIELD),
50
                        new Fields(SRC_FIELD,
51
                        		POSITION_FIELD, RAW_TEXT_FIELD, DST_IDS_FIELD)),
52
                Fields.RESULTS);
53
        
54
//		replace document ids for ids of deduplicated documents
24 55
        Pipe dedupPipe;
25
        dedupPipe = new CoGroup(citationsPipe, new Fields(SRC_FIELD), dedupMapPipe,
56
//      grouping citations and dedup entries by source id
57
        dedupPipe = new CoGroup(matchingPipe, new Fields(SRC_FIELD), dedupMapPipe,
26 58
                new Fields(ORIGINAL_ID_FIELD), new LeftJoin());
27

  
59
//		replacing source identifier with id from dedup mapping
28 60
        dedupPipe = new Each(
29 61
                dedupPipe,
30
                new ReplacerFunction(new Fields(SRC_FIELD), new Fields(NEW_ID_FIELD),
62
                new ReplacerFunction(
63
                		new Fields(SRC_FIELD), new Fields(NEW_ID_FIELD),
31 64
                        new Fields(POSITION_FIELD, RAW_TEXT_FIELD, DST_FIELD, DST_IDS_FIELD)),
32 65
                Fields.RESULTS);
33

  
66
//      grouping citations and dedup entries by destination id
34 67
        dedupPipe = new CoGroup(dedupPipe, new Fields(DST_FIELD), dedupMapPipe,
35 68
                new Fields(ORIGINAL_ID_FIELD), new LeftJoin());
36

  
69
//		replacing destination identifier with id from dedup mapping
37 70
        dedupPipe = new Each(
38 71
                dedupPipe,
39
                new ReplacerFunction(new Fields(DST_FIELD), new Fields(NEW_ID_FIELD),
72
                new ReplacerFunction(
73
                		new Fields(DST_FIELD), new Fields(NEW_ID_FIELD),
40 74
                        new Fields(POSITION_FIELD, RAW_TEXT_FIELD, SRC_FIELD, DST_IDS_FIELD)),
41 75
                Fields.RESULTS);
76
        
77
//      mark citations to existent documents
78
//      FIXME I guess we don't need that step anymore, since identifiers are not generated
79
//      Pipe existenceMarkedPipe = new CoGroup(dedupPipe, new Fields(DST_FIELD), existentDocsIdsPipe,
80
//              new Fields(ID_FIELD), new LeftJoin());
42 81

  
43
        // Mark citations to existent documents
44
        Pipe existenceMarkedPipe = new CoGroup(dedupPipe, new Fields(DST_FIELD), existentDocsIdsPipe,
45
                new Fields(ID_FIELD), new LeftJoin());
82
        Pipe outPipe = new Each(dedupPipe, new CitationReshaperFuncion(), Fields.RESULTS);
46 83

  
47
        Pipe outPipe = new Each(existenceMarkedPipe, new CitationReshaperFuncion(), Fields.RESULTS);
48

  
49 84
        setTails(outPipe);
50 85
    }
51 86
}
modules/icm-iis-ingest-pmc/trunk/src/main/java/eu/dnetlib/iis/ingest/pmc/citations/CitationExtractorFunction.java
1 1
package eu.dnetlib.iis.ingest.pmc.citations;
2 2

  
3
import static eu.dnetlib.iis.ingest.pmc.citations.Constants.DST_FIELD;
4
import static eu.dnetlib.iis.ingest.pmc.citations.Constants.DST_IDS_FIELD;
5
import static eu.dnetlib.iis.ingest.pmc.citations.Constants.DST_PMID_FIELD;
6
import static eu.dnetlib.iis.ingest.pmc.citations.Constants.DST_DOI_FIELD;
7
import static eu.dnetlib.iis.ingest.pmc.citations.Constants.POSITION_FIELD;
8
import static eu.dnetlib.iis.ingest.pmc.citations.Constants.RAW_TEXT_FIELD;
9
import static eu.dnetlib.iis.ingest.pmc.citations.Constants.SRC_FIELD;
3 10
import cascading.flow.FlowProcess;
4 11
import cascading.operation.BaseOperation;
5 12
import cascading.operation.Function;
......
7 14
import cascading.tuple.Fields;
8 15
import cascading.tuple.Tuple;
9 16

  
10
import static eu.dnetlib.iis.ingest.pmc.citations.Constants.*;
11

  
12 17
/**
13
 * Extracts Resolved citations from an NLM file. Input pipe should contain a field named TEXT_FIELD. Result stream will
14
 * contain fields SRC_FIELD, POSITION_FIELD, DST_FIELD and DST_IDS_FIELD
18
 * Extracts Resolved citations from an NLM file. 
19
 * Input pipe should contain a field named TEXT_FIELD and ID_FIELD.
15 20
 *
16 21
 * @author Mateusz Fedoryszak (m.fedoryszak@icm.edu.pl)
17 22
 */
18 23
public class CitationExtractorFunction extends BaseOperation implements Function {
19 24
    private final static Fields fields =
20
            new Fields(SRC_FIELD, POSITION_FIELD, RAW_TEXT_FIELD, DST_FIELD, DST_IDS_FIELD);
21
    //private final Logger logger = Logger.getLogger(CitationExtractorFunction.class);
25
            new Fields(SRC_FIELD, POSITION_FIELD, RAW_TEXT_FIELD, 
26
            		DST_FIELD, DST_IDS_FIELD, DST_DOI_FIELD, DST_PMID_FIELD);
22 27

  
23 28
    public  CitationExtractorFunction() {
24 29
        super(2, fields);
......
27 32
    @Override
28 33
    public void operate(FlowProcess flowProcess, FunctionCall functionCall) {
29 34
        String text = functionCall.getArguments().getString(Constants.TEXT_FIELD);
35
        String oaId = functionCall.getArguments().getString(Constants.ID_FIELD);
30 36

  
31 37
        try {
32
            for (ResolvedCitation cit : ResolvedCitation.extractFromNlm(text)) {
38
            for (ResolvedCitation cit : ResolvedCitation.extractFromNlm(oaId, text)) {
33 39
                Tuple result = new Tuple();
34 40
                result.addString(cit.getSourceOaid());
35 41
                result.addInteger(cit.getPosition());
36 42
                result.addString(cit.getRawText());
37
                result.addString(cit.getTargetOaid());
43
                result.addString(null);
38 44
                result.addString(cit.getTargetIdsJson());
45
                result.addString(cit.getTargetDoi());
46
                result.addString(cit.getTargetPmid());
39 47
                functionCall.getOutputCollector().add(result);
40 48
            }
41 49
        } catch (Exception e) {
42
            //in case of parse exception
43
            //logger.error("Error while parsing NLM\n"+text, e);
50
        	throw new RuntimeException(e);
44 51
        }
45 52
    }
46 53
}
modules/icm-iis-ingest-pmc/trunk/src/main/java/eu/dnetlib/iis/ingest/pmc/citations/Constants.java
2 2

  
3 3
/**
4 4
 * @author Mateusz Fedoryszak (m.fedoryszak@icm.edu.pl)
5
 * @author mhorst
5 6
 */
6 7
public class Constants {
7 8
    public static final String SRC_FIELD = "sourceDocumentId";
......
13 14
    public static final String ID_FIELD = "id";
14 15
    public static final String TEXT_FIELD = "text";
15 16
    public static final String RAW_TEXT_FIELD = "rawText";
17
    
18
    public static final String DST_DOI_FIELD = "destinationDoi";
19
    public static final String DST_PMID_FIELD = "destinationPmid";
16 20

  
17 21
    private Constants() {
18 22
    }
modules/icm-iis-ingest-pmc/trunk/src/main/java/eu/dnetlib/iis/ingest/pmc/citations/ResolvedCitation.java
1 1
package eu.dnetlib.iis.ingest.pmc.citations;
2 2

  
3
import eu.dnetlib.data.proto.TypeProtos.Type;
4
import eu.dnetlib.data.transform.xml.AbstractDNetOafXsltFunctions;
3
import java.util.ArrayList;
4
import java.util.HashMap;
5
import java.util.List;
6
import java.util.Map;
5 7

  
6 8
import org.apache.commons.io.IOUtils;
7 9
import org.apache.commons.lang.StringUtils;
......
12 14
import pl.edu.icm.ceon.scala_commons.xml.XPathEvaluator;
13 15
import scala.collection.JavaConversions;
14 16

  
15
import java.util.ArrayList;
16
import java.util.HashMap;
17
import java.util.List;
18
import java.util.Map;
19

  
20 17
/**
21 18
 * A model of resolved citation from PMC.
22 19
 *
23 20
 * @author Mateusz Fedoryszak (m.fedoryszak@icm.edu.pl)
24 21
 */
25 22
public class ResolvedCitation {
26
    private final String sourcePmid;
23
    private final String sourceOaid;
27 24
    private final int position;
28 25
    private final String rawText;
29 26
    private final Map<String, String> targetIds;
30 27

  
31
    private static final String PUBMED_NS_PREFIX = "od_______908";
32
    private static final String PUBMED_ID_PREFIX = "oai:europepmc.org:";
33 28
    private static final String PMID_KEY = "pmid";
29
    private static final String DOI_KEY = "doi";
34 30

  
35
    public ResolvedCitation(final String sourcePmid, final int position, final String rawText, final Map<String, String> targetIds) {
36
        this.sourcePmid = sourcePmid;
31
    public ResolvedCitation(final String sourceOaid, final int position, final String rawText, final Map<String, String> targetIds) {
32
        this.sourceOaid = sourceOaid;
37 33
        this.position = position;
38 34
        this.rawText = rawText;
39 35
        this.targetIds = targetIds;
40 36
    }
41 37

  
42 38
    /**
43
     * @return Source (citing) document PubMedID
44
     */
45
    public String getSourcePmid() {
46
        return sourcePmid;
47
    }
48

  
49
    /**
50 39
     * @return Source (citing) document OpenAIRE ID (before dedup)
51 40
     */
52 41
    public String getSourceOaid() {
53
        return pmidToOaid(sourcePmid);
42
        return sourceOaid;
54 43
    }
55 44

  
56 45
    public int getPosition() {
......
68 57
        return targetIds.get(PMID_KEY);
69 58
    }
70 59

  
60

  
71 61
    /**
72
     * @return Target (cited) document OpenAIRE ID (before dedup)
62
     * @return Target (cited) document DOI
73 63
     */
74
    public String getTargetOaid() {
75
        final String pmid = getTargetPmid();
76
        if (pmid != null) {
77
            return pmidToOaid(pmid);
78
        } else {
79
            return null;
80
        }
64
    public String getTargetDoi() {
65
        return targetIds.get(DOI_KEY);
81 66
    }
82

  
67
    
83 68
    public String getTargetIdsJson() {
84 69
        return new JSONObject(targetIds).toString();
85 70
    }
86

  
71
    
87 72
    /**
88
     * Transforms PubMed ID into OpenAIRE ID (before dedup)
89
     */
90
    public static String pmidToOaid(final String pmid) {
91
        return AbstractDNetOafXsltFunctions.oafId(
92
        		Type.result.name(), PUBMED_NS_PREFIX, PUBMED_ID_PREFIX + pmid);
93
    }
94

  
95
    /**
96 73
     * Parses NLM file and extracts resolved citations.
97 74
     */
98
    public static List<ResolvedCitation> extractFromNlm(final String text) {
99
        XPathEvaluator evaluator = XPathEvaluator.fromString(text);
100
        String id = evaluator.apply("/article/front/article-meta/article-id[@pub-id-type='pmid']");
75
    public static List<ResolvedCitation> extractFromNlm(final String oaSourceId, final String text) {
76
        XPathEvaluator evaluator = XPathEvaluator.fromInputStream(IOUtils.toInputStream(text));
101 77

  
102 78
        List<ResolvedCitation> result = new ArrayList<ResolvedCitation>();
103 79
        int position = 1;
......
109 85
                final String idValue = citId.getTextContent();
110 86
                targetIds.put(idType, idValue);
111 87
            }
112
            result.add(new ResolvedCitation(id, position, rawTextGenerator(ref), targetIds));
88
            result.add(new ResolvedCitation(oaSourceId, position, rawTextGenerator(ref), targetIds));
113 89

  
114 90
            ++position;
115 91
        }
modules/icm-iis-ingest-pmc/trunk/src/main/java/eu/dnetlib/iis/ingest/pmc/citations/PmcXmlHandler.java
1
package eu.dnetlib.iis.ingest.pmc.citations;
2

  
3
import java.util.Stack;
4

  
5
import org.apache.log4j.Logger;
6
import org.xml.sax.Attributes;
7
import org.xml.sax.SAXException;
8
import org.xml.sax.helpers.DefaultHandler;
9

  
10
/**
11
 * PMC XML dump SAX handler.
12
 * Notice: writer is not being closed by handler.
13
 * Created outside, let it be closed outside as well.
14
 * 
15
 * Currently extracts pmid identifiers only.
16
 * 
17
 * @author mhorst
18
 *
19
 */
20
public class PmcXmlHandler extends DefaultHandler {
21

  
22
//	elements
23
	private static final String ELEM_ARTICLE = "article";
24
//	front article id
25
//	/article/front/article-meta/article-id[@pub-id-type='pmid']
26
	private static final String ELEM_FRONT = "front";
27
	private static final String ELEM_ARTICLE_META = "article-meta";
28
	private static final String ELEM_ARTICLE_ID = "article-id";
29
//	back citations
30
	private static final String ELEM_BACK = "back";
31
	private static final String ELEM_REF_LIST = "ref-list";
32
	private static final String ELEM_REF = "ref";
33
	private static final String ELEM_PUB_ID = "pub-id";
34
//	back citations meta
35
	private static final String ELEM_ARTICLE_TITLE = "article-title";
36
	private static final String ELEM_SOURCE = "source";
37
	private static final String ELEM_YEAR = "year";
38
	private static final String ELEM_VOLUME = "volume";
39
	private static final String ELEM_ISSUE = "issue";
40
	private static final String ELEM_FPAGE = "fpage";
41
	private static final String ELEM_LPAGE = "lpage";
42
//	back citations author
43
	private static final String ELEM_NAME = "name";
44
	private static final String ELEM_SURNAME = "surname";
45
	private static final String ELEM_GIVEN_NAMES = "given-names";
46
//	back citations contains text child
47
	private static final String ELEM_CITATION = "citation";
48
	private static final String ELEM_ELEMENT_CITATION = "element-citation";
49
	private static final String ELEM_MIXED_CITATION = "mixed-citation";
50

  
51
//	attributes
52
	private static final String ATTR_PUB_TYPE = "pub-id-type";
53
	
54
	private static final String PUB_ID_TYPE_PMID = "pmid";
55

  
56
	
57
	private final Logger log = Logger.getLogger(this.getClass());
58
	
59
	private Stack<String> parents;
60
	
61
	private StringBuilder currentValue = new StringBuilder();
62
	
63
	/**
64
	 * article pmid
65
	 */
66
	private String articleId = null;
67
	
68
	private String articleIdType = null;
69
	
70
	boolean containsTextChild = false;
71
	
72
	private int counter = 0;
73
	
74
	/**
75
	 * Default constructor.
76
	 * @param receiver
77
	 */
78
	public PmcXmlHandler() {
79
		super();
80
	}
81
	
82
	@Override
83
	public void startDocument() throws SAXException {
84
		this.parents = new Stack<String>();
85
		this.counter = 0;
86
		clearAllFields();
87
	}
88

  
89
	@Override
90
	public void startElement(String uri, String localName, String qName,
91
			Attributes attributes) throws SAXException {
92
		if (isWithinElement(qName, ELEM_ARTICLE_ID, ELEM_ARTICLE_META)) {
93
			this.currentValue = new StringBuilder();
94
			this.articleIdType = attributes.getValue(ATTR_PUB_TYPE);
95
		}
96
		this.parents.push(qName);
97
	}
98

  
99
	@Override
100
	public void endElement(String uri, String localName, String qName)
101
			throws SAXException {
102
		this.parents.pop();
103
		if (isWithinElement(qName, ELEM_ARTICLE_ID, ELEM_ARTICLE_META) &&
104
				PUB_ID_TYPE_PMID.equals(this.articleIdType)) {
105
			this.articleId = this.currentValue.toString().trim();
106
		}
107
//		resetting current value;
108
		this.currentValue = null;
109
	}
110

  
111
	private void clearAllFields() {
112
		this.articleId = null;
113
		this.articleIdType = null;
114
	}
115
	
116
	boolean isWithinElement(String qName,
117
			String expectedElement, String expectedParent) {
118
		return qName.equals(expectedElement) && 
119
				(expectedParent==null || !this.parents.isEmpty() && expectedParent.equals(this.parents.peek()));
120
	}
121
	
122
	@Override
123
	public void endDocument() throws SAXException {
124
		parents.clear();
125
		parents = null;
126
		log.debug("total number of processed records: " + counter);
127
	}
128

  
129
	@Override
130
	public void characters(char[] ch, int start, int length)
131
			throws SAXException {
132
		if (this.currentValue!=null) {
133
			this.currentValue.append(ch, start, length);
134
		}
135
	}
136

  
137
	public String getArticleId() {
138
		return articleId;
139
	}
140
	
141
}
0 142

  
modules/icm-iis-ingest-pmc/trunk/src/main/java/eu/dnetlib/iis/ingest/pmc/citations/PmidToOaidMapper.java
1
package eu.dnetlib.iis.ingest.pmc.citations;
2

  
3
import java.io.IOException;
4
import java.io.StringReader;
5

  
6
import javax.xml.parsers.ParserConfigurationException;
7
import javax.xml.parsers.SAXParser;
8
import javax.xml.parsers.SAXParserFactory;
9

  
10
import org.apache.avro.mapred.AvroKey;
11
import org.apache.hadoop.io.NullWritable;
12
import org.apache.hadoop.mapreduce.Mapper;
13
import org.apache.log4j.Logger;
14
import org.xml.sax.InputSource;
15
import org.xml.sax.SAXException;
16

  
17
import eu.dnetlib.iis.importer.schemas.DeduplicationMapping;
18
import eu.dnetlib.iis.metadataextraction.schemas.DocumentText;
19

  
20
/**
21
 * Mapper class creating pmid to oaid mappings from PMC XML files.
22
 * @author mhorst
23
 */
24

  
25
public class PmidToOaidMapper extends Mapper<AvroKey<DocumentText>, NullWritable, AvroKey<DeduplicationMapping>, NullWritable> {
26
	
27
	private final Logger log = Logger.getLogger(PmidToOaidMapper.class);
28
		
29
    @Override
30
	protected void map(AvroKey<DocumentText> key, NullWritable value, Context context) 
31
            throws IOException, InterruptedException {
32
        DocumentText nlm = key.datum();
33
        String pmid = null;
34
        if (nlm.getText() != null) {
35
            try {
36
    			SAXParser saxParser = SAXParserFactory.newInstance().newSAXParser();
37
    			PmcXmlHandler pmcXmlHandler = new PmcXmlHandler();
38
				saxParser.parse(new InputSource(
39
						new StringReader(nlm.getText().toString())), 
40
						pmcXmlHandler);
41
				pmid = pmcXmlHandler.getArticleId();
42
            } catch (ParserConfigurationException e) {
43
            	log.error("Text extraction failed for id " + nlm.getId() + 
44
            			" and text: " + nlm.getText(), e);
45
			} catch (SAXException e) {
46
				log.error("Text extraction failed for id " + nlm.getId() + 
47
            			" and text: " + nlm.getText(), e);
48
			}
49
            if (pmid!=null && !pmid.isEmpty()) {
50
            	DeduplicationMapping output = DeduplicationMapping.newBuilder()
51
                        .setOriginalId(pmid)
52
                        .setNewId(nlm.getId().toString()).build();    
53
            	context.write(new AvroKey<DeduplicationMapping>(output), NullWritable.get());    
54
            } else {
55
            	log.warn("null or empty pmid extracted for id " + nlm.getId() + 
56
            			" and text: " + nlm.getText());
57
            }
58
        }
59
	}
60
}
0 61

  
modules/icm-iis-ingest-pmc/trunk/src/main/resources/eu/dnetlib/iis/ingest/pmc/idmapping/pmidtooaid/job.properties
1
input_document_nlm=/share/contents/xml/europePMC/2014-07-28
2
output=${workingDir}/output
0 3

  
modules/icm-iis-ingest-pmc/trunk/src/main/resources/eu/dnetlib/iis/ingest/pmc/citations/job.properties
1 1
input_document_nlm=/share/contents/xml/europePMC/2014-07-28
2 2
input_dedup_map=/share/import/dedupmapping/2014-06-30
3
input_document_id=/share/transformers/importer/documentmetadata/idextractor/2014-07-10
3
input_pmid_to_oaid=/user/marek.horst/ingest/pmc/idmapping/pmidtooaid/working_dir/output
4
#fix this path below
5
input_doi_to_oaid=/user/marek.horst/ingest/pmc/idmapping/pmidtooaid/working_dir/output
4 6
output_citation=${workingDir}/citation
5 7
reduce_tasks=36
modules/icm-iis-ingest-pmc/trunk/src/main/resources/eu/dnetlib/iis/ingest/pmc/citations/oozie_app/workflow.xml
10 10
            <description>mapping between original OpenAire IDs and deduplicated documents' IDs</description>
11 11
        </property>
12 12
        <property>
13
            <name>input_document_id</name>
14
            <description>all existent document ids</description>
13
            <name>input_pmid_to_oaid</name>
14
            <description>pmid to oaid mappings</description>
15 15
        </property>
16 16
        <property>
17
            <name>input_doi_to_oaid</name>
18
            <description>doi to oaid mappings</description>
19
        </property>
20
        <property>
17 21
            <name>output_citation</name>
18 22
            <description>extracted citations</description>
19 23
        </property>
......
43 47
            <java-opts>-Dmapred.reduce.tasks=${reduce_tasks}</java-opts>
44 48
            <arg>${nameNode}${input_document_nlm}</arg>
45 49
            <arg>${nameNode}${input_dedup_map}</arg>
46
            <arg>${nameNode}${input_document_id}</arg>
50
            <arg>${nameNode}${input_pmid_to_oaid}</arg>
51
            <arg>${nameNode}${input_doi_to_oaid}</arg>
47 52
            <arg>${nameNode}${output_citation}</arg>
48 53
        </java>
49 54
        <ok to="end"/>

Also available in: Unified diff