Revision 31116
Added by Marek Horst about 10 years ago
modules/icm-iis-ingest-pmc/trunk/src/test/java/eu/dnetlib/iis/ingest/pmc/citations/PmcXmlHandlerTest.java | ||
---|---|---|
1 |
package eu.dnetlib.iis.ingest.pmc.citations; |
|
2 |
|
|
3 |
import static org.junit.Assert.assertEquals; |
|
4 |
|
|
5 |
import java.io.InputStream; |
|
6 |
|
|
7 |
import javax.xml.parsers.SAXParser; |
|
8 |
import javax.xml.parsers.SAXParserFactory; |
|
9 |
|
|
10 |
import org.junit.Test; |
|
11 |
|
|
12 |
/** |
|
13 |
* {@link PmcXmlHandler} test class. |
|
14 |
* @author mhorst |
|
15 |
* |
|
16 |
*/ |
|
17 |
public class PmcXmlHandlerTest { |
|
18 |
|
|
19 |
@Test |
|
20 |
public void testParsing() throws Exception { |
|
21 |
String filePath = "/eu/dnetlib/iis/ingest/pmc/citations/data/pmc_example.xml"; |
|
22 |
InputStream inputStream = null; |
|
23 |
try { |
|
24 |
SAXParser saxParser = SAXParserFactory.newInstance().newSAXParser(); |
|
25 |
PmcXmlHandler pmcXmlHandler = new PmcXmlHandler(); |
|
26 |
saxParser.parse(inputStream = PmcXmlHandler.class.getResourceAsStream(filePath), |
|
27 |
pmcXmlHandler); |
|
28 |
assertEquals("10629213", pmcXmlHandler.getArticleId()); |
|
29 |
assertEquals("correction", pmcXmlHandler.getArticleType()); |
|
30 |
} finally { |
|
31 |
if (inputStream!=null) { |
|
32 |
inputStream.close(); |
|
33 |
} |
|
34 |
} |
|
35 |
} |
|
36 |
} |
|
0 | 37 |
modules/icm-iis-ingest-pmc/trunk/src/test/resources/eu/dnetlib/iis/ingest/pmc/citations/data/pmc_example.xml | ||
---|---|---|
1 |
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="correction"><front><journal-meta><journal-id journal-id-type="nlm-ta">J Cell Biol</journal-id><journal-title>The Journal of Cell Biology</journal-title><issn pub-type="ppub">0021-9525</issn><issn pub-type="epub">1540-8140</issn><publisher><publisher-name>The Rockefeller University Press</publisher-name></publisher></journal-meta><article-meta><article-id pub-id-type="pmcid">2174284</article-id><article-id pub-id-type="publisher-id">cor1</article-id><article-id pub-id-type="pmid">10629213</article-id><article-categories><subj-group subj-group-type="heading"><subject>Correction</subject></subj-group></article-categories><pub-date pub-type="ppub"><day>24</day><month>1</month><year>2000</year></pub-date><volume>148</volume><issue>2</issue><fpage>1</fpage><lpage>1</lpage><permissions><copyright-statement>© 2000 The Rockefeller University Press</copyright-statement><copyright-year>2000</copyright-year><copyright-holder>The Rockefeller University Press</copyright-holder></permissions><related-article related-article-type="corrected-article" vol="145" page="515" id="N0x2b4fc80N0x3b573b0" xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="10225953" ext-link-type="pubmed"/></article-meta></front><body><p><italic>Streptomyces carpinensis</italic> (Fig. 3 C) was incorrectly listed in the legend as <italic>Streptomyces antibioticus</italic>. This error does not affect the conclusions of the paper.</p></body></article> |
|
0 | 2 |
modules/icm-iis-ingest-pmc/trunk/src/main/java/eu/dnetlib/iis/ingest/pmc/citations/PmcXmlHandler.java | ||
---|---|---|
50 | 50 |
|
51 | 51 |
// attributes |
52 | 52 |
private static final String ATTR_PUB_TYPE = "pub-id-type"; |
53 |
private static final String ATTR_ARTICLE_TYPE = "article-type"; |
|
53 | 54 |
|
54 | 55 |
private static final String PUB_ID_TYPE_PMID = "pmid"; |
55 | 56 |
|
... | ... | |
67 | 68 |
|
68 | 69 |
private String articleIdType = null; |
69 | 70 |
|
71 |
private String articleType = null; |
|
72 |
|
|
70 | 73 |
boolean containsTextChild = false; |
71 | 74 |
|
72 |
private int counter = 0;
|
|
75 |
boolean rootElement = true;
|
|
73 | 76 |
|
74 | 77 |
/** |
75 | 78 |
* Default constructor. |
... | ... | |
82 | 85 |
@Override |
83 | 86 |
public void startDocument() throws SAXException { |
84 | 87 |
this.parents = new Stack<String>(); |
85 |
this.counter = 0; |
|
86 | 88 |
clearAllFields(); |
87 | 89 |
} |
88 | 90 |
|
89 | 91 |
@Override |
90 | 92 |
public void startElement(String uri, String localName, String qName, |
91 | 93 |
Attributes attributes) throws SAXException { |
92 |
if (isWithinElement(qName, ELEM_ARTICLE_ID, ELEM_ARTICLE_META)) { |
|
94 |
if (rootElement) { |
|
95 |
rootElement = false; |
|
96 |
this.articleType = attributes.getValue(ATTR_ARTICLE_TYPE); |
|
97 |
} else if (isWithinElement(qName, ELEM_ARTICLE_ID, ELEM_ARTICLE_META)) { |
|
93 | 98 |
this.currentValue = new StringBuilder(); |
94 | 99 |
this.articleIdType = attributes.getValue(ATTR_PUB_TYPE); |
95 | 100 |
} |
... | ... | |
111 | 116 |
private void clearAllFields() { |
112 | 117 |
this.articleId = null; |
113 | 118 |
this.articleIdType = null; |
119 |
this.rootElement = true; |
|
114 | 120 |
} |
115 | 121 |
|
116 | 122 |
boolean isWithinElement(String qName, |
... | ... | |
123 | 129 |
public void endDocument() throws SAXException { |
124 | 130 |
parents.clear(); |
125 | 131 |
parents = null; |
126 |
log.debug("total number of processed records: " + counter); |
|
127 | 132 |
} |
128 | 133 |
|
129 | 134 |
@Override |
... | ... | |
137 | 142 |
public String getArticleId() { |
138 | 143 |
return articleId; |
139 | 144 |
} |
145 |
|
|
146 |
public String getArticleType() { |
|
147 |
return articleType; |
|
148 |
} |
|
140 | 149 |
|
141 | 150 |
} |
Also available in: Unified diff
#757 introducing article type extraction along with unit test. Article type will be required for filtering out pmc duplicates and leaving only proper types