Project

General

Profile

« Previous | Next » 

Revision 31116

#757 introducing article type extraction along with unit test. Article type will be required for filtering out pmc duplicates and leaving only proper types

View differences:

modules/icm-iis-ingest-pmc/trunk/src/test/java/eu/dnetlib/iis/ingest/pmc/citations/PmcXmlHandlerTest.java
1
package eu.dnetlib.iis.ingest.pmc.citations;
2

  
3
import static org.junit.Assert.assertEquals;
4

  
5
import java.io.InputStream;
6

  
7
import javax.xml.parsers.SAXParser;
8
import javax.xml.parsers.SAXParserFactory;
9

  
10
import org.junit.Test;
11

  
12
/**
13
 * {@link PmcXmlHandler} test class.
14
 * @author mhorst
15
 *
16
 */
17
public class PmcXmlHandlerTest {
18

  
19
	@Test
20
	public void testParsing() throws Exception {
21
		String filePath = "/eu/dnetlib/iis/ingest/pmc/citations/data/pmc_example.xml";
22
		InputStream inputStream = null;
23
		try {
24
			SAXParser saxParser = SAXParserFactory.newInstance().newSAXParser();
25
			PmcXmlHandler pmcXmlHandler = new PmcXmlHandler();
26
			saxParser.parse(inputStream = PmcXmlHandler.class.getResourceAsStream(filePath), 
27
					pmcXmlHandler);
28
			assertEquals("10629213", pmcXmlHandler.getArticleId());
29
			assertEquals("correction", pmcXmlHandler.getArticleType());
30
		} finally {
31
			if (inputStream!=null) {
32
				inputStream.close();
33
			}
34
		}
35
	}
36
}
0 37

  
modules/icm-iis-ingest-pmc/trunk/src/test/resources/eu/dnetlib/iis/ingest/pmc/citations/data/pmc_example.xml
1
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="correction"><front><journal-meta><journal-id journal-id-type="nlm-ta">J Cell Biol</journal-id><journal-title>The Journal of Cell Biology</journal-title><issn pub-type="ppub">0021-9525</issn><issn pub-type="epub">1540-8140</issn><publisher><publisher-name>The Rockefeller University Press</publisher-name></publisher></journal-meta><article-meta><article-id pub-id-type="pmcid">2174284</article-id><article-id pub-id-type="publisher-id">cor1</article-id><article-id pub-id-type="pmid">10629213</article-id><article-categories><subj-group subj-group-type="heading"><subject>Correction</subject></subj-group></article-categories><pub-date pub-type="ppub"><day>24</day><month>1</month><year>2000</year></pub-date><volume>148</volume><issue>2</issue><fpage>1</fpage><lpage>1</lpage><permissions><copyright-statement>&#x000a9; 2000 The Rockefeller University Press</copyright-statement><copyright-year>2000</copyright-year><copyright-holder>The Rockefeller University Press</copyright-holder></permissions><related-article related-article-type="corrected-article" vol="145" page="515" id="N0x2b4fc80N0x3b573b0" xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="10225953" ext-link-type="pubmed"/></article-meta></front><body><p><italic>Streptomyces carpinensis</italic> (Fig. 3 C) was incorrectly listed in the legend as <italic>Streptomyces antibioticus</italic>. This error does not affect the conclusions of the paper.</p></body></article>
0 2

  
modules/icm-iis-ingest-pmc/trunk/src/main/java/eu/dnetlib/iis/ingest/pmc/citations/PmcXmlHandler.java
50 50

  
51 51
//	attributes
52 52
	private static final String ATTR_PUB_TYPE = "pub-id-type";
53
	private static final String ATTR_ARTICLE_TYPE = "article-type";
53 54
	
54 55
	private static final String PUB_ID_TYPE_PMID = "pmid";
55 56

  
......
67 68
	
68 69
	private String articleIdType = null;
69 70
	
71
	private String articleType = null;
72
	
70 73
	boolean containsTextChild = false;
71 74
	
72
	private int counter = 0;
75
	boolean rootElement = true;
73 76
	
74 77
	/**
75 78
	 * Default constructor.
......
82 85
	@Override
83 86
	public void startDocument() throws SAXException {
84 87
		this.parents = new Stack<String>();
85
		this.counter = 0;
86 88
		clearAllFields();
87 89
	}
88 90

  
89 91
	@Override
90 92
	public void startElement(String uri, String localName, String qName,
91 93
			Attributes attributes) throws SAXException {
92
		if (isWithinElement(qName, ELEM_ARTICLE_ID, ELEM_ARTICLE_META)) {
94
		if (rootElement) {
95
			rootElement = false;
96
			this.articleType = attributes.getValue(ATTR_ARTICLE_TYPE);
97
		} else if (isWithinElement(qName, ELEM_ARTICLE_ID, ELEM_ARTICLE_META)) {
93 98
			this.currentValue = new StringBuilder();
94 99
			this.articleIdType = attributes.getValue(ATTR_PUB_TYPE);
95 100
		}
......
111 116
	private void clearAllFields() {
112 117
		this.articleId = null;
113 118
		this.articleIdType = null;
119
		this.rootElement = true;
114 120
	}
115 121
	
116 122
	boolean isWithinElement(String qName,
......
123 129
	public void endDocument() throws SAXException {
124 130
		parents.clear();
125 131
		parents = null;
126
		log.debug("total number of processed records: " + counter);
127 132
	}
128 133

  
129 134
	@Override
......
137 142
	public String getArticleId() {
138 143
		return articleId;
139 144
	}
145

  
146
	public String getArticleType() {
147
		return articleType;
148
	}
140 149
	
141 150
}

Also available in: Unified diff