Project

General

Profile

1
package eu.dnetlib.iis.ingest.pmc.metadata;
2

    
3
import java.util.ArrayList;
4
import java.util.HashMap;
5
import java.util.List;
6
import java.util.Stack;
7

    
8
import org.apache.commons.lang.StringUtils;
9
import org.apache.log4j.Logger;
10
import org.jdom.Element;
11
import org.xml.sax.Attributes;
12
import org.xml.sax.SAXException;
13
import org.xml.sax.helpers.DefaultHandler;
14

    
15
import pl.edu.icm.cermine.metadata.affiliation.CRFAffiliationParser;
16
import eu.dnetlib.iis.common.affiliation.AffiliationBuilder;
17
import eu.dnetlib.iis.ingest.pmc.metadata.schemas.ExtractedDocumentMetadata;
18
import eu.dnetlib.iis.ingest.pmc.metadata.schemas.Range;
19
import eu.dnetlib.iis.ingest.pmc.metadata.schemas.ReferenceBasicMetadata;
20
import eu.dnetlib.iis.ingest.pmc.metadata.schemas.ReferenceMetadata;
21
import eu.dnetlib.iis.metadataextraction.schemas.Affiliation;
22

    
23

    
24
/**
25
 * PMC XML SAX handler.
26
 * 
27
 * @author mhorst
28
 *
29
 */
30
public class PmcXmlHandler extends DefaultHandler {
31

    
32
//	front journal
33
	private static final String ELEM_JOURNAL_TITLE = "journal-title";
34
	private static final String ELEM_JOURNAL_TITLE_GROUP = "journal-title-group";
35
//	front article
36
	private static final String ELEM_ARTICLE_META = "article-meta";
37
	private static final String ELEM_ARTICLE_ID = "article-id";
38
	private static final String ELEM_AFFILIATION = "aff";
39
	private static final String ELEM_LABEL = "label";
40
	
41
//	back citations
42
	private static final String ELEM_REF_LIST = "ref-list";
43
	private static final String ELEM_REF = "ref";
44
	private static final String ELEM_PUB_ID = "pub-id";
45
//	back citations meta
46
	private static final String ELEM_ARTICLE_TITLE = "article-title";
47
	private static final String ELEM_SOURCE = "source";
48
	private static final String ELEM_YEAR = "year";
49
	private static final String ELEM_VOLUME = "volume";
50
	private static final String ELEM_ISSUE = "issue";
51
	private static final String ELEM_FPAGE = "fpage";
52
	private static final String ELEM_LPAGE = "lpage";
53
//	back citations author
54
	private static final String ELEM_NAME = "name";
55
	private static final String ELEM_SURNAME = "surname";
56
	private static final String ELEM_GIVEN_NAMES = "given-names";
57
//	back citations contains text child
58
	private static final String ELEM_CITATION = "citation";
59
	private static final String ELEM_ELEMENT_CITATION = "element-citation";
60
	private static final String ELEM_MIXED_CITATION = "mixed-citation";
61
//	attributes
62
	private static final String PUB_ID_TYPE = "pub-id-type";
63
	private static final String ATTR_ARTICLE_TYPE = "article-type";
64
	
65
	private static final String PUB_ID_TYPE_PMID = "pmid";
66
	
67
	private final Logger log = Logger.getLogger(this.getClass());
68
	
69
	private Stack<String> parents;
70
	
71
	private StringBuilder currentValue = new StringBuilder();
72
	
73
	private ReferenceMetadata.Builder currentRefMetaBuilder;
74
	
75
	private String currentSurname = null;
76
	private String currentGivenNames = null;
77
	
78
	private List<CharSequence> currentRefAuthorList;
79
	private StringBuffer currentReferenceText;
80
	private boolean currentReferenceTextExplicitlySet = false;
81
	private String currentReferenceIdType = null;
82
	
83
	private String currentArticleIdType = null;
84
	
85
	boolean containsTextChild = false;
86
	
87
	boolean rootElement = true;
88
	
89
	private final ExtractedDocumentMetadata.Builder builder;
90
	
91
	/**
92
	 * Default constructor.
93
	 * @param receiver
94
	 */
95
	public PmcXmlHandler(ExtractedDocumentMetadata.Builder builder) {
96
		super();
97
		this.builder = builder;
98
	}
99
	
100
	@Override
101
	public void startDocument() throws SAXException {
102
		this.parents = new Stack<String>();
103
		clearAllFields();
104
	}
105

    
106
	@Override
107
	public void startElement(String uri, String localName, String qName,
108
			Attributes attributes) throws SAXException {
109
		if (rootElement) {
110
			rootElement = false;
111
			builder.setEntityType(attributes.getValue(ATTR_ARTICLE_TYPE));
112
		} else if (isWithinElement(qName, ELEM_JOURNAL_TITLE, ELEM_JOURNAL_TITLE_GROUP)) {
113
			this.currentValue = new StringBuilder();
114
		} else if (isWithinElement(qName, ELEM_ARTICLE_ID, ELEM_ARTICLE_META)) {
115
			this.currentArticleIdType = attributes.getValue(PUB_ID_TYPE);
116
			this.currentValue = new StringBuilder();
117
		} else if (isWithinElement(qName, ELEM_FPAGE, ELEM_ARTICLE_META) ||
118
				isWithinElement(qName, ELEM_LPAGE, ELEM_ARTICLE_META)) {
119
			this.currentValue = new StringBuilder();
120
		} else if (hasAmongParents(qName, ELEM_AFFILIATION, this.parents, ELEM_ARTICLE_META)) {
121
			this.currentValue = new StringBuilder();
122
		} else if (hasAmongParents(qName, ELEM_ARTICLE_TITLE, this.parents, ELEM_REF, ELEM_REF_LIST) ||
123
				hasAmongParents(qName, ELEM_SOURCE, this.parents, ELEM_REF, ELEM_REF_LIST) ||
124
				hasAmongParents(qName, ELEM_YEAR, this.parents, ELEM_REF, ELEM_REF_LIST) ||
125
				hasAmongParents(qName, ELEM_VOLUME, this.parents, ELEM_REF, ELEM_REF_LIST) ||
126
				hasAmongParents(qName, ELEM_ISSUE, this.parents, ELEM_REF, ELEM_REF_LIST) ||
127
				hasAmongParents(qName, ELEM_FPAGE, this.parents, ELEM_REF, ELEM_REF_LIST) ||
128
				hasAmongParents(qName, ELEM_LPAGE, this.parents, ELEM_REF, ELEM_REF_LIST)) {
129
			this.currentValue = new StringBuilder();
130
		} else if (isWithinElement(qName, ELEM_SURNAME, ELEM_NAME) ||
131
				isWithinElement(qName, ELEM_GIVEN_NAMES, ELEM_NAME)) {
132
			this.currentValue = new StringBuilder();
133
		} else if (isWithinElement(qName, ELEM_PUB_ID, ELEM_CITATION) ||
134
				isWithinElement(qName, ELEM_PUB_ID, ELEM_ELEMENT_CITATION) ||
135
				isWithinElement(qName, ELEM_PUB_ID, ELEM_MIXED_CITATION)) {
136
			this.currentReferenceIdType = attributes.getValue(PUB_ID_TYPE);
137
			this.currentValue = new StringBuilder();
138
		} else if (isWithinElement(qName, ELEM_REF, ELEM_REF_LIST)) {
139
			this.currentRefMetaBuilder = ReferenceMetadata.newBuilder();
140
			this.currentRefAuthorList = new ArrayList<CharSequence>();
141
			this.currentReferenceText = new StringBuffer();
142
			ReferenceBasicMetadata.Builder basicMetaBuilder = ReferenceBasicMetadata.newBuilder();
143
			basicMetaBuilder.setExternalIds(new HashMap<CharSequence, CharSequence>());
144
			this.currentRefMetaBuilder.setBasicMetadata(basicMetaBuilder.build());
145
		}
146
		this.parents.push(qName);
147
	}
148

    
149
	@Override
150
	public void endElement(String uri, String localName, String qName)
151
			throws SAXException {
152
		try {
153
		this.parents.pop();
154
		if (isWithinElement(qName, ELEM_JOURNAL_TITLE, ELEM_JOURNAL_TITLE_GROUP)) {
155
			builder.setJournal(this.currentValue.toString().trim());
156
		} else if (isWithinElement(qName, ELEM_ARTICLE_ID, ELEM_ARTICLE_META) &&
157
				PUB_ID_TYPE_PMID.equals(this.currentArticleIdType)) {
158
			builder.setPmid(this.currentValue.toString().trim());
159
		} else if (isWithinElement(qName, ELEM_FPAGE, ELEM_ARTICLE_META)) {
160
			if (builder.getPages()==null) {
161
				builder.setPages(Range.newBuilder().build());
162
			}
163
			builder.getPages().setStart(this.currentValue.toString().trim());
164
		} else if (isWithinElement(qName, ELEM_LPAGE, ELEM_ARTICLE_META)) {
165
			if (builder.getPages()==null) {
166
				builder.setPages(Range.newBuilder().build());
167
			}
168
			builder.getPages().setEnd(this.currentValue.toString().trim());
169
			
170
		} else if (hasAmongParents(qName, ELEM_AFFILIATION, this.parents, ELEM_ARTICLE_META)) {
171
			CRFAffiliationParser affiliationParser = new CRFAffiliationParser();
172
			String affStr = this.currentValue.toString();
173
			if (affStr.trim().length()>0) {
174
				try {
175
					Element parsedAffiliation = affiliationParser.parse(affStr);
176
					if (parsedAffiliation!=null) {
177
						if (builder.getAffiliations()==null) {
178
							builder.setAffiliations(new ArrayList<Affiliation>());
179
						}
180
						Affiliation aff = AffiliationBuilder.build(parsedAffiliation);
181
						if (aff.getRawText().length()>0) {
182
							builder.getAffiliations().add(aff);	
183
						} else {
184
							aff.setRawText(affStr);
185
						}
186
					}	
187
				} catch (IndexOutOfBoundsException e) {
188
//					FIXME remove this catch block when upgrading cermine version
189
					log.error("exception occurred when parsing affiliation: " + affStr, e);
190
				}
191
			}
192
		} else if (hasAmongParents(qName, ELEM_ARTICLE_TITLE, this.parents, ELEM_REF, ELEM_REF_LIST)) {
193
			currentRefMetaBuilder.getBasicMetadata().setTitle(this.currentValue.toString());
194
		} else if (hasAmongParents(qName, ELEM_SOURCE, this.parents, ELEM_REF, ELEM_REF_LIST)) {
195
			currentRefMetaBuilder.getBasicMetadata().setSource(this.currentValue.toString());
196
		} else if (hasAmongParents(qName, ELEM_YEAR, this.parents, ELEM_REF, ELEM_REF_LIST)) {
197
			currentRefMetaBuilder.getBasicMetadata().setYear(this.currentValue.toString());
198
		} else if (hasAmongParents(qName, ELEM_VOLUME, this.parents, ELEM_REF, ELEM_REF_LIST)) {
199
			currentRefMetaBuilder.getBasicMetadata().setVolume(this.currentValue.toString());
200
		} else if (hasAmongParents(qName, ELEM_ISSUE, this.parents, ELEM_REF, ELEM_REF_LIST)) {
201
			currentRefMetaBuilder.getBasicMetadata().setIssue(this.currentValue.toString());
202
		} else if (hasAmongParents(qName, ELEM_FPAGE, this.parents, ELEM_REF, ELEM_REF_LIST)) {
203
			if (currentRefMetaBuilder.getBasicMetadata().getPages()==null) {
204
				currentRefMetaBuilder.getBasicMetadata().setPages(Range.newBuilder().build());
205
			}
206
			currentRefMetaBuilder.getBasicMetadata().getPages().setStart(this.currentValue.toString());
207
		} else if (hasAmongParents(qName, ELEM_LPAGE, this.parents, ELEM_REF, ELEM_REF_LIST)) {
208
			if (currentRefMetaBuilder.getBasicMetadata().getPages()==null) {
209
				currentRefMetaBuilder.getBasicMetadata().setPages(Range.newBuilder().build());
210
			}
211
			currentRefMetaBuilder.getBasicMetadata().getPages().setEnd(this.currentValue.toString());
212
		} else if (hasAmongParents(qName, ELEM_PUB_ID, this.parents, ELEM_REF, ELEM_REF_LIST)) {
213
			if (this.currentReferenceIdType!=null) {
214
				currentRefMetaBuilder.getBasicMetadata().getExternalIds().put(
215
						this.currentReferenceIdType, this.currentValue.toString());	
216
			}
217
		} else if (isWithinElement(qName, ELEM_SURNAME, ELEM_NAME)) {
218
			this.currentSurname = this.currentValue.toString();
219
		} else if (isWithinElement(qName, ELEM_GIVEN_NAMES, ELEM_NAME)) {
220
			this.currentGivenNames = this.currentValue.toString();
221
		}  else if (hasAmongParents(qName, ELEM_NAME, this.parents, ELEM_REF)) {
222
//			in element-citation names are nested in person-group
223
			this.currentRefAuthorList.add(
224
					this.currentSurname + ", " + this.currentGivenNames);
225
			this.currentSurname = null;
226
			this.currentGivenNames = null;
227
		} else if (isWithinElement(qName, ELEM_CITATION, ELEM_REF) ||
228
				isWithinElement(qName, ELEM_ELEMENT_CITATION, ELEM_REF) ||
229
				isWithinElement(qName, ELEM_MIXED_CITATION, ELEM_REF)) {
230
			if (!this.currentRefMetaBuilder.hasText() && 
231
					this.currentReferenceTextExplicitlySet && 
232
					this.currentReferenceText!=null && this.currentReferenceText.length()>0) {
233
				String trimmedRefText = this.currentReferenceText.toString().trim().replaceAll(" +", " ");
234
				if (!trimmedRefText.isEmpty()) {
235
					this.currentRefMetaBuilder.setText(trimmedRefText);
236
				}
237
			}
238
		} else if (isWithinElement(qName, ELEM_REF, ELEM_REF_LIST)) {
239
			if (this.builder.getReferences()==null) {
240
				this.builder.setReferences(new ArrayList<ReferenceMetadata>());
241
			}
242
			this.currentRefMetaBuilder.setPosition(this.builder.getReferences().size()+1);
243

    
244
			if (this.currentRefAuthorList!=null && this.currentRefAuthorList.size()>0) {
245
				this.currentRefMetaBuilder.getBasicMetadata().setAuthors(this.currentRefAuthorList);	
246
			}
247

    
248
			if (!this.currentRefMetaBuilder.hasText()) {
249
				this.currentRefMetaBuilder.setText(generateReferenceRawText(
250
						this.currentRefMetaBuilder.getBasicMetadata()));
251
			}
252
			this.builder.getReferences().add(this.currentRefMetaBuilder.build());
253
//			reference fields cleanup
254
			this.currentRefMetaBuilder = null;
255
			this.currentRefAuthorList = null;
256
			this.currentReferenceText = null;
257
			this.currentReferenceTextExplicitlySet = false;
258
			this.currentReferenceIdType = null;
259
		}
260
		} catch (Exception e) {
261
//			FIXME remote this catch
262
			throw new RuntimeException("unexpected exception while processing doc: " + 
263
					builder.getId(), e);
264
		}
265
	}
266

    
267
	@Override
268
	public void endDocument() throws SAXException {
269
		parents.clear();
270
		parents = null;
271
	}
272

    
273
	@Override
274
	public void characters(char[] ch, int start, int length)
275
			throws SAXException {
276
		String currentElement = this.parents.pop();
277
		try {
278
//			skipping affiliation position element
279
			if (isWithinElement(currentElement, ELEM_LABEL, ELEM_AFFILIATION)) {
280
				return;
281
			}
282
			
283
			this.currentValue.append(ch, start, length);
284
//			handing reference text
285
			if (hasAmongParents(this.parents, ELEM_REF)) {
286
				if (isWithinElement(currentElement, ELEM_CITATION, ELEM_REF) ||
287
						isWithinElement(currentElement, ELEM_ELEMENT_CITATION, ELEM_REF) ||
288
						isWithinElement(currentElement, ELEM_MIXED_CITATION, ELEM_REF)) {
289
//					citation element contents
290
					char[] chunk = new char[length];
291
					System.arraycopy(ch, start, chunk, 0, length);
292
					if (containsNonWhiteCharacter(chunk)) {
293
						this.currentReferenceTextExplicitlySet = true;
294
					}
295
				}
296
				if (this.currentReferenceText.length()>0 &&
297
						isAlphanumeric(ch[start]) && 
298
						isAlphanumeric(this.currentReferenceText.charAt(
299
								this.currentReferenceText.length()-1))) {
300
//					adding missing space separator between two alphanumeric characters
301
					this.currentReferenceText.append(' ');
302
				}
303
				this.currentReferenceText.append(ch, start, length);
304
			}
305
		} finally {
306
			this.parents.push(currentElement);
307
		}
308
	}
309
	
310
	private void clearAllFields() {
311
		this.currentArticleIdType = null;
312
		this.rootElement = true;
313
	}
314
	
315
	static boolean isAlphanumeric(char c) {
316
	        return !(c < 0x30 || (c >= 0x3a && c <= 0x40) || (c > 0x5a && c <= 0x60) || c > 0x7a);
317
	}
318
	
319
	boolean isWithinElement(String qName,
320
			String expectedElement, String expectedParent) {
321
		return qName.equals(expectedElement) && 
322
				(expectedParent==null || !this.parents.isEmpty() && expectedParent.equals(this.parents.peek()));
323
	}
324
	
325
	public static boolean hasAmongParents(String qName,
326
			String expectedElement, Stack<String> parentStack, String... expectedParents) {
327
		if (qName.equals(expectedElement)) {
328
			return hasAmongParents(parentStack, expectedParents);
329
		} else {
330
			return false;	
331
		}
332
	}
333
	
334
	public static boolean hasAmongParents(Stack<String> parentStack, String... expectedParents) {
335
		if (expectedParents.length <= parentStack.size()) {
336
			int startIterationIdx = 0;
337
			for (String currentParent : expectedParents) {
338
				boolean found = false;
339
				for (int i=startIterationIdx; i<parentStack.size(); i++) {
340
//					iteration starts from the bottom while we want to check from top
341
					if (currentParent.equals(parentStack.get(parentStack.size()-(i+1)))) {
342
						startIterationIdx = i+1;
343
						found = true;
344
						break;
345
					}
346
				}
347
				if (!found) {
348
					return false;
349
				}
350
			}
351
			return true;
352
		}
353
		return false;
354
	}
355
	
356
	static boolean containsNonWhiteCharacter(char[] ch) {
357
		if (ch!=null && ch.length>0) {
358
			for (char currentCh : ch) {
359
				if (!Character.isWhitespace(currentCh)) {
360
					return true;
361
				}
362
			}
363
		}
364
		return false;
365
	}
366
	
367
	public static String generateReferenceRawText(ReferenceBasicMetadata refMeta) {
368
        String authors = refMeta.getAuthors()!=null?
369
        		StringUtils.join(refMeta.getAuthors(), ", "):"";
370
        String title = refMeta.getTitle()!=null?refMeta.getTitle().toString():null;
371
        String source = refMeta.getSource()!=null?refMeta.getSource().toString():null;
372
        String year = refMeta.getYear()!=null?refMeta.getYear().toString():null;
373
        String volume = refMeta.getVolume()!=null?refMeta.getVolume().toString():null;
374
        String issue = refMeta.getIssue()!=null?refMeta.getIssue().toString():null;
375
        String fpage = refMeta.getPages()!=null && refMeta.getPages().getStart()!=null
376
        		?refMeta.getPages().getStart().toString():null;
377
        String lpage = refMeta.getPages()!=null && refMeta.getPages().getEnd()!=null
378
        		?refMeta.getPages().getEnd().toString():null;
379

    
380
        StringBuilder builder = new StringBuilder();
381

    
382
        if (StringUtils.isNotBlank(authors)) {
383
            builder.append(authors);
384
            builder.append(". ");
385
        }
386
        if (StringUtils.isNotBlank(title)) {
387
            builder.append(title);
388
            builder.append(". ");
389
        }
390
        if (StringUtils.isNotBlank(source)) {
391
            builder.append(source);
392
            builder.append(". ");
393
        }
394
        if (StringUtils.isNotBlank(year)) {
395
            builder.append(year);
396
        }
397
        if (StringUtils.isNotBlank(volume)) {
398
            builder.append("; ");
399
            builder.append(volume);
400
        }
401
        if (StringUtils.isNotBlank(issue)) {
402
            builder.append(" (");
403
            builder.append(issue);
404
            builder.append(")");
405
        }
406
        if (StringUtils.isNotBlank(fpage)) {
407
            builder.append(": ");
408
            builder.append(fpage);
409
        }
410
        if (StringUtils.isNotBlank(lpage)) {
411
            builder.append("-");
412
            builder.append(lpage);
413
        }
414
        return builder.toString();
415
    }
416
}
(2-2/2)