1
|
package eu.dnetlib.iis.ingest.pmc.metadata;
|
2
|
|
3
|
import java.util.ArrayList;
|
4
|
import java.util.HashMap;
|
5
|
import java.util.List;
|
6
|
import java.util.Stack;
|
7
|
|
8
|
import org.apache.commons.lang.StringUtils;
|
9
|
import org.apache.log4j.Logger;
|
10
|
import org.jdom.Element;
|
11
|
import org.xml.sax.Attributes;
|
12
|
import org.xml.sax.SAXException;
|
13
|
import org.xml.sax.helpers.DefaultHandler;
|
14
|
|
15
|
import pl.edu.icm.cermine.metadata.affiliation.CRFAffiliationParser;
|
16
|
import eu.dnetlib.iis.common.affiliation.AffiliationBuilder;
|
17
|
import eu.dnetlib.iis.ingest.pmc.metadata.schemas.ExtractedDocumentMetadata;
|
18
|
import eu.dnetlib.iis.ingest.pmc.metadata.schemas.Range;
|
19
|
import eu.dnetlib.iis.ingest.pmc.metadata.schemas.ReferenceBasicMetadata;
|
20
|
import eu.dnetlib.iis.ingest.pmc.metadata.schemas.ReferenceMetadata;
|
21
|
import eu.dnetlib.iis.metadataextraction.schemas.Affiliation;
|
22
|
|
23
|
|
24
|
/**
|
25
|
* PMC XML SAX handler.
|
26
|
*
|
27
|
* @author mhorst
|
28
|
*
|
29
|
*/
|
30
|
public class PmcXmlHandler extends DefaultHandler {
|
31
|
|
32
|
// front journal
|
33
|
private static final String ELEM_JOURNAL_TITLE = "journal-title";
|
34
|
private static final String ELEM_JOURNAL_TITLE_GROUP = "journal-title-group";
|
35
|
// front article
|
36
|
private static final String ELEM_ARTICLE_META = "article-meta";
|
37
|
private static final String ELEM_ARTICLE_ID = "article-id";
|
38
|
private static final String ELEM_AFFILIATION = "aff";
|
39
|
private static final String ELEM_LABEL = "label";
|
40
|
|
41
|
// back citations
|
42
|
private static final String ELEM_REF_LIST = "ref-list";
|
43
|
private static final String ELEM_REF = "ref";
|
44
|
private static final String ELEM_PUB_ID = "pub-id";
|
45
|
// back citations meta
|
46
|
private static final String ELEM_ARTICLE_TITLE = "article-title";
|
47
|
private static final String ELEM_SOURCE = "source";
|
48
|
private static final String ELEM_YEAR = "year";
|
49
|
private static final String ELEM_VOLUME = "volume";
|
50
|
private static final String ELEM_ISSUE = "issue";
|
51
|
private static final String ELEM_FPAGE = "fpage";
|
52
|
private static final String ELEM_LPAGE = "lpage";
|
53
|
// back citations author
|
54
|
private static final String ELEM_NAME = "name";
|
55
|
private static final String ELEM_SURNAME = "surname";
|
56
|
private static final String ELEM_GIVEN_NAMES = "given-names";
|
57
|
// back citations contains text child
|
58
|
private static final String ELEM_CITATION = "citation";
|
59
|
private static final String ELEM_ELEMENT_CITATION = "element-citation";
|
60
|
private static final String ELEM_MIXED_CITATION = "mixed-citation";
|
61
|
// attributes
|
62
|
private static final String PUB_ID_TYPE = "pub-id-type";
|
63
|
private static final String ATTR_ARTICLE_TYPE = "article-type";
|
64
|
|
65
|
private static final String PUB_ID_TYPE_PMID = "pmid";
|
66
|
|
67
|
private final Logger log = Logger.getLogger(this.getClass());
|
68
|
|
69
|
private Stack<String> parents;
|
70
|
|
71
|
private StringBuilder currentValue = new StringBuilder();
|
72
|
|
73
|
private ReferenceMetadata.Builder currentRefMetaBuilder;
|
74
|
|
75
|
private String currentSurname = null;
|
76
|
private String currentGivenNames = null;
|
77
|
|
78
|
private List<CharSequence> currentRefAuthorList;
|
79
|
private StringBuffer currentReferenceText;
|
80
|
private boolean currentReferenceTextExplicitlySet = false;
|
81
|
private String currentReferenceIdType = null;
|
82
|
|
83
|
private String currentArticleIdType = null;
|
84
|
|
85
|
boolean containsTextChild = false;
|
86
|
|
87
|
boolean rootElement = true;
|
88
|
|
89
|
private final ExtractedDocumentMetadata.Builder builder;
|
90
|
|
91
|
/**
|
92
|
* Default constructor.
|
93
|
* @param receiver
|
94
|
*/
|
95
|
public PmcXmlHandler(ExtractedDocumentMetadata.Builder builder) {
|
96
|
super();
|
97
|
this.builder = builder;
|
98
|
}
|
99
|
|
100
|
@Override
|
101
|
public void startDocument() throws SAXException {
|
102
|
this.parents = new Stack<String>();
|
103
|
clearAllFields();
|
104
|
}
|
105
|
|
106
|
@Override
|
107
|
public void startElement(String uri, String localName, String qName,
|
108
|
Attributes attributes) throws SAXException {
|
109
|
if (rootElement) {
|
110
|
rootElement = false;
|
111
|
builder.setEntityType(attributes.getValue(ATTR_ARTICLE_TYPE));
|
112
|
} else if (isWithinElement(qName, ELEM_JOURNAL_TITLE, ELEM_JOURNAL_TITLE_GROUP)) {
|
113
|
this.currentValue = new StringBuilder();
|
114
|
} else if (isWithinElement(qName, ELEM_ARTICLE_ID, ELEM_ARTICLE_META)) {
|
115
|
this.currentArticleIdType = attributes.getValue(PUB_ID_TYPE);
|
116
|
this.currentValue = new StringBuilder();
|
117
|
} else if (isWithinElement(qName, ELEM_FPAGE, ELEM_ARTICLE_META) ||
|
118
|
isWithinElement(qName, ELEM_LPAGE, ELEM_ARTICLE_META)) {
|
119
|
this.currentValue = new StringBuilder();
|
120
|
} else if (hasAmongParents(qName, ELEM_AFFILIATION, this.parents, ELEM_ARTICLE_META)) {
|
121
|
this.currentValue = new StringBuilder();
|
122
|
} else if (hasAmongParents(qName, ELEM_ARTICLE_TITLE, this.parents, ELEM_REF, ELEM_REF_LIST) ||
|
123
|
hasAmongParents(qName, ELEM_SOURCE, this.parents, ELEM_REF, ELEM_REF_LIST) ||
|
124
|
hasAmongParents(qName, ELEM_YEAR, this.parents, ELEM_REF, ELEM_REF_LIST) ||
|
125
|
hasAmongParents(qName, ELEM_VOLUME, this.parents, ELEM_REF, ELEM_REF_LIST) ||
|
126
|
hasAmongParents(qName, ELEM_ISSUE, this.parents, ELEM_REF, ELEM_REF_LIST) ||
|
127
|
hasAmongParents(qName, ELEM_FPAGE, this.parents, ELEM_REF, ELEM_REF_LIST) ||
|
128
|
hasAmongParents(qName, ELEM_LPAGE, this.parents, ELEM_REF, ELEM_REF_LIST)) {
|
129
|
this.currentValue = new StringBuilder();
|
130
|
} else if (isWithinElement(qName, ELEM_SURNAME, ELEM_NAME) ||
|
131
|
isWithinElement(qName, ELEM_GIVEN_NAMES, ELEM_NAME)) {
|
132
|
this.currentValue = new StringBuilder();
|
133
|
} else if (isWithinElement(qName, ELEM_PUB_ID, ELEM_CITATION) ||
|
134
|
isWithinElement(qName, ELEM_PUB_ID, ELEM_ELEMENT_CITATION) ||
|
135
|
isWithinElement(qName, ELEM_PUB_ID, ELEM_MIXED_CITATION)) {
|
136
|
this.currentReferenceIdType = attributes.getValue(PUB_ID_TYPE);
|
137
|
this.currentValue = new StringBuilder();
|
138
|
} else if (isWithinElement(qName, ELEM_REF, ELEM_REF_LIST)) {
|
139
|
this.currentRefMetaBuilder = ReferenceMetadata.newBuilder();
|
140
|
this.currentRefAuthorList = new ArrayList<CharSequence>();
|
141
|
this.currentReferenceText = new StringBuffer();
|
142
|
ReferenceBasicMetadata.Builder basicMetaBuilder = ReferenceBasicMetadata.newBuilder();
|
143
|
basicMetaBuilder.setExternalIds(new HashMap<CharSequence, CharSequence>());
|
144
|
this.currentRefMetaBuilder.setBasicMetadata(basicMetaBuilder.build());
|
145
|
}
|
146
|
this.parents.push(qName);
|
147
|
}
|
148
|
|
149
|
@Override
|
150
|
public void endElement(String uri, String localName, String qName)
|
151
|
throws SAXException {
|
152
|
try {
|
153
|
this.parents.pop();
|
154
|
if (isWithinElement(qName, ELEM_JOURNAL_TITLE, ELEM_JOURNAL_TITLE_GROUP)) {
|
155
|
builder.setJournal(this.currentValue.toString().trim());
|
156
|
} else if (isWithinElement(qName, ELEM_ARTICLE_ID, ELEM_ARTICLE_META) &&
|
157
|
PUB_ID_TYPE_PMID.equals(this.currentArticleIdType)) {
|
158
|
builder.setPmid(this.currentValue.toString().trim());
|
159
|
} else if (isWithinElement(qName, ELEM_FPAGE, ELEM_ARTICLE_META)) {
|
160
|
if (builder.getPages()==null) {
|
161
|
builder.setPages(Range.newBuilder().build());
|
162
|
}
|
163
|
builder.getPages().setStart(this.currentValue.toString().trim());
|
164
|
} else if (isWithinElement(qName, ELEM_LPAGE, ELEM_ARTICLE_META)) {
|
165
|
if (builder.getPages()==null) {
|
166
|
builder.setPages(Range.newBuilder().build());
|
167
|
}
|
168
|
builder.getPages().setEnd(this.currentValue.toString().trim());
|
169
|
|
170
|
} else if (hasAmongParents(qName, ELEM_AFFILIATION, this.parents, ELEM_ARTICLE_META)) {
|
171
|
CRFAffiliationParser affiliationParser = new CRFAffiliationParser();
|
172
|
String affStr = this.currentValue.toString();
|
173
|
if (affStr.trim().length()>0) {
|
174
|
try {
|
175
|
Element parsedAffiliation = affiliationParser.parse(affStr);
|
176
|
if (parsedAffiliation!=null) {
|
177
|
if (builder.getAffiliations()==null) {
|
178
|
builder.setAffiliations(new ArrayList<Affiliation>());
|
179
|
}
|
180
|
Affiliation aff = AffiliationBuilder.build(parsedAffiliation);
|
181
|
if (aff.getRawText().length()>0) {
|
182
|
builder.getAffiliations().add(aff);
|
183
|
} else {
|
184
|
aff.setRawText(affStr);
|
185
|
}
|
186
|
}
|
187
|
} catch (IndexOutOfBoundsException e) {
|
188
|
// FIXME remove this catch block when upgrading cermine version
|
189
|
log.error("exception occurred when parsing affiliation: " + affStr, e);
|
190
|
}
|
191
|
}
|
192
|
} else if (hasAmongParents(qName, ELEM_ARTICLE_TITLE, this.parents, ELEM_REF, ELEM_REF_LIST)) {
|
193
|
currentRefMetaBuilder.getBasicMetadata().setTitle(this.currentValue.toString());
|
194
|
} else if (hasAmongParents(qName, ELEM_SOURCE, this.parents, ELEM_REF, ELEM_REF_LIST)) {
|
195
|
currentRefMetaBuilder.getBasicMetadata().setSource(this.currentValue.toString());
|
196
|
} else if (hasAmongParents(qName, ELEM_YEAR, this.parents, ELEM_REF, ELEM_REF_LIST)) {
|
197
|
currentRefMetaBuilder.getBasicMetadata().setYear(this.currentValue.toString());
|
198
|
} else if (hasAmongParents(qName, ELEM_VOLUME, this.parents, ELEM_REF, ELEM_REF_LIST)) {
|
199
|
currentRefMetaBuilder.getBasicMetadata().setVolume(this.currentValue.toString());
|
200
|
} else if (hasAmongParents(qName, ELEM_ISSUE, this.parents, ELEM_REF, ELEM_REF_LIST)) {
|
201
|
currentRefMetaBuilder.getBasicMetadata().setIssue(this.currentValue.toString());
|
202
|
} else if (hasAmongParents(qName, ELEM_FPAGE, this.parents, ELEM_REF, ELEM_REF_LIST)) {
|
203
|
if (currentRefMetaBuilder.getBasicMetadata().getPages()==null) {
|
204
|
currentRefMetaBuilder.getBasicMetadata().setPages(Range.newBuilder().build());
|
205
|
}
|
206
|
currentRefMetaBuilder.getBasicMetadata().getPages().setStart(this.currentValue.toString());
|
207
|
} else if (hasAmongParents(qName, ELEM_LPAGE, this.parents, ELEM_REF, ELEM_REF_LIST)) {
|
208
|
if (currentRefMetaBuilder.getBasicMetadata().getPages()==null) {
|
209
|
currentRefMetaBuilder.getBasicMetadata().setPages(Range.newBuilder().build());
|
210
|
}
|
211
|
currentRefMetaBuilder.getBasicMetadata().getPages().setEnd(this.currentValue.toString());
|
212
|
} else if (hasAmongParents(qName, ELEM_PUB_ID, this.parents, ELEM_REF, ELEM_REF_LIST)) {
|
213
|
if (this.currentReferenceIdType!=null) {
|
214
|
currentRefMetaBuilder.getBasicMetadata().getExternalIds().put(
|
215
|
this.currentReferenceIdType, this.currentValue.toString());
|
216
|
}
|
217
|
} else if (isWithinElement(qName, ELEM_SURNAME, ELEM_NAME)) {
|
218
|
this.currentSurname = this.currentValue.toString();
|
219
|
} else if (isWithinElement(qName, ELEM_GIVEN_NAMES, ELEM_NAME)) {
|
220
|
this.currentGivenNames = this.currentValue.toString();
|
221
|
} else if (hasAmongParents(qName, ELEM_NAME, this.parents, ELEM_REF)) {
|
222
|
// in element-citation names are nested in person-group
|
223
|
this.currentRefAuthorList.add(
|
224
|
this.currentSurname + ", " + this.currentGivenNames);
|
225
|
this.currentSurname = null;
|
226
|
this.currentGivenNames = null;
|
227
|
} else if (isWithinElement(qName, ELEM_CITATION, ELEM_REF) ||
|
228
|
isWithinElement(qName, ELEM_ELEMENT_CITATION, ELEM_REF) ||
|
229
|
isWithinElement(qName, ELEM_MIXED_CITATION, ELEM_REF)) {
|
230
|
if (!this.currentRefMetaBuilder.hasText() &&
|
231
|
this.currentReferenceTextExplicitlySet &&
|
232
|
this.currentReferenceText!=null && this.currentReferenceText.length()>0) {
|
233
|
String trimmedRefText = this.currentReferenceText.toString().trim().replaceAll(" +", " ");
|
234
|
if (!trimmedRefText.isEmpty()) {
|
235
|
this.currentRefMetaBuilder.setText(trimmedRefText);
|
236
|
}
|
237
|
}
|
238
|
} else if (isWithinElement(qName, ELEM_REF, ELEM_REF_LIST)) {
|
239
|
if (this.builder.getReferences()==null) {
|
240
|
this.builder.setReferences(new ArrayList<ReferenceMetadata>());
|
241
|
}
|
242
|
this.currentRefMetaBuilder.setPosition(this.builder.getReferences().size()+1);
|
243
|
|
244
|
if (this.currentRefAuthorList!=null && this.currentRefAuthorList.size()>0) {
|
245
|
this.currentRefMetaBuilder.getBasicMetadata().setAuthors(this.currentRefAuthorList);
|
246
|
}
|
247
|
|
248
|
if (!this.currentRefMetaBuilder.hasText()) {
|
249
|
this.currentRefMetaBuilder.setText(generateReferenceRawText(
|
250
|
this.currentRefMetaBuilder.getBasicMetadata()));
|
251
|
}
|
252
|
this.builder.getReferences().add(this.currentRefMetaBuilder.build());
|
253
|
// reference fields cleanup
|
254
|
this.currentRefMetaBuilder = null;
|
255
|
this.currentRefAuthorList = null;
|
256
|
this.currentReferenceText = null;
|
257
|
this.currentReferenceTextExplicitlySet = false;
|
258
|
this.currentReferenceIdType = null;
|
259
|
}
|
260
|
} catch (Exception e) {
|
261
|
// FIXME remote this catch
|
262
|
throw new RuntimeException("unexpected exception while processing doc: " +
|
263
|
builder.getId(), e);
|
264
|
}
|
265
|
}
|
266
|
|
267
|
@Override
|
268
|
public void endDocument() throws SAXException {
|
269
|
parents.clear();
|
270
|
parents = null;
|
271
|
}
|
272
|
|
273
|
@Override
|
274
|
public void characters(char[] ch, int start, int length)
|
275
|
throws SAXException {
|
276
|
String currentElement = this.parents.pop();
|
277
|
try {
|
278
|
// skipping affiliation position element
|
279
|
if (isWithinElement(currentElement, ELEM_LABEL, ELEM_AFFILIATION)) {
|
280
|
return;
|
281
|
}
|
282
|
|
283
|
this.currentValue.append(ch, start, length);
|
284
|
// handing reference text
|
285
|
if (hasAmongParents(this.parents, ELEM_REF)) {
|
286
|
if (isWithinElement(currentElement, ELEM_CITATION, ELEM_REF) ||
|
287
|
isWithinElement(currentElement, ELEM_ELEMENT_CITATION, ELEM_REF) ||
|
288
|
isWithinElement(currentElement, ELEM_MIXED_CITATION, ELEM_REF)) {
|
289
|
// citation element contents
|
290
|
char[] chunk = new char[length];
|
291
|
System.arraycopy(ch, start, chunk, 0, length);
|
292
|
if (containsNonWhiteCharacter(chunk)) {
|
293
|
this.currentReferenceTextExplicitlySet = true;
|
294
|
}
|
295
|
}
|
296
|
if (this.currentReferenceText.length()>0 &&
|
297
|
isAlphanumeric(ch[start]) &&
|
298
|
isAlphanumeric(this.currentReferenceText.charAt(
|
299
|
this.currentReferenceText.length()-1))) {
|
300
|
// adding missing space separator between two alphanumeric characters
|
301
|
this.currentReferenceText.append(' ');
|
302
|
}
|
303
|
this.currentReferenceText.append(ch, start, length);
|
304
|
}
|
305
|
} finally {
|
306
|
this.parents.push(currentElement);
|
307
|
}
|
308
|
}
|
309
|
|
310
|
private void clearAllFields() {
|
311
|
this.currentArticleIdType = null;
|
312
|
this.rootElement = true;
|
313
|
}
|
314
|
|
315
|
static boolean isAlphanumeric(char c) {
|
316
|
return !(c < 0x30 || (c >= 0x3a && c <= 0x40) || (c > 0x5a && c <= 0x60) || c > 0x7a);
|
317
|
}
|
318
|
|
319
|
boolean isWithinElement(String qName,
|
320
|
String expectedElement, String expectedParent) {
|
321
|
return qName.equals(expectedElement) &&
|
322
|
(expectedParent==null || !this.parents.isEmpty() && expectedParent.equals(this.parents.peek()));
|
323
|
}
|
324
|
|
325
|
public static boolean hasAmongParents(String qName,
|
326
|
String expectedElement, Stack<String> parentStack, String... expectedParents) {
|
327
|
if (qName.equals(expectedElement)) {
|
328
|
return hasAmongParents(parentStack, expectedParents);
|
329
|
} else {
|
330
|
return false;
|
331
|
}
|
332
|
}
|
333
|
|
334
|
public static boolean hasAmongParents(Stack<String> parentStack, String... expectedParents) {
|
335
|
if (expectedParents.length <= parentStack.size()) {
|
336
|
int startIterationIdx = 0;
|
337
|
for (String currentParent : expectedParents) {
|
338
|
boolean found = false;
|
339
|
for (int i=startIterationIdx; i<parentStack.size(); i++) {
|
340
|
// iteration starts from the bottom while we want to check from top
|
341
|
if (currentParent.equals(parentStack.get(parentStack.size()-(i+1)))) {
|
342
|
startIterationIdx = i+1;
|
343
|
found = true;
|
344
|
break;
|
345
|
}
|
346
|
}
|
347
|
if (!found) {
|
348
|
return false;
|
349
|
}
|
350
|
}
|
351
|
return true;
|
352
|
}
|
353
|
return false;
|
354
|
}
|
355
|
|
356
|
static boolean containsNonWhiteCharacter(char[] ch) {
|
357
|
if (ch!=null && ch.length>0) {
|
358
|
for (char currentCh : ch) {
|
359
|
if (!Character.isWhitespace(currentCh)) {
|
360
|
return true;
|
361
|
}
|
362
|
}
|
363
|
}
|
364
|
return false;
|
365
|
}
|
366
|
|
367
|
public static String generateReferenceRawText(ReferenceBasicMetadata refMeta) {
|
368
|
String authors = refMeta.getAuthors()!=null?
|
369
|
StringUtils.join(refMeta.getAuthors(), ", "):"";
|
370
|
String title = refMeta.getTitle()!=null?refMeta.getTitle().toString():null;
|
371
|
String source = refMeta.getSource()!=null?refMeta.getSource().toString():null;
|
372
|
String year = refMeta.getYear()!=null?refMeta.getYear().toString():null;
|
373
|
String volume = refMeta.getVolume()!=null?refMeta.getVolume().toString():null;
|
374
|
String issue = refMeta.getIssue()!=null?refMeta.getIssue().toString():null;
|
375
|
String fpage = refMeta.getPages()!=null && refMeta.getPages().getStart()!=null
|
376
|
?refMeta.getPages().getStart().toString():null;
|
377
|
String lpage = refMeta.getPages()!=null && refMeta.getPages().getEnd()!=null
|
378
|
?refMeta.getPages().getEnd().toString():null;
|
379
|
|
380
|
StringBuilder builder = new StringBuilder();
|
381
|
|
382
|
if (StringUtils.isNotBlank(authors)) {
|
383
|
builder.append(authors);
|
384
|
builder.append(". ");
|
385
|
}
|
386
|
if (StringUtils.isNotBlank(title)) {
|
387
|
builder.append(title);
|
388
|
builder.append(". ");
|
389
|
}
|
390
|
if (StringUtils.isNotBlank(source)) {
|
391
|
builder.append(source);
|
392
|
builder.append(". ");
|
393
|
}
|
394
|
if (StringUtils.isNotBlank(year)) {
|
395
|
builder.append(year);
|
396
|
}
|
397
|
if (StringUtils.isNotBlank(volume)) {
|
398
|
builder.append("; ");
|
399
|
builder.append(volume);
|
400
|
}
|
401
|
if (StringUtils.isNotBlank(issue)) {
|
402
|
builder.append(" (");
|
403
|
builder.append(issue);
|
404
|
builder.append(")");
|
405
|
}
|
406
|
if (StringUtils.isNotBlank(fpage)) {
|
407
|
builder.append(": ");
|
408
|
builder.append(fpage);
|
409
|
}
|
410
|
if (StringUtils.isNotBlank(lpage)) {
|
411
|
builder.append("-");
|
412
|
builder.append(lpage);
|
413
|
}
|
414
|
return builder.toString();
|
415
|
}
|
416
|
}
|