1
|
package eu.dnetlib.data.mdstore.modular;
|
2
|
|
3
|
import java.io.ByteArrayInputStream;
|
4
|
import java.util.HashMap;
|
5
|
import java.util.Map;
|
6
|
import java.util.Stack;
|
7
|
|
8
|
import javax.xml.stream.XMLInputFactory;
|
9
|
import javax.xml.stream.XMLStreamConstants;
|
10
|
import javax.xml.stream.XMLStreamException;
|
11
|
import javax.xml.stream.XMLStreamReader;
|
12
|
|
13
|
import org.apache.commons.logging.Log;
|
14
|
import org.apache.commons.logging.LogFactory;
|
15
|
|
16
|
import static eu.dnetlib.data.mdstore.modular.MDStoreConstants.*;
|
17
|
|
18
|
/**
|
19
|
* This method outperforms SimpleRecordParser by a vast amount, especially since we are just getting stuff in the
|
20
|
* header.
|
21
|
*
|
22
|
* @author marko
|
23
|
*
|
24
|
*/
|
25
|
public class StreamingRecordParser implements RecordParser {
|
26
|
|
27
|
private static final Log log = LogFactory.getLog(StreamingRecordParser.class);
|
28
|
private long ts;
|
29
|
|
30
|
@Override
|
31
|
public Map<String, String> parseRecord(String record) {
|
32
|
|
33
|
try {
|
34
|
XMLInputFactory factory = XMLInputFactory.newInstance();
|
35
|
XMLStreamReader parser = factory.createXMLStreamReader(new ByteArrayInputStream(record.getBytes()));
|
36
|
|
37
|
HashMap<String, String> res = new HashMap<String, String>();
|
38
|
res.put(TIMESTAMP, String.valueOf(getTimestamp()));
|
39
|
|
40
|
Stack<String> elementStack = new Stack<String>();
|
41
|
elementStack.push("/");
|
42
|
|
43
|
while (parser.hasNext()) {
|
44
|
int event = parser.next();
|
45
|
|
46
|
if (event == XMLStreamConstants.END_ELEMENT) {
|
47
|
elementStack.pop();
|
48
|
} else if (event == XMLStreamConstants.START_ELEMENT) {
|
49
|
final String localName = parser.getLocalName();
|
50
|
elementStack.push(localName);
|
51
|
|
52
|
if (OBJIDENTIFIER.equals(localName)) {
|
53
|
parser.next();
|
54
|
|
55
|
res.put(ID, parser.getText().trim());
|
56
|
|
57
|
} else if ("identifier".equals(localName) && "efgEntity".equals(grandParent(elementStack))) {
|
58
|
if (!res.containsKey("originalId")) {
|
59
|
parser.next();
|
60
|
// log.info("ZZZZZZ OK: found identifier at right depth " + elementStack);
|
61
|
res.put("originalId", parser.getText().trim());
|
62
|
}
|
63
|
}
|
64
|
|
65
|
else if ("identifier".equals(localName)) {
|
66
|
|
67
|
// log.info("ZZZZZZ: found identifier not at right depth " + elementStack + " grand parent " + grandParent(elementStack));
|
68
|
}
|
69
|
|
70
|
if (res.containsKey(ID) && res.containsKey("originalId"))
|
71
|
return res;
|
72
|
}
|
73
|
}
|
74
|
return res;
|
75
|
} catch (XMLStreamException e) {
|
76
|
throw new IllegalStateException(e);
|
77
|
}
|
78
|
|
79
|
}
|
80
|
|
81
|
private String grandParent(Stack<String> elementStack) {
|
82
|
if (elementStack.size() <= 3)
|
83
|
return "";
|
84
|
return elementStack.get(elementStack.size() - 3);
|
85
|
}
|
86
|
|
87
|
@Override
|
88
|
public void setTimestamp(final long ts) {
|
89
|
this.ts = ts;
|
90
|
log.debug("RecordParser date set to "+ts);
|
91
|
}
|
92
|
|
93
|
@Override
|
94
|
public long getTimestamp() {
|
95
|
return ts;
|
96
|
}
|
97
|
|
98
|
}
|