Project

General

Profile

1
package eu.dnetlib.dli.parser;
2

    
3
import java.io.ByteArrayInputStream;
4
import java.util.List;
5
import java.util.Stack;
6
import javax.xml.stream.XMLInputFactory;
7
import javax.xml.stream.XMLStreamConstants;
8
import javax.xml.stream.XMLStreamReader;
9

    
10
import com.google.common.collect.Lists;
11
import eu.dnetlib.resolver.model.*;
12
import org.apache.commons.lang3.StringUtils;
13
import org.apache.commons.logging.Log;
14
import org.apache.commons.logging.LogFactory;
15

    
16
/**
17
 * This method outperforms SimpleRecordParser by a vast amount, especially since we are just getting stuff in the header.
18
 *
19
 * @author sandro
20
 */
21
public class DLIRecordParser {
22

    
23
	private static final Log log = LogFactory.getLog(DLIRecordParser.class);
24

    
25
	public ResolvedObject parseRecord(final String record) {
26
		try {
27
			XMLInputFactory factory = XMLInputFactory.newInstance();
28
			XMLStreamReader parser = factory.createXMLStreamReader(new ByteArrayInputStream(record.getBytes()));
29

    
30
			ResolvedObject object = new ResolvedObject();
31
			Stack<String> elementStack = new Stack<>();
32
			elementStack.push("/");
33
			List<String> titles = Lists.newArrayList();
34
			List<String> authors = Lists.newArrayList();
35
			List<String> relatedAuthors = null;
36
			boolean insideRelation = false;
37
			ObjectRelation currentRelation = null;
38
			ResolvedObject currentExtraInfo = null;
39
			List<ObjectRelation> relations = Lists.newArrayList();
40
			List<ObjectProvenance> datasources = Lists.newArrayList();
41

    
42
			while (parser.hasNext()) {
43
				int event = parser.next();
44
				if (event == XMLStreamConstants.END_ELEMENT) {
45
					final String localName = parser.getLocalName();
46

    
47
					// CLOSE TAG relation
48

    
49
					if (localName.equals("relatedauthors")) {
50
						if (currentExtraInfo != null && relatedAuthors != null) {
51
							currentExtraInfo.setAuthors(relatedAuthors);
52
							relatedAuthors.clear();
53
							relatedAuthors = null;
54
						}
55
					}
56
					if (localName.equals("relation")) {
57
						log.debug("found closed tag relation");
58
						insideRelation = false;
59
						if (currentRelation != null) {
60
							if (currentExtraInfo != null) {
61
								currentExtraInfo.setPid(currentRelation.getTargetPID().getId());
62
								currentExtraInfo.setPidType(currentRelation.getTargetPID().getType());
63
								currentExtraInfo.setDatasourceProvenance(object.getDatasourceProvenance());
64
								currentRelation.setExtraInfo(currentExtraInfo);
65
							}
66
							if (object.getPid() == null || object.getPid().isEmpty()) {
67
								log.error("ERROR the DOI is empty");
68
							}
69
							currentRelation.setSourcePid(object.getPid());
70
							if (!StringUtils.isBlank(object.getPid()) && !StringUtils.isBlank(object.getPidType())) {
71
								currentRelation.setSourceRecordId(object.getIdentifier());
72
							}
73

    
74
							log.debug("Adding new relation to the object");
75
							relations.add(currentRelation);
76
							currentRelation = null;
77
						}
78
					}
79
					elementStack.pop();
80
				} else if (event == XMLStreamConstants.START_ELEMENT) {
81
					final String localName = parser.getLocalName();
82
					elementStack.push(localName);
83

    
84
					// LOCAL IDENTIFIER TAG
85
					if ("localIdentifier".equals(localName)) {
86
						log.debug("found open tag localIdentifier");
87
						String type = "";
88
						for (int i = 0; i < parser.getAttributeCount(); i++) {
89
							if (parser.getAttributeLocalName(i).equals("type")) {
90
								type = parser.getAttributeValue(i);
91
								break;
92
							}
93
						}
94
						parser.next();
95
						if (parser.hasText()) {
96
							object.setPid(parser.getText().trim());
97
							object.setPidType(type);
98
						}
99
						// TITLE TAG
100
					} else if ("title".equals(localName)) {
101
						log.debug("found open tag title");
102
						parser.next();
103
						if (parser.hasText()) {
104
							titles.add(parser.getText().trim());
105
						}
106
						// DATASOURCE PROVENANCE TAG
107
					} else if ("datasource".equals(localName)) {
108

    
109
						String completionStatus = "";
110
						String provisionMode = "";
111
						log.debug("found open tag datasources");
112
						for (int i = 0; i < parser.getAttributeCount(); i++) {
113
							if (parser.getAttributeLocalName(i).equals("completionStatus")) {
114
								completionStatus = parser.getAttributeValue(i);
115
							} else if (parser.getAttributeLocalName(i).equals("provisionMode")) {
116
								provisionMode = parser.getAttributeValue(i);
117
							}
118
						}
119
						parser.next();
120
						if (parser.hasText()) {
121
							String dt = parser.getText().trim();
122

    
123
							ObjectProvenance pr = new ObjectProvenance(dt, ObjectProvisionMode.fromString(provisionMode).toString(),
124
									CompletionStatus.fromString(completionStatus).toString(), null, null, true);
125
							datasources.add(pr);
126
						}
127
						// INSIDE the tag Person
128
					} else if ("completionStatus".equals(localName)) {
129
						log.debug("found open tag completionStatus");
130
						parser.next();
131
						if (parser.hasText()) {
132
							object.setCompletionStatus(CompletionStatus.fromString(parser.getText().trim()).toString());
133
						}
134
					} else if ("fullname".equals(localName)) {
135
						log.debug("found open tag fullname");
136
						parser.next();
137
						if (parser.hasText()) {
138
							authors.add(parser.getText().trim());
139
						}
140

    
141
						// INSIDE THE TAG RELATION
142
					} else if ("relation".equals(localName)) {
143
						log.debug("found open tag relation");
144
						insideRelation = true;
145
						currentRelation = new ObjectRelation();
146
						currentExtraInfo = new ResolvedObject();
147
					} else if ("pid".equals(localName) && insideRelation) {
148
						PID currentPid = new PID();
149
						for (int i = 0; i < parser.getAttributeCount(); i++) {
150
							if (parser.getAttributeLocalName(i).equals("type")) {
151
								currentPid.setType(parser.getAttributeValue(i));
152
								break;
153
							}
154
						}
155

    
156
						try {
157
							parser.next();
158
							if (parser.hasText()) {
159
								currentPid.setId(parser.getText().trim());
160
								currentRelation.setTargetPID(currentPid);
161
							}
162

    
163
						} catch (Exception e) {
164
							log.error("skipped doi");
165
						}
166
						// INSIDE THE TAG TYPEOFRELATION
167
					} else if ("typeOfRelation".equals(localName) && insideRelation) {
168
						log.debug("found open tag typeOfRelation");
169
						parser.next();
170
						if (parser.hasText()) {
171
							currentRelation.setRelationSemantics(parser.getText().trim());
172
						}
173
						// INSIDE THE TAG OBJECT TYPE
174
					} else if ("relatedtype".equals(localName) && insideRelation) {
175
						log.debug("found open tag relatedtype");
176
						parser.next();
177
						if (parser.hasText()) {
178
							currentExtraInfo.setType(ObjectType.valueOf(parser.getText().trim()));
179
						}
180
					} else if ("relatedtitle".equals(localName) && insideRelation) {
181
						log.debug("found open tag relatedtype");
182
						parser.next();
183
						if (parser.hasText()) {
184
							currentExtraInfo.setTitles(Lists.newArrayList(parser.getText().trim()));
185
						}
186
					} else if ("relatedauthor".equals(localName) && insideRelation) {
187
						log.debug("found open tag relatedtype");
188
						parser.next();
189
						if (parser.hasText()) {
190
							if (relatedAuthors == null)
191
								relatedAuthors = Lists.newArrayList();
192
							relatedAuthors.add(parser.getText().trim());
193
						}
194
					} else if ("objectType".equals(localName)) {
195
						log.debug("found open tag objectType");
196
						parser.next();
197
						if (parser.hasText()) {
198
							object.setType(ObjectType.valueOf(parser.getText().trim()));
199
						}
200

    
201
						// INSIDE THE TAG DATE
202
					} else if ("date".equals(localName)) {
203
						log.debug("found open tag date");
204
						parser.next();
205
						if (parser.hasText()) {
206
							object.setDate(parser.getText().trim());
207
						}
208
					}
209
				}
210
			}
211
			object.setAuthors(authors);
212
			object.setTitles(titles);
213
			object.setDatasourceProvenance(datasources);
214
			object.setRelations(relations);
215
			return object;
216
		} catch (Exception e) {
217
			log.error("Error on parsing record" + record, e);
218
			return null;
219
		}
220

    
221
	}
222

    
223
}
    (1-1/1)