Project

General

Profile

1
package eu.dnetlib.dli.parser;
2

    
3
import java.io.ByteArrayInputStream;
4
import java.util.List;
5
import java.util.Stack;
6
import javax.xml.stream.XMLInputFactory;
7
import javax.xml.stream.XMLStreamConstants;
8
import javax.xml.stream.XMLStreamReader;
9

    
10
import com.google.common.collect.Lists;
11
import eu.dnetlib.dli.resolver.model.*;
12
import eu.dnetlib.pid.resolver.model.ObjectProvenance;
13
import eu.dnetlib.pid.resolver.model.ObjectRelation;
14
import eu.dnetlib.pid.resolver.model.ObjectType;
15
import eu.dnetlib.pid.resolver.model.PID;
16
import org.apache.commons.lang3.StringUtils;
17
import org.apache.commons.logging.Log;
18
import org.apache.commons.logging.LogFactory;
19

    
20
/**
21
 * This method outperforms SimpleRecordParser by a vast amount, especially since we are just getting stuff in the header.
22
 *
23
 * @author sandro
24
 */
25
public class DLIRecordParser {
26

    
27
	private static final Log log = LogFactory.getLog(DLIRecordParser.class);
28

    
29
    public DLIResolvedObject parseRecord(final String record) {
30
        try {
31
			XMLInputFactory factory = XMLInputFactory.newInstance();
32
			XMLStreamReader parser = factory.createXMLStreamReader(new ByteArrayInputStream(record.getBytes()));
33

    
34
            DLIResolvedObject object = new DLIResolvedObject();
35
            Stack<String> elementStack = new Stack<>();
36
			elementStack.push("/");
37
			List<String> titles = Lists.newArrayList();
38
			List<String> authors = Lists.newArrayList();
39
			List<String> relatedAuthors = null;
40
			boolean insideRelation = false;
41
            DLIObjectRelation currentRelation = null;
42
            DLIResolvedObject currentExtraInfo = null;
43
            List<ObjectRelation> relations = Lists.newArrayList();
44
            List<ObjectProvenance> datasources = Lists.newArrayList();
45

    
46
			while (parser.hasNext()) {
47
				int event = parser.next();
48
				if (event == XMLStreamConstants.END_ELEMENT) {
49
					final String localName = parser.getLocalName();
50

    
51
					// CLOSE TAG relation
52

    
53
					if (localName.equals("relatedauthors")) {
54
						if (currentExtraInfo != null && relatedAuthors != null) {
55
							currentExtraInfo.setAuthors(relatedAuthors);
56
							relatedAuthors.clear();
57
							relatedAuthors = null;
58
						}
59
					}
60
					if (localName.equals("relation")) {
61
						log.debug("found closed tag relation");
62
						insideRelation = false;
63
						if (currentRelation != null) {
64
							if (currentExtraInfo != null) {
65
								currentExtraInfo.setPid(currentRelation.getTargetPID().getId());
66
								currentExtraInfo.setPidType(currentRelation.getTargetPID().getType());
67
								currentExtraInfo.setDatasourceProvenance(object.getDatasourceProvenance());
68
								currentRelation.setExtraInfo(currentExtraInfo);
69
							}
70
							if (object.getPid() == null || object.getPid().isEmpty()) {
71
								log.error("ERROR the DOI is empty");
72
							}
73
							currentRelation.setSourcePid(object.getPid());
74
							if (!StringUtils.isBlank(object.getPid()) && !StringUtils.isBlank(object.getPidType())) {
75
								currentRelation.setSourceRecordId(object.getIdentifier());
76
							}
77

    
78
							log.debug("Adding new relation to the object");
79
							relations.add(currentRelation);
80
							currentRelation = null;
81
						}
82
					}
83
					elementStack.pop();
84
				} else if (event == XMLStreamConstants.START_ELEMENT) {
85
					final String localName = parser.getLocalName();
86
					elementStack.push(localName);
87

    
88
					// LOCAL IDENTIFIER TAG
89
					if ("localIdentifier".equals(localName)) {
90
						log.debug("found open tag localIdentifier");
91
						String type = "";
92
						for (int i = 0; i < parser.getAttributeCount(); i++) {
93
							if (parser.getAttributeLocalName(i).equals("type")) {
94
								type = parser.getAttributeValue(i);
95
								break;
96
							}
97
						}
98
						parser.next();
99
						if (parser.hasText()) {
100
							object.setPid(parser.getText().trim());
101
							object.setPidType(type);
102
						}
103
						// TITLE TAG
104
					} else if ("title".equals(localName)) {
105
						log.debug("found open tag title");
106
						parser.next();
107
						if (parser.hasText()) {
108
							titles.add(parser.getText().trim());
109
						}
110
						// DATASOURCE PROVENANCE TAG
111
					} else if ("datasource".equals(localName)) {
112

    
113
						String completionStatus = "";
114
						String provisionMode = "";
115
						log.debug("found open tag datasources");
116
						for (int i = 0; i < parser.getAttributeCount(); i++) {
117
							if (parser.getAttributeLocalName(i).equals("completionStatus")) {
118
								completionStatus = parser.getAttributeValue(i);
119
							} else if (parser.getAttributeLocalName(i).equals("provisionMode")) {
120
								provisionMode = parser.getAttributeValue(i);
121
							}
122
						}
123
						parser.next();
124
						if (parser.hasText()) {
125
							String dt = parser.getText().trim();
126

    
127
                            DLIObjectProvenance pr = new DLIObjectProvenance(dt, ObjectProvisionMode.fromString(provisionMode).toString(),
128
                                    CompletionStatus.fromString(completionStatus).toString(), null, null, true);
129
							datasources.add(pr);
130
						}
131
						// INSIDE the tag Person
132
					} else if ("completionStatus".equals(localName)) {
133
						log.debug("found open tag completionStatus");
134
						parser.next();
135
						if (parser.hasText()) {
136
							object.setCompletionStatus(CompletionStatus.fromString(parser.getText().trim()).toString());
137
						}
138
					} else if ("fullname".equals(localName)) {
139
						log.debug("found open tag fullname");
140
						parser.next();
141
						if (parser.hasText()) {
142
							authors.add(parser.getText().trim());
143
						}
144

    
145
						// INSIDE THE TAG RELATION
146
					} else if ("relation".equals(localName)) {
147
						log.debug("found open tag relation");
148
						insideRelation = true;
149
                        currentRelation = new DLIObjectRelation();
150
                        currentExtraInfo = new DLIResolvedObject();
151
                    } else if ("pid".equals(localName) && insideRelation) {
152
						PID currentPid = new PID();
153
						for (int i = 0; i < parser.getAttributeCount(); i++) {
154
							if (parser.getAttributeLocalName(i).equals("type")) {
155
								currentPid.setType(parser.getAttributeValue(i));
156
								break;
157
							}
158
						}
159

    
160
						try {
161
							parser.next();
162
							if (parser.hasText()) {
163
								currentPid.setId(parser.getText().trim());
164
								currentRelation.setTargetPID(currentPid);
165
							}
166

    
167
						} catch (Exception e) {
168
							log.error("skipped doi");
169
						}
170
						// INSIDE THE TAG TYPEOFRELATION
171
					} else if ("typeOfRelation".equals(localName) && insideRelation) {
172
						log.debug("found open tag typeOfRelation");
173
						parser.next();
174
						if (parser.hasText()) {
175
							currentRelation.setRelationSemantics(parser.getText().trim());
176
						}
177
						// INSIDE THE TAG OBJECT TYPE
178
					} else if ("relatedtype".equals(localName) && insideRelation) {
179
						log.debug("found open tag relatedtype");
180
						parser.next();
181
						if (parser.hasText()) {
182
							currentExtraInfo.setType(ObjectType.valueOf(parser.getText().trim()));
183
						}
184
					} else if ("relatedtitle".equals(localName) && insideRelation) {
185
						log.debug("found open tag relatedtype");
186
						parser.next();
187
						if (parser.hasText()) {
188
							currentExtraInfo.setTitles(Lists.newArrayList(parser.getText().trim()));
189
						}
190
					} else if ("relatedauthor".equals(localName) && insideRelation) {
191
						log.debug("found open tag relatedtype");
192
						parser.next();
193
						if (parser.hasText()) {
194
							if (relatedAuthors == null)
195
								relatedAuthors = Lists.newArrayList();
196
							relatedAuthors.add(parser.getText().trim());
197
						}
198
					} else if ("objectType".equals(localName)) {
199
						log.debug("found open tag objectType");
200
						parser.next();
201
						if (parser.hasText()) {
202
							object.setType(ObjectType.valueOf(parser.getText().trim()));
203
						}
204

    
205
						// INSIDE THE TAG DATE
206
					} else if ("date".equals(localName)) {
207
						log.debug("found open tag date");
208
						parser.next();
209
						if (parser.hasText()) {
210
							object.setDate(parser.getText().trim());
211
						}
212
					}
213
				}
214
			}
215
			object.setAuthors(authors);
216
			object.setTitles(titles);
217
			object.setDatasourceProvenance(datasources);
218
			object.setRelations(relations);
219
			return object;
220
		} catch (Exception e) {
221
			log.error("Error on parsing record" + record, e);
222
			return null;
223
		}
224

    
225
	}
226

    
227
}
    (1-1/1)