Project

General

Profile

1
package eu.dnetlib.dli.parser;
2

    
3
import java.io.ByteArrayInputStream;
4
import java.util.List;
5
import java.util.Stack;
6
import javax.xml.stream.XMLInputFactory;
7
import javax.xml.stream.XMLStreamConstants;
8
import javax.xml.stream.XMLStreamReader;
9

    
10
import com.google.common.collect.Lists;
11
import eu.dnetlib.dli.resolver.model.*;
12
import eu.dnetlib.pid.resolver.model.ObjectType;
13
import org.apache.commons.lang3.StringUtils;
14
import org.apache.commons.logging.Log;
15
import org.apache.commons.logging.LogFactory;
16

    
17
/**
18
 * This method outperforms SimpleRecordParser by a vast amount, especially since we are just getting stuff in the header.
19
 *
20
 * @author sandro
21
 */
22
public class DLIRecordParser {
23

    
24
	private static final Log log = LogFactory.getLog(DLIRecordParser.class);
25

    
26
    public DLIResolvedObject parseRecord(final String record) {
27
        try {
28
			XMLInputFactory factory = XMLInputFactory.newInstance();
29
			XMLStreamReader parser = factory.createXMLStreamReader(new ByteArrayInputStream(record.getBytes()));
30

    
31
            DLIResolvedObject object = new DLIResolvedObject();
32
            Stack<String> elementStack = new Stack<>();
33
			elementStack.push("/");
34
			List<String> titles = Lists.newArrayList();
35
			List<String> authors = Lists.newArrayList();
36
			List<String> relatedAuthors = null;
37
			boolean insideRelation = false;
38
            DLIObjectRelation currentRelation = null;
39
            DLIResolvedObject currentExtraInfo = null;
40
            List<DLIObjectRelation> relations = Lists.newArrayList();
41
            List<DLIObjectProvenance> datasources = Lists.newArrayList();
42

    
43
			while (parser.hasNext()) {
44
				int event = parser.next();
45
				if (event == XMLStreamConstants.END_ELEMENT) {
46
					final String localName = parser.getLocalName();
47

    
48
					// CLOSE TAG relation
49

    
50
					if (localName.equals("relatedauthors")) {
51
						if (currentExtraInfo != null && relatedAuthors != null) {
52
							currentExtraInfo.setAuthors(relatedAuthors);
53
							relatedAuthors.clear();
54
							relatedAuthors = null;
55
						}
56
					}
57
					if (localName.equals("relation")) {
58
						log.debug("found closed tag relation");
59
						insideRelation = false;
60
						if (currentRelation != null) {
61
							if (currentExtraInfo != null) {
62
								currentExtraInfo.setPid(currentRelation.getTargetPID().getId());
63
								currentExtraInfo.setPidType(currentRelation.getTargetPID().getType());
64
								currentExtraInfo.setDatasourceProvenance(object.getDatasourceProvenance());
65
								currentRelation.setExtraInfo(currentExtraInfo);
66
							}
67
							if (object.getPid() == null || object.getPid().isEmpty()) {
68
								log.error("ERROR the DOI is empty");
69
							}
70
							currentRelation.setSourcePid(object.getPid());
71
							if (!StringUtils.isBlank(object.getPid()) && !StringUtils.isBlank(object.getPidType())) {
72
								currentRelation.setSourceRecordId(object.getIdentifier());
73
							}
74

    
75
							log.debug("Adding new relation to the object");
76
							relations.add(currentRelation);
77
							currentRelation = null;
78
						}
79
					}
80
					elementStack.pop();
81
				} else if (event == XMLStreamConstants.START_ELEMENT) {
82
					final String localName = parser.getLocalName();
83
					elementStack.push(localName);
84

    
85
					// LOCAL IDENTIFIER TAG
86
					if ("localIdentifier".equals(localName)) {
87
						log.debug("found open tag localIdentifier");
88
						String type = "";
89
						for (int i = 0; i < parser.getAttributeCount(); i++) {
90
							if (parser.getAttributeLocalName(i).equals("type")) {
91
								type = parser.getAttributeValue(i);
92
								break;
93
							}
94
						}
95
						parser.next();
96
						if (parser.hasText()) {
97
							object.setPid(parser.getText().trim());
98
							object.setPidType(type);
99
						}
100
						// TITLE TAG
101
					} else if ("title".equals(localName)) {
102
						log.debug("found open tag title");
103
						parser.next();
104
						if (parser.hasText()) {
105
							titles.add(parser.getText().trim());
106
						}
107
						// DATASOURCE PROVENANCE TAG
108
					} else if ("datasource".equals(localName)) {
109

    
110
						String completionStatus = "";
111
						String provisionMode = "";
112
						log.debug("found open tag datasources");
113
						for (int i = 0; i < parser.getAttributeCount(); i++) {
114
							if (parser.getAttributeLocalName(i).equals("completionStatus")) {
115
								completionStatus = parser.getAttributeValue(i);
116
							} else if (parser.getAttributeLocalName(i).equals("provisionMode")) {
117
								provisionMode = parser.getAttributeValue(i);
118
							}
119
						}
120
						parser.next();
121
						if (parser.hasText()) {
122
							String dt = parser.getText().trim();
123

    
124
                            DLIObjectProvenance pr = new DLIObjectProvenance(dt, ObjectProvisionMode.fromString(provisionMode).toString(),
125
                                    CompletionStatus.fromString(completionStatus).toString(), null, null, true);
126
							datasources.add(pr);
127
						}
128
						// INSIDE the tag Person
129
					} else if ("completionStatus".equals(localName)) {
130
						log.debug("found open tag completionStatus");
131
						parser.next();
132
						if (parser.hasText()) {
133
							object.setCompletionStatus(CompletionStatus.fromString(parser.getText().trim()).toString());
134
						}
135
					} else if ("fullname".equals(localName)) {
136
						log.debug("found open tag fullname");
137
						parser.next();
138
						if (parser.hasText()) {
139
							authors.add(parser.getText().trim());
140
						}
141

    
142
						// INSIDE THE TAG RELATION
143
					} else if ("relation".equals(localName)) {
144
						log.debug("found open tag relation");
145
						insideRelation = true;
146
                        currentRelation = new DLIObjectRelation();
147
                        currentExtraInfo = new DLIResolvedObject();
148
                    } else if ("pid".equals(localName) && insideRelation) {
149
						PID currentPid = new PID();
150
						for (int i = 0; i < parser.getAttributeCount(); i++) {
151
							if (parser.getAttributeLocalName(i).equals("type")) {
152
								currentPid.setType(parser.getAttributeValue(i));
153
								break;
154
							}
155
						}
156

    
157
						try {
158
							parser.next();
159
							if (parser.hasText()) {
160
								currentPid.setId(parser.getText().trim());
161
								currentRelation.setTargetPID(currentPid);
162
							}
163

    
164
						} catch (Exception e) {
165
							log.error("skipped doi");
166
						}
167
						// INSIDE THE TAG TYPEOFRELATION
168
					} else if ("typeOfRelation".equals(localName) && insideRelation) {
169
						log.debug("found open tag typeOfRelation");
170
						parser.next();
171
						if (parser.hasText()) {
172
							currentRelation.setRelationSemantics(parser.getText().trim());
173
						}
174
						// INSIDE THE TAG OBJECT TYPE
175
					} else if ("relatedtype".equals(localName) && insideRelation) {
176
						log.debug("found open tag relatedtype");
177
						parser.next();
178
						if (parser.hasText()) {
179
							currentExtraInfo.setType(ObjectType.valueOf(parser.getText().trim()));
180
						}
181
					} else if ("relatedtitle".equals(localName) && insideRelation) {
182
						log.debug("found open tag relatedtype");
183
						parser.next();
184
						if (parser.hasText()) {
185
							currentExtraInfo.setTitles(Lists.newArrayList(parser.getText().trim()));
186
						}
187
					} else if ("relatedauthor".equals(localName) && insideRelation) {
188
						log.debug("found open tag relatedtype");
189
						parser.next();
190
						if (parser.hasText()) {
191
							if (relatedAuthors == null)
192
								relatedAuthors = Lists.newArrayList();
193
							relatedAuthors.add(parser.getText().trim());
194
						}
195
					} else if ("objectType".equals(localName)) {
196
						log.debug("found open tag objectType");
197
						parser.next();
198
						if (parser.hasText()) {
199
							object.setType(ObjectType.valueOf(parser.getText().trim()));
200
						}
201

    
202
						// INSIDE THE TAG DATE
203
					} else if ("date".equals(localName)) {
204
						log.debug("found open tag date");
205
						parser.next();
206
						if (parser.hasText()) {
207
							object.setDate(parser.getText().trim());
208
						}
209
					}
210
				}
211
			}
212
			object.setAuthors(authors);
213
			object.setTitles(titles);
214
			object.setDatasourceProvenance(datasources);
215
			object.setRelations(relations);
216
			return object;
217
		} catch (Exception e) {
218
			log.error("Error on parsing record" + record, e);
219
			return null;
220
		}
221

    
222
	}
223

    
224
}
    (1-1/1)