1
|
package eu.dnetlib.data.oai.store.parser;
|
2
|
|
3
|
import java.io.StringReader;
|
4
|
import java.util.List;
|
5
|
import java.util.Map.Entry;
|
6
|
|
7
|
import com.google.common.base.Function;
|
8
|
import com.google.common.collect.ArrayListMultimap;
|
9
|
import com.google.common.collect.Iterables;
|
10
|
import com.google.common.collect.Multimap;
|
11
|
import eu.dnetlib.data.information.oai.publisher.PublisherField;
|
12
|
import eu.dnetlib.data.information.oai.publisher.conf.OAIConfigurationExistReader;
|
13
|
import org.apache.commons.lang3.StringUtils;
|
14
|
import org.apache.commons.logging.Log;
|
15
|
import org.apache.commons.logging.LogFactory;
|
16
|
import org.dom4j.Document;
|
17
|
import org.dom4j.DocumentException;
|
18
|
import org.dom4j.Node;
|
19
|
import org.dom4j.io.SAXReader;
|
20
|
|
21
|
/**
|
22
|
* An instance of this class can parse an XML record and extract the information needed to store the record in a publisher store.
|
23
|
*
|
24
|
*
|
25
|
* @author alessia
|
26
|
*
|
27
|
*/
|
28
|
public class PublisherRecordParser {
|
29
|
|
30
|
private static final Log log = LogFactory.getLog(PublisherRecordParser.class); // NOPMD by marko on 11/24/08 5:02 PM
|
31
|
|
32
|
/**
|
33
|
* List of the indices of the target store.
|
34
|
*/
|
35
|
private List<PublisherField> storeIndices;
|
36
|
|
37
|
private final SAXReader saxReader = new SAXReader();
|
38
|
|
39
|
/**
|
40
|
* Parses the record and returns a map where a key is the name of an index, the value is the value in the record at the xpath specificed
|
41
|
* in this.storeIndices.
|
42
|
*
|
43
|
* @param record
|
44
|
* the XML string to parse.
|
45
|
* @return a Multimap describing the values to be indexed for this record.
|
46
|
*/
|
47
|
@SuppressWarnings({ "unchecked", "rawtypes" })
|
48
|
public Multimap<String, String> parseRecord(final String record, final String source) {
|
49
|
Multimap<String, String> recordProps = ArrayListMultimap.create();
|
50
|
try {
|
51
|
Document doc = this.saxReader.read(new StringReader(record));
|
52
|
if(StringUtils.isNotBlank(source)) recordProps.put(OAIConfigurationExistReader.SET_FIELD, source);
|
53
|
for (PublisherField field : this.storeIndices) {
|
54
|
for (Entry<String, String> indexEntry : field.getSources().entries()) {
|
55
|
// each xpath can return a list of nodes or strings, depending on the xpath
|
56
|
List xPathResult = doc.selectNodes(indexEntry.getValue());
|
57
|
if ((xPathResult != null) && !xPathResult.isEmpty()) {
|
58
|
if (containsStrings(xPathResult)) {
|
59
|
recordProps.putAll(field.getFieldName(), xPathResult);
|
60
|
} else {
|
61
|
if (containsNodes(xPathResult)) {
|
62
|
recordProps.putAll(field.getFieldName(), Iterables.transform(xPathResult, new Function<Object, String>() {
|
63
|
|
64
|
@Override
|
65
|
public String apply(final Object obj) {
|
66
|
if (obj == null) return "";
|
67
|
Node node = (Node) obj;
|
68
|
return node.getText();
|
69
|
}
|
70
|
}));
|
71
|
}
|
72
|
}
|
73
|
}
|
74
|
}
|
75
|
}
|
76
|
|
77
|
} catch (DocumentException e) {
|
78
|
log.fatal("Can't parse record");
|
79
|
recordProps = null;
|
80
|
}
|
81
|
return recordProps;
|
82
|
|
83
|
}
|
84
|
|
85
|
@SuppressWarnings("rawtypes")
|
86
|
private boolean containsStrings(final List objects) {
|
87
|
Object first = objects.get(0);
|
88
|
return first instanceof String;
|
89
|
}
|
90
|
|
91
|
@SuppressWarnings("rawtypes")
|
92
|
private boolean containsNodes(final List objects) {
|
93
|
Object first = objects.get(0);
|
94
|
return first instanceof Node;
|
95
|
}
|
96
|
|
97
|
public List<PublisherField> getStoreIndices() {
|
98
|
return storeIndices;
|
99
|
}
|
100
|
|
101
|
public void setStoreIndices(final List<PublisherField> storeIndices) {
|
102
|
this.storeIndices = storeIndices;
|
103
|
}
|
104
|
|
105
|
public SAXReader getSaxReader() {
|
106
|
return saxReader;
|
107
|
}
|
108
|
|
109
|
public PublisherRecordParser(final List<PublisherField> storeIndices) {
|
110
|
super();
|
111
|
this.storeIndices = storeIndices;
|
112
|
}
|
113
|
|
114
|
public PublisherRecordParser() {
|
115
|
super();
|
116
|
// TODO Auto-generated constructor stub
|
117
|
}
|
118
|
|
119
|
}
|