1 |
26600
|
sandro.lab
|
package eu.dnetlib.data.oai.store.parser;
|
2 |
|
|
|
3 |
|
|
import java.io.StringReader;
|
4 |
|
|
import java.util.List;
|
5 |
|
|
import java.util.Map.Entry;
|
6 |
|
|
|
7 |
44514
|
alessia.ba
|
import com.google.common.base.Function;
|
8 |
|
|
import com.google.common.collect.ArrayListMultimap;
|
9 |
|
|
import com.google.common.collect.Iterables;
|
10 |
|
|
import com.google.common.collect.Multimap;
|
11 |
|
|
import eu.dnetlib.data.information.oai.publisher.PublisherField;
|
12 |
|
|
import eu.dnetlib.data.information.oai.publisher.conf.OAIConfigurationExistReader;
|
13 |
50872
|
alessia.ba
|
import org.apache.commons.lang3.StringUtils;
|
14 |
26600
|
sandro.lab
|
import org.apache.commons.logging.Log;
|
15 |
|
|
import org.apache.commons.logging.LogFactory;
|
16 |
|
|
import org.dom4j.Document;
|
17 |
|
|
import org.dom4j.DocumentException;
|
18 |
|
|
import org.dom4j.Node;
|
19 |
|
|
import org.dom4j.io.SAXReader;
|
20 |
|
|
|
21 |
|
|
/**
|
22 |
|
|
* An instance of this class can parse an XML record and extract the information needed to store the record in a publisher store.
|
23 |
|
|
*
|
24 |
|
|
*
|
25 |
|
|
* @author alessia
|
26 |
|
|
*
|
27 |
|
|
*/
|
28 |
|
|
public class PublisherRecordParser {
|
29 |
|
|
|
30 |
|
|
private static final Log log = LogFactory.getLog(PublisherRecordParser.class); // NOPMD by marko on 11/24/08 5:02 PM
|
31 |
|
|
|
32 |
|
|
/**
|
33 |
|
|
* List of the indices of the target store.
|
34 |
|
|
*/
|
35 |
|
|
private List<PublisherField> storeIndices;
|
36 |
|
|
|
37 |
|
|
private final SAXReader saxReader = new SAXReader();
|
38 |
|
|
|
39 |
|
|
/**
|
40 |
|
|
* Parses the record and returns a map where a key is the name of an index, the value is the value in the record at the xpath specificed
|
41 |
|
|
* in this.storeIndices.
|
42 |
|
|
*
|
43 |
|
|
* @param record
|
44 |
|
|
* the XML string to parse.
|
45 |
|
|
* @return a Multimap describing the values to be indexed for this record.
|
46 |
|
|
*/
|
47 |
|
|
@SuppressWarnings({ "unchecked", "rawtypes" })
|
48 |
44514
|
alessia.ba
|
public Multimap<String, String> parseRecord(final String record, final String source) {
|
49 |
26600
|
sandro.lab
|
Multimap<String, String> recordProps = ArrayListMultimap.create();
|
50 |
|
|
try {
|
51 |
|
|
Document doc = this.saxReader.read(new StringReader(record));
|
52 |
44514
|
alessia.ba
|
if(StringUtils.isNotBlank(source)) recordProps.put(OAIConfigurationExistReader.SET_FIELD, source);
|
53 |
26600
|
sandro.lab
|
for (PublisherField field : this.storeIndices) {
|
54 |
|
|
for (Entry<String, String> indexEntry : field.getSources().entries()) {
|
55 |
|
|
// each xpath can return a list of nodes or strings, depending on the xpath
|
56 |
|
|
List xPathResult = doc.selectNodes(indexEntry.getValue());
|
57 |
|
|
if ((xPathResult != null) && !xPathResult.isEmpty()) {
|
58 |
|
|
if (containsStrings(xPathResult)) {
|
59 |
|
|
recordProps.putAll(field.getFieldName(), xPathResult);
|
60 |
|
|
} else {
|
61 |
|
|
if (containsNodes(xPathResult)) {
|
62 |
|
|
recordProps.putAll(field.getFieldName(), Iterables.transform(xPathResult, new Function<Object, String>() {
|
63 |
|
|
|
64 |
|
|
@Override
|
65 |
|
|
public String apply(final Object obj) {
|
66 |
|
|
if (obj == null) return "";
|
67 |
|
|
Node node = (Node) obj;
|
68 |
|
|
return node.getText();
|
69 |
|
|
}
|
70 |
|
|
}));
|
71 |
|
|
}
|
72 |
|
|
}
|
73 |
|
|
}
|
74 |
|
|
}
|
75 |
|
|
}
|
76 |
|
|
|
77 |
|
|
} catch (DocumentException e) {
|
78 |
|
|
log.fatal("Can't parse record");
|
79 |
|
|
recordProps = null;
|
80 |
|
|
}
|
81 |
|
|
return recordProps;
|
82 |
|
|
|
83 |
|
|
}
|
84 |
|
|
|
85 |
|
|
@SuppressWarnings("rawtypes")
|
86 |
|
|
private boolean containsStrings(final List objects) {
|
87 |
|
|
Object first = objects.get(0);
|
88 |
|
|
return first instanceof String;
|
89 |
|
|
}
|
90 |
|
|
|
91 |
|
|
@SuppressWarnings("rawtypes")
|
92 |
|
|
private boolean containsNodes(final List objects) {
|
93 |
|
|
Object first = objects.get(0);
|
94 |
|
|
return first instanceof Node;
|
95 |
|
|
}
|
96 |
|
|
|
97 |
|
|
public List<PublisherField> getStoreIndices() {
|
98 |
|
|
return storeIndices;
|
99 |
|
|
}
|
100 |
|
|
|
101 |
|
|
public void setStoreIndices(final List<PublisherField> storeIndices) {
|
102 |
|
|
this.storeIndices = storeIndices;
|
103 |
|
|
}
|
104 |
|
|
|
105 |
|
|
public SAXReader getSaxReader() {
|
106 |
|
|
return saxReader;
|
107 |
|
|
}
|
108 |
|
|
|
109 |
|
|
public PublisherRecordParser(final List<PublisherField> storeIndices) {
|
110 |
|
|
super();
|
111 |
|
|
this.storeIndices = storeIndices;
|
112 |
|
|
}
|
113 |
|
|
|
114 |
|
|
public PublisherRecordParser() {
|
115 |
|
|
super();
|
116 |
|
|
// TODO Auto-generated constructor stub
|
117 |
|
|
}
|
118 |
|
|
|
119 |
|
|
}
|