Project

General

Profile

1 42181 sandro.lab
package eu.dnetlib.oai.parser;
2
3
import java.io.StringReader;
4
import java.util.List;
5
import java.util.Map.Entry;
6
7 45455 alessia.ba
import com.google.common.collect.ArrayListMultimap;
8
import com.google.common.collect.Iterables;
9
import com.google.common.collect.Multimap;
10
import eu.dnetlib.oai.PublisherField;
11 42181 sandro.lab
import org.apache.commons.logging.Log;
12
import org.apache.commons.logging.LogFactory;
13
import org.dom4j.Document;
14
import org.dom4j.DocumentException;
15
import org.dom4j.Node;
16
import org.dom4j.io.SAXReader;
17
18
/**
19
 * An instance of this class can parse an XML record and extract the information needed to store the record in a publisher store.
20
 *
21
 * @author alessia
22
 */
23
public class PublisherRecordParser {
24
25
	private static final Log log = LogFactory.getLog(PublisherRecordParser.class); // NOPMD by marko on 11/24/08 5:02 PM
26 45455 alessia.ba
27 42181 sandro.lab
	/**
28
	 * List of the indices of the target store.
29
	 */
30
	private List<PublisherField> storeIndices;
31
32 45455 alessia.ba
	private final SAXReader saxReader = new SAXReader();
33 42181 sandro.lab
34
	/**
35
	 * Parses the record and returns a map where a key is the name of an index, the value is the value in the record at the xpath specificed
36
	 * in this.storeIndices.
37
	 *
38 42184 michele.ar
	 * @param record
39
	 *            the XML string to parse.
40 42181 sandro.lab
	 * @return a Multimap describing the values to be indexed for this record.
41
	 */
42
	@SuppressWarnings({ "unchecked", "rawtypes" })
43
	public Multimap<String, String> parseRecord(final String record) {
44
		Multimap<String, String> recordProps = ArrayListMultimap.create();
45
		try {
46 45455 alessia.ba
			Document doc = this.saxReader.read(new StringReader(record));
47
			for (PublisherField field : this.storeIndices) {
48
				for (Entry<String, String> indexEntry : field.getSources().entries()) {
49 42181 sandro.lab
					// each xpath can return a list of nodes or strings, depending on the xpath
50 45455 alessia.ba
					List xPathResult = doc.selectNodes(indexEntry.getValue());
51 42181 sandro.lab
					if ((xPathResult != null) && !xPathResult.isEmpty()) {
52
						if (containsStrings(xPathResult)) {
53
							recordProps.putAll(field.getFieldName(), xPathResult);
54
						} else {
55
							if (containsNodes(xPathResult)) {
56 45455 alessia.ba
								recordProps.putAll(field.getFieldName(), Iterables.transform(xPathResult, obj -> {
57
									if (obj == null) return "";
58
									Node node = (Node) obj;
59
									return node.getText();
60 42181 sandro.lab
								}));
61
							}
62
						}
63
					}
64
				}
65
			}
66
67 45455 alessia.ba
		} catch (DocumentException e) {
68 42181 sandro.lab
			log.fatal("Can't parse record");
69
			recordProps = null;
70
		}
71
		return recordProps;
72
73
	}
74
75
	@SuppressWarnings("rawtypes")
76
	private boolean containsStrings(final List objects) {
77 45455 alessia.ba
		Object first = objects.get(0);
78 42181 sandro.lab
		return first instanceof String;
79
	}
80
81
	@SuppressWarnings("rawtypes")
82
	private boolean containsNodes(final List objects) {
83 45455 alessia.ba
		Object first = objects.get(0);
84 42181 sandro.lab
		return first instanceof Node;
85
	}
86
87
	public List<PublisherField> getStoreIndices() {
88 45455 alessia.ba
		return storeIndices;
89 42181 sandro.lab
	}
90
91
	public void setStoreIndices(final List<PublisherField> storeIndices) {
92
		this.storeIndices = storeIndices;
93
	}
94
95
	public SAXReader getSaxReader() {
96 45455 alessia.ba
		return saxReader;
97 42181 sandro.lab
	}
98
99 45455 alessia.ba
	public PublisherRecordParser(final List<PublisherField> storeIndices) {
100
		super();
101
		this.storeIndices = storeIndices;
102
	}
103
104
	public PublisherRecordParser() {
105
		super();
106
		// TODO Auto-generated constructor stub
107
	}
108
109
110 42181 sandro.lab
}