Project

General

Profile

1 26600 sandro.lab
package eu.dnetlib.data.oai.store.parser;
2
3
import java.io.StringReader;
4
import java.util.List;
5
import java.util.Map.Entry;
6
7 44514 alessia.ba
import com.google.common.base.Function;
8
import com.google.common.collect.ArrayListMultimap;
9
import com.google.common.collect.Iterables;
10
import com.google.common.collect.Multimap;
11
import eu.dnetlib.data.information.oai.publisher.PublisherField;
12
import eu.dnetlib.data.information.oai.publisher.conf.OAIConfigurationExistReader;
13 50872 alessia.ba
import org.apache.commons.lang3.StringUtils;
14 26600 sandro.lab
import org.apache.commons.logging.Log;
15
import org.apache.commons.logging.LogFactory;
16
import org.dom4j.Document;
17
import org.dom4j.DocumentException;
18
import org.dom4j.Node;
19
import org.dom4j.io.SAXReader;
20
21
/**
22
 * An instance of this class can parse an XML record and extract the information needed to store the record in a publisher store.
23
 *
24
 *
25
 * @author alessia
26
 *
27
 */
28
public class PublisherRecordParser {
29
30
	private static final Log log = LogFactory.getLog(PublisherRecordParser.class); // NOPMD by marko on 11/24/08 5:02 PM
31
32
	/**
33
	 * List of the indices of the target store.
34
	 */
35
	private List<PublisherField> storeIndices;
36
37
	private final SAXReader saxReader = new SAXReader();
38
39
	/**
40
	 * Parses the record and returns a map where a key is the name of an index, the value is the value in the record at the xpath specificed
41
	 * in this.storeIndices.
42
	 *
43
	 * @param record
44
	 *            the XML string to parse.
45
	 * @return a Multimap describing the values to be indexed for this record.
46
	 */
47
	@SuppressWarnings({ "unchecked", "rawtypes" })
48 44514 alessia.ba
	public Multimap<String, String> parseRecord(final String record, final String source) {
49 26600 sandro.lab
		Multimap<String, String> recordProps = ArrayListMultimap.create();
50
		try {
51
			Document doc = this.saxReader.read(new StringReader(record));
52 44514 alessia.ba
			if(StringUtils.isNotBlank(source)) recordProps.put(OAIConfigurationExistReader.SET_FIELD, source);
53 26600 sandro.lab
			for (PublisherField field : this.storeIndices) {
54
				for (Entry<String, String> indexEntry : field.getSources().entries()) {
55
					// each xpath can return a list of nodes or strings, depending on the xpath
56
					List xPathResult = doc.selectNodes(indexEntry.getValue());
57
					if ((xPathResult != null) && !xPathResult.isEmpty()) {
58
						if (containsStrings(xPathResult)) {
59
							recordProps.putAll(field.getFieldName(), xPathResult);
60
						} else {
61
							if (containsNodes(xPathResult)) {
62
								recordProps.putAll(field.getFieldName(), Iterables.transform(xPathResult, new Function<Object, String>() {
63
64
									@Override
65
									public String apply(final Object obj) {
66
										if (obj == null) return "";
67
										Node node = (Node) obj;
68
										return node.getText();
69
									}
70
								}));
71
							}
72
						}
73
					}
74
				}
75
			}
76
77
		} catch (DocumentException e) {
78
			log.fatal("Can't parse record");
79
			recordProps = null;
80
		}
81
		return recordProps;
82
83
	}
84
85
	@SuppressWarnings("rawtypes")
86
	private boolean containsStrings(final List objects) {
87
		Object first = objects.get(0);
88
		return first instanceof String;
89
	}
90
91
	@SuppressWarnings("rawtypes")
92
	private boolean containsNodes(final List objects) {
93
		Object first = objects.get(0);
94
		return first instanceof Node;
95
	}
96
97
	public List<PublisherField> getStoreIndices() {
98
		return storeIndices;
99
	}
100
101
	public void setStoreIndices(final List<PublisherField> storeIndices) {
102
		this.storeIndices = storeIndices;
103
	}
104
105
	public SAXReader getSaxReader() {
106
		return saxReader;
107
	}
108
109
	public PublisherRecordParser(final List<PublisherField> storeIndices) {
110
		super();
111
		this.storeIndices = storeIndices;
112
	}
113
114
	public PublisherRecordParser() {
115
		super();
116
		// TODO Auto-generated constructor stub
117
	}
118
119
}