Project

General

Profile

1
package eu.dnetlib.oai.parser;
2

    
3
import java.io.StringReader;
4
import java.util.List;
5
import java.util.Map.Entry;
6

    
7
import org.apache.commons.logging.Log;
8
import org.apache.commons.logging.LogFactory;
9
import org.dom4j.Document;
10
import org.dom4j.DocumentException;
11
import org.dom4j.Node;
12
import org.dom4j.io.SAXReader;
13

    
14
import com.google.common.base.Function;
15
import com.google.common.collect.ArrayListMultimap;
16
import com.google.common.collect.Iterables;
17
import com.google.common.collect.Multimap;
18

    
19
import eu.dnetlib.oai.PublisherField;
20

    
21
/**
22
 * An instance of this class can parse an XML record and extract the information needed to store the record in a publisher store.
23
 *
24
 * @author alessia
25
 */
26
public class PublisherRecordParser {
27

    
28
	private static final Log log = LogFactory.getLog(PublisherRecordParser.class); // NOPMD by marko on 11/24/08 5:02 PM
29
	private final SAXReader saxReader = new SAXReader();
30
	/**
31
	 * List of the indices of the target store.
32
	 */
33
	private List<PublisherField> storeIndices;
34

    
35
	public PublisherRecordParser(final List<PublisherField> storeIndices) {
36
		super();
37
		this.storeIndices = storeIndices;
38
	}
39

    
40
	public PublisherRecordParser() {
41
		super();
42
		// TODO Auto-generated constructor stub
43
	}
44

    
45
	/**
46
	 * Parses the record and returns a map where a key is the name of an index, the value is the value in the record at the xpath specificed
47
	 * in this.storeIndices.
48
	 *
49
	 * @param record
50
	 *            the XML string to parse.
51
	 * @return a Multimap describing the values to be indexed for this record.
52
	 */
53
	@SuppressWarnings({ "unchecked", "rawtypes" })
54
	public Multimap<String, String> parseRecord(final String record) {
55
		Multimap<String, String> recordProps = ArrayListMultimap.create();
56
		try {
57
			final Document doc = this.saxReader.read(new StringReader(record));
58
			for (final PublisherField field : this.storeIndices) {
59
				for (final Entry<String, String> indexEntry : field.getSources().entries()) {
60
					// each xpath can return a list of nodes or strings, depending on the xpath
61
					final List xPathResult = doc.selectNodes(indexEntry.getValue());
62
					if ((xPathResult != null) && !xPathResult.isEmpty()) {
63
						if (containsStrings(xPathResult)) {
64
							recordProps.putAll(field.getFieldName(), xPathResult);
65
						} else {
66
							if (containsNodes(xPathResult)) {
67
								recordProps.putAll(field.getFieldName(), Iterables.transform(xPathResult, new Function<Object, String>() {
68

    
69
									@Override
70
									public String apply(final Object obj) {
71
										if (obj == null) { return ""; }
72
										final Node node = (Node) obj;
73
										return node.getText();
74
									}
75
								}));
76
							}
77
						}
78
					}
79
				}
80
			}
81

    
82
		} catch (final DocumentException e) {
83
			log.fatal("Can't parse record");
84
			recordProps = null;
85
		}
86
		return recordProps;
87

    
88
	}
89

    
90
	@SuppressWarnings("rawtypes")
91
	private boolean containsStrings(final List objects) {
92
		final Object first = objects.get(0);
93
		return first instanceof String;
94
	}
95

    
96
	@SuppressWarnings("rawtypes")
97
	private boolean containsNodes(final List objects) {
98
		final Object first = objects.get(0);
99
		return first instanceof Node;
100
	}
101

    
102
	public List<PublisherField> getStoreIndices() {
103
		return this.storeIndices;
104
	}
105

    
106
	public void setStoreIndices(final List<PublisherField> storeIndices) {
107
		this.storeIndices = storeIndices;
108
	}
109

    
110
	public SAXReader getSaxReader() {
111
		return this.saxReader;
112
	}
113

    
114
}
(2-2/2)