Project

General

Profile

« Previous | Next » 

Revision 35322

[maven-release-plugin] copy for tag dnet-index-solr-common-1.1.1

View differences:

modules/dnet-index-solr-common/tags/dnet-index-solr-common-1.1.1/deploy.info
1
{"type_source": "SVN", "goal": "package -U -T 4C source:jar", "url": "http://svn-public.driver.research-infrastructures.eu/driver/dnet40/modules/dnet-index-solr-common/trunk/", "deploy_repository": "dnet4-snapshots", "version": "4", "mail": "sandro.labruzzo@isti.cnr.it,michele.artini@isti.cnr.it, claudio.atzori@isti.cnr.it, alessia.bardi@isti.cnr.it", "deploy_repository_url": "http://maven.research-infrastructures.eu/nexus/content/repositories/dnet4-snapshots", "name": "dnet-index-solr-common"}
modules/dnet-index-solr-common/tags/dnet-index-solr-common-1.1.1/src/main/java/eu/dnetlib/functionality/index/solr/feed/StreamingInputDocumentFactory.java
1
package eu.dnetlib.functionality.index.solr.feed;
2

  
3
import java.io.StringReader;
4
import java.io.StringWriter;
5
import java.util.Iterator;
6
import java.util.List;
7

  
8
import javax.xml.stream.XMLEventFactory;
9
import javax.xml.stream.XMLEventReader;
10
import javax.xml.stream.XMLEventWriter;
11
import javax.xml.stream.XMLInputFactory;
12
import javax.xml.stream.XMLOutputFactory;
13
import javax.xml.stream.XMLStreamException;
14
import javax.xml.stream.events.Namespace;
15
import javax.xml.stream.events.StartElement;
16
import javax.xml.stream.events.XMLEvent;
17

  
18
import org.apache.solr.common.SolrInputDocument;
19

  
20
import com.google.common.collect.Lists;
21

  
22
/**
23
 * Optimized version of the document parser, drop in replacement of InputDocumentFactory.
24
 *
25
 * <p>
26
 * Faster because:
27
 * </p>
28
 * <ul>
29
 * <li>Doesn't create a DOM for the full document</li>
30
 * <li>Doesn't execute xpaths agains the DOM</li>
31
 * <li>Quickly serialize the 'result' element directly in a string.</li>
32
 * <li>Uses less memory: less pressure on GC and allows more threads to process this in parallel</li>
33
 * </ul>
34
 *
35
 * <p>
36
 * This class is fully reentrant and can be invoked in parallel.
37
 * </p>
38
 *
39
 * @author marko
40
 *
41
 */
42
public class StreamingInputDocumentFactory extends InputDocumentFactory {
43

  
44
	protected static final String DEFAULTDNETRESULT = "dnetResult";
45

  
46
	protected static final String TARGETFIELDS = "targetFields";
47

  
48
	protected static final String INDEX_RECORD_ID_ELEMENT = "indexRecordIdentifier";
49

  
50
	protected static final String ROOT_ELEMENT = "indexRecord";
51

  
52
	protected ThreadLocal<XMLInputFactory> inputFactory = new ThreadLocal<XMLInputFactory>() {
53

  
54
		@Override
55
		protected XMLInputFactory initialValue() {
56
			return XMLInputFactory.newInstance();
57
		}
58
	};
59

  
60
	protected ThreadLocal<XMLOutputFactory> outputFactory = new ThreadLocal<XMLOutputFactory>() {
61

  
62
		@Override
63
		protected XMLOutputFactory initialValue() {
64
			return XMLOutputFactory.newInstance();
65
		}
66
	};
67

  
68
	protected ThreadLocal<XMLEventFactory> eventFactory = new ThreadLocal<XMLEventFactory>() {
69

  
70
		@Override
71
		protected XMLEventFactory initialValue() {
72
			return XMLEventFactory.newInstance();
73
		}
74
	};
75

  
76
	/**
77
	 * {@inheritDoc}
78
	 *
79
	 * @see eu.dnetlib.functionality.index.solr.feed.InputDocumentFactory#parseDocument(eu.dnetlib.functionality.index.solr.feed.IndexDocument,
80
	 *      java.lang.String)
81
	 */
82
	@Override
83
	public SolrInputDocument parseDocument(final String version, final String inputDocument, final String dsId, final String resultName) {
84

  
85
		final StringWriter results = new StringWriter();
86
		final List<Namespace> nsList = Lists.newLinkedList();
87
		try {
88

  
89
			XMLEventReader parser = inputFactory.get().createXMLEventReader(new StringReader(inputDocument));
90

  
91
			final SolrInputDocument indexDocument = new SolrInputDocument();
92

  
93
			while (parser.hasNext()) {
94
				final XMLEvent event = parser.nextEvent();
95
				if ((event != null) && event.isStartElement()) {
96
					final String localName = event.asStartElement().getName().getLocalPart();
97

  
98
					if (ROOT_ELEMENT.equals(localName)) {
99
						nsList.addAll(getNamespaces(event));
100
					} else if (INDEX_RECORD_ID_ELEMENT.equals(localName)) {
101
						final XMLEvent text = parser.nextEvent();
102
						String recordId = getText(text);
103
						indexDocument.addField(INDEX_RECORD_ID, recordId);
104
					} else if (TARGETFIELDS.equals(localName)) {
105
						parseTargetFields(indexDocument, parser);
106
					} else if (resultName.equals(localName)) {
107
						copyResult(indexDocument, results, parser, nsList, resultName);
108
					}
109
				}
110
			}
111

  
112
			if (version != null) {
113
				indexDocument.addField(DS_VERSION, version);
114
			}
115

  
116
			if (dsId != null) {
117
				indexDocument.addField(DS_ID, dsId);
118
			}
119

  
120
			if (!indexDocument.containsKey(INDEX_RECORD_ID)) {
121
				indexDocument.clear();
122
				System.err.println("missing indexrecord id:\n" + inputDocument);
123
			}
124

  
125
			return indexDocument;
126
		} catch (XMLStreamException e) {
127
			return new SolrInputDocument();
128
		}
129
	}
130

  
131
	private List<Namespace> getNamespaces(final XMLEvent event) {
132
		final List<Namespace> res = Lists.newLinkedList();
133
		@SuppressWarnings("unchecked")
134
		Iterator<Namespace> nsIter = event.asStartElement().getNamespaces();
135
		while (nsIter.hasNext()) {
136
			Namespace ns = nsIter.next();
137
			res.add(ns);
138
		}
139
		return res;
140
	}
141

  
142
	/**
143
	 * Parse the targetFields block and add fields to the solr document.
144
	 *
145
	 * @param indexDocument
146
	 * @param parser
147
	 * @throws XMLStreamException
148
	 */
149
	protected void parseTargetFields(final SolrInputDocument indexDocument, final XMLEventReader parser) throws XMLStreamException {
150

  
151
		boolean hasFields = false;
152

  
153
		while (parser.hasNext()) {
154
			final XMLEvent targetEvent = parser.nextEvent();
155
			if (targetEvent.isEndElement() && targetEvent.asEndElement().getName().getLocalPart().equals(TARGETFIELDS)) {
156
				break;
157
			}
158

  
159
			if (targetEvent.isStartElement()) {
160
				final String fieldName = targetEvent.asStartElement().getName().getLocalPart();
161
				final XMLEvent text = parser.nextEvent();
162

  
163
				String data = getText(text);
164

  
165
				addField(indexDocument, fieldName, data);
166
				hasFields = true;
167
			}
168
		}
169

  
170
		if (!hasFields) {
171
			indexDocument.clear();
172
		}
173
	}
174

  
175
	/**
176
	 * Copy the /indexRecord/result element and children, preserving namespace declarations etc.
177
	 *
178
	 * @param indexDocument
179
	 * @param results
180
	 * @param parser
181
	 * @param nsList
182
	 * @throws XMLStreamException
183
	 */
184
	protected void copyResult(final SolrInputDocument indexDocument,
185
			final StringWriter results,
186
			final XMLEventReader parser,
187
			final List<Namespace> nsList,
188
			final String dnetResult) throws XMLStreamException {
189
		final XMLEventWriter writer = outputFactory.get().createXMLEventWriter(results);
190

  
191
		for (Namespace ns : nsList) {
192
			eventFactory.get().createNamespace(ns.getPrefix(), ns.getNamespaceURI());
193
		}
194

  
195
		StartElement newRecord = eventFactory.get().createStartElement("", null, RESULT, null, nsList.iterator());
196

  
197
		// new root record
198
		writer.add(newRecord);
199

  
200
		// copy the rest as it is
201
		while (parser.hasNext()) {
202
			final XMLEvent resultEvent = parser.nextEvent();
203

  
204
			// TODO: replace with depth tracking instead of close tag tracking.
205
			if (resultEvent.isEndElement() && resultEvent.asEndElement().getName().getLocalPart().equals(dnetResult)) {
206
				writer.add(eventFactory.get().createEndElement("", null, RESULT));
207
				break;
208
			}
209

  
210
			writer.add(resultEvent);
211
		}
212
		writer.close();
213

  
214
		indexDocument.addField(INDEX_RESULT, results.toString());
215
	}
216

  
217
	/**
218
	 * Helper used to add a field to a solr doc. It avoids to add empy fields
219
	 *
220
	 * @param indexDocument
221
	 * @param field
222
	 * @param value
223
	 */
224
	private final void addField(final SolrInputDocument indexDocument, final String field, final String value) {
225
		String cleaned = value.trim();
226
		if (!cleaned.isEmpty()) {
227
			// log.info("\n\n adding field " + field.toLowerCase() + " value: " + cleaned + "\n");
228
			indexDocument.addField(field.toLowerCase(), cleaned);
229
		}
230
	}
231

  
232
	/**
233
	 * Helper used to get the string from a text element.
234
	 *
235
	 * @param text
236
	 * @return
237
	 */
238
	protected final String getText(final XMLEvent text) {
239
		if (text.isEndElement()) // log.warn("skipping because isEndOfElement " + text.asEndElement().getName().getLocalPart());
240
			return "";
241

  
242
		return text.asCharacters().getData();
243
	}
244

  
245
}
modules/dnet-index-solr-common/tags/dnet-index-solr-common-1.1.1/src/main/java/eu/dnetlib/functionality/index/solr/feed/InputDocumentFactory.java
1
package eu.dnetlib.functionality.index.solr.feed;
2

  
3
import java.text.ParseException;
4
import java.text.SimpleDateFormat;
5
import java.util.Arrays;
6
import java.util.List;
7

  
8
import javax.xml.stream.XMLStreamException;
9

  
10
import org.apache.solr.common.SolrInputDocument;
11
import org.dom4j.DocumentException;
12

  
13
/**
14
 *
15
 * @author claudio
16
 *
17
 */
18
public abstract class InputDocumentFactory {
19

  
20
	public static final String INDEX_FIELD_PREFIX = "__";
21

  
22
	public static final String DS_VERSION = INDEX_FIELD_PREFIX + "dsversion";
23

  
24
	public static final String DS_ID = INDEX_FIELD_PREFIX + "dsid";
25

  
26
	public static final String RESULT = "result";
27

  
28
	public static final String INDEX_RESULT = INDEX_FIELD_PREFIX + RESULT;
29

  
30
	public static final String INDEX_RECORD_ID = INDEX_FIELD_PREFIX + "indexrecordidentifier";
31

  
32
	private static final String outFormat = new String("yyyy-MM-dd'T'hh:mm:ss'Z'");
33

  
34
	private final static List<String> dateFormats = Arrays.asList("yyyy-MM-dd'T'hh:mm:ss", "yyyy-MM-dd", "dd-MM-yyyy", "dd/MM/yyyy", "yyyy");
35

  
36
	public abstract SolrInputDocument parseDocument(final String version, String inputDocument, String dsId, final String resultName) throws XMLStreamException;
37

  
38
	/**
39
	 * method return a solr-compatible string representation of a date
40
	 *
41
	 * @param date
42
	 * @return
43
	 * @throws DocumentException
44
	 * @throws ParseException
45
	 */
46
	public static String getParsedDateField(final String date) {
47
		for (String formatString : dateFormats) {
48
			try {
49
				return new SimpleDateFormat(outFormat).format(new SimpleDateFormat(formatString).parse(date));
50
			} catch (ParseException e) {}
51
		}
52
		throw new IllegalStateException("unable to parse date: " + date);
53
	}
54

  
55
	public String parseDate(final String date) {
56
		return getParsedDateField(date);
57
	}
58

  
59
}
modules/dnet-index-solr-common/tags/dnet-index-solr-common-1.1.1/pom.xml
1
<?xml version="1.0" encoding="UTF-8"?>
2
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
3
	<parent>
4
		<groupId>eu.dnetlib</groupId>
5
		<artifactId>dnet-hadoop-parent</artifactId>
6
		<version>1.0.0</version>
7
		<relativePath />
8
	</parent>
9
	<modelVersion>4.0.0</modelVersion>
10
	<groupId>eu.dnetlib</groupId>
11
	<artifactId>dnet-index-solr-common</artifactId>
12
	<version>1.1.1</version>
13
	<scm>
14
		<developerConnection>scm:svn:https://svn.driver.research-infrastructures.eu/driver/dnet40/modules/dnet-index-solr-common/tags/dnet-index-solr-common-1.1.1</developerConnection>
15
	</scm>
16
	<dependencies>
17
		<dependency>
18
			<groupId>org.apache.solr</groupId>
19
			<artifactId>solr-solrj</artifactId>
20
			<version>${apache.solr.version}</version>
21
			<exclusions>
22
				<exclusion>
23
					<artifactId>wstx-asl</artifactId>
24
					<groupId>org.codehaus.woodstox</groupId>
25
				</exclusion>
26
			</exclusions>
27
		</dependency>
28
		<dependency>
29
			<groupId>dom4j</groupId>
30
			<artifactId>dom4j</artifactId>
31
			<version>1.6.1</version>
32
		</dependency>
33
		<dependency>
34
			<groupId>com.google.guava</groupId>
35
			<artifactId>guava</artifactId>
36
			<version>${google.guava.version}</version>
37
		</dependency>
38

  
39
	</dependencies>
40
</project>

Also available in: Unified diff