Project

General

Profile

« Previous | Next » 

Revision 57157

Added by Enrico Ottonello over 4 years ago

solr 772 integration

View differences:

StreamingInputDocumentFactory.java
2 2

  
3 3
import java.io.StringReader;
4 4
import java.io.StringWriter;
5
import java.util.HashMap;
5 6
import java.util.Iterator;
6
import java.util.LinkedList;
7 7
import java.util.List;
8
import javax.xml.stream.*;
8

  
9
import javax.xml.stream.XMLEventFactory;
10
import javax.xml.stream.XMLEventReader;
11
import javax.xml.stream.XMLEventWriter;
12
import javax.xml.stream.XMLInputFactory;
13
import javax.xml.stream.XMLOutputFactory;
14
import javax.xml.stream.XMLStreamException;
9 15
import javax.xml.stream.events.Namespace;
10 16
import javax.xml.stream.events.StartElement;
11 17
import javax.xml.stream.events.XMLEvent;
12 18

  
13
import eu.dnetlib.index.solr.feed.ResultTransformer.Mode;
14 19
import org.apache.solr.common.SolrInputDocument;
15 20

  
21
import com.google.common.collect.Lists;
22

  
23
import eu.dnetlib.index.solr.feed.ResultTransformer.Mode;
24

  
16 25
/**
17 26
 * Optimized version of the document parser, drop in replacement of InputDocumentFactory.
27
 *
18 28
 * <p>
19
 * <p>
20 29
 * Faster because:
21 30
 * </p>
22 31
 * <ul>
......
25 34
 * <li>Quickly serialize the 'result' element directly in a string.</li>
26 35
 * <li>Uses less memory: less pressure on GC and allows more threads to process this in parallel</li>
27 36
 * </ul>
37
 *
28 38
 * <p>
29
 * <p>
30 39
 * This class is fully reentrant and can be invoked in parallel.
31 40
 * </p>
32 41
 *
33 42
 * @author marko
43
 *
34 44
 */
35

  
36 45
public class StreamingInputDocumentFactory extends InputDocumentFactory {
37 46

  
38 47
	protected static final String DEFAULTDNETRESULT = "dnetResult";
......
43 52

  
44 53
	protected static final String ROOT_ELEMENT = "indexRecord";
45 54

  
55
	protected static final int MAX_FIELD_LENGTH = 25000;
56

  
46 57
	protected ThreadLocal<XMLInputFactory> inputFactory = new ThreadLocal<XMLInputFactory>() {
47 58

  
48 59
		@Override
......
67 78
		}
68 79
	};
69 80

  
70
	/**
71
	 * {@inheritDoc}
72
	 */
73 81
	@Override
74 82
	public SolrInputDocument parseDocument(final String version, final String inputDocument, final String dsId, final String resultName)
75 83
			throws XMLStreamException {
76 84
		return parseDocument(version, inputDocument, dsId, resultName, null);
77 85
	}
78 86

  
79
	/**
80
	 * {@inheritDoc}
81
	 * <p>
82
	 * String, com.google.common.base.Function)
83
	 */
84 87
	@Override
85 88
	public SolrInputDocument parseDocument(final String version,
86 89
			final String inputDocument,
......
89 92
			final ResultTransformer resultTransformer) {
90 93

  
91 94
		final StringWriter results = new StringWriter();
92
		final List<Namespace> nsList = new LinkedList<>();
95
		final List<Namespace> nsList = Lists.newLinkedList();
93 96
		try {
94 97

  
95 98
			XMLEventReader parser = inputFactory.get().createXMLEventReader(new StringReader(inputDocument));
96 99

  
97
			final SolrInputDocument indexDocument = new SolrInputDocument();
100
			final SolrInputDocument indexDocument = new SolrInputDocument(new HashMap<>());
98 101

  
99 102
			while (parser.hasNext()) {
100 103
				final XMLEvent event = parser.nextEvent();
......
137 140
	}
138 141

  
139 142
	private List<Namespace> getNamespaces(final XMLEvent event) {
140
		final List<Namespace> res = new LinkedList<>();
143
		final List<Namespace> res = Lists.newLinkedList();
141 144
		@SuppressWarnings("unchecked")
142 145
		Iterator<Namespace> nsIter = event.asStartElement().getNamespaces();
143 146
		while (nsIter.hasNext()) {
......
246 249
	 * Helper used to get the string from a text element.
247 250
	 *
248 251
	 * @param text
249
	 * @return
252
	 * @return the
250 253
	 */
251 254
	protected final String getText(final XMLEvent text) {
252 255
		if (text.isEndElement()) // log.warn("skipping because isEndOfElement " + text.asEndElement().getName().getLocalPart());
253 256
			return "";
254 257

  
255
		return text.asCharacters().getData();
258
		final String data = text.asCharacters().getData();
259
		if (data != null && data.length() > MAX_FIELD_LENGTH) {
260
			return data.substring(0, MAX_FIELD_LENGTH);
261
		}
262

  
263
		return data;
256 264
	}
257 265

  
258 266
}

Also available in: Unified diff