Revision 57157
Added by Enrico Ottonello over 4 years ago
StreamingInputDocumentFactory.java | ||
---|---|---|
2 | 2 |
|
3 | 3 |
import java.io.StringReader; |
4 | 4 |
import java.io.StringWriter; |
5 |
import java.util.HashMap; |
|
5 | 6 |
import java.util.Iterator; |
6 |
import java.util.LinkedList; |
|
7 | 7 |
import java.util.List; |
8 |
import javax.xml.stream.*; |
|
8 |
|
|
9 |
import javax.xml.stream.XMLEventFactory; |
|
10 |
import javax.xml.stream.XMLEventReader; |
|
11 |
import javax.xml.stream.XMLEventWriter; |
|
12 |
import javax.xml.stream.XMLInputFactory; |
|
13 |
import javax.xml.stream.XMLOutputFactory; |
|
14 |
import javax.xml.stream.XMLStreamException; |
|
9 | 15 |
import javax.xml.stream.events.Namespace; |
10 | 16 |
import javax.xml.stream.events.StartElement; |
11 | 17 |
import javax.xml.stream.events.XMLEvent; |
12 | 18 |
|
13 |
import eu.dnetlib.index.solr.feed.ResultTransformer.Mode; |
|
14 | 19 |
import org.apache.solr.common.SolrInputDocument; |
15 | 20 |
|
21 |
import com.google.common.collect.Lists; |
|
22 |
|
|
23 |
import eu.dnetlib.index.solr.feed.ResultTransformer.Mode; |
|
24 |
|
|
16 | 25 |
/** |
17 | 26 |
* Optimized version of the document parser, drop in replacement of InputDocumentFactory. |
27 |
* |
|
18 | 28 |
* <p> |
19 |
* <p> |
|
20 | 29 |
* Faster because: |
21 | 30 |
* </p> |
22 | 31 |
* <ul> |
... | ... | |
25 | 34 |
* <li>Quickly serialize the 'result' element directly in a string.</li> |
26 | 35 |
* <li>Uses less memory: less pressure on GC and allows more threads to process this in parallel</li> |
27 | 36 |
* </ul> |
37 |
* |
|
28 | 38 |
* <p> |
29 |
* <p> |
|
30 | 39 |
* This class is fully reentrant and can be invoked in parallel. |
31 | 40 |
* </p> |
32 | 41 |
* |
33 | 42 |
* @author marko |
43 |
* |
|
34 | 44 |
*/ |
35 |
|
|
36 | 45 |
public class StreamingInputDocumentFactory extends InputDocumentFactory { |
37 | 46 |
|
38 | 47 |
protected static final String DEFAULTDNETRESULT = "dnetResult"; |
... | ... | |
43 | 52 |
|
44 | 53 |
protected static final String ROOT_ELEMENT = "indexRecord"; |
45 | 54 |
|
55 |
protected static final int MAX_FIELD_LENGTH = 25000; |
|
56 |
|
|
46 | 57 |
protected ThreadLocal<XMLInputFactory> inputFactory = new ThreadLocal<XMLInputFactory>() { |
47 | 58 |
|
48 | 59 |
@Override |
... | ... | |
67 | 78 |
} |
68 | 79 |
}; |
69 | 80 |
|
70 |
/** |
|
71 |
* {@inheritDoc} |
|
72 |
*/ |
|
73 | 81 |
@Override |
74 | 82 |
public SolrInputDocument parseDocument(final String version, final String inputDocument, final String dsId, final String resultName) |
75 | 83 |
throws XMLStreamException { |
76 | 84 |
return parseDocument(version, inputDocument, dsId, resultName, null); |
77 | 85 |
} |
78 | 86 |
|
79 |
/** |
|
80 |
* {@inheritDoc} |
|
81 |
* <p> |
|
82 |
* String, com.google.common.base.Function) |
|
83 |
*/ |
|
84 | 87 |
@Override |
85 | 88 |
public SolrInputDocument parseDocument(final String version, |
86 | 89 |
final String inputDocument, |
... | ... | |
89 | 92 |
final ResultTransformer resultTransformer) { |
90 | 93 |
|
91 | 94 |
final StringWriter results = new StringWriter(); |
92 |
final List<Namespace> nsList = new LinkedList<>();
|
|
95 |
final List<Namespace> nsList = Lists.newLinkedList();
|
|
93 | 96 |
try { |
94 | 97 |
|
95 | 98 |
XMLEventReader parser = inputFactory.get().createXMLEventReader(new StringReader(inputDocument)); |
96 | 99 |
|
97 |
final SolrInputDocument indexDocument = new SolrInputDocument(); |
|
100 |
final SolrInputDocument indexDocument = new SolrInputDocument(new HashMap<>());
|
|
98 | 101 |
|
99 | 102 |
while (parser.hasNext()) { |
100 | 103 |
final XMLEvent event = parser.nextEvent(); |
... | ... | |
137 | 140 |
} |
138 | 141 |
|
139 | 142 |
private List<Namespace> getNamespaces(final XMLEvent event) { |
140 |
final List<Namespace> res = new LinkedList<>();
|
|
143 |
final List<Namespace> res = Lists.newLinkedList();
|
|
141 | 144 |
@SuppressWarnings("unchecked") |
142 | 145 |
Iterator<Namespace> nsIter = event.asStartElement().getNamespaces(); |
143 | 146 |
while (nsIter.hasNext()) { |
... | ... | |
246 | 249 |
* Helper used to get the string from a text element. |
247 | 250 |
* |
248 | 251 |
* @param text |
249 |
* @return |
|
252 |
* @return the
|
|
250 | 253 |
*/ |
251 | 254 |
protected final String getText(final XMLEvent text) { |
252 | 255 |
if (text.isEndElement()) // log.warn("skipping because isEndOfElement " + text.asEndElement().getName().getLocalPart()); |
253 | 256 |
return ""; |
254 | 257 |
|
255 |
return text.asCharacters().getData(); |
|
258 |
final String data = text.asCharacters().getData(); |
|
259 |
if (data != null && data.length() > MAX_FIELD_LENGTH) { |
|
260 |
return data.substring(0, MAX_FIELD_LENGTH); |
|
261 |
} |
|
262 |
|
|
263 |
return data; |
|
256 | 264 |
} |
257 | 265 |
|
258 | 266 |
} |
Also available in: Unified diff
solr 772 integration