Project

General

Profile

« Previous | Next » 

Revision 55768

[maven-release-plugin] copy for tag dnet-index-solr-common-2.3.4

View differences:

modules/dnet-index-solr-common/tags/dnet-index-solr-common-2.3.4/deploy.info
1
{"type_source": "SVN", "goal": "package -U -T 4C source:jar", "url": "http://svn-public.driver.research-infrastructures.eu/driver/dnet45/modules/dnet-index-solr-common/trunk/", "deploy_repository": "dnet45-snapshots", "version": "4", "mail": "sandro.labruzzo@isti.cnr.it,michele.artini@isti.cnr.it, claudio.atzori@isti.cnr.it, alessia.bardi@isti.cnr.it", "deploy_repository_url": "http://maven.research-infrastructures.eu/nexus/content/repositories/dnet45-snapshots", "name": "dnet-index-solr-common"}
modules/dnet-index-solr-common/tags/dnet-index-solr-common-2.3.4/src/main/java/eu/dnetlib/functionality/index/solr/feed/StreamingInputDocumentFactory.java
1
package eu.dnetlib.functionality.index.solr.feed;
2

  
3
import java.io.StringReader;
4
import java.io.StringWriter;
5
import java.util.HashMap;
6
import java.util.Iterator;
7
import java.util.List;
8
import javax.xml.stream.*;
9
import javax.xml.stream.events.Namespace;
10
import javax.xml.stream.events.StartElement;
11
import javax.xml.stream.events.XMLEvent;
12

  
13
import com.google.common.collect.Lists;
14
import eu.dnetlib.functionality.index.solr.feed.ResultTransformer.Mode;
15
import org.apache.solr.common.SolrInputDocument;
16

  
17
/**
18
 * Optimized version of the document parser, drop in replacement of InputDocumentFactory.
19
 *
20
 * <p>
21
 * Faster because:
22
 * </p>
23
 * <ul>
24
 * <li>Doesn't create a DOM for the full document</li>
25
 * <li>Doesn't execute xpaths agains the DOM</li>
26
 * <li>Quickly serialize the 'result' element directly in a string.</li>
27
 * <li>Uses less memory: less pressure on GC and allows more threads to process this in parallel</li>
28
 * </ul>
29
 *
30
 * <p>
31
 * This class is fully reentrant and can be invoked in parallel.
32
 * </p>
33
 *
34
 * @author marko
35
 *
36
 */
37
public class StreamingInputDocumentFactory extends InputDocumentFactory {
38

  
39
	protected static final String DEFAULTDNETRESULT = "dnetResult";
40

  
41
	protected static final String TARGETFIELDS = "targetFields";
42

  
43
	protected static final String INDEX_RECORD_ID_ELEMENT = "indexRecordIdentifier";
44

  
45
	protected static final String ROOT_ELEMENT = "indexRecord";
46

  
47
	protected static final int MAX_FIELD_LENGTH = 25000;
48

  
49
	protected ThreadLocal<XMLInputFactory> inputFactory = new ThreadLocal<XMLInputFactory>() {
50

  
51
		@Override
52
		protected XMLInputFactory initialValue() {
53
			return XMLInputFactory.newInstance();
54
		}
55
	};
56

  
57
	protected ThreadLocal<XMLOutputFactory> outputFactory = new ThreadLocal<XMLOutputFactory>() {
58

  
59
		@Override
60
		protected XMLOutputFactory initialValue() {
61
			return XMLOutputFactory.newInstance();
62
		}
63
	};
64

  
65
	protected ThreadLocal<XMLEventFactory> eventFactory = new ThreadLocal<XMLEventFactory>() {
66

  
67
		@Override
68
		protected XMLEventFactory initialValue() {
69
			return XMLEventFactory.newInstance();
70
		}
71
	};
72

  
73
	@Override
74
	public SolrInputDocument parseDocument(final String version, final String inputDocument, final String dsId, final String resultName)
75
			throws XMLStreamException {
76
		return parseDocument(version, inputDocument, dsId, resultName, null);
77
	}
78

  
79
	@Override
80
	public SolrInputDocument parseDocument(final String version,
81
			final String inputDocument,
82
			final String dsId,
83
			final String resultName,
84
			final ResultTransformer resultTransformer) {
85

  
86
		final StringWriter results = new StringWriter();
87
		final List<Namespace> nsList = Lists.newLinkedList();
88
		try {
89

  
90
			XMLEventReader parser = inputFactory.get().createXMLEventReader(new StringReader(inputDocument));
91

  
92
			final SolrInputDocument indexDocument = new SolrInputDocument(new HashMap<>());
93

  
94
			while (parser.hasNext()) {
95
				final XMLEvent event = parser.nextEvent();
96
				if ((event != null) && event.isStartElement()) {
97
					final String localName = event.asStartElement().getName().getLocalPart();
98

  
99
					if (ROOT_ELEMENT.equals(localName)) {
100
						nsList.addAll(getNamespaces(event));
101
					} else if (INDEX_RECORD_ID_ELEMENT.equals(localName)) {
102
						final XMLEvent text = parser.nextEvent();
103
						String recordId = getText(text);
104
						indexDocument.addField(INDEX_RECORD_ID, recordId);
105
					} else if (TARGETFIELDS.equals(localName)) {
106
						parseTargetFields(indexDocument, parser);
107
					} else if (resultName.equals(localName)) {
108
						if (resultTransformer == null || !(Mode.empty.equals(resultTransformer.getMode()))) {
109
							copyResult(indexDocument, results, parser, nsList, resultName, resultTransformer);
110
						}
111
					}
112
				}
113
			}
114

  
115
			if (version != null) {
116
				indexDocument.addField(DS_VERSION, version);
117
			}
118

  
119
			if (dsId != null) {
120
				indexDocument.addField(DS_ID, dsId);
121
			}
122

  
123
			if (!indexDocument.containsKey(INDEX_RECORD_ID)) {
124
				indexDocument.clear();
125
				System.err.println("missing indexrecord id:\n" + inputDocument);
126
			}
127

  
128
			return indexDocument;
129
		} catch (XMLStreamException e) {
130
			return new SolrInputDocument();
131
		}
132
	}
133

  
134
	private List<Namespace> getNamespaces(final XMLEvent event) {
135
		final List<Namespace> res = Lists.newLinkedList();
136
		@SuppressWarnings("unchecked")
137
		Iterator<Namespace> nsIter = event.asStartElement().getNamespaces();
138
		while (nsIter.hasNext()) {
139
			Namespace ns = nsIter.next();
140
			res.add(ns);
141
		}
142
		return res;
143
	}
144

  
145
	/**
146
	 * Parse the targetFields block and add fields to the solr document.
147
	 *
148
	 * @param indexDocument
149
	 * @param parser
150
	 * @throws XMLStreamException
151
	 */
152
	protected void parseTargetFields(final SolrInputDocument indexDocument, final XMLEventReader parser) throws XMLStreamException {
153

  
154
		boolean hasFields = false;
155

  
156
		while (parser.hasNext()) {
157
			final XMLEvent targetEvent = parser.nextEvent();
158
			if (targetEvent.isEndElement() && targetEvent.asEndElement().getName().getLocalPart().equals(TARGETFIELDS)) {
159
				break;
160
			}
161

  
162
			if (targetEvent.isStartElement()) {
163
				final String fieldName = targetEvent.asStartElement().getName().getLocalPart();
164
				final XMLEvent text = parser.nextEvent();
165

  
166
				String data = getText(text);
167

  
168
				addField(indexDocument, fieldName, data);
169
				hasFields = true;
170
			}
171
		}
172

  
173
		if (!hasFields) {
174
			indexDocument.clear();
175
		}
176
	}
177

  
178
	/**
179
	 * Copy the /indexRecord/result element and children, preserving namespace declarations etc.
180
	 *
181
	 * @param indexDocument
182
	 * @param results
183
	 * @param parser
184
	 * @param nsList
185
	 * @throws XMLStreamException
186
	 */
187
	protected void copyResult(final SolrInputDocument indexDocument,
188
			final StringWriter results,
189
			final XMLEventReader parser,
190
			final List<Namespace> nsList,
191
			final String dnetResult,
192
			final ResultTransformer resultTransformer) throws XMLStreamException {
193
		final XMLEventWriter writer = outputFactory.get().createXMLEventWriter(results);
194

  
195
		for (Namespace ns : nsList) {
196
			eventFactory.get().createNamespace(ns.getPrefix(), ns.getNamespaceURI());
197
		}
198

  
199
		StartElement newRecord = eventFactory.get().createStartElement("", null, RESULT, null, nsList.iterator());
200

  
201
		// new root record
202
		writer.add(newRecord);
203

  
204
		// copy the rest as it is
205
		while (parser.hasNext()) {
206
			final XMLEvent resultEvent = parser.nextEvent();
207

  
208
			// TODO: replace with depth tracking instead of close tag tracking.
209
			if (resultEvent.isEndElement() && resultEvent.asEndElement().getName().getLocalPart().equals(dnetResult)) {
210
				writer.add(eventFactory.get().createEndElement("", null, RESULT));
211
				break;
212
			}
213

  
214
			writer.add(resultEvent);
215
		}
216
		writer.close();
217

  
218
		if (resultTransformer != null) {
219
			indexDocument.addField(INDEX_RESULT, resultTransformer.apply(results.toString()));
220
		} else {
221
			indexDocument.addField(INDEX_RESULT, results.toString());
222
		}
223
	}
224

  
225
	/**
226
	 * Helper used to add a field to a solr doc. It avoids to add empy fields
227
	 *
228
	 * @param indexDocument
229
	 * @param field
230
	 * @param value
231
	 */
232
	private final void addField(final SolrInputDocument indexDocument, final String field, final String value) {
233
		String cleaned = value.trim();
234
		if (!cleaned.isEmpty()) {
235
			// log.info("\n\n adding field " + field.toLowerCase() + " value: " + cleaned + "\n");
236
			indexDocument.addField(field.toLowerCase(), cleaned);
237
		}
238
	}
239

  
240
	/**
241
	 * Helper used to get the string from a text element.
242
	 *
243
	 * @param text
244
	 * @return the element text value
245
	 */
246
	protected final String getText(final XMLEvent text) {
247
		if (text.isEndElement()) // log.warn("skipping because isEndOfElement " + text.asEndElement().getName().getLocalPart());
248
			return "";
249

  
250
		final String data = text.asCharacters().getData();
251
		if (data != null && data.length() > MAX_FIELD_LENGTH) {
252
			return data.substring(0, MAX_FIELD_LENGTH);
253
		}
254

  
255
		return data;
256
	}
257

  
258
}
modules/dnet-index-solr-common/tags/dnet-index-solr-common-2.3.4/src/main/java/eu/dnetlib/functionality/index/solr/feed/ResultTransformer.java
1
package eu.dnetlib.functionality.index.solr.feed;
2

  
3
import com.google.common.base.Function;
4

  
5
/**
6
 * Created by claudio on 17/11/15.
7
 */
8
public abstract class ResultTransformer implements Function<String, String> {
9

  
10
	public enum Mode {compress, empty, xslt, base64}
11

  
12
	protected Mode mode;
13

  
14
	public ResultTransformer(final Mode mode) {
15
		this.mode = mode;
16
	}
17

  
18
	public Mode getMode() {
19
		return mode;
20
	}
21

  
22
	public void setMode(final Mode mode) {
23
		this.mode = mode;
24
	}
25

  
26
}
modules/dnet-index-solr-common/tags/dnet-index-solr-common-2.3.4/src/main/java/eu/dnetlib/functionality/index/solr/feed/InputDocumentFactory.java
1
package eu.dnetlib.functionality.index.solr.feed;
2

  
3
import java.text.ParseException;
4
import java.text.SimpleDateFormat;
5
import java.util.Arrays;
6
import java.util.List;
7
import javax.xml.stream.XMLStreamException;
8

  
9
import org.apache.solr.common.SolrInputDocument;
10
import org.dom4j.DocumentException;
11

  
12
/**
13
 *
14
 * @author claudio
15
 *
16
 */
17
public abstract class InputDocumentFactory {
18

  
19
	public static final String INDEX_FIELD_PREFIX = "__";
20

  
21
	public static final String DS_VERSION = INDEX_FIELD_PREFIX + "dsversion";
22

  
23
	public static final String DS_ID = INDEX_FIELD_PREFIX + "dsid";
24

  
25
	public static final String RESULT = "result";
26

  
27
	public static final String INDEX_RESULT = INDEX_FIELD_PREFIX + RESULT;
28

  
29
	public static final String INDEX_RECORD_ID = INDEX_FIELD_PREFIX + "indexrecordidentifier";
30

  
31
	private static final String outFormat = new String("yyyy-MM-dd'T'hh:mm:ss'Z'");
32

  
33
	private final static List<String> dateFormats = Arrays.asList("yyyy-MM-dd'T'hh:mm:ss", "yyyy-MM-dd", "dd-MM-yyyy", "dd/MM/yyyy", "yyyy");
34

  
35
	public abstract SolrInputDocument parseDocument(final String version,
36
			final String inputDocument,
37
			final String dsId,
38
			final String resultName) throws XMLStreamException;
39

  
40
	public abstract SolrInputDocument parseDocument(final String version,
41
			final String inputDocument,
42
			final String dsId,
43
			final String resultName,
44
			final ResultTransformer resultTransformer) throws XMLStreamException;
45

  
46
	/**
47
	 * method return a solr-compatible string representation of a date
48
	 *
49
	 * @param date
50
	 * @return the formatted date string
51
	 * @throws DocumentException
52
	 * @throws ParseException
53
	 */
54
	public static String getParsedDateField(final String date) {
55
		for (String formatString : dateFormats) {
56
			try {
57
				return new SimpleDateFormat(outFormat).format(new SimpleDateFormat(formatString).parse(date));
58
			} catch (ParseException e) {}
59
		}
60
		throw new IllegalStateException("unable to parse date: " + date);
61
	}
62

  
63
	public String parseDate(final String date) {
64
		return getParsedDateField(date);
65
	}
66

  
67
}
modules/dnet-index-solr-common/tags/dnet-index-solr-common-2.3.4/src/main/java/eu/dnetlib/functionality/index/utils/ZkServers.java
1
package eu.dnetlib.functionality.index.utils;
2

  
3
import com.google.common.base.Splitter;
4
import com.google.common.collect.Lists;
5
import org.apache.commons.lang3.StringUtils;
6
import org.apache.commons.logging.Log;
7
import org.apache.commons.logging.LogFactory;
8

  
9
import java.util.List;
10
import java.util.Optional;
11

  
12
public class ZkServers {
13

  
14
    private static final Log log = LogFactory.getLog(ZkServers.class);
15
    public static final String SEPARATOR = "/";
16

  
17
    private List<String> hosts;
18

  
19
    private Optional<String> chroot;
20

  
21
    public static ZkServers newInstance(final String zkUrl) {
22

  
23
        //quorum0:2182,quorum1:2182,quorum2:2182,quorum3:2182,quorum4:2182/solr-dev-openaire
24
        String urls = zkUrl;
25
        final Optional<String> chRoot = Optional.of(SEPARATOR + StringUtils.substringAfterLast(zkUrl, SEPARATOR));
26
        if (chRoot.isPresent() && StringUtils.isNotBlank(chRoot.get())) {
27
            log.debug(String.format("found zk chroot %s", chRoot));
28
            urls = zkUrl.replace(chRoot.get(), "");
29
        }
30

  
31
        final List<String> urlList = Lists.newArrayList(Splitter.on(",").omitEmptyStrings().split(urls));
32
        log.debug(String.format("zk urls %s", zkUrl));
33

  
34
        return new ZkServers(urlList, chRoot);
35
    }
36

  
37
    public ZkServers(List<String> hosts, Optional<String> chroot) {
38
        this.hosts = hosts;
39
        this.chroot = chroot;
40
    }
41

  
42
    public List<String> getHosts() {
43
        return hosts;
44
    }
45

  
46
    public void setHosts(List<String> hosts) {
47
        this.hosts = hosts;
48
    }
49

  
50
    public Optional<String> getChroot() {
51
        return chroot;
52
    }
53

  
54
    public void setChroot(Optional<String> chroot) {
55
        this.chroot = chroot;
56
    }
57
}
58

  
modules/dnet-index-solr-common/tags/dnet-index-solr-common-2.3.4/pom.xml
1
<?xml version="1.0" encoding="UTF-8"?>
2
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
3
	<parent>
4
		<groupId>eu.dnetlib</groupId>
5
		<artifactId>dnet45-parent</artifactId>
6
		<version>1.0.0</version>
7
		<relativePath />
8
	</parent>
9
	<modelVersion>4.0.0</modelVersion>
10
	<groupId>eu.dnetlib</groupId>
11
	<artifactId>dnet-index-solr-common</artifactId>
12
	<version>2.3.4</version>
13
	<scm>
14
		<developerConnection>scm:svn:https://svn.driver.research-infrastructures.eu/driver/dnet45/modules/dnet-index-solr-common/tags/dnet-index-solr-common-2.3.4</developerConnection>
15
	</scm>
16
	<dependencies>
17
		<dependency>
18
			<groupId>org.apache.solr</groupId>
19
			<artifactId>solr-solrj</artifactId>
20
			<version>${apache.solr.version}</version>
21
			<exclusions>
22
				<exclusion>
23
					<artifactId>wstx-asl</artifactId>
24
					<groupId>org.codehaus.woodstox</groupId>
25
				</exclusion>
26
				<exclusion>
27
					<artifactId>jcl-over-slf4j</artifactId>
28
					<groupId>org.slf4j</groupId>
29
				</exclusion>
30
			</exclusions>
31
		</dependency>
32
		<dependency>
33
			<groupId>dom4j</groupId>
34
			<artifactId>dom4j</artifactId>
35
			<version>${dom4j.version}</version>
36
		</dependency>
37
		<dependency>
38
			<groupId>com.google.guava</groupId>
39
			<artifactId>guava</artifactId>
40
			<version>${google.guava.version}</version>
41
		</dependency>
42
		<dependency>
43
			<groupId>commons-logging</groupId>
44
			<artifactId>commons-logging</artifactId>
45
			<version>${commons.logging.version}</version>
46
		</dependency>
47
		<dependency>
48
			<groupId>org.apache.commons</groupId>
49
			<artifactId>commons-lang3</artifactId>
50
			<version>3.5</version>
51
		</dependency>
52

  
53
	</dependencies>
54
</project>

Also available in: Unified diff