Project

General

Profile

1 26600 sandro.lab
package eu.dnetlib.data.mapreduce.hbase.index;
2
3 48892 claudio.at
import java.io.ByteArrayOutputStream;
4 35129 claudio.at
import java.io.IOException;
5 48892 claudio.at
import java.nio.charset.StandardCharsets;
6 35129 claudio.at
import java.util.List;
7
import java.util.Map.Entry;
8 48892 claudio.at
import java.util.zip.GZIPOutputStream;
9 35129 claudio.at
10 49096 claudio.at
import com.google.common.collect.Lists;
11
import eu.dnetlib.data.mapreduce.JobParams;
12
import eu.dnetlib.data.proto.TypeProtos.Type;
13
import eu.dnetlib.functionality.index.solr.feed.InputDocumentFactory;
14
import eu.dnetlib.functionality.index.solr.feed.StreamingInputDocumentFactory;
15
import eu.dnetlib.miscutils.datetime.HumanTime;
16
import eu.dnetlib.miscutils.functional.xml.ApplyXslt;
17 26600 sandro.lab
import org.apache.commons.codec.binary.Base64;
18 37562 claudio.at
import org.apache.commons.lang.exception.ExceptionUtils;
19
import org.apache.commons.logging.Log;
20
import org.apache.commons.logging.LogFactory;
21 26600 sandro.lab
import org.apache.hadoop.conf.Configuration;
22
import org.apache.hadoop.io.Text;
23
import org.apache.hadoop.mapreduce.Mapper;
24
import org.apache.solr.client.solrj.SolrServerException;
25 49565 claudio.at
import org.apache.solr.client.solrj.impl.CloudSolrServer;
26 32832 claudio.at
import org.apache.solr.client.solrj.response.SolrPingResponse;
27 26600 sandro.lab
import org.apache.solr.client.solrj.response.UpdateResponse;
28
import org.apache.solr.common.SolrInputDocument;
29
30
public class IndexFeedMapper extends Mapper<Text, Text, Text, Text> {
31
32 37562 claudio.at
	private static final Log log = LogFactory.getLog(IndexFeedMapper.class); // NOPMD by marko on 11/24/08 5:02 PM
33 49096 claudio.at
	public static final String DNET_RESULT = "dnetResult";
34 37562 claudio.at
35 35129 claudio.at
	private InputDocumentFactory documentFactory;
36 26600 sandro.lab
37 49565 claudio.at
	private CloudSolrServer solrServer;
38 26600 sandro.lab
39 35129 claudio.at
	private String version;
40 26600 sandro.lab
41 35129 claudio.at
	private String dsId;
42 26600 sandro.lab
43 35129 claudio.at
	private int shutdownWaitTime = 10000;
44 26600 sandro.lab
45 35129 claudio.at
	private int bufferFlushThreshold = 100;
46 26600 sandro.lab
47 35129 claudio.at
	private ApplyXslt dmfToRecord;
48 26600 sandro.lab
49 35129 claudio.at
	private List<SolrInputDocument> buffer;
50 26600 sandro.lab
51 35129 claudio.at
	private int backoffTimeMs = 5000;
52 26600 sandro.lab
53 35129 claudio.at
	private boolean simulation = false;
54 26600 sandro.lab
55 37562 claudio.at
	private final static int MAX_INIT_RETRIES = 10;
56
57
	private final static int MAX_FEED_RETRIES = 10;
58
59 49096 claudio.at
	private boolean compress = false;
60
61 35129 claudio.at
	@Override
62
	protected void setup(final Context context) throws IOException, InterruptedException {
63 26600 sandro.lab
64 35129 claudio.at
		logConfiguration(context.getConfiguration());
65 26600 sandro.lab
66 35129 claudio.at
		dsId = context.getConfiguration().get(JobParams.INDEX_DSID);
67
		shutdownWaitTime = Integer.parseInt(context.getConfiguration().get(JobParams.INDEX_SHUTDOWN_WAIT));
68
		bufferFlushThreshold = Integer.parseInt(context.getConfiguration().get(JobParams.INDEX_BUFFER_FLUSH_TRESHOLD));
69
		documentFactory = new StreamingInputDocumentFactory();
70
		version = InputDocumentFactory.getParsedDateField(context.getConfiguration().get(JobParams.INDEX_FEED_TIME));
71
		buffer = Lists.newArrayList();
72
		simulation = Boolean.parseBoolean(context.getConfiguration().get(JobParams.INDEX_FEED_SIMULATION_MODE));
73 30834 claudio.at
74 49096 claudio.at
		compress = context.getConfiguration().getBoolean(JobParams.INDEX_FEED_COMPRESS_RESULT, false);
75
76 40039 alessia.ba
		final String xslt = new String(Base64.decodeBase64(context.getConfiguration().get(JobParams.INDEX_XSLT)));
77 30834 claudio.at
78 40039 alessia.ba
		log.info("got xslt: \n" + xslt);
79 37562 claudio.at
		log.info("got version: " + version);
80
		log.info("simulation: " + simulation);
81
		log.info("buffer size: " + bufferFlushThreshold);
82 26600 sandro.lab
83 40039 alessia.ba
		dmfToRecord = new ApplyXslt(xslt);
84 26600 sandro.lab
85 35129 claudio.at
		final String baseURL = context.getConfiguration().get(JobParams.INDEX_SOLR_URL);
86 37562 claudio.at
		log.info("solr server baseURL: " + baseURL);
87 26600 sandro.lab
88 35129 claudio.at
		final String collection = context.getConfiguration().get(JobParams.INDEX_SOLR_COLLECTION);
89 37562 claudio.at
		log.info("solr server collection: " + collection);
90 26600 sandro.lab
91 37562 claudio.at
		int count = 0;
92
		while (count <= MAX_INIT_RETRIES) {
93 35129 claudio.at
			try {
94 37562 claudio.at
				count++;
95 42591 claudio.at
				log.info("initializing solr server...");
96 49565 claudio.at
				solrServer = new CloudSolrServer(baseURL);
97 26600 sandro.lab
98 42591 claudio.at
				solrServer.connect();
99 26600 sandro.lab
100 42591 claudio.at
				solrServer.setParallelUpdates(true);
101
				solrServer.setDefaultCollection(collection);
102 26600 sandro.lab
103 42591 claudio.at
				final SolrPingResponse rsp = solrServer.ping();
104 26600 sandro.lab
105 35129 claudio.at
				if (rsp.getStatus() != 0) throw new SolrServerException("bad init status: " + rsp.getStatus());
106
				else {
107
					break;
108
				}
109 26600 sandro.lab
110 35129 claudio.at
			} catch (final Throwable e) {
111 42591 claudio.at
				if (solrServer != null) {
112 49565 claudio.at
					solrServer.shutdown();
113 35129 claudio.at
				}
114 42591 claudio.at
				context.getCounter("index init", e.getMessage()).increment(1);
115
				log.error(String.format("failed to init solr client wait %dms, error:\n%s", backoffTimeMs, ExceptionUtils.getStackTrace(e)));
116 37562 claudio.at
117 35129 claudio.at
				Thread.sleep(backoffTimeMs);
118
			}
119
		}
120 37562 claudio.at
121
		if (count >= MAX_INIT_RETRIES) throw new IOException("reached max retries trying to connect to solr server: " + MAX_INIT_RETRIES);
122 35129 claudio.at
	}
123 26600 sandro.lab
124 35129 claudio.at
	@Override
125
	protected void map(final Text key, final Text value, final Context context) throws IOException, InterruptedException {
126 26600 sandro.lab
127 35129 claudio.at
		String indexRecord = "";
128
		SolrInputDocument doc = null;
129 26600 sandro.lab
130 35129 claudio.at
		try {
131
			indexRecord = dmfToRecord.evaluate(value.toString());
132 48892 claudio.at
133 49565 claudio.at
			doc = documentFactory.parseDocument(version, indexRecord, dsId, DNET_RESULT);
134 49096 claudio.at
135 37352 claudio.at
			if ((doc == null) || doc.isEmpty()) throw new EmptySolrDocumentException();
136
137 35975 claudio.at
		} catch (final Throwable e) {
138 37352 claudio.at
			context.getCounter("index feed", "skipped records").increment(1);
139 35129 claudio.at
			handleError(key, value, context, indexRecord, doc, e);
140 35975 claudio.at
			return;
141 35129 claudio.at
		}
142 37562 claudio.at
		int count = 0;
143
		while (count <= MAX_FEED_RETRIES) {
144
			count++;
145 35129 claudio.at
			try {
146
				addDocument(context, doc);
147
				return;
148
			} catch (final Throwable e) {
149
				context.getCounter("index feed", "retries").increment(1);
150
				handleError(key, value, context, indexRecord, doc, e);
151 37562 claudio.at
				log.info(String.format("failed to feed documents, waiting %dms", backoffTimeMs));
152 35129 claudio.at
				Thread.sleep(backoffTimeMs);
153
			}
154
		}
155
	}
156 26600 sandro.lab
157 49029 claudio.at
	public byte[] zip(final String s) {
158 48892 claudio.at
		if ((s == null) || (s.length() == 0)) {
159
			throw new IllegalArgumentException("Cannot zip null or empty string");
160
		}
161
162
		try (ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream()) {
163
			try (GZIPOutputStream gzipOutputStream = new GZIPOutputStream(byteArrayOutputStream)) {
164
				gzipOutputStream.write(s.getBytes(StandardCharsets.UTF_8));
165
			}
166
			return byteArrayOutputStream.toByteArray();
167
		} catch(IOException e) {
168
			throw new RuntimeException("Failed to zip content", e);
169
		}
170
	}
171
172 36271 claudio.at
	private void addDocument(final Context context, final SolrInputDocument doc) throws SolrServerException, IOException, EmptySolrDocumentException {
173
		buffer.add(doc);
174
		if (buffer.size() >= bufferFlushThreshold) {
175
			doAdd(buffer, context);
176 35129 claudio.at
		}
177
	}
178 32832 claudio.at
179 35129 claudio.at
	private void doAdd(final List<SolrInputDocument> buffer, final Context context) throws SolrServerException, IOException {
180
		if (!simulation) {
181
			final long start = System.currentTimeMillis();
182 42591 claudio.at
			final UpdateResponse rsp = solrServer.add(buffer);
183 35129 claudio.at
			final long stop = System.currentTimeMillis() - start;
184 37562 claudio.at
			log.info("feed time for " + buffer.size() + " records : " + HumanTime.exactly(stop) + "\n");
185 32832 claudio.at
186 35129 claudio.at
			final int status = rsp.getStatus();
187 35975 claudio.at
188 35129 claudio.at
			context.getCounter("index feed", "status code: " + status).increment(buffer.size());
189 32832 claudio.at
190 35129 claudio.at
			if (status != 0) throw new SolrServerException("bad status: " + status);
191 35975 claudio.at
192
			for (final SolrInputDocument doc : buffer) {
193
				context.getCounter("index entity", getEntityType(doc)).increment(1);
194
			}
195 35129 claudio.at
		}
196
		buffer.clear();
197
	}
198 32832 claudio.at
199 35129 claudio.at
	@Override
200
	protected void cleanup(final Context context) throws IOException, InterruptedException {
201
		super.cleanup(context);
202
		try {
203
			if (!buffer.isEmpty()) {
204
				doAdd(buffer, context);
205
			}
206 37562 claudio.at
			log.info("\nwaiting " + shutdownWaitTime + "ms before shutdown");
207 35129 claudio.at
			Thread.sleep(shutdownWaitTime);
208 49565 claudio.at
			solrServer.shutdown();
209 35129 claudio.at
		} catch (final SolrServerException e) {
210 37562 claudio.at
			log.error("couldn't shutdown server " + e.getMessage());
211 35129 claudio.at
		}
212
	}
213 32832 claudio.at
214 35129 claudio.at
	private void handleError(final Text key, final Text value, final Context context, final String indexRecord, final SolrInputDocument doc, final Throwable e)
215
			throws IOException, InterruptedException {
216 42591 claudio.at
		context.getCounter("index feed", e.getClass().getName()).increment(1);
217 35129 claudio.at
		context.write(key, printRottenRecord(context.getTaskAttemptID().toString(), value, indexRecord, doc));
218 37352 claudio.at
		// e.printStackTrace(System.err);
219 35129 claudio.at
	}
220 32832 claudio.at
221 35129 claudio.at
	private Text printRottenRecord(final String taskid, final Text value, final String indexRecord, final SolrInputDocument doc) {
222
		return new Text("\n**********************************\n" + "task: " + taskid + "\n"
223
				+ check("original", value.toString() + check("indexRecord", indexRecord) + check("solrDoc", doc)));
224
	}
225 32832 claudio.at
226 35129 claudio.at
	private String check(final String label, final Object value) {
227
		if ((value != null) && !value.toString().isEmpty()) return "\n " + label + ":\n" + value + "\n";
228
		return "\n";
229
	}
230 32832 claudio.at
231 35129 claudio.at
	private void logConfiguration(final Configuration conf) {
232 37562 claudio.at
		log.info("job configutation #################");
233 35129 claudio.at
		for (final Entry<String, String> e : conf) {
234 37562 claudio.at
			log.info("'" + e.getKey() + "' : '" + e.getValue() + "'");
235 35129 claudio.at
		}
236 37562 claudio.at
		log.info("end of job configutation #################\n\n");
237 35129 claudio.at
	}
238 32832 claudio.at
239 35975 claudio.at
	private String getEntityType(final SolrInputDocument doc) {
240 37352 claudio.at
		if (!doc.containsKey("oaftype")) return "unknown";
241 35975 claudio.at
242
		final Type type = Type.valueOf(doc.getFieldValue("oaftype").toString());
243
		switch (type) {
244
		case result:
245 37352 claudio.at
			if (!doc.containsKey("resulttypeid")) return "result";
246 35975 claudio.at
			return doc.getFieldValue("resulttypeid").toString();
247
		default:
248
			return type.toString();
249
		}
250
	}
251
252 26600 sandro.lab
}