Project

General

Profile

1
package eu.dnetlib.data.mapreduce.hbase.index;
2

    
3
import java.io.ByteArrayOutputStream;
4
import java.io.IOException;
5
import java.nio.charset.StandardCharsets;
6
import java.util.List;
7
import java.util.Map.Entry;
8
import java.util.zip.GZIPOutputStream;
9

    
10
import com.google.common.collect.Lists;
11
import eu.dnetlib.data.mapreduce.JobParams;
12
import eu.dnetlib.data.proto.TypeProtos.Type;
13
import eu.dnetlib.functionality.index.solr.feed.InputDocumentFactory;
14
import eu.dnetlib.functionality.index.solr.feed.ResultTransformer;
15
import eu.dnetlib.functionality.index.solr.feed.ResultTransformer.Mode;
16
import eu.dnetlib.functionality.index.solr.feed.StreamingInputDocumentFactory;
17
import eu.dnetlib.miscutils.datetime.HumanTime;
18
import eu.dnetlib.miscutils.functional.xml.ApplyXslt;
19
import org.apache.commons.codec.binary.Base64;
20
import org.apache.commons.lang.exception.ExceptionUtils;
21
import org.apache.commons.logging.Log;
22
import org.apache.commons.logging.LogFactory;
23
import org.apache.hadoop.conf.Configuration;
24
import org.apache.hadoop.io.Text;
25
import org.apache.hadoop.mapreduce.Mapper;
26
import org.apache.solr.client.solrj.SolrServerException;
27
import org.apache.solr.client.solrj.impl.CloudSolrClient;
28
import org.apache.solr.client.solrj.response.SolrPingResponse;
29
import org.apache.solr.client.solrj.response.UpdateResponse;
30
import org.apache.solr.common.SolrInputDocument;
31

    
32
public class IndexFeedMapper extends Mapper<Text, Text, Text, Text> {
33

    
34
	private static final Log log = LogFactory.getLog(IndexFeedMapper.class); // NOPMD by marko on 11/24/08 5:02 PM
35
	public static final String DNET_RESULT = "dnetResult";
36

    
37
	private InputDocumentFactory documentFactory;
38

    
39
	private CloudSolrClient solrServer;
40

    
41
	private String version;
42

    
43
	private String dsId;
44

    
45
	private int shutdownWaitTime = 10000;
46

    
47
	private int bufferFlushThreshold = 100;
48

    
49
	private ApplyXslt dmfToRecord;
50

    
51
	private List<SolrInputDocument> buffer;
52

    
53
	private int backoffTimeMs = 5000;
54

    
55
	private boolean simulation = false;
56

    
57
	private final static int MAX_INIT_RETRIES = 10;
58

    
59
	private final static int MAX_FEED_RETRIES = 10;
60

    
61
	private boolean compress = false;
62

    
63
	@Override
64
	protected void setup(final Context context) throws IOException, InterruptedException {
65

    
66
		logConfiguration(context.getConfiguration());
67

    
68
		dsId = context.getConfiguration().get(JobParams.INDEX_DSID);
69
		shutdownWaitTime = Integer.parseInt(context.getConfiguration().get(JobParams.INDEX_SHUTDOWN_WAIT));
70
		bufferFlushThreshold = Integer.parseInt(context.getConfiguration().get(JobParams.INDEX_BUFFER_FLUSH_TRESHOLD));
71
		documentFactory = new StreamingInputDocumentFactory();
72
		version = InputDocumentFactory.getParsedDateField(context.getConfiguration().get(JobParams.INDEX_FEED_TIME));
73
		buffer = Lists.newArrayList();
74
		simulation = Boolean.parseBoolean(context.getConfiguration().get(JobParams.INDEX_FEED_SIMULATION_MODE));
75

    
76
		compress = context.getConfiguration().getBoolean(JobParams.INDEX_FEED_COMPRESS_RESULT, false);
77

    
78
		final String xslt = new String(Base64.decodeBase64(context.getConfiguration().get(JobParams.INDEX_XSLT)));
79

    
80
		log.info("got xslt: \n" + xslt);
81
		log.info("got version: " + version);
82
		log.info("simulation: " + simulation);
83
		log.info("buffer size: " + bufferFlushThreshold);
84

    
85
		dmfToRecord = new ApplyXslt(xslt);
86

    
87
		final String baseURL = context.getConfiguration().get(JobParams.INDEX_SOLR_URL);
88
		log.info("solr server baseURL: " + baseURL);
89

    
90
		final String collection = context.getConfiguration().get(JobParams.INDEX_SOLR_COLLECTION);
91
		log.info("solr server collection: " + collection);
92

    
93
		int count = 0;
94
		while (count <= MAX_INIT_RETRIES) {
95
			try {
96
				count++;
97
				log.info("initializing solr server...");
98
				solrServer = new CloudSolrClient.Builder()
99
						.withZkHost(baseURL)
100
						.build();
101

    
102
				solrServer.connect();
103

    
104
				solrServer.setParallelUpdates(true);
105
				solrServer.setDefaultCollection(collection);
106

    
107
				final SolrPingResponse rsp = solrServer.ping();
108

    
109
				if (rsp.getStatus() != 0) throw new SolrServerException("bad init status: " + rsp.getStatus());
110
				else {
111
					break;
112
				}
113

    
114
			} catch (final Throwable e) {
115
				if (solrServer != null) {
116
					solrServer.close();
117
				}
118
				context.getCounter("index init", e.getMessage()).increment(1);
119
				log.error(String.format("failed to init solr client wait %dms, error:\n%s", backoffTimeMs, ExceptionUtils.getStackTrace(e)));
120

    
121
				Thread.sleep(backoffTimeMs);
122
			}
123
		}
124

    
125
		if (count >= MAX_INIT_RETRIES) throw new IOException("reached max retries trying to connect to solr server: " + MAX_INIT_RETRIES);
126
	}
127

    
128
	@Override
129
	protected void map(final Text key, final Text value, final Context context) throws IOException, InterruptedException {
130

    
131
		String indexRecord = "";
132
		SolrInputDocument doc = null;
133

    
134
		try {
135
			indexRecord = dmfToRecord.evaluate(value.toString());
136

    
137
			if (compress) {
138
				doc = documentFactory.parseDocument(version, indexRecord, dsId, DNET_RESULT, new ResultTransformer(Mode.base64) {
139
					@Override
140
					public String apply(final String s) {
141

    
142
						return org.apache.solr.common.util.Base64.byteArrayToBase64(zip(s));
143
					}
144
				});
145
			} else {
146
				doc = documentFactory.parseDocument(version, indexRecord, dsId, DNET_RESULT);
147
			}
148

    
149
			if ((doc == null) || doc.isEmpty()) throw new EmptySolrDocumentException();
150

    
151
		} catch (final Throwable e) {
152
			context.getCounter("index feed", "skipped records").increment(1);
153
			handleError(key, value, context, indexRecord, doc, e);
154
			return;
155
		}
156
		int count = 0;
157
		while (count <= MAX_FEED_RETRIES) {
158
			count++;
159
			try {
160
				addDocument(context, doc);
161
				return;
162
			} catch (final Throwable e) {
163
				context.getCounter("index feed", "retries").increment(1);
164
				handleError(key, value, context, indexRecord, doc, e);
165
				log.info(String.format("failed to feed documents, waiting %dms", backoffTimeMs));
166
				Thread.sleep(backoffTimeMs);
167
			}
168
		}
169
	}
170

    
171
	public byte[] zip(final String s) {
172
		if ((s == null) || (s.length() == 0)) {
173
			throw new IllegalArgumentException("Cannot zip null or empty string");
174
		}
175

    
176
		try (ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream()) {
177
			try (GZIPOutputStream gzipOutputStream = new GZIPOutputStream(byteArrayOutputStream)) {
178
				gzipOutputStream.write(s.getBytes(StandardCharsets.UTF_8));
179
			}
180
			return byteArrayOutputStream.toByteArray();
181
		} catch(IOException e) {
182
			throw new RuntimeException("Failed to zip content", e);
183
		}
184
	}
185

    
186
	private void addDocument(final Context context, final SolrInputDocument doc) throws SolrServerException, IOException, EmptySolrDocumentException {
187
		buffer.add(doc);
188
		if (buffer.size() >= bufferFlushThreshold) {
189
			doAdd(buffer, context);
190
		}
191
	}
192

    
193
	private void doAdd(final List<SolrInputDocument> buffer, final Context context) throws SolrServerException, IOException {
194
		if (!simulation) {
195
			final long start = System.currentTimeMillis();
196
			final UpdateResponse rsp = solrServer.add(buffer);
197
			final long stop = System.currentTimeMillis() - start;
198
			log.info("feed time for " + buffer.size() + " records : " + HumanTime.exactly(stop) + "\n");
199

    
200
			final int status = rsp.getStatus();
201

    
202
			context.getCounter("index feed", "status code: " + status).increment(buffer.size());
203

    
204
			if (status != 0) throw new SolrServerException("bad status: " + status);
205

    
206
			for (final SolrInputDocument doc : buffer) {
207
				context.getCounter("index entity", getEntityType(doc)).increment(1);
208
			}
209
		}
210
		buffer.clear();
211
	}
212

    
213
	@Override
214
	protected void cleanup(final Context context) throws IOException, InterruptedException {
215
		super.cleanup(context);
216
		try {
217
			if (!buffer.isEmpty()) {
218
				doAdd(buffer, context);
219
			}
220
			log.info("\nwaiting " + shutdownWaitTime + "ms before shutdown");
221
			Thread.sleep(shutdownWaitTime);
222
			solrServer.close();
223
		} catch (final SolrServerException e) {
224
			log.error("couldn't shutdown server " + e.getMessage());
225
		}
226
	}
227

    
228
	private void handleError(final Text key, final Text value, final Context context, final String indexRecord, final SolrInputDocument doc, final Throwable e)
229
			throws IOException, InterruptedException {
230
		context.getCounter("index feed", e.getClass().getName()).increment(1);
231
		context.write(key, printRottenRecord(context.getTaskAttemptID().toString(), value, indexRecord, doc));
232
		// e.printStackTrace(System.err);
233
	}
234

    
235
	private Text printRottenRecord(final String taskid, final Text value, final String indexRecord, final SolrInputDocument doc) {
236
		return new Text("\n**********************************\n" + "task: " + taskid + "\n"
237
				+ check("original", value.toString() + check("indexRecord", indexRecord) + check("solrDoc", doc)));
238
	}
239

    
240
	private String check(final String label, final Object value) {
241
		if ((value != null) && !value.toString().isEmpty()) return "\n " + label + ":\n" + value + "\n";
242
		return "\n";
243
	}
244

    
245
	private void logConfiguration(final Configuration conf) {
246
		log.info("job configutation #################");
247
		for (final Entry<String, String> e : conf) {
248
			log.info("'" + e.getKey() + "' : '" + e.getValue() + "'");
249
		}
250
		log.info("end of job configutation #################\n\n");
251
	}
252

    
253
	private String getEntityType(final SolrInputDocument doc) {
254
		if (!doc.containsKey("oaftype")) return "unknown";
255

    
256
		final Type type = Type.valueOf(doc.getFieldValue("oaftype").toString());
257
		switch (type) {
258
		case result:
259
			if (!doc.containsKey("resulttypeid")) return "result";
260
			return doc.getFieldValue("resulttypeid").toString();
261
		default:
262
			return type.toString();
263
		}
264
	}
265

    
266
}
(5-5/7)