Project

General

Profile

1
package eu.dnetlib.data.mapreduce.hbase.index;
2

    
3
import java.io.ByteArrayOutputStream;
4
import java.io.IOException;
5
import java.nio.charset.StandardCharsets;
6
import java.util.List;
7
import java.util.Map.Entry;
8
import java.util.zip.GZIPOutputStream;
9

    
10
import com.google.common.collect.Lists;
11
import eu.dnetlib.data.mapreduce.JobParams;
12
import eu.dnetlib.data.proto.TypeProtos.Type;
13
import eu.dnetlib.functionality.index.solr.feed.InputDocumentFactory;
14
import eu.dnetlib.functionality.index.solr.feed.StreamingInputDocumentFactory;
15
import eu.dnetlib.functionality.index.utils.ZkServers;
16
import eu.dnetlib.miscutils.datetime.HumanTime;
17
import eu.dnetlib.miscutils.functional.xml.ApplyXslt;
18
import org.apache.commons.codec.binary.Base64;
19
import org.apache.commons.lang.exception.ExceptionUtils;
20
import org.apache.commons.logging.Log;
21
import org.apache.commons.logging.LogFactory;
22
import org.apache.hadoop.conf.Configuration;
23
import org.apache.hadoop.io.Text;
24
import org.apache.hadoop.mapreduce.Mapper;
25
import org.apache.solr.client.solrj.SolrServerException;
26
import org.apache.solr.client.solrj.impl.CloudSolrClient;
27
import org.apache.solr.client.solrj.response.SolrPingResponse;
28
import org.apache.solr.client.solrj.response.UpdateResponse;
29
import org.apache.solr.common.SolrInputDocument;
30

    
31
public class IndexFeedMapper extends Mapper<Text, Text, Text, Text> {
32

    
33
	private static final Log log = LogFactory.getLog(IndexFeedMapper.class); // NOPMD by marko on 11/24/08 5:02 PM
34
	public static final String DNET_RESULT = "dnetResult";
35

    
36
	private InputDocumentFactory documentFactory;
37

    
38
	private CloudSolrClient solrClient;
39

    
40
	private String version;
41

    
42
	private String dsId;
43

    
44
	private int shutdownWaitTime = 10000;
45

    
46
	private int bufferFlushThreshold = 100;
47

    
48
	private ApplyXslt dmfToRecord;
49

    
50
	private List<SolrInputDocument> buffer;
51

    
52
	private int backoffTimeMs = 5000;
53

    
54
	private boolean simulation = false;
55

    
56
	private final static int MAX_INIT_RETRIES = 10;
57

    
58
	private final static int MAX_FEED_RETRIES = 10;
59

    
60
	private boolean compress = false;
61

    
62
	@Override
63
	protected void setup(final Context context) throws IOException, InterruptedException {
64

    
65
		logConfiguration(context.getConfiguration());
66

    
67
		dsId = context.getConfiguration().get(JobParams.INDEX_DSID);
68
		shutdownWaitTime = Integer.parseInt(context.getConfiguration().get(JobParams.INDEX_SHUTDOWN_WAIT));
69
		bufferFlushThreshold = Integer.parseInt(context.getConfiguration().get(JobParams.INDEX_BUFFER_FLUSH_TRESHOLD));
70
		documentFactory = new StreamingInputDocumentFactory();
71
		version = InputDocumentFactory.getParsedDateField(context.getConfiguration().get(JobParams.INDEX_FEED_TIME));
72
		buffer = Lists.newArrayList();
73
		simulation = Boolean.parseBoolean(context.getConfiguration().get(JobParams.INDEX_FEED_SIMULATION_MODE));
74

    
75
		compress = context.getConfiguration().getBoolean(JobParams.INDEX_FEED_COMPRESS_RESULT, false);
76

    
77
		final String xslt = new String(Base64.decodeBase64(context.getConfiguration().get(JobParams.INDEX_XSLT)));
78

    
79
		log.info("got xslt: \n" + xslt);
80
		log.info("got version: " + version);
81
		log.info("simulation: " + simulation);
82
		log.info("buffer size: " + bufferFlushThreshold);
83

    
84
		dmfToRecord = new ApplyXslt(xslt);
85
		log.info("using transformer: " + dmfToRecord.getTransformer().getClass());
86

    
87
		final String baseURL = context.getConfiguration().get(JobParams.INDEX_SOLR_URL);
88
		log.info("solr server baseURL: " + baseURL);
89

    
90
		final String collection = context.getConfiguration().get(JobParams.INDEX_SOLR_COLLECTION);
91
		log.info("solr server collection: " + collection);
92

    
93
		int count = 0;
94
		while (count <= MAX_INIT_RETRIES) {
95
			try {
96
				count++;
97
				log.info("initializing solr server...");
98
				final ZkServers zk = ZkServers.newInstance(baseURL);
99
				solrClient = new CloudSolrClient.Builder(zk.getHosts(), zk.getChroot())
100
						.withParallelUpdates(true)
101
						.build();
102

    
103
				solrClient.connect();
104
				solrClient.setDefaultCollection(collection);
105

    
106
				final SolrPingResponse rsp = solrClient.ping();
107

    
108
				if (rsp.getStatus() != 0) throw new SolrServerException("bad init status: " + rsp.getStatus());
109
				else {
110
					break;
111
				}
112

    
113
			} catch (final Throwable e) {
114
				if (solrClient != null) {
115
					solrClient.close();
116
				}
117
				context.getCounter("index init", e.getMessage()).increment(1);
118
				log.error(String.format("failed to init solr client wait %dms, error:\n%s", backoffTimeMs, ExceptionUtils.getStackTrace(e)));
119

    
120
				Thread.sleep(backoffTimeMs);
121
			}
122
		}
123

    
124
		if (count >= MAX_INIT_RETRIES) throw new IOException("reached max retries trying to connect to solr server: " + MAX_INIT_RETRIES);
125
	}
126

    
127
	@Override
128
	protected void map(final Text key, final Text value, final Context context) throws IOException, InterruptedException {
129

    
130
		String indexRecord = "";
131
		SolrInputDocument doc = null;
132

    
133
		try {
134
			indexRecord = dmfToRecord.evaluate(value.toString());
135

    
136
			doc = documentFactory.parseDocument(version, indexRecord, dsId, DNET_RESULT);
137

    
138
			if ((doc == null) || doc.isEmpty()) throw new EmptySolrDocumentException();
139

    
140
		} catch (final Throwable e) {
141
			context.getCounter("index feed", "skipped records").increment(1);
142
			handleError(key, value, context, indexRecord, doc, e);
143
			return;
144
		}
145
		int count = 0;
146
		while (count <= MAX_FEED_RETRIES) {
147
			count++;
148
			try {
149
				addDocument(context, doc);
150
				return;
151
			} catch (final Throwable e) {
152
				context.getCounter("index feed", "retries").increment(1);
153
				handleError(key, value, context, indexRecord, doc, e);
154
				log.info(String.format("failed to feed documents, attempt %s, waiting %dms", count, backoffTimeMs));
155
				Thread.sleep(backoffTimeMs);
156
			}
157
		}
158
	}
159

    
160
	public byte[] zip(final String s) {
161
		if ((s == null) || (s.length() == 0)) {
162
			throw new IllegalArgumentException("Cannot zip null or empty string");
163
		}
164

    
165
		try (ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream()) {
166
			try (GZIPOutputStream gzipOutputStream = new GZIPOutputStream(byteArrayOutputStream)) {
167
				gzipOutputStream.write(s.getBytes(StandardCharsets.UTF_8));
168
			}
169
			return byteArrayOutputStream.toByteArray();
170
		} catch(IOException e) {
171
			throw new RuntimeException("Failed to zip content", e);
172
		}
173
	}
174

    
175
	private void addDocument(final Context context, final SolrInputDocument doc) throws SolrServerException, IOException, EmptySolrDocumentException {
176
		buffer.add(doc);
177
		if (buffer.size() >= bufferFlushThreshold) {
178
			doAdd(buffer, context);
179
		}
180
	}
181

    
182
	private void doAdd(final List<SolrInputDocument> buffer, final Context context) throws SolrServerException, IOException {
183
		if (!simulation) {
184
			final long start = System.currentTimeMillis();
185
			final UpdateResponse rsp = solrClient.add(buffer);
186
			final long stop = System.currentTimeMillis() - start;
187
			log.info("feed time for " + buffer.size() + " records : " + HumanTime.exactly(stop) + "\n");
188

    
189
			final int status = rsp.getStatus();
190

    
191
			context.getCounter("index feed", "status code: " + status).increment(buffer.size());
192

    
193
			if (status != 0) throw new SolrServerException("bad status: " + status);
194

    
195
			for (final SolrInputDocument doc : buffer) {
196
				context.getCounter("index entity", getEntityType(doc)).increment(1);
197
			}
198
		}
199
		buffer.clear();
200
	}
201

    
202
	@Override
203
	protected void cleanup(final Context context) throws IOException, InterruptedException {
204
		super.cleanup(context);
205
		try {
206
			if (!buffer.isEmpty()) {
207
				doAdd(buffer, context);
208
			}
209
			log.info("\nwaiting " + shutdownWaitTime + "ms before shutdown");
210
			Thread.sleep(shutdownWaitTime);
211
			solrClient.close();
212
		} catch (final SolrServerException e) {
213
			log.error("couldn't shutdown server " + e.getMessage());
214
		}
215
	}
216

    
217
	private void handleError(final Text key, final Text value, final Context context, final String indexRecord, final SolrInputDocument doc, final Throwable e)
218
			throws IOException, InterruptedException {
219
		context.getCounter("index feed", e.getClass().getName()).increment(1);
220
		context.write(key, printRottenRecord(context.getTaskAttemptID().toString(), value, indexRecord, doc));
221
		// e.printStackTrace(System.err);
222
	}
223

    
224
	private Text printRottenRecord(final String taskid, final Text value, final String indexRecord, final SolrInputDocument doc) {
225
		return new Text("\n**********************************\n" + "task: " + taskid + "\n"
226
				+ check("original", value.toString() + check("indexRecord", indexRecord) + check("solrDoc", doc)));
227
	}
228

    
229
	private String check(final String label, final Object value) {
230
		if ((value != null) && !value.toString().isEmpty()) return "\n " + label + ":\n" + value + "\n";
231
		return "\n";
232
	}
233

    
234
	private void logConfiguration(final Configuration conf) {
235
		log.info("job configutation #################");
236
		for (final Entry<String, String> e : conf) {
237
			log.info("'" + e.getKey() + "' : '" + e.getValue() + "'");
238
		}
239
		log.info("end of job configutation #################\n\n");
240
	}
241

    
242
	private String getEntityType(final SolrInputDocument doc) {
243
		if (!doc.containsKey("oaftype")) return "unknown";
244

    
245
		final Type type = Type.valueOf(doc.getFieldValue("oaftype").toString());
246
		switch (type) {
247
		case result:
248
			if (!doc.containsKey("resulttypeid")) return "unknownresulttype";
249
			return doc.getFieldValue("resulttypeid").toString();
250
		default:
251
			return type.toString();
252
		}
253
	}
254

    
255
}
(5-5/8)