Project

General

Profile

1
package eu.dnetlib.data.mapreduce.hbase.index;
2

    
3
import java.io.IOException;
4
import java.util.List;
5
import java.util.Map;
6
import java.util.Map.Entry;
7

    
8
import org.apache.commons.collections.MapUtils;
9
import org.apache.commons.logging.Log;
10
import org.apache.commons.logging.LogFactory;
11
import org.apache.hadoop.conf.Configuration;
12
import org.apache.hadoop.hbase.client.Result;
13
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
14
import org.apache.hadoop.hbase.mapreduce.TableMapper;
15
import org.apache.hadoop.hbase.util.Bytes;
16
import org.apache.hadoop.io.Text;
17
import org.apache.solr.client.solrj.SolrServerException;
18
import org.apache.solr.client.solrj.impl.CloudSolrClient;
19
import org.apache.solr.client.solrj.response.SolrPingResponse;
20
import org.apache.solr.client.solrj.response.UpdateResponse;
21
import org.apache.solr.common.SolrInputDocument;
22
import org.dom4j.DocumentException;
23

    
24
import com.google.common.collect.Lists;
25
import com.googlecode.protobuf.format.JsonFormat;
26

    
27
import eu.dnetlib.data.mapreduce.JobParams;
28
import eu.dnetlib.data.mapreduce.util.DedupUtils;
29
import eu.dnetlib.data.proto.OafProtos.Oaf;
30
import eu.dnetlib.data.transform.SolrProtoMapper;
31
import eu.dnetlib.functionality.index.solr.feed.InputDocumentFactory;
32
import eu.dnetlib.miscutils.datetime.HumanTime;
33

    
34
public class DedupIndexFeedMapper extends TableMapper<Text, Text> {
35

    
36
	private static final Log log = LogFactory.getLog(DedupIndexFeedMapper.class); // NOPMD by marko on 11/24/08 5:02 PM
37

    
38
	private CloudSolrClient solrServer;
39

    
40
	private String dsId;
41

    
42
	private String version;
43

    
44
	private int shutdownWaitTime = 10000;
45

    
46
	private int bufferFlushThreshold = 100;
47

    
48
	private List<SolrInputDocument> buffer;
49

    
50
	private int backoffTimeMs = 5000;
51

    
52
	private boolean simulation = false;
53

    
54
	private String entityType = null;
55

    
56
	private String actionset = null;
57

    
58
	private SolrProtoMapper mapper = null;
59

    
60
	private final static int MAX_RETRIES = 10;
61

    
62
	@Override
63
	protected void setup(final Context context) throws IOException, InterruptedException {
64

    
65
		logConfiguration(context.getConfiguration());
66

    
67
		shutdownWaitTime = Integer.parseInt(context.getConfiguration().get(JobParams.INDEX_SHUTDOWN_WAIT));
68
		bufferFlushThreshold = Integer.parseInt(context.getConfiguration().get(JobParams.INDEX_BUFFER_FLUSH_TRESHOLD));
69
		dsId = context.getConfiguration().get(JobParams.INDEX_DSID);
70
		version = InputDocumentFactory.getParsedDateField(context.getConfiguration().get(JobParams.INDEX_FEED_TIME));
71
		buffer = Lists.newArrayList();
72
		simulation = Boolean.parseBoolean(context.getConfiguration().get(JobParams.INDEX_FEED_SIMULATION_MODE));
73
		entityType = context.getConfiguration().get("entityType");
74
		actionset = context.getConfiguration().get("actionset");
75

    
76
		final String fields = context.getConfiguration().get("index.fields");
77

    
78
		log.info("got fields: \n" + fields);
79
		log.info("got dsId: " + dsId);
80
		log.info("got version: " + version);
81
		log.info("simulation: " + simulation);
82
		log.info("entityType: " + entityType);
83
		log.info("actionset: " + actionset);
84
		log.info("buffer size: " + bufferFlushThreshold);
85

    
86
		try {
87
			mapper = new SolrProtoMapper(fields);
88
		} catch (final DocumentException e) {
89
			log.error("unable to parse fields: " + fields);
90
			throw new IllegalArgumentException(e);
91
		}
92

    
93
		final String baseURL = context.getConfiguration().get(JobParams.INDEX_SOLR_URL);
94
		log.info("solr server baseURL: " + baseURL);
95

    
96
		final String collection = context.getConfiguration().get(JobParams.INDEX_SOLR_COLLECTION);
97
		log.info("solr server collection: " + collection);
98

    
99
		while (true) {
100
			try {
101
				log.info("initializing solr server...");
102
				solrServer = new CloudSolrClient.Builder()
103
					.withZkHost(baseURL)
104
					.build();
105

    
106
				solrServer.connect();
107

    
108
				solrServer.setParallelUpdates(true);
109
				solrServer.setDefaultCollection(collection);
110

    
111
				final SolrPingResponse rsp = solrServer.ping();
112

    
113
				if (rsp.getStatus() != 0) throw new SolrServerException("bad init status: " + rsp.getStatus());
114
				else {
115
					break;
116
				}
117

    
118
			} catch (final Throwable e) {
119
				if (solrServer != null) {
120
					solrServer.close();
121
				}
122
				context.getCounter("index init", e.getMessage()).increment(1);
123
				log.info(String.format("failed to init solr client wait %dms", backoffTimeMs));
124
				Thread.sleep(backoffTimeMs);
125
			}
126
		}
127
	}
128

    
129
	@Override
130
	protected void map(final ImmutableBytesWritable key, final Result value, final Context context) throws IOException, InterruptedException {
131

    
132
		SolrInputDocument doc = null;
133

    
134
		final Map<byte[], byte[]> bMap = value.getFamilyMap(Bytes.toBytes(entityType));
135

    
136
		if (MapUtils.isEmpty(bMap) || !bMap.containsKey(DedupUtils.BODY_B)) {
137
			context.getCounter(entityType, "missing body");
138
			return;
139
		}
140

    
141
		final Oaf oaf = Oaf.parseFrom(bMap.get(DedupUtils.BODY_B));
142

    
143
		try {
144
			doc = getDocument(oaf);
145
		} catch (final Throwable e) {
146
			handleError(key, new JsonFormat().printToString(oaf), context, doc, e);
147
			return;
148
		}
149

    
150
		int retries = 0;
151
		while (retries < MAX_RETRIES) {
152
			try {
153
				addDocument(context, doc);
154
				return;
155
			} catch (final Throwable e) {
156
				retries++;
157
				context.getCounter("index feed", "retries").increment(1);
158
				handleError(key, new JsonFormat().printToString(oaf), context, doc, e);
159
				log.info(String.format("failed to feed documents, waiting %dms", backoffTimeMs));
160
				Thread.sleep(backoffTimeMs);
161
			}
162
		}
163
		if (retries >= MAX_RETRIES)
164
			throw new IOException("too many retries: " + retries);
165
	}
166

    
167
	private SolrInputDocument getDocument(final Oaf oaf) throws DocumentException {
168
		final SolrInputDocument document = mapper.map(oaf, version, dsId, actionset);
169
		document.addField("actionset", actionset);
170
		return document;
171
	}
172

    
173
	private void addDocument(final Context context, final SolrInputDocument doc) throws SolrServerException, IOException {
174
		if (!doc.isEmpty()) {
175

    
176
			buffer.add(doc);
177
			if (buffer.size() >= bufferFlushThreshold) {
178
				doAdd(buffer, context);
179
				// Thread.sleep(100);
180
			}
181
		} else {
182
			context.getCounter("index feed", "skipped records").increment(1);
183
		}
184
	}
185

    
186
	private void doAdd(final List<SolrInputDocument> buffer, final Context context) throws SolrServerException, IOException {
187
		if (!simulation) {
188
			final long start = System.currentTimeMillis();
189
			final UpdateResponse rsp = solrServer.add(buffer);
190
			final long stop = System.currentTimeMillis() - start;
191
			log.info("feed time for " + buffer.size() + " records : " + HumanTime.exactly(stop) + "\n");
192

    
193
			final int status = rsp.getStatus();
194
			context.getCounter("index feed", "status code: " + status).increment(buffer.size());
195

    
196
			if (status != 0) throw new SolrServerException("bad status: " + status);
197
		}
198
		buffer.clear();
199
	}
200

    
201
	@Override
202
	protected void cleanup(final Context context) throws IOException, InterruptedException {
203
		super.cleanup(context);
204
		try {
205
			if (!buffer.isEmpty()) {
206
				doAdd(buffer, context);
207
			}
208
			log.info("\nwaiting " + shutdownWaitTime + "ms before shutdown");
209
			Thread.sleep(shutdownWaitTime);
210
			solrServer.close();
211
		} catch (final SolrServerException e) {
212
			System.err.println("couldn't shutdown server " + e.getMessage());
213
		}
214
	}
215

    
216
	private void handleError(final ImmutableBytesWritable key, final String value, final Context context, final SolrInputDocument doc, final Throwable e)
217
			throws IOException, InterruptedException {
218
		context.getCounter("index feed", e.getClass().getName()).increment(1);
219
		context.write(new Text(key.copyBytes()), printRottenRecord(context.getTaskAttemptID().toString(), value, doc));
220
		// e.printStackTrace(System.err);
221
	}
222

    
223
	private Text printRottenRecord(final String taskid, final String value, final SolrInputDocument doc) {
224
		return new Text("\n**********************************\n" + "task: " + taskid + "\n"
225
				+ check("original", value.toString() + check("solrDoc", doc)));
226
	}
227

    
228
	private String check(final String label, final Object value) {
229
		if ((value != null) && !value.toString().isEmpty()) return "\n " + label + ":\n" + value + "\n";
230
		return "\n";
231
	}
232

    
233
	private void logConfiguration(final Configuration conf) {
234
		log.info("job configutation #################");
235
		for (final Entry<String, String> e : conf) {
236
			log.info("'" + e.getKey() + "' : '" + e.getValue() + "'");
237
		}
238
		log.info("end of job configutation #################\n\n");
239
	}
240

    
241
}
(1-1/7)