Project

General

Profile

1
package eu.dnetlib.data.mapreduce.hbase.index;
2

    
3
import com.google.common.collect.Lists;
4
import com.googlecode.protobuf.format.JsonFormat;
5
import eu.dnetlib.data.mapreduce.JobParams;
6
import eu.dnetlib.data.mapreduce.util.DedupUtils;
7
import eu.dnetlib.data.proto.OafProtos.Oaf;
8
import eu.dnetlib.data.transform.SolrProtoMapper;
9
import eu.dnetlib.functionality.index.solr.feed.InputDocumentFactory;
10
import eu.dnetlib.functionality.index.utils.ZkServers;
11
import eu.dnetlib.miscutils.datetime.HumanTime;
12
import org.apache.commons.collections.MapUtils;
13
import org.apache.commons.logging.Log;
14
import org.apache.commons.logging.LogFactory;
15
import org.apache.hadoop.conf.Configuration;
16
import org.apache.hadoop.hbase.client.Result;
17
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
18
import org.apache.hadoop.hbase.mapreduce.TableMapper;
19
import org.apache.hadoop.hbase.util.Bytes;
20
import org.apache.hadoop.io.Text;
21
import org.apache.solr.client.solrj.SolrServerException;
22
import org.apache.solr.client.solrj.impl.CloudSolrClient;
23
import org.apache.solr.client.solrj.response.SolrPingResponse;
24
import org.apache.solr.client.solrj.response.UpdateResponse;
25
import org.apache.solr.common.SolrInputDocument;
26
import org.dom4j.DocumentException;
27

    
28
import java.io.IOException;
29
import java.util.List;
30
import java.util.Map;
31
import java.util.Map.Entry;
32

    
33
public class DedupIndexFeedMapper extends TableMapper<Text, Text> {
34

    
35
	private static final Log log = LogFactory.getLog(DedupIndexFeedMapper.class); // NOPMD by marko on 11/24/08 5:02 PM
36

    
37
	private CloudSolrClient solrClient;
38

    
39
	private String dsId;
40

    
41
	private String version;
42

    
43
	private int shutdownWaitTime = 10000;
44

    
45
	private int bufferFlushThreshold = 100;
46

    
47
	private List<SolrInputDocument> buffer;
48

    
49
	private int backoffTimeMs = 5000;
50

    
51
	private boolean simulation = false;
52

    
53
	private String entityType = null;
54

    
55
	private String actionset = null;
56

    
57
	private SolrProtoMapper mapper = null;
58

    
59
	private final static int MAX_RETRIES = 10;
60

    
61
	@Override
62
	protected void setup(final Context context) throws IOException, InterruptedException {
63

    
64
		logConfiguration(context.getConfiguration());
65

    
66
		shutdownWaitTime = Integer.parseInt(context.getConfiguration().get(JobParams.INDEX_SHUTDOWN_WAIT));
67
		bufferFlushThreshold = Integer.parseInt(context.getConfiguration().get(JobParams.INDEX_BUFFER_FLUSH_TRESHOLD));
68
		dsId = context.getConfiguration().get(JobParams.INDEX_DSID);
69
		version = InputDocumentFactory.getParsedDateField(context.getConfiguration().get(JobParams.INDEX_FEED_TIME));
70
		buffer = Lists.newArrayList();
71
		simulation = Boolean.parseBoolean(context.getConfiguration().get(JobParams.INDEX_FEED_SIMULATION_MODE));
72
		entityType = context.getConfiguration().get("entityType");
73
		actionset = context.getConfiguration().get("actionset");
74

    
75
		final String fields = context.getConfiguration().get("index.fields");
76

    
77
		log.info("got fields: \n" + fields);
78
		log.info("got dsId: " + dsId);
79
		log.info("got version: " + version);
80
		log.info("simulation: " + simulation);
81
		log.info("entityType: " + entityType);
82
		log.info("actionset: " + actionset);
83
		log.info("buffer size: " + bufferFlushThreshold);
84

    
85
		try {
86
			mapper = new SolrProtoMapper(fields);
87
		} catch (final DocumentException e) {
88
			log.error("unable to parse fields: " + fields);
89
			throw new IllegalArgumentException(e);
90
		}
91

    
92
		final String baseURL = context.getConfiguration().get(JobParams.INDEX_SOLR_URL);
93
		log.info("solr server baseURL: " + baseURL);
94

    
95
		final String collection = context.getConfiguration().get(JobParams.INDEX_SOLR_COLLECTION);
96
		log.info("solr server collection: " + collection);
97

    
98
		while (true) {
99
			try {
100
				log.info("initializing solr server...");
101

    
102
				final ZkServers zk = ZkServers.newInstance(baseURL);
103
				solrClient = new CloudSolrClient.Builder(zk.getHosts(), zk.getChroot())
104
						.withParallelUpdates(true)
105
						.build();
106

    
107
				solrClient.connect();
108
				solrClient.setDefaultCollection(collection);
109

    
110
				final SolrPingResponse rsp = solrClient.ping();
111

    
112
				if (rsp.getStatus() != 0) throw new SolrServerException("bad init status: " + rsp.getStatus());
113
				else {
114
					break;
115
				}
116

    
117
			} catch (final Throwable e) {
118
				if (solrClient != null) {
119
					solrClient.close();
120
				}
121
				context.getCounter("index init", e.getMessage()).increment(1);
122
				log.info(String.format("failed to init solr client wait %dms", backoffTimeMs));
123
				Thread.sleep(backoffTimeMs);
124
			}
125
		}
126
	}
127

    
128
	@Override
129
	protected void map(final ImmutableBytesWritable key, final Result value, final Context context) throws IOException, InterruptedException {
130

    
131
		SolrInputDocument doc = null;
132

    
133
		final Map<byte[], byte[]> bMap = value.getFamilyMap(Bytes.toBytes(entityType));
134

    
135
		if (MapUtils.isEmpty(bMap) || !bMap.containsKey(DedupUtils.BODY_B)) {
136
			context.getCounter(entityType, "missing body");
137
			return;
138
		}
139

    
140
		final Oaf oaf = Oaf.parseFrom(bMap.get(DedupUtils.BODY_B));
141

    
142
		try {
143
			doc = getDocument(oaf);
144
		} catch (final Throwable e) {
145
			handleError(key, new JsonFormat().printToString(oaf), context, doc, e);
146
			return;
147
		}
148

    
149
		int retries = 0;
150
		while (retries < MAX_RETRIES) {
151
			try {
152
				addDocument(context, doc);
153
				return;
154
			} catch (final Throwable e) {
155
				retries++;
156
				context.getCounter("index feed", "retries").increment(1);
157
				handleError(key, new JsonFormat().printToString(oaf), context, doc, e);
158
				log.info(String.format("failed to feed documents, waiting %dms", backoffTimeMs));
159
				Thread.sleep(backoffTimeMs);
160
			}
161
		}
162
		if (retries >= MAX_RETRIES)
163
			throw new IOException("too many retries: " + retries);
164
	}
165

    
166
	private SolrInputDocument getDocument(final Oaf oaf) throws DocumentException {
167
		final SolrInputDocument document = mapper.map(oaf, version, dsId, actionset);
168
		document.addField("actionset", actionset);
169
		return document;
170
	}
171

    
172
	private void addDocument(final Context context, final SolrInputDocument doc) throws SolrServerException, IOException {
173
		if (!doc.isEmpty()) {
174

    
175
			buffer.add(doc);
176
			if (buffer.size() >= bufferFlushThreshold) {
177
				doAdd(buffer, context);
178
				// Thread.sleep(100);
179
			}
180
		} else {
181
			context.getCounter("index feed", "skipped records").increment(1);
182
		}
183
	}
184

    
185
	private void doAdd(final List<SolrInputDocument> buffer, final Context context) throws SolrServerException, IOException {
186
		if (!simulation) {
187
			final long start = System.currentTimeMillis();
188
			final UpdateResponse rsp = solrClient.add(buffer);
189
			final long stop = System.currentTimeMillis() - start;
190
			log.info("feed time for " + buffer.size() + " records : " + HumanTime.exactly(stop) + "\n");
191

    
192
			final int status = rsp.getStatus();
193
			context.getCounter("index feed", "status code: " + status).increment(buffer.size());
194

    
195
			if (status != 0) throw new SolrServerException("bad status: " + status);
196
		}
197
		buffer.clear();
198
	}
199

    
200
	@Override
201
	protected void cleanup(final Context context) throws IOException, InterruptedException {
202
		super.cleanup(context);
203
		try {
204
			if (!buffer.isEmpty()) {
205
				doAdd(buffer, context);
206
			}
207
			log.info("\nwaiting " + shutdownWaitTime + "ms before shutdown");
208
			Thread.sleep(shutdownWaitTime);
209
			solrClient.close();
210
		} catch (final SolrServerException e) {
211
			System.err.println("couldn't shutdown server " + e.getMessage());
212
		}
213
	}
214

    
215
	private void handleError(final ImmutableBytesWritable key, final String value, final Context context, final SolrInputDocument doc, final Throwable e)
216
			throws IOException, InterruptedException {
217
		context.getCounter("index feed", e.getClass().getName()).increment(1);
218
		context.write(new Text(key.copyBytes()), printRottenRecord(context.getTaskAttemptID().toString(), value, doc));
219
		// e.printStackTrace(System.err);
220
	}
221

    
222
	private Text printRottenRecord(final String taskid, final String value, final SolrInputDocument doc) {
223
		return new Text("\n**********************************\n" + "task: " + taskid + "\n"
224
				+ check("original", value.toString() + check("solrDoc", doc)));
225
	}
226

    
227
	private String check(final String label, final Object value) {
228
		if ((value != null) && !value.toString().isEmpty()) return "\n " + label + ":\n" + value + "\n";
229
		return "\n";
230
	}
231

    
232
	private void logConfiguration(final Configuration conf) {
233
		log.info("job configutation #################");
234
		for (final Entry<String, String> e : conf) {
235
			log.info("'" + e.getKey() + "' : '" + e.getValue() + "'");
236
		}
237
		log.info("end of job configutation #################\n\n");
238
	}
239

    
240
}
(2-2/8)