Project

General

Profile

1
package eu.dnetlib.data.mapreduce.hbase.index;
2

    
3
import java.io.IOException;
4
import java.util.List;
5
import java.util.Map.Entry;
6

    
7
import org.apache.commons.codec.binary.Base64;
8
import org.apache.commons.lang.exception.ExceptionUtils;
9
import org.apache.commons.logging.Log;
10
import org.apache.commons.logging.LogFactory;
11
import org.apache.hadoop.conf.Configuration;
12
import org.apache.hadoop.io.Text;
13
import org.apache.hadoop.mapreduce.Mapper;
14
import org.apache.solr.client.solrj.SolrServerException;
15
import org.apache.solr.client.solrj.impl.CloudSolrServer;
16
import org.apache.solr.client.solrj.response.SolrPingResponse;
17
import org.apache.solr.client.solrj.response.UpdateResponse;
18
import org.apache.solr.common.SolrInputDocument;
19

    
20
import com.google.common.collect.Lists;
21

    
22
import eu.dnetlib.data.mapreduce.JobParams;
23
import eu.dnetlib.data.proto.TypeProtos.Type;
24
import eu.dnetlib.functionality.index.solr.feed.InputDocumentFactory;
25
import eu.dnetlib.functionality.index.solr.feed.StreamingInputDocumentFactory;
26
import eu.dnetlib.miscutils.datetime.HumanTime;
27
import eu.dnetlib.miscutils.functional.xml.ApplyXslt;
28

    
29
public class IndexFeedMapper extends Mapper<Text, Text, Text, Text> {
30

    
31
	private static final Log log = LogFactory.getLog(IndexFeedMapper.class); // NOPMD by marko on 11/24/08 5:02 PM
32

    
33
	private InputDocumentFactory documentFactory;
34

    
35
	private CloudSolrServer solrServer;
36

    
37
	private String version;
38

    
39
	private String dsId;
40

    
41
	private int shutdownWaitTime = 10000;
42

    
43
	private int bufferFlushThreshold = 100;
44

    
45
	private ApplyXslt dmfToRecord;
46

    
47
	private List<SolrInputDocument> buffer;
48

    
49
	private int backoffTimeMs = 5000;
50

    
51
	private boolean simulation = false;
52

    
53
	private final static int MAX_INIT_RETRIES = 10;
54

    
55
	private final static int MAX_FEED_RETRIES = 10;
56

    
57
	@Override
58
	protected void setup(final Context context) throws IOException, InterruptedException {
59

    
60
		logConfiguration(context.getConfiguration());
61

    
62
		dsId = context.getConfiguration().get(JobParams.INDEX_DSID);
63
		shutdownWaitTime = Integer.parseInt(context.getConfiguration().get(JobParams.INDEX_SHUTDOWN_WAIT));
64
		bufferFlushThreshold = Integer.parseInt(context.getConfiguration().get(JobParams.INDEX_BUFFER_FLUSH_TRESHOLD));
65
		documentFactory = new StreamingInputDocumentFactory();
66
		version = InputDocumentFactory.getParsedDateField(context.getConfiguration().get(JobParams.INDEX_FEED_TIME));
67
		buffer = Lists.newArrayList();
68
		simulation = Boolean.parseBoolean(context.getConfiguration().get(JobParams.INDEX_FEED_SIMULATION_MODE));
69

    
70
		final String xslt = new String(Base64.decodeBase64(context.getConfiguration().get(JobParams.INDEX_XSLT)));
71

    
72
		log.info("got xslt: \n" + xslt);
73
		log.info("got version: " + version);
74
		log.info("simulation: " + simulation);
75
		log.info("buffer size: " + bufferFlushThreshold);
76

    
77
		dmfToRecord = new ApplyXslt(xslt);
78

    
79
		final String baseURL = context.getConfiguration().get(JobParams.INDEX_SOLR_URL);
80
		log.info("solr server baseURL: " + baseURL);
81

    
82
		final String collection = context.getConfiguration().get(JobParams.INDEX_SOLR_COLLECTION);
83
		log.info("solr server collection: " + collection);
84

    
85
		int count = 0;
86
		while (count <= MAX_INIT_RETRIES) {
87
			try {
88
				count++;
89
				log.info("initializing solr server...");
90
				solrServer = new CloudSolrServer(baseURL);
91

    
92
				solrServer.connect();
93

    
94
				solrServer.setParallelUpdates(true);
95
				solrServer.setDefaultCollection(collection);
96

    
97
				final SolrPingResponse rsp = solrServer.ping();
98

    
99
				if (rsp.getStatus() != 0) throw new SolrServerException("bad init status: " + rsp.getStatus());
100
				else {
101
					break;
102
				}
103

    
104
			} catch (final Throwable e) {
105
				if (solrServer != null) {
106
					solrServer.shutdown();
107
				}
108
				context.getCounter("index init", e.getMessage()).increment(1);
109
				log.error(String.format("failed to init solr client wait %dms, error:\n%s", backoffTimeMs, ExceptionUtils.getStackTrace(e)));
110

    
111
				Thread.sleep(backoffTimeMs);
112
			}
113
		}
114

    
115
		if (count >= MAX_INIT_RETRIES) throw new IOException("reached max retries trying to connect to solr server: " + MAX_INIT_RETRIES);
116
	}
117

    
118
	@Override
119
	protected void map(final Text key, final Text value, final Context context) throws IOException, InterruptedException {
120

    
121
		String indexRecord = "";
122
		SolrInputDocument doc = null;
123

    
124
		try {
125
			indexRecord = dmfToRecord.evaluate(value.toString());
126
			doc = documentFactory.parseDocument(version, indexRecord, dsId, "dnetResult");
127
			if ((doc == null) || doc.isEmpty()) throw new EmptySolrDocumentException();
128

    
129
		} catch (final Throwable e) {
130
			context.getCounter("index feed", "skipped records").increment(1);
131
			handleError(key, value, context, indexRecord, doc, e);
132
			return;
133
		}
134
		int count = 0;
135
		while (count <= MAX_FEED_RETRIES) {
136
			count++;
137
			try {
138
				addDocument(context, doc);
139
				return;
140
			} catch (final Throwable e) {
141
				context.getCounter("index feed", "retries").increment(1);
142
				handleError(key, value, context, indexRecord, doc, e);
143
				log.info(String.format("failed to feed documents, waiting %dms", backoffTimeMs));
144
				Thread.sleep(backoffTimeMs);
145
			}
146
		}
147
	}
148

    
149
	private void addDocument(final Context context, final SolrInputDocument doc) throws SolrServerException, IOException, EmptySolrDocumentException {
150
		buffer.add(doc);
151
		if (buffer.size() >= bufferFlushThreshold) {
152
			doAdd(buffer, context);
153
		}
154
	}
155

    
156
	private void doAdd(final List<SolrInputDocument> buffer, final Context context) throws SolrServerException, IOException {
157
		if (!simulation) {
158
			final long start = System.currentTimeMillis();
159
			final UpdateResponse rsp = solrServer.add(buffer);
160
			final long stop = System.currentTimeMillis() - start;
161
			log.info("feed time for " + buffer.size() + " records : " + HumanTime.exactly(stop) + "\n");
162

    
163
			final int status = rsp.getStatus();
164

    
165
			context.getCounter("index feed", "status code: " + status).increment(buffer.size());
166

    
167
			if (status != 0) throw new SolrServerException("bad status: " + status);
168

    
169
			for (final SolrInputDocument doc : buffer) {
170
				context.getCounter("index entity", getEntityType(doc)).increment(1);
171
			}
172
		}
173
		buffer.clear();
174
	}
175

    
176
	@Override
177
	protected void cleanup(final Context context) throws IOException, InterruptedException {
178
		super.cleanup(context);
179
		try {
180
			if (!buffer.isEmpty()) {
181
				doAdd(buffer, context);
182
			}
183
			log.info("\nwaiting " + shutdownWaitTime + "ms before shutdown");
184
			Thread.sleep(shutdownWaitTime);
185
			solrServer.shutdown();
186
		} catch (final SolrServerException e) {
187
			log.error("couldn't shutdown server " + e.getMessage());
188
		}
189
	}
190

    
191
	private void handleError(final Text key, final Text value, final Context context, final String indexRecord, final SolrInputDocument doc, final Throwable e)
192
			throws IOException, InterruptedException {
193
		context.getCounter("index feed", e.getClass().getName()).increment(1);
194
		context.write(key, printRottenRecord(context.getTaskAttemptID().toString(), value, indexRecord, doc));
195
		// e.printStackTrace(System.err);
196
	}
197

    
198
	private Text printRottenRecord(final String taskid, final Text value, final String indexRecord, final SolrInputDocument doc) {
199
		return new Text("\n**********************************\n" + "task: " + taskid + "\n"
200
				+ check("original", value.toString() + check("indexRecord", indexRecord) + check("solrDoc", doc)));
201
	}
202

    
203
	private String check(final String label, final Object value) {
204
		if ((value != null) && !value.toString().isEmpty()) return "\n " + label + ":\n" + value + "\n";
205
		return "\n";
206
	}
207

    
208
	private void logConfiguration(final Configuration conf) {
209
		log.info("job configutation #################");
210
		for (final Entry<String, String> e : conf) {
211
			log.info("'" + e.getKey() + "' : '" + e.getValue() + "'");
212
		}
213
		log.info("end of job configutation #################\n\n");
214
	}
215

    
216
	private String getEntityType(final SolrInputDocument doc) {
217
		if (!doc.containsKey("oaftype")) return "unknown";
218

    
219
		final Type type = Type.valueOf(doc.getFieldValue("oaftype").toString());
220
		switch (type) {
221
		case result:
222
			if (!doc.containsKey("resulttypeid")) return "result";
223
			return doc.getFieldValue("resulttypeid").toString();
224
		default:
225
			return type.toString();
226
		}
227
	}
228

    
229
}
(5-5/7)