1
|
package eu.dnetlib.data.mapreduce.hbase.index;
|
2
|
|
3
|
import java.io.ByteArrayOutputStream;
|
4
|
import java.io.IOException;
|
5
|
import java.nio.charset.StandardCharsets;
|
6
|
import java.util.List;
|
7
|
import java.util.Map.Entry;
|
8
|
import java.util.zip.GZIPOutputStream;
|
9
|
|
10
|
import com.google.common.collect.Lists;
|
11
|
import eu.dnetlib.data.mapreduce.JobParams;
|
12
|
import eu.dnetlib.data.proto.TypeProtos.Type;
|
13
|
import eu.dnetlib.functionality.index.solr.feed.InputDocumentFactory;
|
14
|
import eu.dnetlib.functionality.index.solr.feed.StreamingInputDocumentFactory;
|
15
|
import eu.dnetlib.miscutils.datetime.HumanTime;
|
16
|
import eu.dnetlib.miscutils.functional.xml.ApplyXslt;
|
17
|
import org.apache.commons.codec.binary.Base64;
|
18
|
import org.apache.commons.lang.exception.ExceptionUtils;
|
19
|
import org.apache.commons.logging.Log;
|
20
|
import org.apache.commons.logging.LogFactory;
|
21
|
import org.apache.hadoop.conf.Configuration;
|
22
|
import org.apache.hadoop.io.Text;
|
23
|
import org.apache.hadoop.mapreduce.Mapper;
|
24
|
import org.apache.solr.client.solrj.SolrServerException;
|
25
|
import org.apache.solr.client.solrj.impl.CloudSolrServer;
|
26
|
import org.apache.solr.client.solrj.response.SolrPingResponse;
|
27
|
import org.apache.solr.client.solrj.response.UpdateResponse;
|
28
|
import org.apache.solr.common.SolrInputDocument;
|
29
|
|
30
|
public class IndexFeedMapper extends Mapper<Text, Text, Text, Text> {
|
31
|
|
32
|
private static final Log log = LogFactory.getLog(IndexFeedMapper.class); // NOPMD by marko on 11/24/08 5:02 PM
|
33
|
public static final String DNET_RESULT = "dnetResult";
|
34
|
|
35
|
private InputDocumentFactory documentFactory;
|
36
|
|
37
|
private CloudSolrServer solrServer;
|
38
|
|
39
|
private String version;
|
40
|
|
41
|
private String dsId;
|
42
|
|
43
|
private int shutdownWaitTime = 10000;
|
44
|
|
45
|
private int bufferFlushThreshold = 100;
|
46
|
|
47
|
private ApplyXslt dmfToRecord;
|
48
|
|
49
|
private List<SolrInputDocument> buffer;
|
50
|
|
51
|
private int backoffTimeMs = 5000;
|
52
|
|
53
|
private boolean simulation = false;
|
54
|
|
55
|
private final static int MAX_INIT_RETRIES = 10;
|
56
|
|
57
|
private final static int MAX_FEED_RETRIES = 10;
|
58
|
|
59
|
private boolean compress = false;
|
60
|
|
61
|
@Override
|
62
|
protected void setup(final Context context) throws IOException, InterruptedException {
|
63
|
|
64
|
logConfiguration(context.getConfiguration());
|
65
|
|
66
|
dsId = context.getConfiguration().get(JobParams.INDEX_DSID);
|
67
|
shutdownWaitTime = Integer.parseInt(context.getConfiguration().get(JobParams.INDEX_SHUTDOWN_WAIT));
|
68
|
bufferFlushThreshold = Integer.parseInt(context.getConfiguration().get(JobParams.INDEX_BUFFER_FLUSH_TRESHOLD));
|
69
|
documentFactory = new StreamingInputDocumentFactory();
|
70
|
version = InputDocumentFactory.getParsedDateField(context.getConfiguration().get(JobParams.INDEX_FEED_TIME));
|
71
|
buffer = Lists.newArrayList();
|
72
|
simulation = Boolean.parseBoolean(context.getConfiguration().get(JobParams.INDEX_FEED_SIMULATION_MODE));
|
73
|
|
74
|
compress = context.getConfiguration().getBoolean(JobParams.INDEX_FEED_COMPRESS_RESULT, false);
|
75
|
|
76
|
final String xslt = new String(Base64.decodeBase64(context.getConfiguration().get(JobParams.INDEX_XSLT)));
|
77
|
|
78
|
log.info("got xslt: \n" + xslt);
|
79
|
log.info("got version: " + version);
|
80
|
log.info("simulation: " + simulation);
|
81
|
log.info("buffer size: " + bufferFlushThreshold);
|
82
|
|
83
|
dmfToRecord = new ApplyXslt(xslt);
|
84
|
|
85
|
final String baseURL = context.getConfiguration().get(JobParams.INDEX_SOLR_URL);
|
86
|
log.info("solr server baseURL: " + baseURL);
|
87
|
|
88
|
final String collection = context.getConfiguration().get(JobParams.INDEX_SOLR_COLLECTION);
|
89
|
log.info("solr server collection: " + collection);
|
90
|
|
91
|
int count = 0;
|
92
|
while (count <= MAX_INIT_RETRIES) {
|
93
|
try {
|
94
|
count++;
|
95
|
log.info("initializing solr server...");
|
96
|
solrServer = new CloudSolrServer(baseURL);
|
97
|
|
98
|
solrServer.connect();
|
99
|
|
100
|
solrServer.setParallelUpdates(true);
|
101
|
solrServer.setDefaultCollection(collection);
|
102
|
|
103
|
final SolrPingResponse rsp = solrServer.ping();
|
104
|
|
105
|
if (rsp.getStatus() != 0) throw new SolrServerException("bad init status: " + rsp.getStatus());
|
106
|
else {
|
107
|
break;
|
108
|
}
|
109
|
|
110
|
} catch (final Throwable e) {
|
111
|
if (solrServer != null) {
|
112
|
solrServer.shutdown();
|
113
|
}
|
114
|
context.getCounter("index init", e.getMessage()).increment(1);
|
115
|
log.error(String.format("failed to init solr client wait %dms, error:\n%s", backoffTimeMs, ExceptionUtils.getStackTrace(e)));
|
116
|
|
117
|
Thread.sleep(backoffTimeMs);
|
118
|
}
|
119
|
}
|
120
|
|
121
|
if (count >= MAX_INIT_RETRIES) throw new IOException("reached max retries trying to connect to solr server: " + MAX_INIT_RETRIES);
|
122
|
}
|
123
|
|
124
|
@Override
|
125
|
protected void map(final Text key, final Text value, final Context context) throws IOException, InterruptedException {
|
126
|
|
127
|
String indexRecord = "";
|
128
|
SolrInputDocument doc = null;
|
129
|
|
130
|
try {
|
131
|
indexRecord = dmfToRecord.evaluate(value.toString());
|
132
|
|
133
|
doc = documentFactory.parseDocument(version, indexRecord, dsId, DNET_RESULT);
|
134
|
|
135
|
if ((doc == null) || doc.isEmpty()) throw new EmptySolrDocumentException();
|
136
|
|
137
|
} catch (final Throwable e) {
|
138
|
context.getCounter("index feed", "skipped records").increment(1);
|
139
|
handleError(key, value, context, indexRecord, doc, e);
|
140
|
return;
|
141
|
}
|
142
|
int count = 0;
|
143
|
while (count <= MAX_FEED_RETRIES) {
|
144
|
count++;
|
145
|
try {
|
146
|
addDocument(context, doc);
|
147
|
return;
|
148
|
} catch (final Throwable e) {
|
149
|
context.getCounter("index feed", "retries").increment(1);
|
150
|
handleError(key, value, context, indexRecord, doc, e);
|
151
|
log.info(String.format("failed to feed documents, waiting %dms", backoffTimeMs));
|
152
|
Thread.sleep(backoffTimeMs);
|
153
|
}
|
154
|
}
|
155
|
}
|
156
|
|
157
|
public byte[] zip(final String s) {
|
158
|
if ((s == null) || (s.length() == 0)) {
|
159
|
throw new IllegalArgumentException("Cannot zip null or empty string");
|
160
|
}
|
161
|
|
162
|
try (ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream()) {
|
163
|
try (GZIPOutputStream gzipOutputStream = new GZIPOutputStream(byteArrayOutputStream)) {
|
164
|
gzipOutputStream.write(s.getBytes(StandardCharsets.UTF_8));
|
165
|
}
|
166
|
return byteArrayOutputStream.toByteArray();
|
167
|
} catch(IOException e) {
|
168
|
throw new RuntimeException("Failed to zip content", e);
|
169
|
}
|
170
|
}
|
171
|
|
172
|
private void addDocument(final Context context, final SolrInputDocument doc) throws SolrServerException, IOException, EmptySolrDocumentException {
|
173
|
buffer.add(doc);
|
174
|
if (buffer.size() >= bufferFlushThreshold) {
|
175
|
doAdd(buffer, context);
|
176
|
}
|
177
|
}
|
178
|
|
179
|
private void doAdd(final List<SolrInputDocument> buffer, final Context context) throws SolrServerException, IOException {
|
180
|
if (!simulation) {
|
181
|
final long start = System.currentTimeMillis();
|
182
|
final UpdateResponse rsp = solrServer.add(buffer);
|
183
|
final long stop = System.currentTimeMillis() - start;
|
184
|
log.info("feed time for " + buffer.size() + " records : " + HumanTime.exactly(stop) + "\n");
|
185
|
|
186
|
final int status = rsp.getStatus();
|
187
|
|
188
|
context.getCounter("index feed", "status code: " + status).increment(buffer.size());
|
189
|
|
190
|
if (status != 0) throw new SolrServerException("bad status: " + status);
|
191
|
|
192
|
for (final SolrInputDocument doc : buffer) {
|
193
|
context.getCounter("index entity", getEntityType(doc)).increment(1);
|
194
|
}
|
195
|
}
|
196
|
buffer.clear();
|
197
|
}
|
198
|
|
199
|
@Override
|
200
|
protected void cleanup(final Context context) throws IOException, InterruptedException {
|
201
|
super.cleanup(context);
|
202
|
try {
|
203
|
if (!buffer.isEmpty()) {
|
204
|
doAdd(buffer, context);
|
205
|
}
|
206
|
log.info("\nwaiting " + shutdownWaitTime + "ms before shutdown");
|
207
|
Thread.sleep(shutdownWaitTime);
|
208
|
solrServer.shutdown();
|
209
|
} catch (final SolrServerException e) {
|
210
|
log.error("couldn't shutdown server " + e.getMessage());
|
211
|
}
|
212
|
}
|
213
|
|
214
|
private void handleError(final Text key, final Text value, final Context context, final String indexRecord, final SolrInputDocument doc, final Throwable e)
|
215
|
throws IOException, InterruptedException {
|
216
|
context.getCounter("index feed", e.getClass().getName()).increment(1);
|
217
|
context.write(key, printRottenRecord(context.getTaskAttemptID().toString(), value, indexRecord, doc));
|
218
|
// e.printStackTrace(System.err);
|
219
|
}
|
220
|
|
221
|
private Text printRottenRecord(final String taskid, final Text value, final String indexRecord, final SolrInputDocument doc) {
|
222
|
return new Text("\n**********************************\n" + "task: " + taskid + "\n"
|
223
|
+ check("original", value.toString() + check("indexRecord", indexRecord) + check("solrDoc", doc)));
|
224
|
}
|
225
|
|
226
|
private String check(final String label, final Object value) {
|
227
|
if ((value != null) && !value.toString().isEmpty()) return "\n " + label + ":\n" + value + "\n";
|
228
|
return "\n";
|
229
|
}
|
230
|
|
231
|
private void logConfiguration(final Configuration conf) {
|
232
|
log.info("job configutation #################");
|
233
|
for (final Entry<String, String> e : conf) {
|
234
|
log.info("'" + e.getKey() + "' : '" + e.getValue() + "'");
|
235
|
}
|
236
|
log.info("end of job configutation #################\n\n");
|
237
|
}
|
238
|
|
239
|
private String getEntityType(final SolrInputDocument doc) {
|
240
|
if (!doc.containsKey("oaftype")) return "unknown";
|
241
|
|
242
|
final Type type = Type.valueOf(doc.getFieldValue("oaftype").toString());
|
243
|
switch (type) {
|
244
|
case result:
|
245
|
if (!doc.containsKey("resulttypeid")) return "result";
|
246
|
return doc.getFieldValue("resulttypeid").toString();
|
247
|
default:
|
248
|
return type.toString();
|
249
|
}
|
250
|
}
|
251
|
|
252
|
}
|