1 |
36164
|
claudio.at
|
package eu.dnetlib.data.mapreduce.hbase.index;
|
2 |
|
|
|
3 |
|
|
import java.io.IOException;
|
4 |
|
|
import java.util.List;
|
5 |
|
|
import java.util.Map;
|
6 |
|
|
import java.util.Map.Entry;
|
7 |
|
|
|
8 |
49029
|
claudio.at
|
import com.google.common.collect.Lists;
|
9 |
|
|
import com.googlecode.protobuf.format.JsonFormat;
|
10 |
|
|
import eu.dnetlib.data.mapreduce.JobParams;
|
11 |
|
|
import eu.dnetlib.data.mapreduce.util.DedupUtils;
|
12 |
|
|
import eu.dnetlib.data.proto.OafProtos.Oaf;
|
13 |
|
|
import eu.dnetlib.data.transform.SolrProtoMapper;
|
14 |
|
|
import eu.dnetlib.functionality.index.solr.feed.InputDocumentFactory;
|
15 |
|
|
import eu.dnetlib.miscutils.datetime.HumanTime;
|
16 |
36164
|
claudio.at
|
import org.apache.commons.collections.MapUtils;
|
17 |
|
|
import org.apache.commons.logging.Log;
|
18 |
|
|
import org.apache.commons.logging.LogFactory;
|
19 |
|
|
import org.apache.hadoop.conf.Configuration;
|
20 |
|
|
import org.apache.hadoop.hbase.client.Result;
|
21 |
|
|
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
|
22 |
|
|
import org.apache.hadoop.hbase.mapreduce.TableMapper;
|
23 |
|
|
import org.apache.hadoop.hbase.util.Bytes;
|
24 |
|
|
import org.apache.hadoop.io.Text;
|
25 |
|
|
import org.apache.solr.client.solrj.SolrServerException;
|
26 |
49565
|
claudio.at
|
import org.apache.solr.client.solrj.impl.CloudSolrServer;
|
27 |
36164
|
claudio.at
|
import org.apache.solr.client.solrj.response.SolrPingResponse;
|
28 |
|
|
import org.apache.solr.client.solrj.response.UpdateResponse;
|
29 |
|
|
import org.apache.solr.common.SolrInputDocument;
|
30 |
|
|
import org.dom4j.DocumentException;
|
31 |
|
|
|
32 |
|
|
public class DedupIndexFeedMapper extends TableMapper<Text, Text> {
|
33 |
|
|
|
34 |
|
|
private static final Log log = LogFactory.getLog(DedupIndexFeedMapper.class); // NOPMD by marko on 11/24/08 5:02 PM
|
35 |
|
|
|
36 |
49565
|
claudio.at
|
private CloudSolrServer solrServer;
|
37 |
36164
|
claudio.at
|
|
38 |
|
|
private String dsId;
|
39 |
|
|
|
40 |
|
|
private String version;
|
41 |
|
|
|
42 |
|
|
private int shutdownWaitTime = 10000;
|
43 |
|
|
|
44 |
|
|
private int bufferFlushThreshold = 100;
|
45 |
|
|
|
46 |
|
|
private List<SolrInputDocument> buffer;
|
47 |
|
|
|
48 |
|
|
private int backoffTimeMs = 5000;
|
49 |
|
|
|
50 |
|
|
private boolean simulation = false;
|
51 |
|
|
|
52 |
|
|
private String entityType = null;
|
53 |
|
|
|
54 |
36670
|
claudio.at
|
private String actionset = null;
|
55 |
|
|
|
56 |
37517
|
claudio.at
|
private SolrProtoMapper mapper = null;
|
57 |
36164
|
claudio.at
|
|
58 |
36670
|
claudio.at
|
private final static int MAX_RETRIES = 10;
|
59 |
|
|
|
60 |
36164
|
claudio.at
|
@Override
|
61 |
|
|
protected void setup(final Context context) throws IOException, InterruptedException {
|
62 |
|
|
|
63 |
|
|
logConfiguration(context.getConfiguration());
|
64 |
|
|
|
65 |
|
|
shutdownWaitTime = Integer.parseInt(context.getConfiguration().get(JobParams.INDEX_SHUTDOWN_WAIT));
|
66 |
|
|
bufferFlushThreshold = Integer.parseInt(context.getConfiguration().get(JobParams.INDEX_BUFFER_FLUSH_TRESHOLD));
|
67 |
|
|
dsId = context.getConfiguration().get(JobParams.INDEX_DSID);
|
68 |
|
|
version = InputDocumentFactory.getParsedDateField(context.getConfiguration().get(JobParams.INDEX_FEED_TIME));
|
69 |
|
|
buffer = Lists.newArrayList();
|
70 |
|
|
simulation = Boolean.parseBoolean(context.getConfiguration().get(JobParams.INDEX_FEED_SIMULATION_MODE));
|
71 |
|
|
entityType = context.getConfiguration().get("entityType");
|
72 |
36670
|
claudio.at
|
actionset = context.getConfiguration().get("actionset");
|
73 |
36164
|
claudio.at
|
|
74 |
|
|
final String fields = context.getConfiguration().get("index.fields");
|
75 |
|
|
|
76 |
|
|
log.info("got fields: \n" + fields);
|
77 |
|
|
log.info("got dsId: " + dsId);
|
78 |
|
|
log.info("got version: " + version);
|
79 |
|
|
log.info("simulation: " + simulation);
|
80 |
|
|
log.info("entityType: " + entityType);
|
81 |
36670
|
claudio.at
|
log.info("actionset: " + actionset);
|
82 |
36164
|
claudio.at
|
log.info("buffer size: " + bufferFlushThreshold);
|
83 |
|
|
|
84 |
|
|
try {
|
85 |
37517
|
claudio.at
|
mapper = new SolrProtoMapper(fields);
|
86 |
36164
|
claudio.at
|
} catch (final DocumentException e) {
|
87 |
|
|
log.error("unable to parse fields: " + fields);
|
88 |
|
|
throw new IllegalArgumentException(e);
|
89 |
|
|
}
|
90 |
|
|
|
91 |
|
|
final String baseURL = context.getConfiguration().get(JobParams.INDEX_SOLR_URL);
|
92 |
|
|
log.info("solr server baseURL: " + baseURL);
|
93 |
|
|
|
94 |
|
|
final String collection = context.getConfiguration().get(JobParams.INDEX_SOLR_COLLECTION);
|
95 |
|
|
log.info("solr server collection: " + collection);
|
96 |
|
|
|
97 |
|
|
while (true) {
|
98 |
|
|
try {
|
99 |
|
|
log.info("initializing solr server...");
|
100 |
49565
|
claudio.at
|
solrServer = new CloudSolrServer(baseURL);
|
101 |
36164
|
claudio.at
|
|
102 |
|
|
solrServer.connect();
|
103 |
|
|
|
104 |
|
|
solrServer.setParallelUpdates(true);
|
105 |
|
|
solrServer.setDefaultCollection(collection);
|
106 |
|
|
|
107 |
|
|
final SolrPingResponse rsp = solrServer.ping();
|
108 |
|
|
|
109 |
|
|
if (rsp.getStatus() != 0) throw new SolrServerException("bad init status: " + rsp.getStatus());
|
110 |
|
|
else {
|
111 |
|
|
break;
|
112 |
|
|
}
|
113 |
|
|
|
114 |
|
|
} catch (final Throwable e) {
|
115 |
|
|
if (solrServer != null) {
|
116 |
49565
|
claudio.at
|
solrServer.shutdown();
|
117 |
36164
|
claudio.at
|
}
|
118 |
|
|
context.getCounter("index init", e.getMessage()).increment(1);
|
119 |
|
|
log.info(String.format("failed to init solr client wait %dms", backoffTimeMs));
|
120 |
|
|
Thread.sleep(backoffTimeMs);
|
121 |
|
|
}
|
122 |
|
|
}
|
123 |
|
|
}
|
124 |
|
|
|
125 |
|
|
@Override
|
126 |
|
|
protected void map(final ImmutableBytesWritable key, final Result value, final Context context) throws IOException, InterruptedException {
|
127 |
|
|
|
128 |
|
|
SolrInputDocument doc = null;
|
129 |
|
|
|
130 |
|
|
final Map<byte[], byte[]> bMap = value.getFamilyMap(Bytes.toBytes(entityType));
|
131 |
|
|
|
132 |
|
|
if (MapUtils.isEmpty(bMap) || !bMap.containsKey(DedupUtils.BODY_B)) {
|
133 |
|
|
context.getCounter(entityType, "missing body");
|
134 |
|
|
return;
|
135 |
|
|
}
|
136 |
|
|
|
137 |
|
|
final Oaf oaf = Oaf.parseFrom(bMap.get(DedupUtils.BODY_B));
|
138 |
|
|
|
139 |
|
|
try {
|
140 |
36670
|
claudio.at
|
doc = getDocument(oaf);
|
141 |
36164
|
claudio.at
|
} catch (final Throwable e) {
|
142 |
42590
|
claudio.at
|
handleError(key, new JsonFormat().printToString(oaf), context, doc, e);
|
143 |
36670
|
claudio.at
|
return;
|
144 |
36164
|
claudio.at
|
}
|
145 |
|
|
|
146 |
36670
|
claudio.at
|
int retries = 0;
|
147 |
|
|
while (retries < MAX_RETRIES) {
|
148 |
36164
|
claudio.at
|
try {
|
149 |
|
|
addDocument(context, doc);
|
150 |
|
|
return;
|
151 |
|
|
} catch (final Throwable e) {
|
152 |
36670
|
claudio.at
|
retries++;
|
153 |
36164
|
claudio.at
|
context.getCounter("index feed", "retries").increment(1);
|
154 |
42590
|
claudio.at
|
handleError(key, new JsonFormat().printToString(oaf), context, doc, e);
|
155 |
36164
|
claudio.at
|
log.info(String.format("failed to feed documents, waiting %dms", backoffTimeMs));
|
156 |
|
|
Thread.sleep(backoffTimeMs);
|
157 |
|
|
}
|
158 |
|
|
}
|
159 |
36670
|
claudio.at
|
if (retries >= MAX_RETRIES)
|
160 |
|
|
throw new IOException("too many retries: " + retries);
|
161 |
36164
|
claudio.at
|
}
|
162 |
|
|
|
163 |
36670
|
claudio.at
|
private SolrInputDocument getDocument(final Oaf oaf) throws DocumentException {
|
164 |
37135
|
claudio.at
|
final SolrInputDocument document = mapper.map(oaf, version, dsId, actionset);
|
165 |
36670
|
claudio.at
|
document.addField("actionset", actionset);
|
166 |
|
|
return document;
|
167 |
|
|
}
|
168 |
|
|
|
169 |
36164
|
claudio.at
|
private void addDocument(final Context context, final SolrInputDocument doc) throws SolrServerException, IOException {
|
170 |
|
|
if (!doc.isEmpty()) {
|
171 |
|
|
|
172 |
|
|
buffer.add(doc);
|
173 |
|
|
if (buffer.size() >= bufferFlushThreshold) {
|
174 |
|
|
doAdd(buffer, context);
|
175 |
|
|
// Thread.sleep(100);
|
176 |
|
|
}
|
177 |
|
|
} else {
|
178 |
|
|
context.getCounter("index feed", "skipped records").increment(1);
|
179 |
|
|
}
|
180 |
|
|
}
|
181 |
|
|
|
182 |
|
|
private void doAdd(final List<SolrInputDocument> buffer, final Context context) throws SolrServerException, IOException {
|
183 |
|
|
if (!simulation) {
|
184 |
|
|
final long start = System.currentTimeMillis();
|
185 |
|
|
final UpdateResponse rsp = solrServer.add(buffer);
|
186 |
|
|
final long stop = System.currentTimeMillis() - start;
|
187 |
|
|
log.info("feed time for " + buffer.size() + " records : " + HumanTime.exactly(stop) + "\n");
|
188 |
|
|
|
189 |
|
|
final int status = rsp.getStatus();
|
190 |
|
|
context.getCounter("index feed", "status code: " + status).increment(buffer.size());
|
191 |
|
|
|
192 |
|
|
if (status != 0) throw new SolrServerException("bad status: " + status);
|
193 |
|
|
}
|
194 |
|
|
buffer.clear();
|
195 |
|
|
}
|
196 |
|
|
|
197 |
|
|
@Override
|
198 |
|
|
protected void cleanup(final Context context) throws IOException, InterruptedException {
|
199 |
|
|
super.cleanup(context);
|
200 |
|
|
try {
|
201 |
|
|
if (!buffer.isEmpty()) {
|
202 |
|
|
doAdd(buffer, context);
|
203 |
|
|
}
|
204 |
|
|
log.info("\nwaiting " + shutdownWaitTime + "ms before shutdown");
|
205 |
|
|
Thread.sleep(shutdownWaitTime);
|
206 |
49565
|
claudio.at
|
solrServer.shutdown();
|
207 |
36164
|
claudio.at
|
} catch (final SolrServerException e) {
|
208 |
|
|
System.err.println("couldn't shutdown server " + e.getMessage());
|
209 |
|
|
}
|
210 |
|
|
}
|
211 |
|
|
|
212 |
|
|
private void handleError(final ImmutableBytesWritable key, final String value, final Context context, final SolrInputDocument doc, final Throwable e)
|
213 |
|
|
throws IOException, InterruptedException {
|
214 |
|
|
context.getCounter("index feed", e.getClass().getName()).increment(1);
|
215 |
|
|
context.write(new Text(key.copyBytes()), printRottenRecord(context.getTaskAttemptID().toString(), value, doc));
|
216 |
36670
|
claudio.at
|
// e.printStackTrace(System.err);
|
217 |
36164
|
claudio.at
|
}
|
218 |
|
|
|
219 |
|
|
private Text printRottenRecord(final String taskid, final String value, final SolrInputDocument doc) {
|
220 |
|
|
return new Text("\n**********************************\n" + "task: " + taskid + "\n"
|
221 |
|
|
+ check("original", value.toString() + check("solrDoc", doc)));
|
222 |
|
|
}
|
223 |
|
|
|
224 |
|
|
private String check(final String label, final Object value) {
|
225 |
|
|
if ((value != null) && !value.toString().isEmpty()) return "\n " + label + ":\n" + value + "\n";
|
226 |
|
|
return "\n";
|
227 |
|
|
}
|
228 |
|
|
|
229 |
|
|
private void logConfiguration(final Configuration conf) {
|
230 |
|
|
log.info("job configutation #################");
|
231 |
|
|
for (final Entry<String, String> e : conf) {
|
232 |
|
|
log.info("'" + e.getKey() + "' : '" + e.getValue() + "'");
|
233 |
|
|
}
|
234 |
|
|
log.info("end of job configutation #################\n\n");
|
235 |
|
|
}
|
236 |
|
|
|
237 |
|
|
}
|