/modules/dnet-mapreduce-jobs/trunk/src/main/java/eu/dnetlib/data/mapreduce/hbase/index/IndexFeedMapper.java - D-Net - D-Net project tracking tool

dnet45/modules/dnet-mapreduce-jobs/trunk/src/main/java/eu/dnetlib/data/mapreduce/hbase/index/IndexFeedMapper.java @ 49096

       package eu.dnetlib.data.mapreduce.hbase.index;
       import java.io.ByteArrayOutputStream;
       import java.io.IOException;
       import java.nio.charset.StandardCharsets;
       import java.util.List;
       import java.util.Map.Entry;
       import java.util.zip.GZIPOutputStream;
       import com.google.common.collect.Lists;
       import eu.dnetlib.data.mapreduce.JobParams;
       import eu.dnetlib.data.proto.TypeProtos.Type;
       import eu.dnetlib.functionality.index.solr.feed.InputDocumentFactory;
       import eu.dnetlib.functionality.index.solr.feed.ResultTransformer;
       import eu.dnetlib.functionality.index.solr.feed.ResultTransformer.Mode;
       import eu.dnetlib.functionality.index.solr.feed.StreamingInputDocumentFactory;
       import eu.dnetlib.miscutils.datetime.HumanTime;
       import eu.dnetlib.miscutils.functional.xml.ApplyXslt;
       import org.apache.commons.codec.binary.Base64;
       import org.apache.commons.lang.exception.ExceptionUtils;
       import org.apache.commons.logging.Log;
       import org.apache.commons.logging.LogFactory;
       import org.apache.hadoop.conf.Configuration;
       import org.apache.hadoop.io.Text;
       import org.apache.hadoop.mapreduce.Mapper;
       import org.apache.solr.client.solrj.SolrServerException;
       import org.apache.solr.client.solrj.impl.CloudSolrClient;
       import org.apache.solr.client.solrj.response.SolrPingResponse;
       import org.apache.solr.client.solrj.response.UpdateResponse;
       import org.apache.solr.common.SolrInputDocument;
       public class IndexFeedMapper extends Mapper<Text, Text, Text, Text> {
       	private static final Log log = LogFactory.getLog(IndexFeedMapper.class); // NOPMD by marko on 11/24/08 5:02 PM
       	public static final String DNET_RESULT = "dnetResult";
       	private InputDocumentFactory documentFactory;
       	private CloudSolrClient solrServer;
       	private String version;
       	private String dsId;
       	private int shutdownWaitTime = 10000;
       	private int bufferFlushThreshold = 100;
       	private ApplyXslt dmfToRecord;
       	private List<SolrInputDocument> buffer;
       	private int backoffTimeMs = 5000;
       	private boolean simulation = false;
       	private final static int MAX_INIT_RETRIES = 10;
       	private final static int MAX_FEED_RETRIES = 10;
       	private boolean compress = false;
       	@Override
       	protected void setup(final Context context) throws IOException, InterruptedException {
       		logConfiguration(context.getConfiguration());
       		dsId = context.getConfiguration().get(JobParams.INDEX_DSID);
       		shutdownWaitTime = Integer.parseInt(context.getConfiguration().get(JobParams.INDEX_SHUTDOWN_WAIT));
       		bufferFlushThreshold = Integer.parseInt(context.getConfiguration().get(JobParams.INDEX_BUFFER_FLUSH_TRESHOLD));
       		documentFactory = new StreamingInputDocumentFactory();
       		version = InputDocumentFactory.getParsedDateField(context.getConfiguration().get(JobParams.INDEX_FEED_TIME));
       		buffer = Lists.newArrayList();
       		simulation = Boolean.parseBoolean(context.getConfiguration().get(JobParams.INDEX_FEED_SIMULATION_MODE));
       		compress = context.getConfiguration().getBoolean(JobParams.INDEX_FEED_COMPRESS_RESULT, false);
       		final String xslt = new String(Base64.decodeBase64(context.getConfiguration().get(JobParams.INDEX_XSLT)));
       		log.info("got xslt: \n" + xslt);
       		log.info("got version: " + version);
       		log.info("simulation: " + simulation);
       		log.info("buffer size: " + bufferFlushThreshold);
       		dmfToRecord = new ApplyXslt(xslt);
       		final String baseURL = context.getConfiguration().get(JobParams.INDEX_SOLR_URL);
       		log.info("solr server baseURL: " + baseURL);
       		final String collection = context.getConfiguration().get(JobParams.INDEX_SOLR_COLLECTION);
       		log.info("solr server collection: " + collection);
       		int count = 0;
       		while (count <= MAX_INIT_RETRIES) {
       			try {
       				count++;
       				log.info("initializing solr server...");
       				solrServer = new CloudSolrClient.Builder()
       						.withZkHost(baseURL)
       						.build();
       				solrServer.connect();
       				solrServer.setParallelUpdates(true);
       				solrServer.setDefaultCollection(collection);
       				final SolrPingResponse rsp = solrServer.ping();
       				if (rsp.getStatus() != 0) throw new SolrServerException("bad init status: " + rsp.getStatus());
       				else {
       					break;
+      				}
       			} catch (final Throwable e) {
       				if (solrServer != null) {
       					solrServer.close();
+      				}
       				context.getCounter("index init", e.getMessage()).increment(1);
       				log.error(String.format("failed to init solr client wait %dms, error:\n%s", backoffTimeMs, ExceptionUtils.getStackTrace(e)));
       				Thread.sleep(backoffTimeMs);
+      			}
+      		}
       		if (count >= MAX_INIT_RETRIES) throw new IOException("reached max retries trying to connect to solr server: " + MAX_INIT_RETRIES);
+      	}
       	@Override
       	protected void map(final Text key, final Text value, final Context context) throws IOException, InterruptedException {
       		String indexRecord = "";
       		SolrInputDocument doc = null;
       		try {
       			indexRecord = dmfToRecord.evaluate(value.toString());
       			if (compress) {
       				doc = documentFactory.parseDocument(version, indexRecord, dsId, DNET_RESULT, new ResultTransformer(Mode.base64) {
       					@Override
       					public String apply(final String s) {
       						return org.apache.solr.common.util.Base64.byteArrayToBase64(zip(s));
+      					}
       				});
       			} else {
       				doc = documentFactory.parseDocument(version, indexRecord, dsId, DNET_RESULT);
+      			}
       			if ((doc == null) || doc.isEmpty()) throw new EmptySolrDocumentException();
       		} catch (final Throwable e) {
       			context.getCounter("index feed", "skipped records").increment(1);
       			handleError(key, value, context, indexRecord, doc, e);
       			return;
+      		}
       		int count = 0;
       		while (count <= MAX_FEED_RETRIES) {
       			count++;
       			try {
       				addDocument(context, doc);
       				return;
       			} catch (final Throwable e) {
       				context.getCounter("index feed", "retries").increment(1);
       				handleError(key, value, context, indexRecord, doc, e);
       				log.info(String.format("failed to feed documents, waiting %dms", backoffTimeMs));
       				Thread.sleep(backoffTimeMs);
+      			}
+      		}
+      	}
       	public byte[] zip(final String s) {
       		if ((s == null) || (s.length() == 0)) {
       			throw new IllegalArgumentException("Cannot zip null or empty string");
+      		}
       		try (ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream()) {
       			try (GZIPOutputStream gzipOutputStream = new GZIPOutputStream(byteArrayOutputStream)) {
       				gzipOutputStream.write(s.getBytes(StandardCharsets.UTF_8));
+      			}
       			return byteArrayOutputStream.toByteArray();
       		} catch(IOException e) {
       			throw new RuntimeException("Failed to zip content", e);
+      		}
+      	}
       	private void addDocument(final Context context, final SolrInputDocument doc) throws SolrServerException, IOException, EmptySolrDocumentException {
       		buffer.add(doc);
       		if (buffer.size() >= bufferFlushThreshold) {
       			doAdd(buffer, context);
+      		}
+      	}
       	private void doAdd(final List<SolrInputDocument> buffer, final Context context) throws SolrServerException, IOException {
       		if (!simulation) {
       			final long start = System.currentTimeMillis();
       			final UpdateResponse rsp = solrServer.add(buffer);
       			final long stop = System.currentTimeMillis() - start;
       			log.info("feed time for " + buffer.size() + " records : " + HumanTime.exactly(stop) + "\n");
       			final int status = rsp.getStatus();
       			context.getCounter("index feed", "status code: " + status).increment(buffer.size());
       			if (status != 0) throw new SolrServerException("bad status: " + status);
       			for (final SolrInputDocument doc : buffer) {
       				context.getCounter("index entity", getEntityType(doc)).increment(1);
+      			}
+      		}
       		buffer.clear();
+      	}
       	@Override
       	protected void cleanup(final Context context) throws IOException, InterruptedException {
       		super.cleanup(context);
       		try {
       			if (!buffer.isEmpty()) {
       				doAdd(buffer, context);
+      			}
       			log.info("\nwaiting " + shutdownWaitTime + "ms before shutdown");
       			Thread.sleep(shutdownWaitTime);
       			solrServer.close();
       		} catch (final SolrServerException e) {
       			log.error("couldn't shutdown server " + e.getMessage());
+      		}
+      	}
       	private void handleError(final Text key, final Text value, final Context context, final String indexRecord, final SolrInputDocument doc, final Throwable e)
       			throws IOException, InterruptedException {
       		context.getCounter("index feed", e.getClass().getName()).increment(1);
       		context.write(key, printRottenRecord(context.getTaskAttemptID().toString(), value, indexRecord, doc));
       		// e.printStackTrace(System.err);
+      	}
       	private Text printRottenRecord(final String taskid, final Text value, final String indexRecord, final SolrInputDocument doc) {
       		return new Text("\n**********************************\n" + "task: " + taskid + "\n"
       				+ check("original", value.toString() + check("indexRecord", indexRecord) + check("solrDoc", doc)));
+      	}
       	private String check(final String label, final Object value) {
       		if ((value != null) && !value.toString().isEmpty()) return "\n " + label + ":\n" + value + "\n";
       		return "\n";
+      	}
       	private void logConfiguration(final Configuration conf) {
       		log.info("job configutation #################");
       		for (final Entry<String, String> e : conf) {
       			log.info("'" + e.getKey() + "' : '" + e.getValue() + "'");
+      		}
       		log.info("end of job configutation #################\n\n");
+      	}
       	private String getEntityType(final SolrInputDocument doc) {
       		if (!doc.containsKey("oaftype")) return "unknown";
       		final Type type = Type.valueOf(doc.getFieldValue("oaftype").toString());
       		switch (type) {
       		case result:
       			if (!doc.containsKey("resulttypeid")) return "result";
       			return doc.getFieldValue("resulttypeid").toString();
       		default:
       			return type.toString();
+      		}
+      	}
+      }

(5-5/7)

Project

General

Profile

D-Net