/modules/icm-iis-metadataextraction/trunk/src/main/java/eu/dnetlib/iis/metadataextraction/AbstractMetadataExtractorMapper.java - D-Net - D-Net project tracking tool

dnet40/modules/icm-iis-metadataextraction/trunk/src/main/java/eu/dnetlib/iis/metadataextraction/AbstractMetadataExtractorMapper.java @ 31757

       package eu.dnetlib.iis.metadataextraction;
       import java.io.IOException;
       import java.io.InputStream;
       import java.util.Arrays;
       import java.util.HashSet;
       import java.util.Set;
       import org.apache.avro.mapred.AvroKey;
       import org.apache.commons.lang.StringUtils;
       import org.apache.hadoop.io.NullWritable;
       import org.apache.hadoop.mapreduce.Mapper;
       import org.apache.log4j.Logger;
       import org.jdom.Document;
       import org.jdom.Element;
       import org.jdom.JDOMException;
       import org.jdom.output.Format;
       import org.jdom.output.XMLOutputter;
       import pl.edu.icm.cermine.ContentExtractor;
       import pl.edu.icm.cermine.exception.AnalysisException;
       import pl.edu.icm.cermine.exception.TransformationException;
       import com.itextpdf.text.exceptions.InvalidPdfException;
       import eu.dnetlib.iis.common.WorkflowRuntimeParameters;
       import eu.dnetlib.iis.core.javamapreduce.MultipleOutputs;
       import eu.dnetlib.iis.metadataextraction.schemas.DocumentText;
       import eu.dnetlib.iis.metadataextraction.schemas.ExtractedDocumentMetadata;
       /**
        * Abstract class containing shared code of metadata extraction.
        * @author mhorst
+       *
        */
       public abstract class AbstractMetadataExtractorMapper<T> extends Mapper<AvroKey<T>, NullWritable, NullWritable, NullWritable> {
       	protected final Logger log = Logger.getLogger(AbstractMetadataExtractorMapper.class);
       	/**
       	 * Flag indicating {@link AnalysisException} should cause interruption.
       	 */
       	protected boolean analysisExceptionAsCritical = false;
       	/**
       	 * Flag indicating any other {@link Exception} should cause interruption.
       	 */
       	protected boolean otherExceptionAsCritical = false;
       	/**
       	 * Multiple outputs.
       	 */
       	protected MultipleOutputs mos = null;
       	/**
       	 * Document metadata named output.
       	 */
       	protected String namedOutputMeta;
       	/**
       	 * Document plaintext named output.
       	 */
       	protected String namedOutputPlaintext;
       	/**
       	 * Progress log interval.
       	 */
       	protected int progresLogInterval = 100;
       	/**
       	 * Current progress.
       	 */
       	protected int currentProgress = 0;
       	/**
       	 * Interval time.
       	 */
       	protected long intervalTime = 0;
       	/**
       	 * Maximum content size in MegaBytes.
       	 */
       	protected long maxFileSize = Long.MAX_VALUE;
       	/**
       	 * Set of object identifiers objects excluded from processing.
       	 */
       	protected Set<String> excludedIds;
       	/* (non-Javadoc)
       	 * @see org.apache.hadoop.mapreduce.Mapper#setup(org.apache.hadoop.mapreduce.Mapper.Context)
       	 */
       	@Override
       	protected void setup(Context context) throws IOException,
       			InterruptedException {
       		namedOutputMeta = context.getConfiguration().get("output.meta");
       		if (namedOutputMeta==null || namedOutputMeta.isEmpty()) {
       			throw new RuntimeException("no named output provided for metadata");
+      		}
       		namedOutputPlaintext = context.getConfiguration().get("output.plaintext");
       		if (namedOutputPlaintext==null || namedOutputPlaintext.isEmpty()) {
       			throw new RuntimeException("no named output provided for plaintext");
+      		}
       		String excludedIdsCSV = context.getConfiguration().get("excluded.ids");
       		if (excludedIdsCSV!=null && !excludedIdsCSV.trim().isEmpty()
       				&& !WorkflowRuntimeParameters.UNDEFINED_NONEMPTY_VALUE.equals(excludedIdsCSV)) {
       			log.warn("got excluded ids: " + excludedIdsCSV);
       			excludedIds = new HashSet<String>(Arrays.asList(
       					StringUtils.split(excludedIdsCSV.trim(), ',')));
       		} else {
       			log.warn("got no excluded ids");
+      		}
       //		handling maximum content size
       		String maxFileSizeMBStr = context.getConfiguration().get(
       				WorkflowRuntimeParameters.IMPORT_CONTENT_MAX_FILE_SIZE_MB);
       		if (maxFileSizeMBStr!=null && !maxFileSizeMBStr.trim().isEmpty()
       				&& !WorkflowRuntimeParameters.UNDEFINED_NONEMPTY_VALUE.equals(maxFileSizeMBStr)) {
       			this.maxFileSize = 1048576l * Integer.valueOf(maxFileSizeMBStr);
+      		}
       		mos = new MultipleOutputs(context);
       		currentProgress = 0;
       		intervalTime = System.currentTimeMillis();
+      	}
       	/**
       	 * Processes content input stream. Closes stream at the end.
       	 * @param documentId
       	 * @param contentStream
       	 * @param contentLength
       	 * @throws IOException
       	 * @throws InterruptedException
       	 */
       	protected void processStream(CharSequence documentId,
       			InputStream contentStream,
       			long contentLength) throws IOException, InterruptedException {
       		try {
       			currentProgress++;
       			if (currentProgress>0 && currentProgress%progresLogInterval==0) {
       //				FIXME switch back to debug when setting debug level on oozie
       					log.warn("metadata extaction progress: " + currentProgress + ", time taken to process " +
       							progresLogInterval + " elements: " +
       						((System.currentTimeMillis() - intervalTime)/1000) + " secs");
       				intervalTime = System.currentTimeMillis();
+      			}
       			if (excludedIds!=null && excludedIds.contains(documentId)) {
       				log.warn("skipping processing for excluded id " + documentId);
       			} else {
       //				handling maximum content size
       				if (contentLength>maxFileSize) {
       					log.warn("skipping processing for id " + documentId +
       							" due to max file size limit="+maxFileSize+" exceeded: " + contentLength);
       					try {
       //    					writing empty metadata
           					mos.write(namedOutputMeta, new AvroKey<ExtractedDocumentMetadata>(
       							NlmToDocumentWithBasicMetadataConverter.convertFull(
       									documentId.toString(), null)));
       //    					writing empty plaintext
           					mos.write(namedOutputPlaintext, new AvroKey<DocumentText>(
       								NlmToDocumentContentConverter.convert(
       										documentId.toString(), null)));
           					return;
           				} catch (TransformationException e2) {
                   			log.debug("closing multiple outputs...");
                   			mos.close();
                   			log.debug("multiple outputs closed");
                   			throw new RuntimeException(e2);
           				} catch (JDOMException e2) {
                   			log.debug("closing multiple outputs...");
                   			mos.close();
                   			log.debug("multiple outputs closed");
                   			throw new RuntimeException(e2);
                   		} finally {
                   			if (contentStream!=null) {
           						contentStream.close();
+          					}
+                  		}
+      				}
       //				TODO switch back to debug when setting debug level on oozie
       				log.warn("starting processing for id: " + documentId);
       				long startTime = System.currentTimeMillis();
       				ContentExtractor extractor = new ContentExtractor();
       				try {
                           extractor.uploadPDF(contentStream);
                           try {
                           	Element resultElem = extractor.getNLMContent();
                               Document doc = new Document(resultElem);
           					XMLOutputter outputter = new XMLOutputter(Format.getPrettyFormat());
           					log.debug("got NLM content: \n" + outputter.outputString(resultElem));
       						mos.write(namedOutputMeta, new AvroKey<ExtractedDocumentMetadata>(
       								NlmToDocumentWithBasicMetadataConverter.convertFull(
       										documentId.toString(), doc)));
                           } catch (JDOMException e) {
                   			log.debug("closing multiple outputs...");
                   			mos.close();
                   			log.debug("multiple outputs closed");
                   			throw new RuntimeException(e);
                   		} catch (TransformationException e) {
                   			log.debug("closing multiple outputs...");
                   			mos.close();
                   			log.debug("multiple outputs closed");
                   			throw new RuntimeException(e);
                   		} catch (AnalysisException e) {
                   			if (analysisExceptionAsCritical) {
                   				log.debug("closing multiple outputs...");
                   				mos.close();
                   				log.debug("multiple outputs closed");
                   				throw new RuntimeException(e);
                   			} else {
                   				if (e.getCause() instanceof InvalidPdfException) {
                   					log.error("Invalid PDF file", e);
                   				} else {
                   					log.error("got unexpected analysis exception, just logging", e);
+                  				}
                   				try {
       //            					writing empty result
                   					mos.write(namedOutputMeta, new AvroKey<ExtractedDocumentMetadata>(
           								NlmToDocumentWithBasicMetadataConverter.convertFull(
           										documentId.toString(), null)));
                   				} catch (TransformationException e2) {
                           			log.debug("closing multiple outputs...");
                           			mos.close();
                           			log.debug("multiple outputs closed");
                           			throw new RuntimeException(e2);
                   				} catch (JDOMException e2) {
                           			log.debug("closing multiple outputs...");
                           			mos.close();
                           			log.debug("multiple outputs closed");
                           			throw new RuntimeException(e2);
+                          		}
+                  			}
                   		} catch (Exception e) {
                   			if (otherExceptionAsCritical) {
                   				log.debug("closing multiple outputs...");
                   				mos.close();
                   				log.debug("multiple outputs closed");
                   				throw new RuntimeException(e);
                   			} else {
                  					log.error("got unexpected exception, just logging", e);
                  					try {
       //            					writing empty result
                   					mos.write(namedOutputMeta, new AvroKey<ExtractedDocumentMetadata>(
           								NlmToDocumentWithBasicMetadataConverter.convertFull(
           										documentId.toString(), null)));
                   				} catch (TransformationException e2) {
                           			log.debug("closing multiple outputs...");
                           			mos.close();
                           			log.debug("multiple outputs closed");
                           			throw new RuntimeException(e2);
                   				} catch (JDOMException e2) {
                           			log.debug("closing multiple outputs...");
                           			mos.close();
                           			log.debug("multiple outputs closed");
                           			throw new RuntimeException(e2);
+                          		}
+                  			}
+                  		}
       					try {
       						mos.write(namedOutputPlaintext, new AvroKey<DocumentText>(
       								NlmToDocumentContentConverter.convert(
       										documentId.toString(), extractor.getRawFullText())));
       					} catch (AnalysisException e) {
       						if (analysisExceptionAsCritical) {
                   				log.debug("closing multiple outputs...");
                   				mos.close();
                   				log.debug("multiple outputs closed");
                   				throw new RuntimeException(e);
                   			} else {
                   				if (e.getCause() instanceof InvalidPdfException) {
                   					log.error("Invalid PDF file when retrieving plaintext", e);
                   				} else {
                   					log.error("got unexpected analysis exception "
                   							+ "when retrieving plaintext, just logging", e);
+                  				}
       //        					writing empty result
               					mos.write(namedOutputPlaintext, new AvroKey<DocumentText>(
           								NlmToDocumentContentConverter.convert(
           										documentId.toString(), null)));
+                  			}
+      					}
       				} finally {
       					if (contentStream!=null) {
       						contentStream.close();
+      					}
+      				}
       //				TODO switch back to debug when setting debug level on oozie
       				log.warn("finished processing for id " + documentId + " in " +
       						((System.currentTimeMillis() - startTime)/1000) + " secs");
+      			}
       		} catch (AnalysisException e) {
       			log.debug("closing multiple outputs...");
       			mos.close();
       			log.debug("multiple outputs closed");
       			throw new RuntimeException(e);
+      		}
+      	}
       	/* (non-Javadoc)
       	 * @see org.apache.hadoop.mapreduce.Mapper#cleanup(org.apache.hadoop.mapreduce.Mapper.Context)
       	 */
       	@Override
           public void cleanup(Context context) throws IOException, InterruptedException {
       		log.debug("cleanup: closing multiple outputs...");
               mos.close();
               log.debug("cleanup: multiple outputs closed");
+          }
       	/**
       	 * Sets flag indicating {@link AnalysisException} should cause interruption.
       	 * @param analysisExceptionAsCritical
       	 */
       	public void setAnalysisExceptionAsCritical(boolean analysisExceptionAsCritical) {
       		this.analysisExceptionAsCritical = analysisExceptionAsCritical;
+      	}
       	/**
       	 * Sets flag indicating any other {@link Exception} should cause interruption.
       	 * @param otherExceptionAsCritical
       	 */
       	public void setOtherExceptionAsCritical(boolean otherExceptionAsCritical) {
       		this.otherExceptionAsCritical = otherExceptionAsCritical;
+      	}
       	public void setProgresLogInterval(int progresLogInterval) {
       		this.progresLogInterval = progresLogInterval;
+      	}
       //	public static void main(String[] args) throws Exception {
       ////		testing interruption
       ////		String fileLoc = "/home/azio/Downloads/cermine/hal/4f5cc34f137de4dc89766a9366ca66de.pdf";
       //		String fileLoc = "/home/azio/Downloads/cermine/4119";
       //		final BufferedInputStream bis = new BufferedInputStream(new FileInputStream(new File(fileLoc)));
       //		try {
       //			ExecutorService executor = Executors.newSingleThreadExecutor();
       //			Future<ContentExtractorResult> futureResult = executor.submit(new Callable<ContentExtractorResult>() {
       //				@Override
       //				public ContentExtractorResult call() throws Exception {
       //					ContentExtractor extractor = new ContentExtractor();
       //				    extractor.uploadPDF(bis);
       //					Element resultElem = extractor.getNLMContent();
       //					XMLOutputter outputter = new XMLOutputter(Format.getPrettyFormat());
       //					System.out.println("got NLM content: \n" + outputter.outputString(resultElem));
       //					return new ContentExtractorResult(null,
       //							DocumentText.newBuilder().setId("1234").setText("xxx").build());
       //				}
       //			});
       //			System.out.println("before get");
       //			try {
       //				ContentExtractorResult result = futureResult.get(5, TimeUnit.SECONDS);
       //				System.out.println("got result: " + result.text);
       //				System.out.println("after get, before shutdown");
       //			} catch(TimeoutException e) {
       //				System.out.println("timeout! before shutdown");
       //				executor.shutdown();
       //				System.out.println("after shutdown");
       //			}
       //		} finally {
       //			bis.close();
       //		}
       //	}
+      }

« Previous
1
2
3
…
8
Next »

(1-1/8)

Project

General

Profile

D-Net