Project

General

Profile

« Previous | Next » 

Revision 52461

mapper class that parses xml records

View differences:

modules/dnet-mapreduce-jobs/branches/beta/src/main/java/eu/dnetlib/data/mapreduce/hbase/dataimport/FilterXmlRecordsMapper.java
1
package eu.dnetlib.data.mapreduce.hbase.dataimport;
2

  
3
import java.io.IOException;
4
import java.io.StringReader;
5
import javax.xml.transform.Transformer;
6
import javax.xml.transform.TransformerConfigurationException;
7
import javax.xml.transform.TransformerException;
8
import javax.xml.transform.TransformerFactory;
9

  
10
import org.apache.commons.logging.Log;
11
import org.apache.commons.logging.LogFactory;
12
import org.apache.hadoop.io.Text;
13
import org.apache.hadoop.mapreduce.Mapper;
14
import org.dom4j.Document;
15
import org.dom4j.DocumentException;
16
import org.dom4j.io.DocumentResult;
17
import org.dom4j.io.DocumentSource;
18
import org.dom4j.io.SAXReader;
19

  
20
public class ImportRecordsMapper extends Mapper<Text, Text, Text, Text> {
21

  
22
	private static final Log log = LogFactory.getLog(ImportRecordsMapper.class); // NOPMD by marko on 11/24/08 5:02 PM
23

  
24
	private Transformer transformer;
25

  
26
	private SAXReader saxReader;
27

  
28
	private final static String xslt =
29
			"<xsl:stylesheet version=\"1.0\" xmlns:xsl=\"http://www.w3.org/1999/XSL/Transform\">\n"
30
					+ "  <xsl:template match=\"@*|node()\">\n"
31
					+ "    <xsl:copy>\n"
32
					+ "      <xsl:apply-templates select=\"@*|node()\"/>\n"
33
					+ "    </xsl:copy>\n"
34
					+ "  </xsl:template>\n"
35
					+ "</xsl:stylesheet>";
36

  
37
	@Override
38
	protected void setup(final Context context) throws IOException, InterruptedException {
39
		super.setup(context);
40

  
41
		saxReader = new SAXReader();
42

  
43
		log.info("using xslt:\n" + xslt);
44
		try {
45
			transformer = TransformerFactory.newInstance().newTransformer(new DocumentSource((new SAXReader()).read(new StringReader(xslt))));
46
		} catch (TransformerConfigurationException | DocumentException e) {
47
			log.error(e);
48
			throw new RuntimeException(e);
49
		}
50

  
51
		log.info("using trasformer: '" + transformer.getClass().getName() + "'");
52
	}
53

  
54
	private Document transform(Document doc) throws TransformerException {
55
		final DocumentResult result = new DocumentResult();
56
		transformer.transform(new DocumentSource(doc), result);
57
		return result.getDocument();
58
	}
59

  
60
	@Override
61
	protected void map(final Text key, final Text value, final Context context) throws IOException, InterruptedException {
62
		try {
63

  
64
			final Document result = transform(saxReader.read(new StringReader(value.toString())));
65
			context.write(key, new Text(result.getDocument().asXML()));
66

  
67
		} catch (final Throwable e) {
68
			log.error("error mapping the following record on HBase: " + value.toString(), e);
69
			context.getCounter("error", e.getClass().getName()).increment(1);
70
			throw new RuntimeException(e);
71
		}
72
	}
73

  
74
}

Also available in: Unified diff