Revision 52461
Added by Claudio Atzori almost 6 years ago
modules/dnet-mapreduce-jobs/branches/beta/src/main/java/eu/dnetlib/data/mapreduce/hbase/dataimport/FilterXmlRecordsMapper.java | ||
---|---|---|
1 |
package eu.dnetlib.data.mapreduce.hbase.dataimport; |
|
2 |
|
|
3 |
import java.io.IOException; |
|
4 |
import java.io.StringReader; |
|
5 |
import javax.xml.transform.Transformer; |
|
6 |
import javax.xml.transform.TransformerConfigurationException; |
|
7 |
import javax.xml.transform.TransformerException; |
|
8 |
import javax.xml.transform.TransformerFactory; |
|
9 |
|
|
10 |
import org.apache.commons.logging.Log; |
|
11 |
import org.apache.commons.logging.LogFactory; |
|
12 |
import org.apache.hadoop.io.Text; |
|
13 |
import org.apache.hadoop.mapreduce.Mapper; |
|
14 |
import org.dom4j.Document; |
|
15 |
import org.dom4j.DocumentException; |
|
16 |
import org.dom4j.io.DocumentResult; |
|
17 |
import org.dom4j.io.DocumentSource; |
|
18 |
import org.dom4j.io.SAXReader; |
|
19 |
|
|
20 |
public class ImportRecordsMapper extends Mapper<Text, Text, Text, Text> { |
|
21 |
|
|
22 |
private static final Log log = LogFactory.getLog(ImportRecordsMapper.class); // NOPMD by marko on 11/24/08 5:02 PM |
|
23 |
|
|
24 |
private Transformer transformer; |
|
25 |
|
|
26 |
private SAXReader saxReader; |
|
27 |
|
|
28 |
private final static String xslt = |
|
29 |
"<xsl:stylesheet version=\"1.0\" xmlns:xsl=\"http://www.w3.org/1999/XSL/Transform\">\n" |
|
30 |
+ " <xsl:template match=\"@*|node()\">\n" |
|
31 |
+ " <xsl:copy>\n" |
|
32 |
+ " <xsl:apply-templates select=\"@*|node()\"/>\n" |
|
33 |
+ " </xsl:copy>\n" |
|
34 |
+ " </xsl:template>\n" |
|
35 |
+ "</xsl:stylesheet>"; |
|
36 |
|
|
37 |
@Override |
|
38 |
protected void setup(final Context context) throws IOException, InterruptedException { |
|
39 |
super.setup(context); |
|
40 |
|
|
41 |
saxReader = new SAXReader(); |
|
42 |
|
|
43 |
log.info("using xslt:\n" + xslt); |
|
44 |
try { |
|
45 |
transformer = TransformerFactory.newInstance().newTransformer(new DocumentSource((new SAXReader()).read(new StringReader(xslt)))); |
|
46 |
} catch (TransformerConfigurationException | DocumentException e) { |
|
47 |
log.error(e); |
|
48 |
throw new RuntimeException(e); |
|
49 |
} |
|
50 |
|
|
51 |
log.info("using trasformer: '" + transformer.getClass().getName() + "'"); |
|
52 |
} |
|
53 |
|
|
54 |
private Document transform(Document doc) throws TransformerException { |
|
55 |
final DocumentResult result = new DocumentResult(); |
|
56 |
transformer.transform(new DocumentSource(doc), result); |
|
57 |
return result.getDocument(); |
|
58 |
} |
|
59 |
|
|
60 |
@Override |
|
61 |
protected void map(final Text key, final Text value, final Context context) throws IOException, InterruptedException { |
|
62 |
try { |
|
63 |
|
|
64 |
final Document result = transform(saxReader.read(new StringReader(value.toString()))); |
|
65 |
context.write(key, new Text(result.getDocument().asXML())); |
|
66 |
|
|
67 |
} catch (final Throwable e) { |
|
68 |
log.error("error mapping the following record on HBase: " + value.toString(), e); |
|
69 |
context.getCounter("error", e.getClass().getName()).increment(1); |
|
70 |
throw new RuntimeException(e); |
|
71 |
} |
|
72 |
} |
|
73 |
|
|
74 |
} |
Also available in: Unified diff
mapper class that parses xml records