Project

General

Profile

1
package eu.dnetlib.data.mapreduce.hbase.dataimport;
2

    
3
import java.io.IOException;
4
import java.io.StringReader;
5
import java.util.regex.Matcher;
6
import java.util.regex.Pattern;
7
import javax.xml.transform.Transformer;
8
import javax.xml.transform.TransformerConfigurationException;
9
import javax.xml.transform.TransformerFactory;
10

    
11
import org.apache.commons.lang3.StringUtils;
12
import org.apache.commons.lang3.exception.ExceptionUtils;
13
import org.apache.commons.logging.Log;
14
import org.apache.commons.logging.LogFactory;
15
import org.apache.hadoop.io.Text;
16
import org.apache.hadoop.mapreduce.Mapper;
17
import org.dom4j.Document;
18
import org.dom4j.DocumentException;
19
import org.dom4j.io.DocumentResult;
20
import org.dom4j.io.DocumentSource;
21
import org.dom4j.io.SAXReader;
22

    
23
public class FilterXmlRecordsMapper extends Mapper<Text, Text, Text, Text> {
24

    
25
	private static final Log log = LogFactory.getLog(FilterXmlRecordsMapper.class); // NOPMD by marko on 11/24/08 5:02 PM
26

    
27
	private Transformer transformer;
28

    
29
	private SAXReader saxReader;
30

    
31
	private final static String xslt =
32
			"<xsl:stylesheet version=\"1.0\" xmlns:xsl=\"http://www.w3.org/1999/XSL/Transform\">\n"
33
					+ "  <xsl:template match=\"@*|node()\">\n"
34
					+ "    <xsl:copy>\n"
35
					+ "      <xsl:apply-templates select=\"@*|node()\"/>\n"
36
					+ "    </xsl:copy>\n"
37
					+ "  </xsl:template>\n"
38
					+ "</xsl:stylesheet>";
39

    
40
	@Override
41
	protected void setup(final Context context) throws IOException, InterruptedException {
42
		super.setup(context);
43

    
44
		saxReader = new SAXReader();
45

    
46
		log.info("using xslt:\n" + xslt);
47
		try {
48
			transformer = TransformerFactory.newInstance().newTransformer(new DocumentSource((new SAXReader()).read(new StringReader(xslt))));
49
		} catch (TransformerConfigurationException | DocumentException e) {
50
			log.error(e);
51
			throw new RuntimeException(e);
52
		}
53

    
54
		log.info("using trasformer: '" + transformer.getClass().getName() + "'");
55
	}
56

    
57
	@Override
58
	protected void map(final Text key, final Text value, final Context context) throws IOException, InterruptedException {
59
		try {
60
			final DocumentResult result = new DocumentResult();
61
			final Document document = saxReader.read(new StringReader(value.toString()));
62
			transformer.transform(new DocumentSource(document), result);
63

    
64
			context.write(key, new Text(result.getDocument().asXML()));
65

    
66
		} catch (final Throwable e) {
67
			//log.error("error parsing record\n" + value.toString(), e);
68

    
69
			context.getCounter("error", e.getClass().getName()).increment(1);
70

    
71
			final String c = getInvalidXmlChar(e);
72
			if (StringUtils.isNotBlank(c)) {
73
				context.getCounter("invalid char", c).increment(1);
74
			}
75
		}
76
	}
77

    
78
	public static String getInvalidXmlChar(final Throwable e) {
79
		final String error = ExceptionUtils.getRootCauseMessage(e);
80
		if (StringUtils.contains(error, "An invalid XML character")) {
81
			final Pattern p = Pattern.compile(".*\\(.*:\\s?(?<char>.*)\\).*");
82
			final Matcher m = p.matcher(error);
83
			if (m.matches()) {
84
				final String c = m.group("char");
85
				if (StringUtils.isNotBlank(c)) {
86
					return c;
87
				}
88
			}
89
		}
90
		return null;
91
	}
92

    
93
}
(6-6/15)