Project

General

Profile

1
package eu.dnetlib.data.mapreduce.hbase.dataimport;
2

    
3
import org.apache.commons.lang3.StringUtils;
4
import org.apache.commons.lang3.exception.ExceptionUtils;
5
import org.apache.commons.logging.Log;
6
import org.apache.commons.logging.LogFactory;
7
import org.apache.hadoop.io.Text;
8
import org.apache.hadoop.mapreduce.Mapper;
9
import org.dom4j.Document;
10
import org.dom4j.DocumentException;
11
import org.dom4j.io.DocumentResult;
12
import org.dom4j.io.DocumentSource;
13
import org.dom4j.io.SAXReader;
14

    
15
import javax.xml.transform.Transformer;
16
import javax.xml.transform.TransformerConfigurationException;
17
import javax.xml.transform.TransformerFactory;
18
import java.io.IOException;
19
import java.io.StringReader;
20
import java.util.regex.Matcher;
21
import java.util.regex.Pattern;
22

    
23
public class GetInvalidXmlRecordsMapper extends Mapper<Text, Text, Text, Text> {
24

    
25
	private static final Log log = LogFactory.getLog(GetInvalidXmlRecordsMapper.class); // NOPMD by marko on 11/24/08 5:02 PM
26
	public static final String DOI_REGEX = "(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?![\\\"&\\'])\\S)+)";
27

    
28
	private Transformer transformer;
29

    
30
	private SAXReader saxReader;
31

    
32
	private Text valueOut;
33

    
34
	private final static String xslt =
35
			"<xsl:stylesheet version=\"1.0\" xmlns:xsl=\"http://www.w3.org/1999/XSL/Transform\">\n"
36
					+ "  <xsl:template match=\"@*|node()\">\n"
37
					+ "    <xsl:copy>\n"
38
					+ "      <xsl:apply-templates select=\"@*|node()\"/>\n"
39
					+ "    </xsl:copy>\n"
40
					+ "  </xsl:template>\n"
41
					+ "</xsl:stylesheet>";
42

    
43
	@Override
44
	protected void setup(final Context context) throws IOException, InterruptedException {
45
		super.setup(context);
46

    
47
		valueOut = new Text();
48
		saxReader = new SAXReader();
49

    
50
		log.info("using xslt:\n" + xslt);
51
		try {
52
			transformer = TransformerFactory.newInstance().newTransformer(new DocumentSource((new SAXReader()).read(new StringReader(xslt))));
53
		} catch (TransformerConfigurationException | DocumentException e) {
54
			log.error(e);
55
			throw new RuntimeException(e);
56
		}
57

    
58
		log.info("using trasformer: '" + transformer.getClass().getName() + "'");
59
	}
60

    
61
	@Override
62
	protected void map(final Text key, final Text value, final Context context) throws IOException, InterruptedException {
63
		try {
64
			final DocumentResult result = new DocumentResult();
65
			final Document document = saxReader.read(new StringReader(value.toString()));
66
			transformer.transform(new DocumentSource(document), result);
67
			result.getDocument().asXML();
68

    
69
		} catch (final Throwable e) {
70
			//log.error("error parsing record\n" + value.toString(), e);
71
			context.getCounter("error", e.getClass().getName()).increment(1);
72
			final String c = getInvalidXmlChar(e);
73
			if (StringUtils.isNotBlank(c)) {
74
				context.getCounter("invalid char", c).increment(1);
75
			}
76
			String doi = getDoi(value.toString());
77
			if (StringUtils.isNotBlank(doi)) {
78
				context.getCounter("output", "doi").increment(1);
79
			}
80
			valueOut.set(value.toString());
81
			context.write(key, valueOut);
82
		}
83
	}
84

    
85
	public static String getInvalidXmlChar(final Throwable e) {
86
		final String error = ExceptionUtils.getRootCauseMessage(e);
87
		if (StringUtils.contains(error, "An invalid XML character")) {
88
			final Pattern p = Pattern.compile(".*\\(.*:\\s?(?<char>.*)\\).*");
89
			final Matcher m = p.matcher(error);
90
			if (m.matches()) {
91
				final String c = m.group("char");
92
				if (StringUtils.isNotBlank(c)) {
93
					return c;
94
				}
95
			}
96
		}
97
		return null;
98
	}
99

    
100
	public static String getDoi(final String url) {
101

    
102
		final Pattern pattern = Pattern.compile(DOI_REGEX);
103
		final Matcher matcher = pattern.matcher(url);
104

    
105
		if (matcher.find())
106
			return matcher.group(0);
107

    
108

    
109
		return null;
110

    
111
	}
112

    
113
}
(8-8/17)