Project

General

Profile

1
package eu.dnetlib.data.mapreduce.hbase.dataimport;
2

    
3
import org.apache.commons.lang3.StringUtils;
4
import org.apache.commons.lang3.exception.ExceptionUtils;
5
import org.apache.commons.logging.Log;
6
import org.apache.commons.logging.LogFactory;
7
import org.apache.hadoop.io.Text;
8
import org.apache.hadoop.mapreduce.Mapper;
9
import org.dom4j.Document;
10
import org.dom4j.DocumentException;
11
import org.dom4j.io.DocumentResult;
12
import org.dom4j.io.DocumentSource;
13
import org.dom4j.io.SAXReader;
14

    
15
import javax.xml.transform.Transformer;
16
import javax.xml.transform.TransformerConfigurationException;
17
import javax.xml.transform.TransformerFactory;
18
import java.io.IOException;
19
import java.io.StringReader;
20
import java.util.regex.Matcher;
21
import java.util.regex.Pattern;
22

    
23
public class GetInvalidXmlRecordsMapper extends Mapper<Text, Text, Text, Text> {
24

    
25
	private static final Log log = LogFactory.getLog(GetInvalidXmlRecordsMapper.class); // NOPMD by marko on 11/24/08 5:02 PM
26
	public static final String DOI_REGEX = "(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?![\\\"&\\'])\\S)+)";
27

    
28
	private Transformer transformer;
29

    
30
	private SAXReader saxReader;
31

    
32
	private Text valueOut;
33

    
34
	private final static String xslt =
35
			"<xsl:stylesheet version=\"1.0\" xmlns:xsl=\"http://www.w3.org/1999/XSL/Transform\">\n"
36
					+ "  <xsl:template match=\"@*|node()\">\n"
37
					+ "    <xsl:copy>\n"
38
					+ "      <xsl:apply-templates select=\"@*|node()\"/>\n"
39
					+ "    </xsl:copy>\n"
40
					+ "  </xsl:template>\n"
41
					+ "</xsl:stylesheet>";
42

    
43
	@Override
44
	protected void setup(final Context context) throws IOException, InterruptedException {
45
		super.setup(context);
46

    
47
		valueOut = new Text();
48
		saxReader = new SAXReader();
49

    
50
		log.info("using xslt:\n" + xslt);
51
		try {
52
			transformer = TransformerFactory.newInstance().newTransformer(new DocumentSource((new SAXReader()).read(new StringReader(xslt))));
53
		} catch (TransformerConfigurationException | DocumentException e) {
54
			log.error(e);
55
			throw new RuntimeException(e);
56
		}
57

    
58
		log.info("using trasformer: '" + transformer.getClass().getName() + "'");
59
	}
60

    
61
	@Override
62
	protected void map(final Text key, final Text value, final Context context) throws IOException, InterruptedException {
63
		try {
64
			final DocumentResult result = new DocumentResult();
65
			final Document document = saxReader.read(new StringReader(value.toString()));
66
			transformer.transform(new DocumentSource(document), result);
67

    
68
			result.getDocument().asXML();
69

    
70
		} catch (final Throwable e) {
71
			//log.error("error parsing record\n" + value.toString(), e);
72

    
73
			context.getCounter("error", e.getClass().getName()).increment(1);
74

    
75
			final String c = getInvalidXmlChar(e);
76
			if (StringUtils.isNotBlank(c)) {
77
				context.getCounter("invalid char", c).increment(1);
78
			}
79

    
80
			String doi = getDoi(value.toString());
81
			if (StringUtils.isNotBlank(doi)) {
82
				valueOut.set(doi);
83
				context.write(key, valueOut);
84
				context.getCounter("output", "doi").increment(1);
85
			}
86

    
87
		}
88
	}
89

    
90
	public static String getInvalidXmlChar(final Throwable e) {
91
		final String error = ExceptionUtils.getRootCauseMessage(e);
92
		if (StringUtils.contains(error, "An invalid XML character")) {
93
			final Pattern p = Pattern.compile(".*\\(.*:\\s?(?<char>.*)\\).*");
94
			final Matcher m = p.matcher(error);
95
			if (m.matches()) {
96
				final String c = m.group("char");
97
				if (StringUtils.isNotBlank(c)) {
98
					return c;
99
				}
100
			}
101
		}
102
		return null;
103
	}
104

    
105
	public static String getDoi(final String url) {
106

    
107
		final Pattern pattern = Pattern.compile(DOI_REGEX);
108
		final Matcher matcher = pattern.matcher(url);
109

    
110
		if (matcher.find())
111
			return matcher.group(0);
112

    
113

    
114
		return null;
115

    
116
	}
117

    
118
}
(7-7/15)