Revision 52469
Added by Claudio Atzori over 6 years ago
modules/dnet-mapreduce-jobs/branches/beta/src/main/java/eu/dnetlib/data/mapreduce/hbase/dataimport/FilterXmlRecordsMapper.java | ||
---|---|---|
2 | 2 |
|
3 | 3 |
import java.io.IOException; |
4 | 4 |
import java.io.StringReader; |
5 |
import java.util.regex.Matcher; |
|
6 |
import java.util.regex.Pattern; |
|
5 | 7 |
import javax.xml.transform.Transformer; |
6 | 8 |
import javax.xml.transform.TransformerConfigurationException; |
7 |
import javax.xml.transform.TransformerException; |
|
8 | 9 |
import javax.xml.transform.TransformerFactory; |
9 | 10 |
|
11 |
import org.apache.commons.lang3.StringUtils; |
|
10 | 12 |
import org.apache.commons.logging.Log; |
11 | 13 |
import org.apache.commons.logging.LogFactory; |
12 | 14 |
import org.apache.hadoop.io.Text; |
... | ... | |
51 | 53 |
log.info("using trasformer: '" + transformer.getClass().getName() + "'"); |
52 | 54 |
} |
53 | 55 |
|
54 |
private Document transform(Document doc) throws TransformerException { |
|
55 |
final DocumentResult result = new DocumentResult(); |
|
56 |
transformer.transform(new DocumentSource(doc), result); |
|
57 |
return result.getDocument(); |
|
58 |
} |
|
59 |
|
|
60 | 56 |
@Override |
61 | 57 |
protected void map(final Text key, final Text value, final Context context) throws IOException, InterruptedException { |
62 | 58 |
try { |
59 |
final DocumentResult result = new DocumentResult(); |
|
60 |
final Document document = saxReader.read(new StringReader(value.toString())); |
|
61 |
transformer.transform(new DocumentSource(document), result); |
|
63 | 62 |
|
64 |
final Document result = transform(saxReader.read(new StringReader(value.toString()))); |
|
65 | 63 |
context.write(key, new Text(result.getDocument().asXML())); |
66 | 64 |
|
67 | 65 |
} catch (final Throwable e) { |
68 |
log.error("error mapping the following record on HBase: " + value.toString(), e); |
|
66 |
//log.error("error parsing record\n" + value.toString(), e); |
|
67 |
|
|
69 | 68 |
context.getCounter("error", e.getClass().getName()).increment(1); |
70 |
throw new RuntimeException(e); |
|
69 |
|
|
70 |
if (e instanceof DocumentException) { |
|
71 |
if (StringUtils.contains(e.getMessage(), "An invalid XML character")) { |
|
72 |
final Pattern p = Pattern.compile("\\(.*:\\s?(?<char>.*)\\)"); |
|
73 |
final Matcher m = p.matcher(e.getMessage()); |
|
74 |
if (m.matches()) { |
|
75 |
final String c = m.group("char"); |
|
76 |
if (StringUtils.isNotBlank(c)) { |
|
77 |
context.getCounter("invalid char", c).increment(1); |
|
78 |
} |
|
79 |
} |
|
80 |
} |
|
81 |
} |
|
71 | 82 |
} |
72 | 83 |
} |
73 | 84 |
|
Also available in: Unified diff
mapper class that parses xml records