Project

General

Profile

« Previous | Next » 

Revision 52469

mapper class that parses xml records

View differences:

modules/dnet-mapreduce-jobs/branches/beta/src/main/java/eu/dnetlib/data/mapreduce/hbase/dataimport/FilterXmlRecordsMapper.java
2 2

  
3 3
import java.io.IOException;
4 4
import java.io.StringReader;
5
import java.util.regex.Matcher;
6
import java.util.regex.Pattern;
5 7
import javax.xml.transform.Transformer;
6 8
import javax.xml.transform.TransformerConfigurationException;
7
import javax.xml.transform.TransformerException;
8 9
import javax.xml.transform.TransformerFactory;
9 10

  
11
import org.apache.commons.lang3.StringUtils;
10 12
import org.apache.commons.logging.Log;
11 13
import org.apache.commons.logging.LogFactory;
12 14
import org.apache.hadoop.io.Text;
......
51 53
		log.info("using trasformer: '" + transformer.getClass().getName() + "'");
52 54
	}
53 55

  
54
	private Document transform(Document doc) throws TransformerException {
55
		final DocumentResult result = new DocumentResult();
56
		transformer.transform(new DocumentSource(doc), result);
57
		return result.getDocument();
58
	}
59

  
60 56
	@Override
61 57
	protected void map(final Text key, final Text value, final Context context) throws IOException, InterruptedException {
62 58
		try {
59
			final DocumentResult result = new DocumentResult();
60
			final Document document = saxReader.read(new StringReader(value.toString()));
61
			transformer.transform(new DocumentSource(document), result);
63 62

  
64
			final Document result = transform(saxReader.read(new StringReader(value.toString())));
65 63
			context.write(key, new Text(result.getDocument().asXML()));
66 64

  
67 65
		} catch (final Throwable e) {
68
			log.error("error mapping the following record on HBase: " + value.toString(), e);
66
			//log.error("error parsing record\n" + value.toString(), e);
67

  
69 68
			context.getCounter("error", e.getClass().getName()).increment(1);
70
			throw new RuntimeException(e);
69

  
70
			if (e instanceof DocumentException) {
71
				if (StringUtils.contains(e.getMessage(), "An invalid XML character")) {
72
					final Pattern p = Pattern.compile("\\(.*:\\s?(?<char>.*)\\)");
73
					final Matcher m = p.matcher(e.getMessage());
74
					if (m.matches()) {
75
						final String c = m.group("char");
76
						if (StringUtils.isNotBlank(c)) {
77
							context.getCounter("invalid char", c).increment(1);
78
						}
79
					}
80
				}
81
			}
71 82
		}
72 83
	}
73 84

  

Also available in: Unified diff