Revision 53122
Added by Andreas Czerniak over 5 years ago
XmlCleaner.java | ||
---|---|---|
7 | 7 |
import java.util.regex.Pattern; |
8 | 8 |
|
9 | 9 |
/** |
10 |
* @author jochen |
|
10 |
* @author jochen, Andreas Czerniak
|
|
11 | 11 |
* |
12 | 12 |
*/ |
13 | 13 |
public class XmlCleaner { |
... | ... | |
16 | 16 |
*/ |
17 | 17 |
private static Pattern validCharacterEntityPattern = Pattern.compile("^&#x?\\d{2,4};"); //$NON-NLS-1$ |
18 | 18 |
// private static Pattern validCharacterEntityPattern = Pattern.compile("^&#?\\d{2,4};"); //$NON-NLS-1$ |
19 |
private static Pattern invalidControlCharPattern = Pattern.compile(""); |
|
19 |
|
|
20 |
// see https://www.w3.org/TR/REC-xml/#charsets , not only limited to  |
|
21 |
private static Pattern invalidControlCharPattern = Pattern.compile("&#x?1[0-9a-fA-F];"); |
|
22 |
|
|
20 | 23 |
/** |
21 | 24 |
* Pattern that negates the allowable XML 4 byte unicode characters. Valid |
22 | 25 |
* are: #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | |
Also available in: Unified diff
remove some characters there are not in XML Character Range.
see https://www.w3.org/TR/REC-xml/#charsets