Project

General

Profile

« Previous | Next » 

Revision 53122

Added by Andreas Czerniak over 5 years ago

remove some characters there are not in XML Character Range.
see https://www.w3.org/TR/REC-xml/#charsets

View differences:

XmlCleaner.java
7 7
import java.util.regex.Pattern;
8 8

  
9 9
/**
10
 * @author jochen
10
 * @author jochen, Andreas Czerniak
11 11
 *
12 12
 */
13 13
public class XmlCleaner {
......
16 16
	 */
17 17
	private static Pattern validCharacterEntityPattern = Pattern.compile("^&#x?\\d{2,4};"); //$NON-NLS-1$
18 18
	//	    private static Pattern validCharacterEntityPattern = Pattern.compile("^&#?\\d{2,4};"); //$NON-NLS-1$
19
	private static Pattern invalidControlCharPattern = Pattern.compile("");
19
        
20
        // see https://www.w3.org/TR/REC-xml/#charsets , not only limited to 
21
	private static Pattern invalidControlCharPattern = Pattern.compile("&#x?1[0-9a-fA-F];");
22
        
20 23
	/**
21 24
	 * Pattern that negates the allowable XML 4 byte unicode characters. Valid
22 25
	 * are: #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] |

Also available in: Unified diff