Project

General

Profile

« Previous | Next » 

Revision 53122

Added by Andreas Czerniak over 5 years ago

remove some characters there are not in XML Character Range.
see https://www.w3.org/TR/REC-xml/#charsets

View differences:

modules/dnet-modular-collector-service/trunk/src/main/java/eu/dnetlib/data/collector/plugins/oai/engine/XmlCleaner.java
7 7
import java.util.regex.Pattern;
8 8

  
9 9
/**
10
 * @author jochen
10
 * @author jochen, Andreas Czerniak
11 11
 *
12 12
 */
13 13
public class XmlCleaner {
......
16 16
	 */
17 17
	private static Pattern validCharacterEntityPattern = Pattern.compile("^&#x?\\d{2,4};"); //$NON-NLS-1$
18 18
	//	    private static Pattern validCharacterEntityPattern = Pattern.compile("^&#?\\d{2,4};"); //$NON-NLS-1$
19
	private static Pattern invalidControlCharPattern = Pattern.compile("");
19
        
20
        // see https://www.w3.org/TR/REC-xml/#charsets , not only limited to 
21
	private static Pattern invalidControlCharPattern = Pattern.compile("&#x?1[0-9a-fA-F];");
22
        
20 23
	/**
21 24
	 * Pattern that negates the allowable XML 4 byte unicode characters. Valid
22 25
	 * are: #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] |
modules/dnet-modular-collector-service/trunk/pom.xml
10 10
	<groupId>eu.dnetlib</groupId>
11 11
	<artifactId>dnet-modular-collector-service</artifactId>
12 12
	<packaging>jar</packaging>
13
	<version>3.3.20-SNAPSHOT</version>
13
	<version>3.3.21-SNAPSHOT</version>
14 14
	<scm>
15 15
		<developerConnection>scm:svn:https://svn.driver.research-infrastructures.eu/driver/dnet45/modules/dnet-modular-collector-service/trunk</developerConnection>
16 16
	</scm>
......
99 99
			<artifactId>vtd-xml</artifactId>
100 100
			<version>2.13.2</version>
101 101
		</dependency>
102

  
103

  
102
	 <dependency>
103
	  <groupId>org.apache.httpcomponents</groupId>
104
	  <artifactId>httpcore</artifactId>
105
	  <version>4.4.1</version>
106
	  <scope>test</scope>
107
	  <type>jar</type>
108
	 </dependency>
104 109
	</dependencies>
105 110
</project>

Also available in: Unified diff