Revision 53122
Added by Andreas Czerniak over 5 years ago
modules/dnet-modular-collector-service/trunk/src/main/java/eu/dnetlib/data/collector/plugins/oai/engine/XmlCleaner.java | ||
---|---|---|
7 | 7 |
import java.util.regex.Pattern; |
8 | 8 |
|
9 | 9 |
/** |
10 |
* @author jochen |
|
10 |
* @author jochen, Andreas Czerniak
|
|
11 | 11 |
* |
12 | 12 |
*/ |
13 | 13 |
public class XmlCleaner { |
... | ... | |
16 | 16 |
*/ |
17 | 17 |
private static Pattern validCharacterEntityPattern = Pattern.compile("^&#x?\\d{2,4};"); //$NON-NLS-1$ |
18 | 18 |
// private static Pattern validCharacterEntityPattern = Pattern.compile("^&#?\\d{2,4};"); //$NON-NLS-1$ |
19 |
private static Pattern invalidControlCharPattern = Pattern.compile(""); |
|
19 |
|
|
20 |
// see https://www.w3.org/TR/REC-xml/#charsets , not only limited to  |
|
21 |
private static Pattern invalidControlCharPattern = Pattern.compile("&#x?1[0-9a-fA-F];"); |
|
22 |
|
|
20 | 23 |
/** |
21 | 24 |
* Pattern that negates the allowable XML 4 byte unicode characters. Valid |
22 | 25 |
* are: #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | |
modules/dnet-modular-collector-service/trunk/pom.xml | ||
---|---|---|
10 | 10 |
<groupId>eu.dnetlib</groupId> |
11 | 11 |
<artifactId>dnet-modular-collector-service</artifactId> |
12 | 12 |
<packaging>jar</packaging> |
13 |
<version>3.3.20-SNAPSHOT</version>
|
|
13 |
<version>3.3.21-SNAPSHOT</version>
|
|
14 | 14 |
<scm> |
15 | 15 |
<developerConnection>scm:svn:https://svn.driver.research-infrastructures.eu/driver/dnet45/modules/dnet-modular-collector-service/trunk</developerConnection> |
16 | 16 |
</scm> |
... | ... | |
99 | 99 |
<artifactId>vtd-xml</artifactId> |
100 | 100 |
<version>2.13.2</version> |
101 | 101 |
</dependency> |
102 |
|
|
103 |
|
|
102 |
<dependency> |
|
103 |
<groupId>org.apache.httpcomponents</groupId> |
|
104 |
<artifactId>httpcore</artifactId> |
|
105 |
<version>4.4.1</version> |
|
106 |
<scope>test</scope> |
|
107 |
<type>jar</type> |
|
108 |
</dependency> |
|
104 | 109 |
</dependencies> |
105 | 110 |
</project> |
Also available in: Unified diff
remove some characters there are not in XML Character Range.
see https://www.w3.org/TR/REC-xml/#charsets