1
|
package eu.dnetlib.data.collector.plugins.oai.engine;
|
2
|
|
3
|
import java.util.HashMap;
|
4
|
import java.util.HashSet;
|
5
|
import java.util.Map;
|
6
|
import java.util.Set;
|
7
|
import java.util.regex.Pattern;
|
8
|
|
9
|
import org.apache.commons.lang3.StringUtils;
|
10
|
|
11
|
/**
|
12
|
* @author jochen, Andreas Czerniak
|
13
|
*
|
14
|
*/
|
15
|
public class XmlCleaner {
|
16
|
/**
|
17
|
* Pattern for numeric entities.
|
18
|
*/
|
19
|
private static Pattern validCharacterEntityPattern = Pattern.compile("^&#x?\\d{2,4};"); //$NON-NLS-1$
|
20
|
// private static Pattern validCharacterEntityPattern = Pattern.compile("^&#?\\d{2,4};"); //$NON-NLS-1$
|
21
|
|
22
|
// see https://www.w3.org/TR/REC-xml/#charsets , not only limited to 
|
23
|
private static Pattern invalidControlCharPattern = Pattern.compile("&#x?1[0-9a-fA-F];");
|
24
|
|
25
|
/**
|
26
|
* Pattern that negates the allowable XML 4 byte unicode characters. Valid
|
27
|
* are: #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] |
|
28
|
* [#x10000-#x10FFFF]
|
29
|
*/
|
30
|
private static Pattern invalidCharacterPattern = Pattern.compile("[^\t\r\n\u0020-\uD7FF\uE000-\uFFFD]"); //$NON-NLS-1$
|
31
|
|
32
|
// Map entities to their unicode equivalent
|
33
|
private static Set<String> goodEntities = new HashSet<String>();
|
34
|
private static Map<String, String> badEntities = new HashMap<String, String>();
|
35
|
|
36
|
static {
|
37
|
// pre-defined XML entities
|
38
|
goodEntities.add("""); //$NON-NLS-1$ // quotation mark
|
39
|
goodEntities.add("&"); //$NON-NLS-1$ // ampersand
|
40
|
goodEntities.add("<"); //$NON-NLS-1$ // less-than sign
|
41
|
goodEntities.add(">"); //$NON-NLS-1$ // greater-than sign
|
42
|
// control entities
|
43
|
//badEntities.put("", "");
|
44
|
badEntities.put("", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
45
|
badEntities.put("€", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
46
|
badEntities.put("", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
47
|
badEntities.put("‚", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
48
|
badEntities.put("ƒ", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
49
|
badEntities.put("„", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
50
|
badEntities.put("…", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
51
|
badEntities.put("†", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
52
|
badEntities.put("‡", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
53
|
badEntities.put("ˆ", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
54
|
badEntities.put("‰", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
55
|
badEntities.put("Š", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
56
|
badEntities.put("‹", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
57
|
badEntities.put("Œ", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
58
|
badEntities.put("", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
59
|
badEntities.put("Ž", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
60
|
badEntities.put("", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
61
|
badEntities.put("", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
62
|
badEntities.put("‘", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
63
|
badEntities.put("’", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
64
|
badEntities.put("“", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
65
|
badEntities.put("”", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
66
|
badEntities.put("•", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
67
|
badEntities.put("–", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
68
|
badEntities.put("—", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
69
|
badEntities.put("˜", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
70
|
badEntities.put("™", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
71
|
badEntities.put("š", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
72
|
badEntities.put("›", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
73
|
badEntities.put("œ", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
74
|
badEntities.put("", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
75
|
badEntities.put("ž", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
76
|
badEntities.put("Ÿ", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
77
|
// misc entities
|
78
|
badEntities.put("€", "\u20AC"); //$NON-NLS-1$ //$NON-NLS-2$ // euro
|
79
|
badEntities.put("‘", "\u2018"); //$NON-NLS-1$ //$NON-NLS-2$ // left single quotation mark
|
80
|
badEntities.put("’", "\u2019"); //$NON-NLS-1$ //$NON-NLS-2$ // right single quotation mark
|
81
|
// Latin 1 entities
|
82
|
badEntities.put(" ", "\u00A0"); //$NON-NLS-1$ //$NON-NLS-2$ // no-break space
|
83
|
badEntities.put("¡", "\u00A1"); //$NON-NLS-1$ //$NON-NLS-2$ // inverted exclamation mark
|
84
|
badEntities.put("¢", "\u00A2"); //$NON-NLS-1$ //$NON-NLS-2$ // cent sign
|
85
|
badEntities.put("£", "\u00A3"); //$NON-NLS-1$ //$NON-NLS-2$ // pound sign
|
86
|
badEntities.put("¤", "\u00A4"); //$NON-NLS-1$ //$NON-NLS-2$ // currency sign
|
87
|
badEntities.put("¥", "\u00A5"); //$NON-NLS-1$ //$NON-NLS-2$ // yen sign
|
88
|
badEntities.put("¦", "\u00A6"); //$NON-NLS-1$ //$NON-NLS-2$ // broken vertical bar
|
89
|
badEntities.put("§", "\u00A7"); //$NON-NLS-1$ //$NON-NLS-2$ // section sign
|
90
|
badEntities.put("¨", "\u00A8"); //$NON-NLS-1$ //$NON-NLS-2$ // diaeresis
|
91
|
badEntities.put("©", "\u00A9"); //$NON-NLS-1$ //$NON-NLS-2$ // copyright sign
|
92
|
badEntities.put("ª", "\u00AA"); //$NON-NLS-1$ //$NON-NLS-2$ // feminine ordinal indicator
|
93
|
badEntities.put("«", "\u00AB"); //$NON-NLS-1$ //$NON-NLS-2$ // left-pointing double angle quotation mark
|
94
|
badEntities.put("¬", "\u00AC"); //$NON-NLS-1$ //$NON-NLS-2$ // not sign
|
95
|
badEntities.put("­", "\u00AD"); //$NON-NLS-1$ //$NON-NLS-2$ // soft hyphen
|
96
|
badEntities.put("®", "\u00AE"); //$NON-NLS-1$ //$NON-NLS-2$ // registered sign
|
97
|
badEntities.put("¯", "\u00AF"); //$NON-NLS-1$ //$NON-NLS-2$ // macron
|
98
|
badEntities.put("°", "\u00B0"); //$NON-NLS-1$ //$NON-NLS-2$ // degree sign
|
99
|
badEntities.put("±", "\u00B1"); //$NON-NLS-1$ //$NON-NLS-2$ // plus-minus sign
|
100
|
badEntities.put("²", "\u00B2"); //$NON-NLS-1$ //$NON-NLS-2$ // superscript two
|
101
|
badEntities.put("³", "\u00B3"); //$NON-NLS-1$ //$NON-NLS-2$ // superscript three
|
102
|
badEntities.put("´", "\u00B4"); //$NON-NLS-1$ //$NON-NLS-2$ // acute accent
|
103
|
badEntities.put("µ", "\u00B5"); //$NON-NLS-1$ //$NON-NLS-2$ // micro sign
|
104
|
badEntities.put("¶", "\u00B6"); //$NON-NLS-1$ //$NON-NLS-2$ // pilcrow sign
|
105
|
badEntities.put("·", "\u00B7"); //$NON-NLS-1$ //$NON-NLS-2$ // middle dot
|
106
|
badEntities.put("¸", "\u00B8"); //$NON-NLS-1$ //$NON-NLS-2$ // cedilla
|
107
|
badEntities.put("¹", "\u00B9"); //$NON-NLS-1$ //$NON-NLS-2$ // superscript one
|
108
|
badEntities.put("º", "\u00BA"); //$NON-NLS-1$ //$NON-NLS-2$ // masculine ordinal indicator
|
109
|
badEntities.put("»", "\u00BB"); //$NON-NLS-1$ //$NON-NLS-2$ // right-pointing double angle quotation mark
|
110
|
badEntities.put("¼", "\u00BC"); //$NON-NLS-1$ //$NON-NLS-2$ // vulgar fraction one quarter
|
111
|
badEntities.put("½", "\u00BD"); //$NON-NLS-1$ //$NON-NLS-2$ // vulgar fraction one half
|
112
|
badEntities.put("¾", "\u00BE"); //$NON-NLS-1$ //$NON-NLS-2$ // vulgar fraction three quarters
|
113
|
badEntities.put("¿", "\u00BF"); //$NON-NLS-1$ //$NON-NLS-2$ // inverted question mark
|
114
|
badEntities.put("À", "\u00C0"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with grave
|
115
|
badEntities.put("Á", "\u00C1"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with acute
|
116
|
badEntities.put("Â", "\u00C2"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with circumflex
|
117
|
badEntities.put("Ã", "\u00C3"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with tilde
|
118
|
badEntities.put("Ä", "\u00C4"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with diaeresis
|
119
|
badEntities.put("Å", "\u00C5"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with ring above
|
120
|
badEntities.put("Æ", "\u00C6"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter AE
|
121
|
badEntities.put("Ç", "\u00C7"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter C with cedilla
|
122
|
badEntities.put("È", "\u00C8"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter E with grave
|
123
|
badEntities.put("É", "\u00C9"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter E with acute
|
124
|
badEntities.put("Ê", "\u00CA"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter E with circumflex
|
125
|
badEntities.put("Ë", "\u00CB"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter E with diaeresis
|
126
|
badEntities.put("Ì", "\u00CC"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter I with grave
|
127
|
badEntities.put("Í", "\u00CD"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter I with acute
|
128
|
badEntities.put("Î", "\u00CE"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter I with circumflex
|
129
|
badEntities.put("Ï", "\u00CF"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter I with diaeresis
|
130
|
badEntities.put("Ð", "\u00D0"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter ETH
|
131
|
badEntities.put("Ñ", "\u00D1"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter N with tilde
|
132
|
badEntities.put("Ò", "\u00D2"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with grave
|
133
|
badEntities.put("Ó", "\u00D3"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with acute
|
134
|
badEntities.put("Ô", "\u00D4"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with circumflex
|
135
|
badEntities.put("Õ", "\u00D5"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with tilde
|
136
|
badEntities.put("Ö", "\u00D6"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with diaeresis
|
137
|
badEntities.put("×", "\u00D7"); //$NON-NLS-1$ //$NON-NLS-2$ // multiplication sign
|
138
|
badEntities.put("Ø", "\u00D8"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with stroke
|
139
|
badEntities.put("Ù", "\u00D9"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter U with grave
|
140
|
badEntities.put("Ú", "\u00DA"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter U with acute
|
141
|
badEntities.put("Û", "\u00DB"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter U with circumflex
|
142
|
badEntities.put("Ü", "\u00DC"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter U with diaeresis
|
143
|
badEntities.put("Ý", "\u00DD"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter Y with acute
|
144
|
badEntities.put("Þ", "\u00DE"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter THORN
|
145
|
badEntities.put("ß", "\u00DF"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter sharp s
|
146
|
badEntities.put("à", "\u00E0"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with grave
|
147
|
badEntities.put("á", "\u00E1"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with acute
|
148
|
badEntities.put("â", "\u00E2"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with circumflex
|
149
|
badEntities.put("ã", "\u00E3"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with tilde
|
150
|
badEntities.put("ä", "\u00E4"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with diaeresis
|
151
|
badEntities.put("å", "\u00E5"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with ring above
|
152
|
badEntities.put("æ", "\u00E6"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter ae
|
153
|
badEntities.put("ç", "\u00E7"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter c with cedilla
|
154
|
badEntities.put("è", "\u00E8"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter e with grave
|
155
|
badEntities.put("é", "\u00E9"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter e with acute
|
156
|
badEntities.put("ê", "\u00EA"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter e with circumflex
|
157
|
badEntities.put("ë", "\u00EB"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter e with diaeresis
|
158
|
badEntities.put("ì", "\u00EC"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter i with grave
|
159
|
badEntities.put("í", "\u00ED"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter i with acute
|
160
|
badEntities.put("î", "\u00EE"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter i with circumflex
|
161
|
badEntities.put("ï", "\u00EF"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter i with diaeresis
|
162
|
badEntities.put("ð", "\u00F0"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter eth
|
163
|
badEntities.put("ñ", "\u00F1"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter n with tilde
|
164
|
badEntities.put("ò", "\u00F2"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with grave
|
165
|
badEntities.put("ó", "\u00F3"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with acute
|
166
|
badEntities.put("ô", "\u00F4"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with circumflex
|
167
|
badEntities.put("õ", "\u00F5"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with tilde
|
168
|
badEntities.put("ö", "\u00F6"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with diaeresis
|
169
|
badEntities.put("÷", "\u00F7"); //$NON-NLS-1$ //$NON-NLS-2$ // division sign
|
170
|
badEntities.put("ø", "\u00F8"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with stroke
|
171
|
badEntities.put("ù", "\u00F9"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter u with grave
|
172
|
badEntities.put("ú", "\u00FA"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter u with acute
|
173
|
badEntities.put("û", "\u00FB"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter u with circumflex
|
174
|
badEntities.put("ü", "\u00FC"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter u with diaeresis
|
175
|
badEntities.put("ý", "\u00FD"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter y with acute
|
176
|
badEntities.put("þ", "\u00FE"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter thorn
|
177
|
badEntities.put("ÿ", "\u00FF"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter y with diaeresis
|
178
|
}
|
179
|
/**
|
180
|
* For each entity in the input that is not allowed in XML, replace the
|
181
|
* entity with its unicode equivalent or remove it. For each instance of a
|
182
|
* bare {@literal &}, replace it with {@literal &<br/>}
|
183
|
* XML only allows 4 entities: {@literal &amp;}, {@literal &quot;}, {@literal &lt;} and {@literal &gt;}.
|
184
|
*
|
185
|
* @param broken
|
186
|
* the string to handle entities
|
187
|
* @return the string with entities appropriately fixed up
|
188
|
*/
|
189
|
static public String cleanAllEntities(final String broken) {
|
190
|
if (StringUtils.isBlank(broken)) {
|
191
|
return null;
|
192
|
}
|
193
|
|
194
|
String working = invalidControlCharPattern.matcher(broken).replaceAll("");
|
195
|
working = invalidCharacterPattern.matcher(working).replaceAll("");
|
196
|
|
197
|
int cleanfrom = 0;
|
198
|
|
199
|
while (true) {
|
200
|
int amp = working.indexOf('&', cleanfrom);
|
201
|
// If there are no more amps then we are done
|
202
|
if (amp == -1) {
|
203
|
break;
|
204
|
}
|
205
|
// Skip references of the kind &#ddd;
|
206
|
if (validCharacterEntityPattern.matcher(working.substring(amp)).find()) {
|
207
|
cleanfrom = working.indexOf(';', amp) + 1;
|
208
|
continue;
|
209
|
}
|
210
|
int i = amp + 1;
|
211
|
while (true) {
|
212
|
// if we are at the end of the string then just escape the '&';
|
213
|
if (i >= working.length()) {
|
214
|
return working.substring(0, amp) + "&" + working.substring(amp + 1); //$NON-NLS-1$
|
215
|
}
|
216
|
// if we have come to a ; then we have an entity
|
217
|
// If it is something that xml can't handle then replace it.
|
218
|
char c = working.charAt(i);
|
219
|
if (c == ';') {
|
220
|
final String entity = working.substring(amp, i + 1);
|
221
|
final String replace = handleEntity(entity);
|
222
|
working = working.substring(0, amp) + replace + working.substring(i + 1);
|
223
|
break;
|
224
|
}
|
225
|
// Did we end an entity without finding a closing ;
|
226
|
// Then treat it as an '&' that needs to be replaced with &
|
227
|
if (!Character.isLetterOrDigit(c)) {
|
228
|
working = working.substring(0, amp) + "&" + working.substring(amp + 1); //$NON-NLS-1$
|
229
|
amp = i + 4; // account for the 4 extra characters
|
230
|
break;
|
231
|
}
|
232
|
i++;
|
233
|
}
|
234
|
cleanfrom = amp + 1;
|
235
|
}
|
236
|
|
237
|
if (Pattern.compile("<<").matcher(working).find()) {
|
238
|
working = working.replaceAll("<<", "<<");
|
239
|
}
|
240
|
|
241
|
if (Pattern.compile(">>").matcher(working).find()) {
|
242
|
working = working.replaceAll(">>", ">>");
|
243
|
}
|
244
|
|
245
|
return working;
|
246
|
}
|
247
|
|
248
|
/**
|
249
|
* Replace entity with its unicode equivalent, if it is not a valid XML
|
250
|
* entity. Otherwise strip it out. XML only allows 4 entities: &amp;,
|
251
|
* &quot;, &lt; and &gt;.
|
252
|
*
|
253
|
* @param entity
|
254
|
* the entity to be replaced
|
255
|
* @return the substitution for the entity, either itself, the unicode
|
256
|
* equivalent or an empty string.
|
257
|
*/
|
258
|
private static String handleEntity(final String entity) {
|
259
|
if (goodEntities.contains(entity)) {
|
260
|
return entity;
|
261
|
}
|
262
|
|
263
|
final String replace = (String) badEntities.get(entity);
|
264
|
if (replace != null) {
|
265
|
return replace;
|
266
|
}
|
267
|
|
268
|
return replace != null ? replace : "";
|
269
|
}
|
270
|
}
|