Classes in this File | Line Coverage | Branch Coverage | Complexity | ||||
XMLUtil |
|
| 5.833333333333333;5.833 |
1 | /** | |
2 | * Distribution License: | |
3 | * JSword is free software; you can redistribute it and/or modify it under | |
4 | * the terms of the GNU Lesser General Public License, version 2.1 or later | |
5 | * as published by the Free Software Foundation. This program is distributed | |
6 | * in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even | |
7 | * the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. | |
8 | * See the GNU Lesser General Public License for more details. | |
9 | * | |
10 | * The License is available on the internet at: | |
11 | * http://www.gnu.org/copyleft/lgpl.html | |
12 | * or by writing to: | |
13 | * Free Software Foundation, Inc. | |
14 | * 59 Temple Place - Suite 330 | |
15 | * Boston, MA 02111-1307, USA | |
16 | * | |
17 | * © CrossWire Bible Society, 2005 - 2016 | |
18 | * | |
19 | */ | |
20 | package org.crosswire.common.xml; | |
21 | ||
22 | import java.io.IOException; | |
23 | import java.io.InputStream; | |
24 | import java.util.ArrayList; | |
25 | import java.util.Collections; | |
26 | import java.util.HashSet; | |
27 | import java.util.List; | |
28 | import java.util.Set; | |
29 | import java.util.regex.Matcher; | |
30 | import java.util.regex.Pattern; | |
31 | ||
32 | import org.crosswire.common.util.FileUtil; | |
33 | import org.crosswire.common.util.PropertyMap; | |
34 | import org.crosswire.common.util.ResourceUtil; | |
35 | import org.jdom2.Document; | |
36 | import org.jdom2.JDOMException; | |
37 | import org.jdom2.input.SAXBuilder; | |
38 | import org.jdom2.input.sax.XMLReaders; | |
39 | import org.slf4j.Logger; | |
40 | import org.slf4j.LoggerFactory; | |
41 | import org.xml.sax.Attributes; | |
42 | import org.xml.sax.ContentHandler; | |
43 | import org.xml.sax.SAXException; | |
44 | ||
45 | /** | |
46 | * Utilities for working with SAX XML parsing. | |
47 | * | |
48 | * @see gnu.lgpl.License The GNU Lesser General Public License for details. | |
49 | * @author Joe Walker | |
50 | * @author DM Smith | |
51 | */ | |
52 | public final class XMLUtil { | |
53 | /** | |
54 | * Prevent instantiation | |
55 | */ | |
56 | 0 | private XMLUtil() { |
57 | 0 | } |
58 | ||
59 | /** | |
60 | * Get and load an XML file from the classpath and a few other places into a | |
61 | * JDOM Document object. | |
62 | * | |
63 | * @param subject | |
64 | * The name of the desired resource (without any extension) | |
65 | * @return The requested resource | |
66 | * @throws IOException | |
67 | * if there is a problem reading the file | |
68 | * @throws JDOMException | |
69 | * If the resource is not valid XML | |
70 | */ | |
71 | public static Document getDocument(String subject) throws JDOMException, IOException { | |
72 | 0 | String resource = subject + FileUtil.EXTENSION_XML; |
73 | 0 | InputStream in = ResourceUtil.getResourceAsStream(resource); |
74 | ||
75 | 0 | log.debug("Loading {}.xml from classpath: [OK]", subject); |
76 | // With JDom 1.x this passed true | |
77 | 0 | SAXBuilder builder = new SAXBuilder(XMLReaders.DTDVALIDATING); |
78 | 0 | return builder.build(in); |
79 | } | |
80 | ||
81 | /** | |
82 | * Serialize a SAXEventProvider into an XML String | |
83 | * | |
84 | * @param provider | |
85 | * The source of SAX events | |
86 | * @return a serialized string | |
87 | * @throws SAXException | |
88 | */ | |
89 | public static String writeToString(SAXEventProvider provider) throws SAXException { | |
90 | 0 | ContentHandler ser = new PrettySerializingContentHandler(); |
91 | 0 | provider.provideSAXEvents(ser); |
92 | 0 | return ser.toString(); |
93 | } | |
94 | ||
95 | /** | |
96 | * Get the full name of the attribute, including the namespace if any. | |
97 | * | |
98 | * @param attrs | |
99 | * the collection of attributes | |
100 | * @param index | |
101 | * the index of the desired attribute | |
102 | * @return the requested attribute | |
103 | */ | |
104 | public static String getAttributeName(Attributes attrs, int index) { | |
105 | 0 | String qName = attrs.getQName(index); |
106 | 0 | if (qName != null) { |
107 | 0 | return qName; |
108 | } | |
109 | 0 | return attrs.getLocalName(index); |
110 | } | |
111 | ||
112 | /** | |
113 | * Show the attributes of an element as debug | |
114 | * @param attrs | |
115 | */ | |
116 | public static void debugSAXAttributes(Attributes attrs) { | |
117 | 0 | for (int i = 0; i < attrs.getLength(); i++) { |
118 | 0 | log.debug("attr[{}]: {}={}", Integer.toString(i), attrs.getQName(i), attrs.getValue(i)); |
119 | } | |
120 | 0 | } |
121 | ||
122 | /** | |
123 | * Normalizes the given string | |
124 | * @param s | |
125 | * @return the escaped string | |
126 | */ | |
127 | public static String escape(String s) { | |
128 | 0 | if (s == null) { |
129 | 0 | return s; |
130 | } | |
131 | 0 | int len = s.length(); |
132 | 0 | StringBuilder str = new StringBuilder(len); |
133 | ||
134 | 0 | for (int i = 0; i < len; i++) { |
135 | 0 | char ch = s.charAt(i); |
136 | 0 | switch (ch) { |
137 | case '<': | |
138 | 0 | str.append("<"); |
139 | 0 | break; |
140 | ||
141 | case '>': | |
142 | 0 | str.append(">"); |
143 | 0 | break; |
144 | ||
145 | case '&': | |
146 | 0 | str.append("&"); |
147 | 0 | break; |
148 | ||
149 | case '"': | |
150 | 0 | str.append("""); |
151 | 0 | break; |
152 | ||
153 | default: | |
154 | 0 | str.append(ch); |
155 | } | |
156 | } | |
157 | ||
158 | 0 | return str.toString(); |
159 | } | |
160 | ||
161 | /** | |
162 | * For each entity in the input that is not allowed in XML, replace the | |
163 | * entity with its unicode equivalent or remove it. For each instance of a | |
164 | * bare &, replace it with &<br> | |
165 | * XML only allows 4 entities: &amp;, &quot;, &lt; and &gt;. | |
166 | * | |
167 | * @param broken | |
168 | * the string to handle entities | |
169 | * @return the string with entities appropriately fixed up | |
170 | */ | |
171 | public static String cleanAllEntities(String broken) { | |
172 | 0 | if (broken == null) { |
173 | 0 | return null; |
174 | } | |
175 | ||
176 | 0 | String working = broken; |
177 | 0 | int cleanfrom = 0; |
178 | ||
179 | while (true) { | |
180 | 0 | int amp = working.indexOf('&', cleanfrom); |
181 | ||
182 | // If there are no more amps then we are done | |
183 | 0 | if (amp == -1) { |
184 | 0 | break; |
185 | } | |
186 | ||
187 | // Skip references of the kind &#ddd; | |
188 | 0 | if (validCharacterEntityPattern.matcher(working.substring(amp)).find()) { |
189 | 0 | cleanfrom = working.indexOf(';', amp) + 1; |
190 | 0 | continue; |
191 | } | |
192 | ||
193 | 0 | int i = amp + 1; |
194 | while (true) { | |
195 | // if we are at the end of the string then just escape the '&'; | |
196 | 0 | if (i >= working.length()) { |
197 | // String entity = working.substring(amp); | |
198 | // String replace = guessEntity(entity); | |
199 | // DataPolice.report("replacing unterminated entity: '" + | |
200 | // entity + "' with: '" + replace + "'"); | |
201 | ||
202 | 0 | return working.substring(0, amp) + "&" + working.substring(amp + 1); |
203 | } | |
204 | ||
205 | // if we have come to a ; then we have an entity | |
206 | // If it is something that xml can't handle then replace it. | |
207 | 0 | char c = working.charAt(i); |
208 | 0 | if (c == ';') { |
209 | 0 | String entity = working.substring(amp, i + 1); |
210 | 0 | String replace = handleEntity(entity); |
211 | // log.warn("replacing entity: '{}' with: '{}'", entity, replace); | |
212 | ||
213 | 0 | working = working.substring(0, amp) + replace + working.substring(i + 1); |
214 | 0 | break; |
215 | } | |
216 | ||
217 | // Did we end an entity without finding a closing ; | |
218 | // Then treat it as an '&' that needs to be replaced with & | |
219 | 0 | if (!Character.isLetterOrDigit(c)) { |
220 | // String entity = working.substring(amp, i); | |
221 | // String replace = "&" + working.substring(amp + 1, i); | |
222 | // log.warn("replacing invalid entity: '{}' with: '{}': {}", entity, replace, broken); | |
223 | ||
224 | 0 | working = working.substring(0, amp) + "&" + working.substring(amp + 1); |
225 | 0 | amp = i + 4; // account for the 4 extra characters |
226 | 0 | break; |
227 | } | |
228 | ||
229 | 0 | i++; |
230 | 0 | } |
231 | ||
232 | 0 | cleanfrom = amp + 1; |
233 | 0 | } |
234 | ||
235 | 0 | return working; |
236 | } | |
237 | ||
238 | /** | |
239 | * Remove all invalid characters in the input, replacing them with a space. XML has stringent | |
240 | * requirements as to which characters are or are not allowed. The set of | |
241 | * allowable characters are:<br> | |
242 | * #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]<br> | |
243 | * Note: Java handles to \uFFFF | |
244 | * | |
245 | * @param broken | |
246 | * the string to be cleaned | |
247 | * @return the cleaned string | |
248 | */ | |
249 | public static String cleanAllCharacters(String broken) { | |
250 | 0 | return invalidCharacterPattern.matcher(broken).replaceAll(" "); |
251 | } | |
252 | ||
253 | /** | |
254 | * Strip all closing tags from the end of the XML fragment, and then | |
255 | * re-close all tags that are open at the end of the string. | |
256 | * | |
257 | * @param broken | |
258 | * the string to be cleaned. | |
259 | * @return cleaned string, or {@code null} if the string could not be | |
260 | * cleaned due to more broken XML | |
261 | */ | |
262 | public static String recloseTags(String broken) { | |
263 | 0 | String result = broken; |
264 | // remove closing tags from the end | |
265 | 0 | while (result.matches(".*</[a-zA-Z]+>[ \t\r\n]*")) { |
266 | 0 | result = result.substring(0, result.lastIndexOf('<')); |
267 | } | |
268 | // close tags again | |
269 | 0 | List<String> openTags = new ArrayList<String>(); |
270 | 0 | Matcher m = Pattern.compile("</?[a-zA-Z]+").matcher(result); |
271 | 0 | boolean lTagFound = false; |
272 | 0 | boolean lgTagFound = false; |
273 | 0 | while (m.find()) { |
274 | 0 | String match = m.group(); |
275 | 0 | if (match.startsWith("</")) { |
276 | 0 | if (openTags.size() == 0 && "</l".equals(match) && !lTagFound) { |
277 | 0 | return recloseTags("<l>" + broken); |
278 | } | |
279 | 0 | if (openTags.size() == 0 && "</lg".equals(match) && !lgTagFound) { |
280 | 0 | return recloseTags("<lg>" + broken); |
281 | } | |
282 | 0 | if (openTags.size() == 0) { |
283 | 0 | return null; |
284 | } | |
285 | 0 | String lastTag = openTags.remove(openTags.size() - 1); |
286 | 0 | if (!("</" + lastTag).equals(match)) { |
287 | 0 | return null; |
288 | } | |
289 | 0 | } else { |
290 | 0 | int closePos = result.indexOf('>', m.end()); |
291 | 0 | if (closePos == -1) { |
292 | 0 | return null; |
293 | } | |
294 | 0 | while (Character.isWhitespace(result.charAt(closePos - 1))) { |
295 | 0 | --closePos; |
296 | } | |
297 | 0 | if (result.charAt(closePos - 1) != '/') { |
298 | 0 | if ("<l".equals(match)) { |
299 | 0 | lTagFound = true; |
300 | } | |
301 | 0 | if ("<lg".equals(match)) { |
302 | 0 | lgTagFound = true; |
303 | } | |
304 | 0 | openTags.add(match.substring(1)); |
305 | } | |
306 | } | |
307 | 0 | } |
308 | 0 | Collections.reverse(openTags); |
309 | 0 | for (String openTag : openTags) { |
310 | 0 | result += "</" + openTag + ">"; |
311 | } | |
312 | 0 | return result; |
313 | } | |
314 | ||
315 | /** | |
316 | * Common HTML tags such as <br>,<hr> and <img> may be | |
317 | * left open causing XML parsing to fail. This method closes these tags. | |
318 | * | |
319 | * @param broken | |
320 | * the string to be cleaned | |
321 | * @return the cleaned string | |
322 | */ | |
323 | public static String closeEmptyTags(String broken) { | |
324 | 0 | if (broken == null) { |
325 | 0 | return null; |
326 | } | |
327 | ||
328 | 0 | return openHTMLTagPattern.matcher(broken).replaceAll("<$1$2/>"); |
329 | } | |
330 | ||
331 | /** | |
332 | * XML parse failed, so we can try getting rid of all the tags and having | |
333 | * another go. We define a tag to start at a < and end at the end of the | |
334 | * next word (where a word is what comes in between spaces) that does not | |
335 | * contain an = sign, or at a >, whichever is earlier. | |
336 | * @param broken | |
337 | * @return the string without any tags | |
338 | */ | |
339 | public static String cleanAllTags(String broken) { | |
340 | 0 | if (broken == null) { |
341 | 0 | return null; |
342 | } | |
343 | ||
344 | 0 | String working = broken; |
345 | ||
346 | allTags: while (true) { | |
347 | 0 | int lt = working.indexOf('<'); |
348 | ||
349 | // If there are no more amps then we are done | |
350 | 0 | if (lt == -1) { |
351 | 0 | break allTags; |
352 | } | |
353 | ||
354 | // loop to find the end of this tag | |
355 | 0 | int i = lt; |
356 | 0 | int startattr = -1; |
357 | ||
358 | singletag: while (true) { | |
359 | 0 | i++; |
360 | ||
361 | // the tag can't exist past the end of the string | |
362 | 0 | if (i >= working.length()) { |
363 | // go back one so we can safely chop | |
364 | 0 | i--; |
365 | 0 | break singletag; |
366 | } | |
367 | ||
368 | 0 | char c = working.charAt(i); |
369 | ||
370 | // normal end of tag | |
371 | 0 | if (c == '>') { |
372 | 0 | break singletag; |
373 | } | |
374 | ||
375 | // we declare end-of-tag if this 'word' is not an attribute | |
376 | 0 | if (c == ' ') { |
377 | 0 | if (startattr == -1) { |
378 | // NOTE(joe): should we skip over consecutive spaces? | |
379 | 0 | startattr = i; |
380 | } else { | |
381 | // so we've already had a space indicating start of | |
382 | // attribute, so this must be the beginning of the next | |
383 | // NOTE(joe): no - spaces can exist in attr values | |
384 | 0 | String value = working.substring(startattr, i); |
385 | 0 | if (value.indexOf('=') == -1) { |
386 | // this 'attribute' does not contain an equals so | |
387 | // we call it a word and end the parse | |
388 | 0 | break singletag; |
389 | } | |
390 | } | |
391 | } | |
392 | 0 | } |
393 | ||
394 | // So we have the end of the tag, delete it, but leave a space in it's place | |
395 | // DataPolice.report("discarding tag: " + working.substring(lt, i + 1)); | |
396 | 0 | working = working.substring(0, lt) + " " + working.substring(i + 1); |
397 | 0 | } |
398 | ||
399 | 0 | return working; |
400 | } | |
401 | ||
402 | /** | |
403 | * Replace entity with its unicode equivalent, if it is not a valid XML | |
404 | * entity. Otherwise strip it out. XML only allows 4 entities: &amp;, | |
405 | * &quot;, &lt; and &gt;. | |
406 | * | |
407 | * @param entity | |
408 | * the entity to be replaced | |
409 | * @return the substitution for the entity, either itself, the unicode | |
410 | * equivalent or an empty string. | |
411 | */ | |
412 | private static String handleEntity(String entity) { | |
413 | 0 | if (goodEntities.contains(entity)) { |
414 | 0 | return entity; |
415 | } | |
416 | ||
417 | 0 | String replace = badEntities.get(entity); |
418 | 0 | if (replace != null) { |
419 | 0 | return replace; |
420 | } | |
421 | ||
422 | // replace unknown entities with a space | |
423 | 0 | return " "; |
424 | } | |
425 | ||
426 | // Map entities to their unicode equivalent | |
427 | 0 | private static Set<String> goodEntities = new HashSet<String>(); |
428 | 0 | private static PropertyMap badEntities = new PropertyMap(); |
429 | static { | |
430 | // pre-defined XML entities | |
431 | 0 | goodEntities.add("""); // quotation mark |
432 | 0 | goodEntities.add("&"); // ampersand |
433 | 0 | goodEntities.add("<"); // less-than sign |
434 | 0 | goodEntities.add(">"); // greater-than sign |
435 | ||
436 | // misc entities | |
437 | 0 | badEntities.put("€", "\u20AC"); // euro |
438 | 0 | badEntities.put("‘", "\u2018"); // left single quotation mark |
439 | 0 | badEntities.put("’", "\u2019"); // right single quotation mark |
440 | ||
441 | // Latin 1 entities | |
442 | 0 | badEntities.put(" ", "\u00A0"); // no-break space |
443 | 0 | badEntities.put("¡", "\u00A1"); // inverted exclamation mark |
444 | 0 | badEntities.put("¢", "\u00A2"); // cent sign |
445 | 0 | badEntities.put("£", "\u00A3"); // pound sign |
446 | 0 | badEntities.put("¤", "\u00A4"); // currency sign |
447 | 0 | badEntities.put("¥", "\u00A5"); // yen sign |
448 | 0 | badEntities.put("¦", "\u00A6"); // broken vertical bar |
449 | 0 | badEntities.put("§", "\u00A7"); // section sign |
450 | 0 | badEntities.put("¨", "\u00A8"); // diaeresis |
451 | 0 | badEntities.put("©", "\u00A9"); // copyright sign |
452 | 0 | badEntities.put("ª", "\u00AA"); // feminine ordinal indicator |
453 | 0 | badEntities.put("«", "\u00AB"); // left-pointing double angle quotation mark |
454 | 0 | badEntities.put("¬", "\u00AC"); // not sign |
455 | 0 | badEntities.put("­", "\u00AD"); // soft hyphen |
456 | 0 | badEntities.put("®", "\u00AE"); // registered sign |
457 | 0 | badEntities.put("¯", "\u00AF"); // macron |
458 | 0 | badEntities.put("°", "\u00B0"); // degree sign |
459 | 0 | badEntities.put("±", "\u00B1"); // plus-minus sign |
460 | 0 | badEntities.put("²", "\u00B2"); // superscript two |
461 | 0 | badEntities.put("³", "\u00B3"); // superscript three |
462 | 0 | badEntities.put("´", "\u00B4"); // acute accent |
463 | 0 | badEntities.put("µ", "\u00B5"); // micro sign |
464 | 0 | badEntities.put("¶", "\u00B6"); // pilcrow sign |
465 | 0 | badEntities.put("·", "\u00B7"); // middle dot |
466 | 0 | badEntities.put("¸", "\u00B8"); // cedilla |
467 | 0 | badEntities.put("¹", "\u00B9"); // superscript one |
468 | 0 | badEntities.put("º", "\u00BA"); // masculine ordinal indicator |
469 | 0 | badEntities.put("»", "\u00BB"); // right-pointing double angle quotation mark |
470 | 0 | badEntities.put("¼", "\u00BC"); // vulgar fraction one quarter |
471 | 0 | badEntities.put("½", "\u00BD"); // vulgar fraction one half |
472 | 0 | badEntities.put("¾", "\u00BE"); // vulgar fraction three quarters |
473 | 0 | badEntities.put("¿", "\u00BF"); // inverted question mark |
474 | 0 | badEntities.put("À", "\u00C0"); // latin capital letter A with grave |
475 | 0 | badEntities.put("Á", "\u00C1"); // latin capital letter A with acute |
476 | 0 | badEntities.put("Â", "\u00C2"); // latin capital letter A with circumflex |
477 | 0 | badEntities.put("Ã", "\u00C3"); // latin capital letter A with tilde |
478 | 0 | badEntities.put("Ä", "\u00C4"); // latin capital letter A with diaeresis |
479 | 0 | badEntities.put("Å", "\u00C5"); // latin capital letter A with ring above |
480 | 0 | badEntities.put("Æ", "\u00C6"); // latin capital letter AE |
481 | 0 | badEntities.put("Ç", "\u00C7"); // latin capital letter C with cedilla |
482 | 0 | badEntities.put("È", "\u00C8"); // latin capital letter E with grave |
483 | 0 | badEntities.put("É", "\u00C9"); // latin capital letter E with acute |
484 | 0 | badEntities.put("Ê", "\u00CA"); // latin capital letter E with circumflex |
485 | 0 | badEntities.put("Ë", "\u00CB"); // latin capital letter E with diaeresis |
486 | 0 | badEntities.put("Ì", "\u00CC"); // latin capital letter I with grave |
487 | 0 | badEntities.put("Í", "\u00CD"); // latin capital letter I with acute |
488 | 0 | badEntities.put("Î", "\u00CE"); // latin capital letter I with circumflex |
489 | 0 | badEntities.put("Ï", "\u00CF"); // latin capital letter I with diaeresis |
490 | 0 | badEntities.put("Ð", "\u00D0"); // latin capital letter ETH |
491 | 0 | badEntities.put("Ñ", "\u00D1"); // latin capital letter N with tilde |
492 | 0 | badEntities.put("Ò", "\u00D2"); // latin capital letter O with grave |
493 | 0 | badEntities.put("Ó", "\u00D3"); // latin capital letter O with acute |
494 | 0 | badEntities.put("Ô", "\u00D4"); // latin capital letter O with circumflex |
495 | 0 | badEntities.put("Õ", "\u00D5"); // latin capital letter O with tilde |
496 | 0 | badEntities.put("Ö", "\u00D6"); // latin capital letter O with diaeresis |
497 | 0 | badEntities.put("×", "\u00D7"); // multiplication sign |
498 | 0 | badEntities.put("Ø", "\u00D8"); // latin capital letter O with stroke |
499 | 0 | badEntities.put("Ù", "\u00D9"); // latin capital letter U with grave |
500 | 0 | badEntities.put("Ú", "\u00DA"); // latin capital letter U with acute |
501 | 0 | badEntities.put("Û", "\u00DB"); // latin capital letter U with circumflex |
502 | 0 | badEntities.put("Ü", "\u00DC"); // latin capital letter U with diaeresis |
503 | 0 | badEntities.put("Ý", "\u00DD"); // latin capital letter Y with acute |
504 | 0 | badEntities.put("Þ", "\u00DE"); // latin capital letter THORN |
505 | 0 | badEntities.put("ß", "\u00DF"); // latin small letter sharp s |
506 | 0 | badEntities.put("à", "\u00E0"); // latin small letter a with grave |
507 | 0 | badEntities.put("á", "\u00E1"); // latin small letter a with acute |
508 | 0 | badEntities.put("â", "\u00E2"); // latin small letter a with circumflex |
509 | 0 | badEntities.put("ã", "\u00E3"); // latin small letter a with tilde |
510 | 0 | badEntities.put("ä", "\u00E4"); // latin small letter a with diaeresis |
511 | 0 | badEntities.put("å", "\u00E5"); // latin small letter a with ring above |
512 | 0 | badEntities.put("æ", "\u00E6"); // latin small letter ae |
513 | 0 | badEntities.put("ç", "\u00E7"); // latin small letter c with cedilla |
514 | 0 | badEntities.put("è", "\u00E8"); // latin small letter e with grave |
515 | 0 | badEntities.put("é", "\u00E9"); // latin small letter e with acute |
516 | 0 | badEntities.put("ê", "\u00EA"); // latin small letter e with circumflex |
517 | 0 | badEntities.put("ë", "\u00EB"); // latin small letter e with diaeresis |
518 | 0 | badEntities.put("ì", "\u00EC"); // latin small letter i with grave |
519 | 0 | badEntities.put("í", "\u00ED"); // latin small letter i with acute |
520 | 0 | badEntities.put("î", "\u00EE"); // latin small letter i with circumflex |
521 | 0 | badEntities.put("ï", "\u00EF"); // latin small letter i with diaeresis |
522 | 0 | badEntities.put("ð", "\u00F0"); // latin small letter eth |
523 | 0 | badEntities.put("ñ", "\u00F1"); // latin small letter n with tilde |
524 | 0 | badEntities.put("ò", "\u00F2"); // latin small letter o with grave |
525 | 0 | badEntities.put("ó", "\u00F3"); // latin small letter o with acute |
526 | 0 | badEntities.put("ô", "\u00F4"); // latin small letter o with circumflex |
527 | 0 | badEntities.put("õ", "\u00F5"); // latin small letter o with tilde |
528 | 0 | badEntities.put("ö", "\u00F6"); // latin small letter o with diaeresis |
529 | 0 | badEntities.put("÷", "\u00F7"); // division sign |
530 | 0 | badEntities.put("ø", "\u00F8"); // latin small letter o with stroke |
531 | 0 | badEntities.put("ù", "\u00F9"); // latin small letter u with grave |
532 | 0 | badEntities.put("ú", "\u00FA"); // latin small letter u with acute |
533 | 0 | badEntities.put("û", "\u00FB"); // latin small letter u with circumflex |
534 | 0 | badEntities.put("ü", "\u00FC"); // latin small letter u with diaeresis |
535 | 0 | badEntities.put("ý", "\u00FD"); // latin small letter y with acute |
536 | 0 | badEntities.put("þ", "\u00FE"); // latin small letter thorn |
537 | 0 | badEntities.put("ÿ", "\u00FF"); // latin small letter y with diaeresis |
538 | } | |
539 | ||
540 | /** | |
541 | * Pattern for numeric entities. | |
542 | */ | |
543 | 0 | private static Pattern validCharacterEntityPattern = Pattern.compile("^&#x?\\d{2,4};"); |
544 | ||
545 | /** | |
546 | * Pattern that negates the allowable XML 4 byte unicode characters. Valid | |
547 | * are: #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | | |
548 | * [#x10000-#x10FFFF] | |
549 | */ | |
550 | 0 | private static Pattern invalidCharacterPattern = Pattern.compile("[^\t\r\n\u0020-\uD7FF\uE000-\uFFFD]"); |
551 | ||
552 | /** | |
553 | * Pattern that matches open <br>,<hr> and <img> tags. | |
554 | */ | |
555 | 0 | private static Pattern openHTMLTagPattern = Pattern.compile("<(img|hr|br)([^>]*)(?<!/)>"); |
556 | ||
557 | /** | |
558 | * The log stream | |
559 | */ | |
560 | 0 | private static final Logger log = LoggerFactory.getLogger(XMLUtil.class); |
561 | } |