| Classes in this File | Line Coverage | Branch Coverage | Complexity | ||||
| XMLUtil |
|
| 5.833333333333333;5.833 |
| 1 | /** | |
| 2 | * Distribution License: | |
| 3 | * JSword is free software; you can redistribute it and/or modify it under | |
| 4 | * the terms of the GNU Lesser General Public License, version 2.1 or later | |
| 5 | * as published by the Free Software Foundation. This program is distributed | |
| 6 | * in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even | |
| 7 | * the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. | |
| 8 | * See the GNU Lesser General Public License for more details. | |
| 9 | * | |
| 10 | * The License is available on the internet at: | |
| 11 | * http://www.gnu.org/copyleft/lgpl.html | |
| 12 | * or by writing to: | |
| 13 | * Free Software Foundation, Inc. | |
| 14 | * 59 Temple Place - Suite 330 | |
| 15 | * Boston, MA 02111-1307, USA | |
| 16 | * | |
| 17 | * © CrossWire Bible Society, 2005 - 2016 | |
| 18 | * | |
| 19 | */ | |
| 20 | package org.crosswire.common.xml; | |
| 21 | ||
| 22 | import java.io.IOException; | |
| 23 | import java.io.InputStream; | |
| 24 | import java.util.ArrayList; | |
| 25 | import java.util.Collections; | |
| 26 | import java.util.HashSet; | |
| 27 | import java.util.List; | |
| 28 | import java.util.Set; | |
| 29 | import java.util.regex.Matcher; | |
| 30 | import java.util.regex.Pattern; | |
| 31 | ||
| 32 | import org.crosswire.common.util.FileUtil; | |
| 33 | import org.crosswire.common.util.PropertyMap; | |
| 34 | import org.crosswire.common.util.ResourceUtil; | |
| 35 | import org.jdom2.Document; | |
| 36 | import org.jdom2.JDOMException; | |
| 37 | import org.jdom2.input.SAXBuilder; | |
| 38 | import org.jdom2.input.sax.XMLReaders; | |
| 39 | import org.slf4j.Logger; | |
| 40 | import org.slf4j.LoggerFactory; | |
| 41 | import org.xml.sax.Attributes; | |
| 42 | import org.xml.sax.ContentHandler; | |
| 43 | import org.xml.sax.SAXException; | |
| 44 | ||
| 45 | /** | |
| 46 | * Utilities for working with SAX XML parsing. | |
| 47 | * | |
| 48 | * @see gnu.lgpl.License The GNU Lesser General Public License for details. | |
| 49 | * @author Joe Walker | |
| 50 | * @author DM Smith | |
| 51 | */ | |
| 52 | public final class XMLUtil { | |
| 53 | /** | |
| 54 | * Prevent instantiation | |
| 55 | */ | |
| 56 | 0 | private XMLUtil() { |
| 57 | 0 | } |
| 58 | ||
| 59 | /** | |
| 60 | * Get and load an XML file from the classpath and a few other places into a | |
| 61 | * JDOM Document object. | |
| 62 | * | |
| 63 | * @param subject | |
| 64 | * The name of the desired resource (without any extension) | |
| 65 | * @return The requested resource | |
| 66 | * @throws IOException | |
| 67 | * if there is a problem reading the file | |
| 68 | * @throws JDOMException | |
| 69 | * If the resource is not valid XML | |
| 70 | */ | |
| 71 | public static Document getDocument(String subject) throws JDOMException, IOException { | |
| 72 | 0 | String resource = subject + FileUtil.EXTENSION_XML; |
| 73 | 0 | InputStream in = ResourceUtil.getResourceAsStream(resource); |
| 74 | ||
| 75 | 0 | log.debug("Loading {}.xml from classpath: [OK]", subject); |
| 76 | // With JDom 1.x this passed true | |
| 77 | 0 | SAXBuilder builder = new SAXBuilder(XMLReaders.DTDVALIDATING); |
| 78 | 0 | return builder.build(in); |
| 79 | } | |
| 80 | ||
| 81 | /** | |
| 82 | * Serialize a SAXEventProvider into an XML String | |
| 83 | * | |
| 84 | * @param provider | |
| 85 | * The source of SAX events | |
| 86 | * @return a serialized string | |
| 87 | * @throws SAXException | |
| 88 | */ | |
| 89 | public static String writeToString(SAXEventProvider provider) throws SAXException { | |
| 90 | 0 | ContentHandler ser = new PrettySerializingContentHandler(); |
| 91 | 0 | provider.provideSAXEvents(ser); |
| 92 | 0 | return ser.toString(); |
| 93 | } | |
| 94 | ||
| 95 | /** | |
| 96 | * Get the full name of the attribute, including the namespace if any. | |
| 97 | * | |
| 98 | * @param attrs | |
| 99 | * the collection of attributes | |
| 100 | * @param index | |
| 101 | * the index of the desired attribute | |
| 102 | * @return the requested attribute | |
| 103 | */ | |
| 104 | public static String getAttributeName(Attributes attrs, int index) { | |
| 105 | 0 | String qName = attrs.getQName(index); |
| 106 | 0 | if (qName != null) { |
| 107 | 0 | return qName; |
| 108 | } | |
| 109 | 0 | return attrs.getLocalName(index); |
| 110 | } | |
| 111 | ||
| 112 | /** | |
| 113 | * Show the attributes of an element as debug | |
| 114 | * @param attrs | |
| 115 | */ | |
| 116 | public static void debugSAXAttributes(Attributes attrs) { | |
| 117 | 0 | for (int i = 0; i < attrs.getLength(); i++) { |
| 118 | 0 | log.debug("attr[{}]: {}={}", Integer.toString(i), attrs.getQName(i), attrs.getValue(i)); |
| 119 | } | |
| 120 | 0 | } |
| 121 | ||
| 122 | /** | |
| 123 | * Normalizes the given string | |
| 124 | * @param s | |
| 125 | * @return the escaped string | |
| 126 | */ | |
| 127 | public static String escape(String s) { | |
| 128 | 0 | if (s == null) { |
| 129 | 0 | return s; |
| 130 | } | |
| 131 | 0 | int len = s.length(); |
| 132 | 0 | StringBuilder str = new StringBuilder(len); |
| 133 | ||
| 134 | 0 | for (int i = 0; i < len; i++) { |
| 135 | 0 | char ch = s.charAt(i); |
| 136 | 0 | switch (ch) { |
| 137 | case '<': | |
| 138 | 0 | str.append("<"); |
| 139 | 0 | break; |
| 140 | ||
| 141 | case '>': | |
| 142 | 0 | str.append(">"); |
| 143 | 0 | break; |
| 144 | ||
| 145 | case '&': | |
| 146 | 0 | str.append("&"); |
| 147 | 0 | break; |
| 148 | ||
| 149 | case '"': | |
| 150 | 0 | str.append("""); |
| 151 | 0 | break; |
| 152 | ||
| 153 | default: | |
| 154 | 0 | str.append(ch); |
| 155 | } | |
| 156 | } | |
| 157 | ||
| 158 | 0 | return str.toString(); |
| 159 | } | |
| 160 | ||
| 161 | /** | |
| 162 | * For each entity in the input that is not allowed in XML, replace the | |
| 163 | * entity with its unicode equivalent or remove it. For each instance of a | |
| 164 | * bare &, replace it with &<br> | |
| 165 | * XML only allows 4 entities: &amp;, &quot;, &lt; and &gt;. | |
| 166 | * | |
| 167 | * @param broken | |
| 168 | * the string to handle entities | |
| 169 | * @return the string with entities appropriately fixed up | |
| 170 | */ | |
| 171 | public static String cleanAllEntities(String broken) { | |
| 172 | 0 | if (broken == null) { |
| 173 | 0 | return null; |
| 174 | } | |
| 175 | ||
| 176 | 0 | String working = broken; |
| 177 | 0 | int cleanfrom = 0; |
| 178 | ||
| 179 | while (true) { | |
| 180 | 0 | int amp = working.indexOf('&', cleanfrom); |
| 181 | ||
| 182 | // If there are no more amps then we are done | |
| 183 | 0 | if (amp == -1) { |
| 184 | 0 | break; |
| 185 | } | |
| 186 | ||
| 187 | // Skip references of the kind &#ddd; | |
| 188 | 0 | if (validCharacterEntityPattern.matcher(working.substring(amp)).find()) { |
| 189 | 0 | cleanfrom = working.indexOf(';', amp) + 1; |
| 190 | 0 | continue; |
| 191 | } | |
| 192 | ||
| 193 | 0 | int i = amp + 1; |
| 194 | while (true) { | |
| 195 | // if we are at the end of the string then just escape the '&'; | |
| 196 | 0 | if (i >= working.length()) { |
| 197 | // String entity = working.substring(amp); | |
| 198 | // String replace = guessEntity(entity); | |
| 199 | // DataPolice.report("replacing unterminated entity: '" + | |
| 200 | // entity + "' with: '" + replace + "'"); | |
| 201 | ||
| 202 | 0 | return working.substring(0, amp) + "&" + working.substring(amp + 1); |
| 203 | } | |
| 204 | ||
| 205 | // if we have come to a ; then we have an entity | |
| 206 | // If it is something that xml can't handle then replace it. | |
| 207 | 0 | char c = working.charAt(i); |
| 208 | 0 | if (c == ';') { |
| 209 | 0 | String entity = working.substring(amp, i + 1); |
| 210 | 0 | String replace = handleEntity(entity); |
| 211 | // log.warn("replacing entity: '{}' with: '{}'", entity, replace); | |
| 212 | ||
| 213 | 0 | working = working.substring(0, amp) + replace + working.substring(i + 1); |
| 214 | 0 | break; |
| 215 | } | |
| 216 | ||
| 217 | // Did we end an entity without finding a closing ; | |
| 218 | // Then treat it as an '&' that needs to be replaced with & | |
| 219 | 0 | if (!Character.isLetterOrDigit(c)) { |
| 220 | // String entity = working.substring(amp, i); | |
| 221 | // String replace = "&" + working.substring(amp + 1, i); | |
| 222 | // log.warn("replacing invalid entity: '{}' with: '{}': {}", entity, replace, broken); | |
| 223 | ||
| 224 | 0 | working = working.substring(0, amp) + "&" + working.substring(amp + 1); |
| 225 | 0 | amp = i + 4; // account for the 4 extra characters |
| 226 | 0 | break; |
| 227 | } | |
| 228 | ||
| 229 | 0 | i++; |
| 230 | 0 | } |
| 231 | ||
| 232 | 0 | cleanfrom = amp + 1; |
| 233 | 0 | } |
| 234 | ||
| 235 | 0 | return working; |
| 236 | } | |
| 237 | ||
| 238 | /** | |
| 239 | * Remove all invalid characters in the input, replacing them with a space. XML has stringent | |
| 240 | * requirements as to which characters are or are not allowed. The set of | |
| 241 | * allowable characters are:<br> | |
| 242 | * #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]<br> | |
| 243 | * Note: Java handles to \uFFFF | |
| 244 | * | |
| 245 | * @param broken | |
| 246 | * the string to be cleaned | |
| 247 | * @return the cleaned string | |
| 248 | */ | |
| 249 | public static String cleanAllCharacters(String broken) { | |
| 250 | 0 | return invalidCharacterPattern.matcher(broken).replaceAll(" "); |
| 251 | } | |
| 252 | ||
| 253 | /** | |
| 254 | * Strip all closing tags from the end of the XML fragment, and then | |
| 255 | * re-close all tags that are open at the end of the string. | |
| 256 | * | |
| 257 | * @param broken | |
| 258 | * the string to be cleaned. | |
| 259 | * @return cleaned string, or {@code null} if the string could not be | |
| 260 | * cleaned due to more broken XML | |
| 261 | */ | |
| 262 | public static String recloseTags(String broken) { | |
| 263 | 0 | String result = broken; |
| 264 | // remove closing tags from the end | |
| 265 | 0 | while (result.matches(".*</[a-zA-Z]+>[ \t\r\n]*")) { |
| 266 | 0 | result = result.substring(0, result.lastIndexOf('<')); |
| 267 | } | |
| 268 | // close tags again | |
| 269 | 0 | List<String> openTags = new ArrayList<String>(); |
| 270 | 0 | Matcher m = Pattern.compile("</?[a-zA-Z]+").matcher(result); |
| 271 | 0 | boolean lTagFound = false; |
| 272 | 0 | boolean lgTagFound = false; |
| 273 | 0 | while (m.find()) { |
| 274 | 0 | String match = m.group(); |
| 275 | 0 | if (match.startsWith("</")) { |
| 276 | 0 | if (openTags.size() == 0 && "</l".equals(match) && !lTagFound) { |
| 277 | 0 | return recloseTags("<l>" + broken); |
| 278 | } | |
| 279 | 0 | if (openTags.size() == 0 && "</lg".equals(match) && !lgTagFound) { |
| 280 | 0 | return recloseTags("<lg>" + broken); |
| 281 | } | |
| 282 | 0 | if (openTags.size() == 0) { |
| 283 | 0 | return null; |
| 284 | } | |
| 285 | 0 | String lastTag = openTags.remove(openTags.size() - 1); |
| 286 | 0 | if (!("</" + lastTag).equals(match)) { |
| 287 | 0 | return null; |
| 288 | } | |
| 289 | 0 | } else { |
| 290 | 0 | int closePos = result.indexOf('>', m.end()); |
| 291 | 0 | if (closePos == -1) { |
| 292 | 0 | return null; |
| 293 | } | |
| 294 | 0 | while (Character.isWhitespace(result.charAt(closePos - 1))) { |
| 295 | 0 | --closePos; |
| 296 | } | |
| 297 | 0 | if (result.charAt(closePos - 1) != '/') { |
| 298 | 0 | if ("<l".equals(match)) { |
| 299 | 0 | lTagFound = true; |
| 300 | } | |
| 301 | 0 | if ("<lg".equals(match)) { |
| 302 | 0 | lgTagFound = true; |
| 303 | } | |
| 304 | 0 | openTags.add(match.substring(1)); |
| 305 | } | |
| 306 | } | |
| 307 | 0 | } |
| 308 | 0 | Collections.reverse(openTags); |
| 309 | 0 | for (String openTag : openTags) { |
| 310 | 0 | result += "</" + openTag + ">"; |
| 311 | } | |
| 312 | 0 | return result; |
| 313 | } | |
| 314 | ||
| 315 | /** | |
| 316 | * Common HTML tags such as <br>,<hr> and <img> may be | |
| 317 | * left open causing XML parsing to fail. This method closes these tags. | |
| 318 | * | |
| 319 | * @param broken | |
| 320 | * the string to be cleaned | |
| 321 | * @return the cleaned string | |
| 322 | */ | |
| 323 | public static String closeEmptyTags(String broken) { | |
| 324 | 0 | if (broken == null) { |
| 325 | 0 | return null; |
| 326 | } | |
| 327 | ||
| 328 | 0 | return openHTMLTagPattern.matcher(broken).replaceAll("<$1$2/>"); |
| 329 | } | |
| 330 | ||
| 331 | /** | |
| 332 | * XML parse failed, so we can try getting rid of all the tags and having | |
| 333 | * another go. We define a tag to start at a < and end at the end of the | |
| 334 | * next word (where a word is what comes in between spaces) that does not | |
| 335 | * contain an = sign, or at a >, whichever is earlier. | |
| 336 | * @param broken | |
| 337 | * @return the string without any tags | |
| 338 | */ | |
| 339 | public static String cleanAllTags(String broken) { | |
| 340 | 0 | if (broken == null) { |
| 341 | 0 | return null; |
| 342 | } | |
| 343 | ||
| 344 | 0 | String working = broken; |
| 345 | ||
| 346 | allTags: while (true) { | |
| 347 | 0 | int lt = working.indexOf('<'); |
| 348 | ||
| 349 | // If there are no more amps then we are done | |
| 350 | 0 | if (lt == -1) { |
| 351 | 0 | break allTags; |
| 352 | } | |
| 353 | ||
| 354 | // loop to find the end of this tag | |
| 355 | 0 | int i = lt; |
| 356 | 0 | int startattr = -1; |
| 357 | ||
| 358 | singletag: while (true) { | |
| 359 | 0 | i++; |
| 360 | ||
| 361 | // the tag can't exist past the end of the string | |
| 362 | 0 | if (i >= working.length()) { |
| 363 | // go back one so we can safely chop | |
| 364 | 0 | i--; |
| 365 | 0 | break singletag; |
| 366 | } | |
| 367 | ||
| 368 | 0 | char c = working.charAt(i); |
| 369 | ||
| 370 | // normal end of tag | |
| 371 | 0 | if (c == '>') { |
| 372 | 0 | break singletag; |
| 373 | } | |
| 374 | ||
| 375 | // we declare end-of-tag if this 'word' is not an attribute | |
| 376 | 0 | if (c == ' ') { |
| 377 | 0 | if (startattr == -1) { |
| 378 | // NOTE(joe): should we skip over consecutive spaces? | |
| 379 | 0 | startattr = i; |
| 380 | } else { | |
| 381 | // so we've already had a space indicating start of | |
| 382 | // attribute, so this must be the beginning of the next | |
| 383 | // NOTE(joe): no - spaces can exist in attr values | |
| 384 | 0 | String value = working.substring(startattr, i); |
| 385 | 0 | if (value.indexOf('=') == -1) { |
| 386 | // this 'attribute' does not contain an equals so | |
| 387 | // we call it a word and end the parse | |
| 388 | 0 | break singletag; |
| 389 | } | |
| 390 | } | |
| 391 | } | |
| 392 | 0 | } |
| 393 | ||
| 394 | // So we have the end of the tag, delete it, but leave a space in it's place | |
| 395 | // DataPolice.report("discarding tag: " + working.substring(lt, i + 1)); | |
| 396 | 0 | working = working.substring(0, lt) + " " + working.substring(i + 1); |
| 397 | 0 | } |
| 398 | ||
| 399 | 0 | return working; |
| 400 | } | |
| 401 | ||
| 402 | /** | |
| 403 | * Replace entity with its unicode equivalent, if it is not a valid XML | |
| 404 | * entity. Otherwise strip it out. XML only allows 4 entities: &amp;, | |
| 405 | * &quot;, &lt; and &gt;. | |
| 406 | * | |
| 407 | * @param entity | |
| 408 | * the entity to be replaced | |
| 409 | * @return the substitution for the entity, either itself, the unicode | |
| 410 | * equivalent or an empty string. | |
| 411 | */ | |
| 412 | private static String handleEntity(String entity) { | |
| 413 | 0 | if (goodEntities.contains(entity)) { |
| 414 | 0 | return entity; |
| 415 | } | |
| 416 | ||
| 417 | 0 | String replace = badEntities.get(entity); |
| 418 | 0 | if (replace != null) { |
| 419 | 0 | return replace; |
| 420 | } | |
| 421 | ||
| 422 | // replace unknown entities with a space | |
| 423 | 0 | return " "; |
| 424 | } | |
| 425 | ||
| 426 | // Map entities to their unicode equivalent | |
| 427 | 0 | private static Set<String> goodEntities = new HashSet<String>(); |
| 428 | 0 | private static PropertyMap badEntities = new PropertyMap(); |
| 429 | static { | |
| 430 | // pre-defined XML entities | |
| 431 | 0 | goodEntities.add("""); // quotation mark |
| 432 | 0 | goodEntities.add("&"); // ampersand |
| 433 | 0 | goodEntities.add("<"); // less-than sign |
| 434 | 0 | goodEntities.add(">"); // greater-than sign |
| 435 | ||
| 436 | // misc entities | |
| 437 | 0 | badEntities.put("€", "\u20AC"); // euro |
| 438 | 0 | badEntities.put("‘", "\u2018"); // left single quotation mark |
| 439 | 0 | badEntities.put("’", "\u2019"); // right single quotation mark |
| 440 | ||
| 441 | // Latin 1 entities | |
| 442 | 0 | badEntities.put(" ", "\u00A0"); // no-break space |
| 443 | 0 | badEntities.put("¡", "\u00A1"); // inverted exclamation mark |
| 444 | 0 | badEntities.put("¢", "\u00A2"); // cent sign |
| 445 | 0 | badEntities.put("£", "\u00A3"); // pound sign |
| 446 | 0 | badEntities.put("¤", "\u00A4"); // currency sign |
| 447 | 0 | badEntities.put("¥", "\u00A5"); // yen sign |
| 448 | 0 | badEntities.put("¦", "\u00A6"); // broken vertical bar |
| 449 | 0 | badEntities.put("§", "\u00A7"); // section sign |
| 450 | 0 | badEntities.put("¨", "\u00A8"); // diaeresis |
| 451 | 0 | badEntities.put("©", "\u00A9"); // copyright sign |
| 452 | 0 | badEntities.put("ª", "\u00AA"); // feminine ordinal indicator |
| 453 | 0 | badEntities.put("«", "\u00AB"); // left-pointing double angle quotation mark |
| 454 | 0 | badEntities.put("¬", "\u00AC"); // not sign |
| 455 | 0 | badEntities.put("­", "\u00AD"); // soft hyphen |
| 456 | 0 | badEntities.put("®", "\u00AE"); // registered sign |
| 457 | 0 | badEntities.put("¯", "\u00AF"); // macron |
| 458 | 0 | badEntities.put("°", "\u00B0"); // degree sign |
| 459 | 0 | badEntities.put("±", "\u00B1"); // plus-minus sign |
| 460 | 0 | badEntities.put("²", "\u00B2"); // superscript two |
| 461 | 0 | badEntities.put("³", "\u00B3"); // superscript three |
| 462 | 0 | badEntities.put("´", "\u00B4"); // acute accent |
| 463 | 0 | badEntities.put("µ", "\u00B5"); // micro sign |
| 464 | 0 | badEntities.put("¶", "\u00B6"); // pilcrow sign |
| 465 | 0 | badEntities.put("·", "\u00B7"); // middle dot |
| 466 | 0 | badEntities.put("¸", "\u00B8"); // cedilla |
| 467 | 0 | badEntities.put("¹", "\u00B9"); // superscript one |
| 468 | 0 | badEntities.put("º", "\u00BA"); // masculine ordinal indicator |
| 469 | 0 | badEntities.put("»", "\u00BB"); // right-pointing double angle quotation mark |
| 470 | 0 | badEntities.put("¼", "\u00BC"); // vulgar fraction one quarter |
| 471 | 0 | badEntities.put("½", "\u00BD"); // vulgar fraction one half |
| 472 | 0 | badEntities.put("¾", "\u00BE"); // vulgar fraction three quarters |
| 473 | 0 | badEntities.put("¿", "\u00BF"); // inverted question mark |
| 474 | 0 | badEntities.put("À", "\u00C0"); // latin capital letter A with grave |
| 475 | 0 | badEntities.put("Á", "\u00C1"); // latin capital letter A with acute |
| 476 | 0 | badEntities.put("Â", "\u00C2"); // latin capital letter A with circumflex |
| 477 | 0 | badEntities.put("Ã", "\u00C3"); // latin capital letter A with tilde |
| 478 | 0 | badEntities.put("Ä", "\u00C4"); // latin capital letter A with diaeresis |
| 479 | 0 | badEntities.put("Å", "\u00C5"); // latin capital letter A with ring above |
| 480 | 0 | badEntities.put("Æ", "\u00C6"); // latin capital letter AE |
| 481 | 0 | badEntities.put("Ç", "\u00C7"); // latin capital letter C with cedilla |
| 482 | 0 | badEntities.put("È", "\u00C8"); // latin capital letter E with grave |
| 483 | 0 | badEntities.put("É", "\u00C9"); // latin capital letter E with acute |
| 484 | 0 | badEntities.put("Ê", "\u00CA"); // latin capital letter E with circumflex |
| 485 | 0 | badEntities.put("Ë", "\u00CB"); // latin capital letter E with diaeresis |
| 486 | 0 | badEntities.put("Ì", "\u00CC"); // latin capital letter I with grave |
| 487 | 0 | badEntities.put("Í", "\u00CD"); // latin capital letter I with acute |
| 488 | 0 | badEntities.put("Î", "\u00CE"); // latin capital letter I with circumflex |
| 489 | 0 | badEntities.put("Ï", "\u00CF"); // latin capital letter I with diaeresis |
| 490 | 0 | badEntities.put("Ð", "\u00D0"); // latin capital letter ETH |
| 491 | 0 | badEntities.put("Ñ", "\u00D1"); // latin capital letter N with tilde |
| 492 | 0 | badEntities.put("Ò", "\u00D2"); // latin capital letter O with grave |
| 493 | 0 | badEntities.put("Ó", "\u00D3"); // latin capital letter O with acute |
| 494 | 0 | badEntities.put("Ô", "\u00D4"); // latin capital letter O with circumflex |
| 495 | 0 | badEntities.put("Õ", "\u00D5"); // latin capital letter O with tilde |
| 496 | 0 | badEntities.put("Ö", "\u00D6"); // latin capital letter O with diaeresis |
| 497 | 0 | badEntities.put("×", "\u00D7"); // multiplication sign |
| 498 | 0 | badEntities.put("Ø", "\u00D8"); // latin capital letter O with stroke |
| 499 | 0 | badEntities.put("Ù", "\u00D9"); // latin capital letter U with grave |
| 500 | 0 | badEntities.put("Ú", "\u00DA"); // latin capital letter U with acute |
| 501 | 0 | badEntities.put("Û", "\u00DB"); // latin capital letter U with circumflex |
| 502 | 0 | badEntities.put("Ü", "\u00DC"); // latin capital letter U with diaeresis |
| 503 | 0 | badEntities.put("Ý", "\u00DD"); // latin capital letter Y with acute |
| 504 | 0 | badEntities.put("Þ", "\u00DE"); // latin capital letter THORN |
| 505 | 0 | badEntities.put("ß", "\u00DF"); // latin small letter sharp s |
| 506 | 0 | badEntities.put("à", "\u00E0"); // latin small letter a with grave |
| 507 | 0 | badEntities.put("á", "\u00E1"); // latin small letter a with acute |
| 508 | 0 | badEntities.put("â", "\u00E2"); // latin small letter a with circumflex |
| 509 | 0 | badEntities.put("ã", "\u00E3"); // latin small letter a with tilde |
| 510 | 0 | badEntities.put("ä", "\u00E4"); // latin small letter a with diaeresis |
| 511 | 0 | badEntities.put("å", "\u00E5"); // latin small letter a with ring above |
| 512 | 0 | badEntities.put("æ", "\u00E6"); // latin small letter ae |
| 513 | 0 | badEntities.put("ç", "\u00E7"); // latin small letter c with cedilla |
| 514 | 0 | badEntities.put("è", "\u00E8"); // latin small letter e with grave |
| 515 | 0 | badEntities.put("é", "\u00E9"); // latin small letter e with acute |
| 516 | 0 | badEntities.put("ê", "\u00EA"); // latin small letter e with circumflex |
| 517 | 0 | badEntities.put("ë", "\u00EB"); // latin small letter e with diaeresis |
| 518 | 0 | badEntities.put("ì", "\u00EC"); // latin small letter i with grave |
| 519 | 0 | badEntities.put("í", "\u00ED"); // latin small letter i with acute |
| 520 | 0 | badEntities.put("î", "\u00EE"); // latin small letter i with circumflex |
| 521 | 0 | badEntities.put("ï", "\u00EF"); // latin small letter i with diaeresis |
| 522 | 0 | badEntities.put("ð", "\u00F0"); // latin small letter eth |
| 523 | 0 | badEntities.put("ñ", "\u00F1"); // latin small letter n with tilde |
| 524 | 0 | badEntities.put("ò", "\u00F2"); // latin small letter o with grave |
| 525 | 0 | badEntities.put("ó", "\u00F3"); // latin small letter o with acute |
| 526 | 0 | badEntities.put("ô", "\u00F4"); // latin small letter o with circumflex |
| 527 | 0 | badEntities.put("õ", "\u00F5"); // latin small letter o with tilde |
| 528 | 0 | badEntities.put("ö", "\u00F6"); // latin small letter o with diaeresis |
| 529 | 0 | badEntities.put("÷", "\u00F7"); // division sign |
| 530 | 0 | badEntities.put("ø", "\u00F8"); // latin small letter o with stroke |
| 531 | 0 | badEntities.put("ù", "\u00F9"); // latin small letter u with grave |
| 532 | 0 | badEntities.put("ú", "\u00FA"); // latin small letter u with acute |
| 533 | 0 | badEntities.put("û", "\u00FB"); // latin small letter u with circumflex |
| 534 | 0 | badEntities.put("ü", "\u00FC"); // latin small letter u with diaeresis |
| 535 | 0 | badEntities.put("ý", "\u00FD"); // latin small letter y with acute |
| 536 | 0 | badEntities.put("þ", "\u00FE"); // latin small letter thorn |
| 537 | 0 | badEntities.put("ÿ", "\u00FF"); // latin small letter y with diaeresis |
| 538 | } | |
| 539 | ||
| 540 | /** | |
| 541 | * Pattern for numeric entities. | |
| 542 | */ | |
| 543 | 0 | private static Pattern validCharacterEntityPattern = Pattern.compile("^&#x?\\d{2,4};"); |
| 544 | ||
| 545 | /** | |
| 546 | * Pattern that negates the allowable XML 4 byte unicode characters. Valid | |
| 547 | * are: #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | | |
| 548 | * [#x10000-#x10FFFF] | |
| 549 | */ | |
| 550 | 0 | private static Pattern invalidCharacterPattern = Pattern.compile("[^\t\r\n\u0020-\uD7FF\uE000-\uFFFD]"); |
| 551 | ||
| 552 | /** | |
| 553 | * Pattern that matches open <br>,<hr> and <img> tags. | |
| 554 | */ | |
| 555 | 0 | private static Pattern openHTMLTagPattern = Pattern.compile("<(img|hr|br)([^>]*)(?<!/)>"); |
| 556 | ||
| 557 | /** | |
| 558 | * The log stream | |
| 559 | */ | |
| 560 | 0 | private static final Logger log = LoggerFactory.getLogger(XMLUtil.class); |
| 561 | } |