Coverage Report - org.crosswire.common.xml.XMLUtil
 
Classes in this File Line Coverage Branch Coverage Complexity
XMLUtil
0%
0/242
0%
0/79
5.833
 
 1  
 /**
 2  
  * Distribution License:
 3  
  * JSword is free software; you can redistribute it and/or modify it under
 4  
  * the terms of the GNU Lesser General Public License, version 2.1 or later
 5  
  * as published by the Free Software Foundation. This program is distributed
 6  
  * in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even
 7  
  * the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 8  
  * See the GNU Lesser General Public License for more details.
 9  
  *
 10  
  * The License is available on the internet at:
 11  
  *      http://www.gnu.org/copyleft/lgpl.html
 12  
  * or by writing to:
 13  
  *      Free Software Foundation, Inc.
 14  
  *      59 Temple Place - Suite 330
 15  
  *      Boston, MA 02111-1307, USA
 16  
  *
 17  
  * © CrossWire Bible Society, 2005 - 2016
 18  
  *
 19  
  */
 20  
 package org.crosswire.common.xml;
 21  
 
 22  
 import java.io.IOException;
 23  
 import java.io.InputStream;
 24  
 import java.util.ArrayList;
 25  
 import java.util.Collections;
 26  
 import java.util.HashSet;
 27  
 import java.util.List;
 28  
 import java.util.Set;
 29  
 import java.util.regex.Matcher;
 30  
 import java.util.regex.Pattern;
 31  
 
 32  
 import org.crosswire.common.util.FileUtil;
 33  
 import org.crosswire.common.util.PropertyMap;
 34  
 import org.crosswire.common.util.ResourceUtil;
 35  
 import org.jdom2.Document;
 36  
 import org.jdom2.JDOMException;
 37  
 import org.jdom2.input.SAXBuilder;
 38  
 import org.jdom2.input.sax.XMLReaders;
 39  
 import org.slf4j.Logger;
 40  
 import org.slf4j.LoggerFactory;
 41  
 import org.xml.sax.Attributes;
 42  
 import org.xml.sax.ContentHandler;
 43  
 import org.xml.sax.SAXException;
 44  
 
 45  
 /**
 46  
  * Utilities for working with SAX XML parsing.
 47  
  * 
 48  
  * @see gnu.lgpl.License The GNU Lesser General Public License for details.
 49  
  * @author Joe Walker
 50  
  * @author DM Smith
 51  
  */
 52  
 public final class XMLUtil {
 53  
     /**
 54  
      * Prevent instantiation
 55  
      */
 56  0
     private XMLUtil() {
 57  0
     }
 58  
 
 59  
     /**
 60  
      * Get and load an XML file from the classpath and a few other places into a
 61  
      * JDOM Document object.
 62  
      * 
 63  
      * @param subject
 64  
      *            The name of the desired resource (without any extension)
 65  
      * @return The requested resource
 66  
      * @throws IOException
 67  
      *             if there is a problem reading the file
 68  
      * @throws JDOMException
 69  
      *             If the resource is not valid XML
 70  
      */
 71  
     public static Document getDocument(String subject) throws JDOMException, IOException {
 72  0
         String resource = subject + FileUtil.EXTENSION_XML;
 73  0
         InputStream in = ResourceUtil.getResourceAsStream(resource);
 74  
 
 75  0
         log.debug("Loading {}.xml from classpath: [OK]", subject);
 76  
         // With JDom 1.x this passed true
 77  0
         SAXBuilder builder = new SAXBuilder(XMLReaders.DTDVALIDATING);
 78  0
         return builder.build(in);
 79  
     }
 80  
 
 81  
     /**
 82  
      * Serialize a SAXEventProvider into an XML String
 83  
      * 
 84  
      * @param provider
 85  
      *            The source of SAX events
 86  
      * @return a serialized string
 87  
      * @throws SAXException 
 88  
      */
 89  
     public static String writeToString(SAXEventProvider provider) throws SAXException {
 90  0
         ContentHandler ser = new PrettySerializingContentHandler();
 91  0
         provider.provideSAXEvents(ser);
 92  0
         return ser.toString();
 93  
     }
 94  
 
 95  
     /**
 96  
      * Get the full name of the attribute, including the namespace if any.
 97  
      * 
 98  
      * @param attrs
 99  
      *            the collection of attributes
 100  
      * @param index
 101  
      *            the index of the desired attribute
 102  
      * @return the requested attribute
 103  
      */
 104  
     public static String getAttributeName(Attributes attrs, int index) {
 105  0
         String qName = attrs.getQName(index);
 106  0
         if (qName != null) {
 107  0
             return qName;
 108  
         }
 109  0
         return attrs.getLocalName(index);
 110  
     }
 111  
 
 112  
     /**
 113  
      * Show the attributes of an element as debug
 114  
      * @param attrs 
 115  
      */
 116  
     public static void debugSAXAttributes(Attributes attrs) {
 117  0
         for (int i = 0; i < attrs.getLength(); i++) {
 118  0
             log.debug("attr[{}]: {}={}", Integer.toString(i), attrs.getQName(i), attrs.getValue(i));
 119  
         }
 120  0
     }
 121  
 
 122  
     /**
 123  
      * Normalizes the given string
 124  
      * @param s 
 125  
      * @return the escaped string
 126  
      */
 127  
     public static String escape(String s) {
 128  0
         if (s == null) {
 129  0
             return s;
 130  
         }
 131  0
         int len = s.length();
 132  0
         StringBuilder str = new StringBuilder(len);
 133  
 
 134  0
         for (int i = 0; i < len; i++) {
 135  0
             char ch = s.charAt(i);
 136  0
             switch (ch) {
 137  
             case '<':
 138  0
                 str.append("&lt;");
 139  0
                 break;
 140  
 
 141  
             case '>':
 142  0
                 str.append("&gt;");
 143  0
                 break;
 144  
 
 145  
             case '&':
 146  0
                 str.append("&amp;");
 147  0
                 break;
 148  
 
 149  
             case '"':
 150  0
                 str.append("&quot;");
 151  0
                 break;
 152  
 
 153  
             default:
 154  0
                 str.append(ch);
 155  
             }
 156  
         }
 157  
 
 158  0
         return str.toString();
 159  
     }
 160  
 
 161  
     /**
 162  
      * For each entity in the input that is not allowed in XML, replace the
 163  
      * entity with its unicode equivalent or remove it. For each instance of a
 164  
      * bare &, replace it with &amp;<br>
 165  
      * XML only allows 4 entities: &amp;amp;, &amp;quot;, &amp;lt; and &amp;gt;.
 166  
      * 
 167  
      * @param broken
 168  
      *            the string to handle entities
 169  
      * @return the string with entities appropriately fixed up
 170  
      */
 171  
     public static String cleanAllEntities(String broken) {
 172  0
         if (broken == null) {
 173  0
             return null;
 174  
         }
 175  
 
 176  0
         String working = broken;
 177  0
         int cleanfrom = 0;
 178  
 
 179  
         while (true) {
 180  0
             int amp = working.indexOf('&', cleanfrom);
 181  
 
 182  
             // If there are no more amps then we are done
 183  0
             if (amp == -1) {
 184  0
                 break;
 185  
             }
 186  
 
 187  
             // Skip references of the kind &#ddd;
 188  0
             if (validCharacterEntityPattern.matcher(working.substring(amp)).find()) {
 189  0
                 cleanfrom = working.indexOf(';', amp) + 1;
 190  0
                 continue;
 191  
             }
 192  
 
 193  0
             int i = amp + 1;
 194  
             while (true) {
 195  
                 // if we are at the end of the string then just escape the '&';
 196  0
                 if (i >= working.length()) {
 197  
                     // String entity = working.substring(amp);
 198  
                     // String replace = guessEntity(entity);
 199  
                     // DataPolice.report("replacing unterminated entity: '" +
 200  
                     // entity + "' with: '" + replace + "'");
 201  
 
 202  0
                     return working.substring(0, amp) + "&amp;" + working.substring(amp + 1);
 203  
                 }
 204  
 
 205  
                 // if we have come to a ; then we have an entity
 206  
                 // If it is something that xml can't handle then replace it.
 207  0
                 char c = working.charAt(i);
 208  0
                 if (c == ';') {
 209  0
                     String entity = working.substring(amp, i + 1);
 210  0
                     String replace = handleEntity(entity);
 211  
                     // log.warn("replacing entity: '{}' with: '{}'", entity, replace);
 212  
 
 213  0
                     working = working.substring(0, amp) + replace + working.substring(i + 1);
 214  0
                     break;
 215  
                 }
 216  
 
 217  
                 // Did we end an entity without finding a closing ;
 218  
                 // Then treat it as an '&' that needs to be replaced with &amp;
 219  0
                 if (!Character.isLetterOrDigit(c)) {
 220  
                     // String entity = working.substring(amp, i);
 221  
                     // String replace = "&amp;" + working.substring(amp + 1, i);
 222  
                     // log.warn("replacing invalid entity: '{}' with: '{}': {}", entity, replace, broken);
 223  
 
 224  0
                     working = working.substring(0, amp) + "&amp;" + working.substring(amp + 1);
 225  0
                     amp = i + 4; // account for the 4 extra characters
 226  0
                     break;
 227  
                 }
 228  
 
 229  0
                 i++;
 230  0
             }
 231  
 
 232  0
             cleanfrom = amp + 1;
 233  0
         }
 234  
 
 235  0
         return working;
 236  
     }
 237  
 
 238  
     /**
 239  
      * Remove all invalid characters in the input, replacing them with a space. XML has stringent
 240  
      * requirements as to which characters are or are not allowed. The set of
 241  
      * allowable characters are:<br>
 242  
      * #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]<br>
 243  
      * Note: Java handles to \uFFFF
 244  
      * 
 245  
      * @param broken
 246  
      *            the string to be cleaned
 247  
      * @return the cleaned string
 248  
      */
 249  
     public static String cleanAllCharacters(String broken) {
 250  0
         return invalidCharacterPattern.matcher(broken).replaceAll(" ");
 251  
     }
 252  
 
 253  
     /**
 254  
      * Strip all closing tags from the end of the XML fragment, and then
 255  
      * re-close all tags that are open at the end of the string.
 256  
      * 
 257  
      * @param broken
 258  
      *            the string to be cleaned.
 259  
      * @return cleaned string, or {@code null} if the string could not be
 260  
      *         cleaned due to more broken XML
 261  
      */
 262  
     public static String recloseTags(String broken) {
 263  0
         String result = broken;
 264  
         // remove closing tags from the end
 265  0
         while (result.matches(".*</[a-zA-Z]+>[ \t\r\n]*")) {
 266  0
             result = result.substring(0, result.lastIndexOf('<'));
 267  
         }
 268  
         // close tags again
 269  0
         List<String> openTags = new ArrayList<String>();
 270  0
         Matcher m = Pattern.compile("</?[a-zA-Z]+").matcher(result);
 271  0
         boolean lTagFound = false;
 272  0
         boolean lgTagFound = false;
 273  0
         while (m.find()) {
 274  0
             String match = m.group();
 275  0
             if (match.startsWith("</")) {
 276  0
                 if (openTags.size() == 0 && "</l".equals(match) && !lTagFound) {
 277  0
                     return recloseTags("<l>" + broken);
 278  
                 }
 279  0
                 if (openTags.size() == 0 && "</lg".equals(match) && !lgTagFound) {
 280  0
                     return recloseTags("<lg>" + broken);
 281  
                 }
 282  0
                 if (openTags.size() == 0) {
 283  0
                     return null;
 284  
                 }
 285  0
                 String lastTag = openTags.remove(openTags.size() - 1);
 286  0
                 if (!("</" + lastTag).equals(match)) {
 287  0
                     return null;
 288  
                 }
 289  0
             } else {
 290  0
                 int closePos = result.indexOf('>', m.end());
 291  0
                 if (closePos == -1) {
 292  0
                     return null;
 293  
                 }
 294  0
                 while (Character.isWhitespace(result.charAt(closePos - 1))) {
 295  0
                     --closePos;
 296  
                 }
 297  0
                 if (result.charAt(closePos - 1) != '/') {
 298  0
                     if ("<l".equals(match)) {
 299  0
                         lTagFound = true;
 300  
                     }
 301  0
                     if ("<lg".equals(match)) {
 302  0
                         lgTagFound = true;
 303  
                     }
 304  0
                     openTags.add(match.substring(1));
 305  
                 }
 306  
             }
 307  0
         }
 308  0
         Collections.reverse(openTags);
 309  0
         for (String openTag : openTags) {
 310  0
             result += "</" + openTag + ">";
 311  
         }
 312  0
         return result;
 313  
     }
 314  
 
 315  
     /**
 316  
      * Common HTML tags such as &lt;br&gt;,&lt;hr&gt; and &lt;img&gt; may be
 317  
      * left open causing XML parsing to fail. This method closes these tags.
 318  
      * 
 319  
      * @param broken
 320  
      *            the string to be cleaned
 321  
      * @return the cleaned string
 322  
      */
 323  
     public static String closeEmptyTags(String broken) {
 324  0
         if (broken == null) {
 325  0
             return null;
 326  
         }
 327  
 
 328  0
         return openHTMLTagPattern.matcher(broken).replaceAll("<$1$2/>");
 329  
     }
 330  
 
 331  
     /**
 332  
      * XML parse failed, so we can try getting rid of all the tags and having
 333  
      * another go. We define a tag to start at a &lt; and end at the end of the
 334  
      * next word (where a word is what comes in between spaces) that does not
 335  
      * contain an = sign, or at a >, whichever is earlier.
 336  
      * @param broken 
 337  
      * @return the string without any tags
 338  
      */
 339  
     public static String cleanAllTags(String broken) {
 340  0
         if (broken == null) {
 341  0
             return null;
 342  
         }
 343  
 
 344  0
         String working = broken;
 345  
 
 346  
         allTags: while (true) {
 347  0
             int lt = working.indexOf('<');
 348  
 
 349  
             // If there are no more amps then we are done
 350  0
             if (lt == -1) {
 351  0
                 break allTags;
 352  
             }
 353  
 
 354  
             // loop to find the end of this tag
 355  0
             int i = lt;
 356  0
             int startattr = -1;
 357  
 
 358  
             singletag: while (true) {
 359  0
                 i++;
 360  
 
 361  
                 // the tag can't exist past the end of the string
 362  0
                 if (i >= working.length()) {
 363  
                     // go back one so we can safely chop
 364  0
                     i--;
 365  0
                     break singletag;
 366  
                 }
 367  
 
 368  0
                 char c = working.charAt(i);
 369  
 
 370  
                 // normal end of tag
 371  0
                 if (c == '>') {
 372  0
                     break singletag;
 373  
                 }
 374  
 
 375  
                 // we declare end-of-tag if this 'word' is not an attribute
 376  0
                 if (c == ' ') {
 377  0
                     if (startattr == -1) {
 378  
                         // NOTE(joe): should we skip over consecutive spaces?
 379  0
                         startattr = i;
 380  
                     } else {
 381  
                         // so we've already had a space indicating start of
 382  
                         // attribute, so this must be the beginning of the next
 383  
                         // NOTE(joe): no - spaces can exist in attr values
 384  0
                         String value = working.substring(startattr, i);
 385  0
                         if (value.indexOf('=') == -1) {
 386  
                             // this 'attribute' does not contain an equals so
 387  
                             // we call it a word and end the parse
 388  0
                             break singletag;
 389  
                         }
 390  
                     }
 391  
                 }
 392  0
             }
 393  
 
 394  
             // So we have the end of the tag, delete it, but leave a space in it's place
 395  
             // DataPolice.report("discarding tag: " + working.substring(lt, i + 1));
 396  0
             working = working.substring(0, lt) + " " + working.substring(i + 1);
 397  0
         }
 398  
 
 399  0
         return working;
 400  
     }
 401  
 
 402  
     /**
 403  
      * Replace entity with its unicode equivalent, if it is not a valid XML
 404  
      * entity. Otherwise strip it out. XML only allows 4 entities: &amp;amp;,
 405  
      * &amp;quot;, &amp;lt; and &amp;gt;.
 406  
      * 
 407  
      * @param entity
 408  
      *            the entity to be replaced
 409  
      * @return the substitution for the entity, either itself, the unicode
 410  
      *         equivalent or an empty string.
 411  
      */
 412  
     private static String handleEntity(String entity) {
 413  0
         if (goodEntities.contains(entity)) {
 414  0
             return entity;
 415  
         }
 416  
 
 417  0
         String replace = badEntities.get(entity);
 418  0
         if (replace != null) {
 419  0
             return replace;
 420  
         }
 421  
 
 422  
         // replace unknown entities with a space
 423  0
         return " ";
 424  
     }
 425  
 
 426  
     // Map entities to their unicode equivalent
 427  0
     private static Set<String> goodEntities = new HashSet<String>();
 428  0
     private static PropertyMap badEntities = new PropertyMap();
 429  
     static {
 430  
         // pre-defined XML entities
 431  0
         goodEntities.add("&quot;"); // quotation mark
 432  0
         goodEntities.add("&amp;"); // ampersand
 433  0
         goodEntities.add("&lt;"); // less-than sign
 434  0
         goodEntities.add("&gt;"); // greater-than sign
 435  
 
 436  
         // misc entities
 437  0
         badEntities.put("&euro;", "\u20AC"); // euro
 438  0
         badEntities.put("&lsquo;", "\u2018"); // left single quotation mark
 439  0
         badEntities.put("&rsquo;", "\u2019"); // right single quotation mark
 440  
 
 441  
         // Latin 1 entities
 442  0
         badEntities.put("&nbsp;", "\u00A0"); // no-break space
 443  0
         badEntities.put("&iexcl;", "\u00A1"); // inverted exclamation mark
 444  0
         badEntities.put("&cent;", "\u00A2"); // cent sign
 445  0
         badEntities.put("&pound;", "\u00A3"); // pound sign
 446  0
         badEntities.put("&curren;", "\u00A4"); // currency sign
 447  0
         badEntities.put("&yen;", "\u00A5"); // yen sign
 448  0
         badEntities.put("&brvbar;", "\u00A6"); // broken vertical bar
 449  0
         badEntities.put("&sect;", "\u00A7"); // section sign
 450  0
         badEntities.put("&uml;", "\u00A8"); // diaeresis
 451  0
         badEntities.put("&copy;", "\u00A9"); // copyright sign
 452  0
         badEntities.put("&ordf;", "\u00AA"); // feminine ordinal indicator
 453  0
         badEntities.put("&laquo;", "\u00AB"); // left-pointing double angle quotation mark
 454  0
         badEntities.put("&not;", "\u00AC"); // not sign
 455  0
         badEntities.put("&shy;", "\u00AD"); // soft hyphen
 456  0
         badEntities.put("&reg;", "\u00AE"); // registered sign
 457  0
         badEntities.put("&macr;", "\u00AF"); // macron
 458  0
         badEntities.put("&deg;", "\u00B0"); // degree sign
 459  0
         badEntities.put("&plusmn;", "\u00B1"); // plus-minus sign
 460  0
         badEntities.put("&sup2;", "\u00B2"); // superscript two
 461  0
         badEntities.put("&sup3;", "\u00B3"); // superscript three
 462  0
         badEntities.put("&acute;", "\u00B4"); // acute accent
 463  0
         badEntities.put("&micro;", "\u00B5"); // micro sign
 464  0
         badEntities.put("&para;", "\u00B6"); // pilcrow sign
 465  0
         badEntities.put("&middot;", "\u00B7"); // middle dot
 466  0
         badEntities.put("&cedil;", "\u00B8"); // cedilla
 467  0
         badEntities.put("&sup1;", "\u00B9"); // superscript one
 468  0
         badEntities.put("&ordm;", "\u00BA"); // masculine ordinal indicator
 469  0
         badEntities.put("&raquo;", "\u00BB"); // right-pointing double angle quotation mark
 470  0
         badEntities.put("&frac14;", "\u00BC"); // vulgar fraction one quarter
 471  0
         badEntities.put("&frac12;", "\u00BD"); // vulgar fraction one half
 472  0
         badEntities.put("&frac34;", "\u00BE"); // vulgar fraction three quarters
 473  0
         badEntities.put("&iquest;", "\u00BF"); // inverted question mark
 474  0
         badEntities.put("&Agrave;", "\u00C0"); // latin capital letter A with grave
 475  0
         badEntities.put("&Aacute;", "\u00C1"); // latin capital letter A with acute
 476  0
         badEntities.put("&Acirc;", "\u00C2"); // latin capital letter A with circumflex
 477  0
         badEntities.put("&Atilde;", "\u00C3"); // latin capital letter A with tilde
 478  0
         badEntities.put("&Auml;", "\u00C4"); // latin capital letter A with diaeresis
 479  0
         badEntities.put("&Aring;", "\u00C5"); // latin capital letter A with ring above
 480  0
         badEntities.put("&AElig;", "\u00C6"); // latin capital letter AE
 481  0
         badEntities.put("&Ccedil;", "\u00C7"); // latin capital letter C with cedilla
 482  0
         badEntities.put("&Egrave;", "\u00C8"); // latin capital letter E with grave
 483  0
         badEntities.put("&Eacute;", "\u00C9"); // latin capital letter E with acute
 484  0
         badEntities.put("&Ecirc;", "\u00CA"); // latin capital letter E with circumflex
 485  0
         badEntities.put("&Euml;", "\u00CB"); // latin capital letter E with diaeresis
 486  0
         badEntities.put("&Igrave;", "\u00CC"); // latin capital letter I with grave
 487  0
         badEntities.put("&Iacute;", "\u00CD"); // latin capital letter I with acute
 488  0
         badEntities.put("&Icirc;", "\u00CE"); // latin capital letter I with circumflex
 489  0
         badEntities.put("&Iuml;", "\u00CF"); // latin capital letter I with diaeresis
 490  0
         badEntities.put("&ETH;", "\u00D0"); // latin capital letter ETH
 491  0
         badEntities.put("&Ntilde;", "\u00D1"); // latin capital letter N with tilde
 492  0
         badEntities.put("&Ograve;", "\u00D2"); // latin capital letter O with grave
 493  0
         badEntities.put("&Oacute;", "\u00D3"); // latin capital letter O with acute
 494  0
         badEntities.put("&Ocirc;", "\u00D4"); // latin capital letter O with circumflex
 495  0
         badEntities.put("&Otilde;", "\u00D5"); // latin capital letter O with tilde
 496  0
         badEntities.put("&Ouml;", "\u00D6"); // latin capital letter O with diaeresis
 497  0
         badEntities.put("&times;", "\u00D7"); // multiplication sign
 498  0
         badEntities.put("&Oslash;", "\u00D8"); // latin capital letter O with stroke
 499  0
         badEntities.put("&Ugrave;", "\u00D9"); // latin capital letter U with grave
 500  0
         badEntities.put("&Uacute;", "\u00DA"); // latin capital letter U with acute
 501  0
         badEntities.put("&Ucirc;", "\u00DB"); // latin capital letter U with circumflex
 502  0
         badEntities.put("&Uuml;", "\u00DC"); // latin capital letter U with diaeresis
 503  0
         badEntities.put("&Yacute;", "\u00DD"); // latin capital letter Y with acute
 504  0
         badEntities.put("&THORN;", "\u00DE"); // latin capital letter THORN
 505  0
         badEntities.put("&szlig;", "\u00DF"); // latin small letter sharp s
 506  0
         badEntities.put("&agrave;", "\u00E0"); // latin small letter a with grave
 507  0
         badEntities.put("&aacute;", "\u00E1"); // latin small letter a with acute
 508  0
         badEntities.put("&acirc;", "\u00E2"); // latin small letter a with circumflex
 509  0
         badEntities.put("&atilde;", "\u00E3"); // latin small letter a with tilde
 510  0
         badEntities.put("&auml;", "\u00E4"); // latin small letter a with diaeresis
 511  0
         badEntities.put("&aring;", "\u00E5"); // latin small letter a with ring above
 512  0
         badEntities.put("&aelig;", "\u00E6"); // latin small letter ae
 513  0
         badEntities.put("&ccedil;", "\u00E7"); // latin small letter c with cedilla
 514  0
         badEntities.put("&egrave;", "\u00E8"); // latin small letter e with grave
 515  0
         badEntities.put("&eacute;", "\u00E9"); // latin small letter e with acute
 516  0
         badEntities.put("&ecirc;", "\u00EA"); // latin small letter e with circumflex
 517  0
         badEntities.put("&euml;", "\u00EB"); // latin small letter e with diaeresis
 518  0
         badEntities.put("&igrave;", "\u00EC"); // latin small letter i with grave
 519  0
         badEntities.put("&iacute;", "\u00ED"); // latin small letter i with acute
 520  0
         badEntities.put("&icirc;", "\u00EE"); // latin small letter i with circumflex
 521  0
         badEntities.put("&iuml;", "\u00EF"); // latin small letter i with diaeresis
 522  0
         badEntities.put("&eth;", "\u00F0"); // latin small letter eth
 523  0
         badEntities.put("&ntilde;", "\u00F1"); // latin small letter n with tilde
 524  0
         badEntities.put("&ograve;", "\u00F2"); // latin small letter o with grave
 525  0
         badEntities.put("&oacute;", "\u00F3"); // latin small letter o with acute
 526  0
         badEntities.put("&ocirc;", "\u00F4"); // latin small letter o with circumflex
 527  0
         badEntities.put("&otilde;", "\u00F5"); // latin small letter o with tilde
 528  0
         badEntities.put("&ouml;", "\u00F6"); // latin small letter o with diaeresis
 529  0
         badEntities.put("&divide;", "\u00F7"); // division sign
 530  0
         badEntities.put("&oslash;", "\u00F8"); // latin small letter o with stroke
 531  0
         badEntities.put("&ugrave;", "\u00F9"); // latin small letter u with grave
 532  0
         badEntities.put("&uacute;", "\u00FA"); // latin small letter u with acute
 533  0
         badEntities.put("&ucirc;", "\u00FB"); // latin small letter u with circumflex
 534  0
         badEntities.put("&uuml;", "\u00FC"); // latin small letter u with diaeresis
 535  0
         badEntities.put("&yacute;", "\u00FD"); // latin small letter y with acute
 536  0
         badEntities.put("&thorn;", "\u00FE"); // latin small letter thorn
 537  0
         badEntities.put("&yuml;", "\u00FF"); // latin small letter y with diaeresis
 538  
     }
 539  
 
 540  
     /**
 541  
      * Pattern for numeric entities.
 542  
      */
 543  0
     private static Pattern validCharacterEntityPattern = Pattern.compile("^&#x?\\d{2,4};");
 544  
 
 545  
     /**
 546  
      * Pattern that negates the allowable XML 4 byte unicode characters. Valid
 547  
      * are: #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] |
 548  
      * [#x10000-#x10FFFF]
 549  
      */
 550  0
     private static Pattern invalidCharacterPattern = Pattern.compile("[^\t\r\n\u0020-\uD7FF\uE000-\uFFFD]");
 551  
 
 552  
     /**
 553  
      * Pattern that matches open &lt;br&gt;,&lt;hr&gt; and &lt;img&gt; tags.
 554  
      */
 555  0
     private static Pattern openHTMLTagPattern = Pattern.compile("<(img|hr|br)([^>]*)(?<!/)>");
 556  
 
 557  
     /**
 558  
      * The log stream
 559  
      */
 560  0
     private static final Logger log = LoggerFactory.getLogger(XMLUtil.class);
 561  
 }