Coverage Report

Coverage Report - org.crosswire.common.xml.XMLUtil

Classes in this File

Line Coverage

Branch Coverage

Complexity

XMLUtil

0/242

0/79

5.833

 /**
  * Distribution License:
  * JSword is free software; you can redistribute it and/or modify it under
  * the terms of the GNU Lesser General Public License, version 2.1 or later
  * as published by the Free Software Foundation. This program is distributed
  * in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even
  * the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  * See the GNU Lesser General Public License for more details.
  *
  * The License is available on the internet at:
  *      http://www.gnu.org/copyleft/lgpl.html
  * or by writing to:
  *      Free Software Foundation, Inc.
  *      59 Temple Place - Suite 330
  *      Boston, MA 02111-1307, USA
  *
  * © CrossWire Bible Society, 2005 - 2016
  *
  */
 package org.crosswire.common.xml;
 
 import java.io.IOException;
 import java.io.InputStream;
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Set;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
 import org.crosswire.common.util.FileUtil;
 import org.crosswire.common.util.PropertyMap;
 import org.crosswire.common.util.ResourceUtil;
 import org.jdom2.Document;
 import org.jdom2.JDOMException;
 import org.jdom2.input.SAXBuilder;
 import org.jdom2.input.sax.XMLReaders;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.xml.sax.Attributes;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
 /**
  * Utilities for working with SAX XML parsing.
  * 
  * @see gnu.lgpl.License The GNU Lesser General Public License for details.
  * @author Joe Walker
  * @author DM Smith
  */
 public final class XMLUtil {
     /**
      * Prevent instantiation
      */
     private XMLUtil() {
     }
 
     /**
      * Get and load an XML file from the classpath and a few other places into a
      * JDOM Document object.
      * 
      * @param subject
      *            The name of the desired resource (without any extension)
      * @return The requested resource
      * @throws IOException
      *             if there is a problem reading the file
      * @throws JDOMException
      *             If the resource is not valid XML
      */
     public static Document getDocument(String subject) throws JDOMException, IOException {
         String resource = subject + FileUtil.EXTENSION_XML;
         InputStream in = ResourceUtil.getResourceAsStream(resource);
 
         log.debug("Loading {}.xml from classpath: [OK]", subject);
         // With JDom 1.x this passed true
         SAXBuilder builder = new SAXBuilder(XMLReaders.DTDVALIDATING);
         return builder.build(in);
     }
 
     /**
      * Serialize a SAXEventProvider into an XML String
      * 
      * @param provider
      *            The source of SAX events
      * @return a serialized string
      * @throws SAXException 
      */
     public static String writeToString(SAXEventProvider provider) throws SAXException {
         ContentHandler ser = new PrettySerializingContentHandler();
         provider.provideSAXEvents(ser);
         return ser.toString();
     }
 
     /**
      * Get the full name of the attribute, including the namespace if any.
      * 
      * @param attrs
      *            the collection of attributes
      * @param index
      *            the index of the desired attribute
      * @return the requested attribute
      */
     public static String getAttributeName(Attributes attrs, int index) {
         String qName = attrs.getQName(index);
         if (qName != null) {
             return qName;
         }
         return attrs.getLocalName(index);
     }
 
     /**
      * Show the attributes of an element as debug
      * @param attrs 
      */
     public static void debugSAXAttributes(Attributes attrs) {
         for (int i = 0; i < attrs.getLength(); i++) {
             log.debug("attr[{}]: {}={}", Integer.toString(i), attrs.getQName(i), attrs.getValue(i));
         }
     }
 
     /**
      * Normalizes the given string
      * @param s 
      * @return the escaped string
      */
     public static String escape(String s) {
         if (s == null) {
             return s;
         }
         int len = s.length();
         StringBuilder str = new StringBuilder(len);
 
         for (int i = 0; i < len; i++) {
             char ch = s.charAt(i);
             switch (ch) {
             case '<':
                 str.append("&lt;");
                 break;
 
             case '>':
                 str.append("&gt;");
                 break;
 
             case '&':
                 str.append("&amp;");
                 break;
 
             case '"':
                 str.append("&quot;");
                 break;
 
             default:
                 str.append(ch);
             }
         }
 
         return str.toString();
     }
 
     /**
      * For each entity in the input that is not allowed in XML, replace the
      * entity with its unicode equivalent or remove it. For each instance of a
      * bare &, replace it with &amp;<br>
      * XML only allows 4 entities: &amp;amp;, &amp;quot;, &amp;lt; and &amp;gt;.
      * 
      * @param broken
      *            the string to handle entities
      * @return the string with entities appropriately fixed up
      */
     public static String cleanAllEntities(String broken) {
         if (broken == null) {
             return null;
         }
 
         String working = broken;
         int cleanfrom = 0;
 
         while (true) {
             int amp = working.indexOf('&', cleanfrom);
 
             // If there are no more amps then we are done
             if (amp == -1) {
                 break;
             }
 
             // Skip references of the kind &#ddd;
             if (validCharacterEntityPattern.matcher(working.substring(amp)).find()) {
                 cleanfrom = working.indexOf(';', amp) + 1;
                 continue;
             }
 
             int i = amp + 1;
             while (true) {
                 // if we are at the end of the string then just escape the '&';
                 if (i >= working.length()) {
                     // String entity = working.substring(amp);
                     // String replace = guessEntity(entity);
                     // DataPolice.report("replacing unterminated entity: '" +
                     // entity + "' with: '" + replace + "'");
 
                     return working.substring(0, amp) + "&amp;" + working.substring(amp + 1);
                 }
 
                 // if we have come to a ; then we have an entity
                 // If it is something that xml can't handle then replace it.
                 char c = working.charAt(i);
                 if (c == ';') {
                     String entity = working.substring(amp, i + 1);
                     String replace = handleEntity(entity);
                     // log.warn("replacing entity: '{}' with: '{}'", entity, replace);
 
                     working = working.substring(0, amp) + replace + working.substring(i + 1);
                     break;
                 }
 
                 // Did we end an entity without finding a closing ;
                 // Then treat it as an '&' that needs to be replaced with &amp;
                 if (!Character.isLetterOrDigit(c)) {
                     // String entity = working.substring(amp, i);
                     // String replace = "&amp;" + working.substring(amp + 1, i);
                     // log.warn("replacing invalid entity: '{}' with: '{}': {}", entity, replace, broken);
 
                     working = working.substring(0, amp) + "&amp;" + working.substring(amp + 1);
                     amp = i + 4; // account for the 4 extra characters
                     break;
                 }
 
                 i++;
             }
 
             cleanfrom = amp + 1;
         }
 
         return working;
     }
 
     /**
      * Remove all invalid characters in the input, replacing them with a space. XML has stringent
      * requirements as to which characters are or are not allowed. The set of
      * allowable characters are:<br>
      * #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]<br>
      * Note: Java handles to \uFFFF
      * 
      * @param broken
      *            the string to be cleaned
      * @return the cleaned string
      */
     public static String cleanAllCharacters(String broken) {
         return invalidCharacterPattern.matcher(broken).replaceAll(" ");
     }
 
     /**
      * Strip all closing tags from the end of the XML fragment, and then
      * re-close all tags that are open at the end of the string.
      * 
      * @param broken
      *            the string to be cleaned.
      * @return cleaned string, or {@code null} if the string could not be
      *         cleaned due to more broken XML
      */
     public static String recloseTags(String broken) {
         String result = broken;
         // remove closing tags from the end
         while (result.matches(".*</[a-zA-Z]+>[ \t\r\n]*")) {
             result = result.substring(0, result.lastIndexOf('<'));
         }
         // close tags again
         List<String> openTags = new ArrayList<String>();
         Matcher m = Pattern.compile("</?[a-zA-Z]+").matcher(result);
         boolean lTagFound = false;
         boolean lgTagFound = false;
         while (m.find()) {
             String match = m.group();
             if (match.startsWith("</")) {
                 if (openTags.size() == 0 && "</l".equals(match) && !lTagFound) {
                     return recloseTags("<l>" + broken);
                 }
                 if (openTags.size() == 0 && "</lg".equals(match) && !lgTagFound) {
                     return recloseTags("<lg>" + broken);
                 }
                 if (openTags.size() == 0) {
                     return null;
                 }
                 String lastTag = openTags.remove(openTags.size() - 1);
                 if (!("</" + lastTag).equals(match)) {
                     return null;
                 }
             } else {
                 int closePos = result.indexOf('>', m.end());
                 if (closePos == -1) {
                     return null;
                 }
                 while (Character.isWhitespace(result.charAt(closePos - 1))) {
                     --closePos;
                 }
                 if (result.charAt(closePos - 1) != '/') {
                     if ("<l".equals(match)) {
                         lTagFound = true;
                     }
                     if ("<lg".equals(match)) {
                         lgTagFound = true;
                     }
                     openTags.add(match.substring(1));
                 }
             }
         }
         Collections.reverse(openTags);
         for (String openTag : openTags) {
             result += "</" + openTag + ">";
         }
         return result;
     }
 
     /**
      * Common HTML tags such as &lt;br&gt;,&lt;hr&gt; and &lt;img&gt; may be
      * left open causing XML parsing to fail. This method closes these tags.
      * 
      * @param broken
      *            the string to be cleaned
      * @return the cleaned string
      */
     public static String closeEmptyTags(String broken) {
         if (broken == null) {
             return null;
         }
 
         return openHTMLTagPattern.matcher(broken).replaceAll("<$1$2/>");
     }
 
     /**
      * XML parse failed, so we can try getting rid of all the tags and having
      * another go. We define a tag to start at a &lt; and end at the end of the
      * next word (where a word is what comes in between spaces) that does not
      * contain an = sign, or at a >, whichever is earlier.
      * @param broken 
      * @return the string without any tags
      */
     public static String cleanAllTags(String broken) {
         if (broken == null) {
             return null;
         }
 
         String working = broken;
 
         allTags: while (true) {
             int lt = working.indexOf('<');
 
             // If there are no more amps then we are done
             if (lt == -1) {
                 break allTags;
             }
 
             // loop to find the end of this tag
             int i = lt;
             int startattr = -1;
 
             singletag: while (true) {
                 i++;
 
                 // the tag can't exist past the end of the string
                 if (i >= working.length()) {
                     // go back one so we can safely chop
                     i--;
                     break singletag;
                 }
 
                 char c = working.charAt(i);
 
                 // normal end of tag
                 if (c == '>') {
                     break singletag;
                 }
 
                 // we declare end-of-tag if this 'word' is not an attribute
                 if (c == ' ') {
                     if (startattr == -1) {
                         // NOTE(joe): should we skip over consecutive spaces?
                         startattr = i;
                     } else {
                         // so we've already had a space indicating start of
                         // attribute, so this must be the beginning of the next
                         // NOTE(joe): no - spaces can exist in attr values
                         String value = working.substring(startattr, i);
                         if (value.indexOf('=') == -1) {
                             // this 'attribute' does not contain an equals so
                             // we call it a word and end the parse
                             break singletag;
                         }
                     }
                 }
             }
 
             // So we have the end of the tag, delete it, but leave a space in it's place
             // DataPolice.report("discarding tag: " + working.substring(lt, i + 1));
             working = working.substring(0, lt) + " " + working.substring(i + 1);
         }
 
         return working;
     }
 
     /**
      * Replace entity with its unicode equivalent, if it is not a valid XML
      * entity. Otherwise strip it out. XML only allows 4 entities: &amp;amp;,
      * &amp;quot;, &amp;lt; and &amp;gt;.
      * 
      * @param entity
      *            the entity to be replaced
      * @return the substitution for the entity, either itself, the unicode
      *         equivalent or an empty string.
      */
     private static String handleEntity(String entity) {
         if (goodEntities.contains(entity)) {
             return entity;
         }
 
         String replace = badEntities.get(entity);
         if (replace != null) {
             return replace;
         }
 
         // replace unknown entities with a space
         return " ";
     }
 
     // Map entities to their unicode equivalent
     private static Set<String> goodEntities = new HashSet<String>();
     private static PropertyMap badEntities = new PropertyMap();
     static {
         // pre-defined XML entities
         goodEntities.add("&quot;"); // quotation mark
         goodEntities.add("&amp;"); // ampersand
         goodEntities.add("&lt;"); // less-than sign
         goodEntities.add("&gt;"); // greater-than sign
 
         // misc entities
         badEntities.put("&euro;", "\u20AC"); // euro
         badEntities.put("&lsquo;", "\u2018"); // left single quotation mark
         badEntities.put("&rsquo;", "\u2019"); // right single quotation mark
 
         // Latin 1 entities
         badEntities.put("&nbsp;", "\u00A0"); // no-break space
         badEntities.put("&iexcl;", "\u00A1"); // inverted exclamation mark
         badEntities.put("&cent;", "\u00A2"); // cent sign
         badEntities.put("&pound;", "\u00A3"); // pound sign
         badEntities.put("&curren;", "\u00A4"); // currency sign
         badEntities.put("&yen;", "\u00A5"); // yen sign
         badEntities.put("&brvbar;", "\u00A6"); // broken vertical bar
         badEntities.put("&sect;", "\u00A7"); // section sign
         badEntities.put("&uml;", "\u00A8"); // diaeresis
         badEntities.put("&copy;", "\u00A9"); // copyright sign
         badEntities.put("&ordf;", "\u00AA"); // feminine ordinal indicator
         badEntities.put("&laquo;", "\u00AB"); // left-pointing double angle quotation mark
         badEntities.put("&not;", "\u00AC"); // not sign
         badEntities.put("&shy;", "\u00AD"); // soft hyphen
         badEntities.put("&reg;", "\u00AE"); // registered sign
         badEntities.put("&macr;", "\u00AF"); // macron
         badEntities.put("&deg;", "\u00B0"); // degree sign
         badEntities.put("&plusmn;", "\u00B1"); // plus-minus sign
         badEntities.put("&sup2;", "\u00B2"); // superscript two
         badEntities.put("&sup3;", "\u00B3"); // superscript three
         badEntities.put("&acute;", "\u00B4"); // acute accent
         badEntities.put("&micro;", "\u00B5"); // micro sign
         badEntities.put("&para;", "\u00B6"); // pilcrow sign
         badEntities.put("&middot;", "\u00B7"); // middle dot
         badEntities.put("&cedil;", "\u00B8"); // cedilla
         badEntities.put("&sup1;", "\u00B9"); // superscript one
         badEntities.put("&ordm;", "\u00BA"); // masculine ordinal indicator
         badEntities.put("&raquo;", "\u00BB"); // right-pointing double angle quotation mark
         badEntities.put("&frac14;", "\u00BC"); // vulgar fraction one quarter
         badEntities.put("&frac12;", "\u00BD"); // vulgar fraction one half
         badEntities.put("&frac34;", "\u00BE"); // vulgar fraction three quarters
         badEntities.put("&iquest;", "\u00BF"); // inverted question mark
         badEntities.put("&Agrave;", "\u00C0"); // latin capital letter A with grave
         badEntities.put("&Aacute;", "\u00C1"); // latin capital letter A with acute
         badEntities.put("&Acirc;", "\u00C2"); // latin capital letter A with circumflex
         badEntities.put("&Atilde;", "\u00C3"); // latin capital letter A with tilde
         badEntities.put("&Auml;", "\u00C4"); // latin capital letter A with diaeresis
         badEntities.put("&Aring;", "\u00C5"); // latin capital letter A with ring above
         badEntities.put("&AElig;", "\u00C6"); // latin capital letter AE
         badEntities.put("&Ccedil;", "\u00C7"); // latin capital letter C with cedilla
         badEntities.put("&Egrave;", "\u00C8"); // latin capital letter E with grave
         badEntities.put("&Eacute;", "\u00C9"); // latin capital letter E with acute
         badEntities.put("&Ecirc;", "\u00CA"); // latin capital letter E with circumflex
         badEntities.put("&Euml;", "\u00CB"); // latin capital letter E with diaeresis
         badEntities.put("&Igrave;", "\u00CC"); // latin capital letter I with grave
         badEntities.put("&Iacute;", "\u00CD"); // latin capital letter I with acute
         badEntities.put("&Icirc;", "\u00CE"); // latin capital letter I with circumflex
         badEntities.put("&Iuml;", "\u00CF"); // latin capital letter I with diaeresis
         badEntities.put("&ETH;", "\u00D0"); // latin capital letter ETH
         badEntities.put("&Ntilde;", "\u00D1"); // latin capital letter N with tilde
         badEntities.put("&Ograve;", "\u00D2"); // latin capital letter O with grave
         badEntities.put("&Oacute;", "\u00D3"); // latin capital letter O with acute
         badEntities.put("&Ocirc;", "\u00D4"); // latin capital letter O with circumflex
         badEntities.put("&Otilde;", "\u00D5"); // latin capital letter O with tilde
         badEntities.put("&Ouml;", "\u00D6"); // latin capital letter O with diaeresis
         badEntities.put("&times;", "\u00D7"); // multiplication sign
         badEntities.put("&Oslash;", "\u00D8"); // latin capital letter O with stroke
         badEntities.put("&Ugrave;", "\u00D9"); // latin capital letter U with grave
         badEntities.put("&Uacute;", "\u00DA"); // latin capital letter U with acute
         badEntities.put("&Ucirc;", "\u00DB"); // latin capital letter U with circumflex
         badEntities.put("&Uuml;", "\u00DC"); // latin capital letter U with diaeresis
         badEntities.put("&Yacute;", "\u00DD"); // latin capital letter Y with acute
         badEntities.put("&THORN;", "\u00DE"); // latin capital letter THORN
         badEntities.put("&szlig;", "\u00DF"); // latin small letter sharp s
         badEntities.put("&agrave;", "\u00E0"); // latin small letter a with grave
         badEntities.put("&aacute;", "\u00E1"); // latin small letter a with acute
         badEntities.put("&acirc;", "\u00E2"); // latin small letter a with circumflex
         badEntities.put("&atilde;", "\u00E3"); // latin small letter a with tilde
         badEntities.put("&auml;", "\u00E4"); // latin small letter a with diaeresis
         badEntities.put("&aring;", "\u00E5"); // latin small letter a with ring above
         badEntities.put("&aelig;", "\u00E6"); // latin small letter ae
         badEntities.put("&ccedil;", "\u00E7"); // latin small letter c with cedilla
         badEntities.put("&egrave;", "\u00E8"); // latin small letter e with grave
         badEntities.put("&eacute;", "\u00E9"); // latin small letter e with acute
         badEntities.put("&ecirc;", "\u00EA"); // latin small letter e with circumflex
         badEntities.put("&euml;", "\u00EB"); // latin small letter e with diaeresis
         badEntities.put("&igrave;", "\u00EC"); // latin small letter i with grave
         badEntities.put("&iacute;", "\u00ED"); // latin small letter i with acute
         badEntities.put("&icirc;", "\u00EE"); // latin small letter i with circumflex
         badEntities.put("&iuml;", "\u00EF"); // latin small letter i with diaeresis
         badEntities.put("&eth;", "\u00F0"); // latin small letter eth
         badEntities.put("&ntilde;", "\u00F1"); // latin small letter n with tilde
         badEntities.put("&ograve;", "\u00F2"); // latin small letter o with grave
         badEntities.put("&oacute;", "\u00F3"); // latin small letter o with acute
         badEntities.put("&ocirc;", "\u00F4"); // latin small letter o with circumflex
         badEntities.put("&otilde;", "\u00F5"); // latin small letter o with tilde
         badEntities.put("&ouml;", "\u00F6"); // latin small letter o with diaeresis
         badEntities.put("&divide;", "\u00F7"); // division sign
         badEntities.put("&oslash;", "\u00F8"); // latin small letter o with stroke
         badEntities.put("&ugrave;", "\u00F9"); // latin small letter u with grave
         badEntities.put("&uacute;", "\u00FA"); // latin small letter u with acute
         badEntities.put("&ucirc;", "\u00FB"); // latin small letter u with circumflex
         badEntities.put("&uuml;", "\u00FC"); // latin small letter u with diaeresis
         badEntities.put("&yacute;", "\u00FD"); // latin small letter y with acute
         badEntities.put("&thorn;", "\u00FE"); // latin small letter thorn
         badEntities.put("&yuml;", "\u00FF"); // latin small letter y with diaeresis
     }
 
     /**
      * Pattern for numeric entities.
      */
     private static Pattern validCharacterEntityPattern = Pattern.compile("^&#x?\\d{2,4};");
 
     /**
      * Pattern that negates the allowable XML 4 byte unicode characters. Valid
      * are: #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] |
      * [#x10000-#x10FFFF]
      */
     private static Pattern invalidCharacterPattern = Pattern.compile("[^\t\r\n\u0020-\uD7FF\uE000-\uFFFD]");
 
     /**
      * Pattern that matches open &lt;br&gt;,&lt;hr&gt; and &lt;img&gt; tags.
      */
     private static Pattern openHTMLTagPattern = Pattern.compile("<(img|hr|br)([^>]*)(?<!/)>");
 
     /**
      * The log stream
      */
     private static final Logger log = LoggerFactory.getLogger(XMLUtil.class);
 }

1		/**
2		* Distribution License:
3		* JSword is free software; you can redistribute it and/or modify it under
4		* the terms of the GNU Lesser General Public License, version 2.1 or later
5		* as published by the Free Software Foundation. This program is distributed
6		* in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even
7		* the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
8		* See the GNU Lesser General Public License for more details.
9		*
10		* The License is available on the internet at:
11		* http://www.gnu.org/copyleft/lgpl.html
12		* or by writing to:
13		* Free Software Foundation, Inc.
14		* 59 Temple Place - Suite 330
15		* Boston, MA 02111-1307, USA
16		*
17		* © CrossWire Bible Society, 2005 - 2016
18		*
19		*/
20		package org.crosswire.common.xml;
21
22		import java.io.IOException;
23		import java.io.InputStream;
24		import java.util.ArrayList;
25		import java.util.Collections;
26		import java.util.HashSet;
27		import java.util.List;
28		import java.util.Set;
29		import java.util.regex.Matcher;
30		import java.util.regex.Pattern;
31
32		import org.crosswire.common.util.FileUtil;
33		import org.crosswire.common.util.PropertyMap;
34		import org.crosswire.common.util.ResourceUtil;
35		import org.jdom2.Document;
36		import org.jdom2.JDOMException;
37		import org.jdom2.input.SAXBuilder;
38		import org.jdom2.input.sax.XMLReaders;
39		import org.slf4j.Logger;
40		import org.slf4j.LoggerFactory;
41		import org.xml.sax.Attributes;
42		import org.xml.sax.ContentHandler;
43		import org.xml.sax.SAXException;
44
45		/**
46		* Utilities for working with SAX XML parsing.
47		*
48		* @see gnu.lgpl.License The GNU Lesser General Public License for details.
49		* @author Joe Walker
50		* @author DM Smith
51		*/
52		public final class XMLUtil {
53		/**
54		* Prevent instantiation
55		*/
56	0	private XMLUtil() {
57	0	}
58
59		/**
60		* Get and load an XML file from the classpath and a few other places into a
61		* JDOM Document object.
62		*
63		* @param subject
64		* The name of the desired resource (without any extension)
65		* @return The requested resource
66		* @throws IOException
67		* if there is a problem reading the file
68		* @throws JDOMException
69		* If the resource is not valid XML
70		*/
71		public static Document getDocument(String subject) throws JDOMException, IOException {
72	0	String resource = subject + FileUtil.EXTENSION_XML;
73	0	InputStream in = ResourceUtil.getResourceAsStream(resource);
74
75	0	log.debug("Loading {}.xml from classpath: [OK]", subject);
76		// With JDom 1.x this passed true
77	0	SAXBuilder builder = new SAXBuilder(XMLReaders.DTDVALIDATING);
78	0	return builder.build(in);
79		}
80
81		/**
82		* Serialize a SAXEventProvider into an XML String
83		*
84		* @param provider
85		* The source of SAX events
86		* @return a serialized string
87		* @throws SAXException
88		*/
89		public static String writeToString(SAXEventProvider provider) throws SAXException {
90	0	ContentHandler ser = new PrettySerializingContentHandler();
91	0	provider.provideSAXEvents(ser);
92	0	return ser.toString();
93		}
94
95		/**
96		* Get the full name of the attribute, including the namespace if any.
97		*
98		* @param attrs
99		* the collection of attributes
100		* @param index
101		* the index of the desired attribute
102		* @return the requested attribute
103		*/
104		public static String getAttributeName(Attributes attrs, int index) {
105	0	String qName = attrs.getQName(index);
106	0	if (qName != null) {
107	0	return qName;
108		}
109	0	return attrs.getLocalName(index);
110		}
111
112		/**
113		* Show the attributes of an element as debug
114		* @param attrs
115		*/
116		public static void debugSAXAttributes(Attributes attrs) {
117	0	for (int i = 0; i < attrs.getLength(); i++) {
118	0	log.debug("attr[{}]: {}={}", Integer.toString(i), attrs.getQName(i), attrs.getValue(i));
119		}
120	0	}
121
122		/**
123		* Normalizes the given string
124		* @param s
125		* @return the escaped string
126		*/
127		public static String escape(String s) {
128	0	if (s == null) {
129	0	return s;
130		}
131	0	int len = s.length();
132	0	StringBuilder str = new StringBuilder(len);
133
134	0	for (int i = 0; i < len; i++) {
135	0	char ch = s.charAt(i);
136	0	switch (ch) {
137		case '<':
138	0	str.append("<");
139	0	break;
140
141		case '>':
142	0	str.append(">");
143	0	break;
144
145		case '&':
146	0	str.append("&");
147	0	break;
148
149		case '"':
150	0	str.append(""");
151	0	break;
152
153		default:
154	0	str.append(ch);
155		}
156		}
157
158	0	return str.toString();
159		}
160
161		/**
162		* For each entity in the input that is not allowed in XML, replace the
163		* entity with its unicode equivalent or remove it. For each instance of a
164		* bare &, replace it with &<br>
165		* XML only allows 4 entities: &amp;, &quot;, &lt; and &gt;.
166		*
167		* @param broken
168		* the string to handle entities
169		* @return the string with entities appropriately fixed up
170		*/
171		public static String cleanAllEntities(String broken) {
172	0	if (broken == null) {
173	0	return null;
174		}
175
176	0	String working = broken;
177	0	int cleanfrom = 0;
178
179		while (true) {
180	0	int amp = working.indexOf('&', cleanfrom);
181
182		// If there are no more amps then we are done
183	0	if (amp == -1) {
184	0	break;
185		}
186
187		// Skip references of the kind &#ddd;
188	0	if (validCharacterEntityPattern.matcher(working.substring(amp)).find()) {
189	0	cleanfrom = working.indexOf(';', amp) + 1;
190	0	continue;
191		}
192
193	0	int i = amp + 1;
194		while (true) {
195		// if we are at the end of the string then just escape the '&';
196	0	if (i >= working.length()) {
197		// String entity = working.substring(amp);
198		// String replace = guessEntity(entity);
199		// DataPolice.report("replacing unterminated entity: '" +
200		// entity + "' with: '" + replace + "'");
201
202	0	return working.substring(0, amp) + "&" + working.substring(amp + 1);
203		}
204
205		// if we have come to a ; then we have an entity
206		// If it is something that xml can't handle then replace it.
207	0	char c = working.charAt(i);
208	0	if (c == ';') {
209	0	String entity = working.substring(amp, i + 1);
210	0	String replace = handleEntity(entity);
211		// log.warn("replacing entity: '{}' with: '{}'", entity, replace);
212
213	0	working = working.substring(0, amp) + replace + working.substring(i + 1);
214	0	break;
215		}
216
217		// Did we end an entity without finding a closing ;
218		// Then treat it as an '&' that needs to be replaced with &
219	0	if (!Character.isLetterOrDigit(c)) {
220		// String entity = working.substring(amp, i);
221		// String replace = "&" + working.substring(amp + 1, i);
222		// log.warn("replacing invalid entity: '{}' with: '{}': {}", entity, replace, broken);
223
224	0	working = working.substring(0, amp) + "&" + working.substring(amp + 1);
225	0	amp = i + 4; // account for the 4 extra characters
226	0	break;
227		}
228
229	0	i++;
230	0	}
231
232	0	cleanfrom = amp + 1;
233	0	}
234
235	0	return working;
236		}
237
238		/**
239		* Remove all invalid characters in the input, replacing them with a space. XML has stringent
240		* requirements as to which characters are or are not allowed. The set of
241		* allowable characters are:<br>
242		* #x9 \| #xA \| #xD \| [#x20-#xD7FF] \| [#xE000-#xFFFD] \| [#x10000-#x10FFFF]<br>
243		* Note: Java handles to \uFFFF
244		*
245		* @param broken
246		* the string to be cleaned
247		* @return the cleaned string
248		*/
249		public static String cleanAllCharacters(String broken) {
250	0	return invalidCharacterPattern.matcher(broken).replaceAll(" ");
251		}
252
253		/**
254		* Strip all closing tags from the end of the XML fragment, and then
255		* re-close all tags that are open at the end of the string.
256		*
257		* @param broken
258		* the string to be cleaned.
259		* @return cleaned string, or {@code null} if the string could not be
260		* cleaned due to more broken XML
261		*/
262		public static String recloseTags(String broken) {
263	0	String result = broken;
264		// remove closing tags from the end
265	0	while (result.matches(".</[a-zA-Z]+>[ \t\r\n]")) {
266	0	result = result.substring(0, result.lastIndexOf('<'));
267		}
268		// close tags again
269	0	List<String> openTags = new ArrayList<String>();
270	0	Matcher m = Pattern.compile("</?[a-zA-Z]+").matcher(result);
271	0	boolean lTagFound = false;
272	0	boolean lgTagFound = false;
273	0	while (m.find()) {
274	0	String match = m.group();
275	0	if (match.startsWith("</")) {
276	0	if (openTags.size() == 0 && "</l".equals(match) && !lTagFound) {
277	0	return recloseTags("<l>" + broken);
278		}
279	0	if (openTags.size() == 0 && "</lg".equals(match) && !lgTagFound) {
280	0	return recloseTags("<lg>" + broken);
281		}
282	0	if (openTags.size() == 0) {
283	0	return null;
284		}
285	0	String lastTag = openTags.remove(openTags.size() - 1);
286	0	if (!("</" + lastTag).equals(match)) {
287	0	return null;
288		}
289	0	} else {
290	0	int closePos = result.indexOf('>', m.end());
291	0	if (closePos == -1) {
292	0	return null;
293		}
294	0	while (Character.isWhitespace(result.charAt(closePos - 1))) {
295	0	--closePos;
296		}
297	0	if (result.charAt(closePos - 1) != '/') {
298	0	if ("<l".equals(match)) {
299	0	lTagFound = true;
300		}
301	0	if ("<lg".equals(match)) {
302	0	lgTagFound = true;
303		}
304	0	openTags.add(match.substring(1));
305		}
306		}
307	0	}
308	0	Collections.reverse(openTags);
309	0	for (String openTag : openTags) {
310	0	result += "</" + openTag + ">";
311		}
312	0	return result;
313		}
314
315		/**
316		* Common HTML tags such as <br>,<hr> and <img> may be
317		* left open causing XML parsing to fail. This method closes these tags.
318		*
319		* @param broken
320		* the string to be cleaned
321		* @return the cleaned string
322		*/
323		public static String closeEmptyTags(String broken) {
324	0	if (broken == null) {
325	0	return null;
326		}
327
328	0	return openHTMLTagPattern.matcher(broken).replaceAll("<$1$2/>");
329		}
330
331		/**
332		* XML parse failed, so we can try getting rid of all the tags and having
333		* another go. We define a tag to start at a < and end at the end of the
334		* next word (where a word is what comes in between spaces) that does not
335		* contain an = sign, or at a >, whichever is earlier.
336		* @param broken
337		* @return the string without any tags
338		*/
339		public static String cleanAllTags(String broken) {
340	0	if (broken == null) {
341	0	return null;
342		}
343
344	0	String working = broken;
345
346		allTags: while (true) {
347	0	int lt = working.indexOf('<');
348
349		// If there are no more amps then we are done
350	0	if (lt == -1) {
351	0	break allTags;
352		}
353
354		// loop to find the end of this tag
355	0	int i = lt;
356	0	int startattr = -1;
357
358		singletag: while (true) {
359	0	i++;
360
361		// the tag can't exist past the end of the string
362	0	if (i >= working.length()) {
363		// go back one so we can safely chop
364	0	i--;
365	0	break singletag;
366		}
367
368	0	char c = working.charAt(i);
369
370		// normal end of tag
371	0	if (c == '>') {
372	0	break singletag;
373		}
374
375		// we declare end-of-tag if this 'word' is not an attribute
376	0	if (c == ' ') {
377	0	if (startattr == -1) {
378		// NOTE(joe): should we skip over consecutive spaces?
379	0	startattr = i;
380		} else {
381		// so we've already had a space indicating start of
382		// attribute, so this must be the beginning of the next
383		// NOTE(joe): no - spaces can exist in attr values
384	0	String value = working.substring(startattr, i);
385	0	if (value.indexOf('=') == -1) {
386		// this 'attribute' does not contain an equals so
387		// we call it a word and end the parse
388	0	break singletag;
389		}
390		}
391		}
392	0	}
393
394		// So we have the end of the tag, delete it, but leave a space in it's place
395		// DataPolice.report("discarding tag: " + working.substring(lt, i + 1));
396	0	working = working.substring(0, lt) + " " + working.substring(i + 1);
397	0	}
398
399	0	return working;
400		}
401
402		/**
403		* Replace entity with its unicode equivalent, if it is not a valid XML
404		* entity. Otherwise strip it out. XML only allows 4 entities: &amp;,
405		* &quot;, &lt; and &gt;.
406		*
407		* @param entity
408		* the entity to be replaced
409		* @return the substitution for the entity, either itself, the unicode
410		* equivalent or an empty string.
411		*/
412		private static String handleEntity(String entity) {
413	0	if (goodEntities.contains(entity)) {
414	0	return entity;
415		}
416
417	0	String replace = badEntities.get(entity);
418	0	if (replace != null) {
419	0	return replace;
420		}
421
422		// replace unknown entities with a space
423	0	return " ";
424		}
425
426		// Map entities to their unicode equivalent
427	0	private static Set<String> goodEntities = new HashSet<String>();
428	0	private static PropertyMap badEntities = new PropertyMap();
429		static {
430		// pre-defined XML entities
431	0	goodEntities.add("""); // quotation mark
432	0	goodEntities.add("&"); // ampersand
433	0	goodEntities.add("<"); // less-than sign
434	0	goodEntities.add(">"); // greater-than sign
435
436		// misc entities
437	0	badEntities.put("€", "\u20AC"); // euro
438	0	badEntities.put("‘", "\u2018"); // left single quotation mark
439	0	badEntities.put("’", "\u2019"); // right single quotation mark
440
441		// Latin 1 entities
442	0	badEntities.put(" ", "\u00A0"); // no-break space
443	0	badEntities.put("¡", "\u00A1"); // inverted exclamation mark
444	0	badEntities.put("¢", "\u00A2"); // cent sign
445	0	badEntities.put("£", "\u00A3"); // pound sign
446	0	badEntities.put("¤", "\u00A4"); // currency sign
447	0	badEntities.put("¥", "\u00A5"); // yen sign
448	0	badEntities.put("¦", "\u00A6"); // broken vertical bar
449	0	badEntities.put("§", "\u00A7"); // section sign
450	0	badEntities.put("¨", "\u00A8"); // diaeresis
451	0	badEntities.put("©", "\u00A9"); // copyright sign
452	0	badEntities.put("ª", "\u00AA"); // feminine ordinal indicator
453	0	badEntities.put("«", "\u00AB"); // left-pointing double angle quotation mark
454	0	badEntities.put("¬", "\u00AC"); // not sign
455	0	badEntities.put("", "\u00AD"); // soft hyphen
456	0	badEntities.put("®", "\u00AE"); // registered sign
457	0	badEntities.put("¯", "\u00AF"); // macron
458	0	badEntities.put("°", "\u00B0"); // degree sign
459	0	badEntities.put("±", "\u00B1"); // plus-minus sign
460	0	badEntities.put("²", "\u00B2"); // superscript two
461	0	badEntities.put("³", "\u00B3"); // superscript three
462	0	badEntities.put("´", "\u00B4"); // acute accent
463	0	badEntities.put("µ", "\u00B5"); // micro sign
464	0	badEntities.put("¶", "\u00B6"); // pilcrow sign
465	0	badEntities.put("·", "\u00B7"); // middle dot
466	0	badEntities.put("¸", "\u00B8"); // cedilla
467	0	badEntities.put("¹", "\u00B9"); // superscript one
468	0	badEntities.put("º", "\u00BA"); // masculine ordinal indicator
469	0	badEntities.put("»", "\u00BB"); // right-pointing double angle quotation mark
470	0	badEntities.put("¼", "\u00BC"); // vulgar fraction one quarter
471	0	badEntities.put("½", "\u00BD"); // vulgar fraction one half
472	0	badEntities.put("¾", "\u00BE"); // vulgar fraction three quarters
473	0	badEntities.put("¿", "\u00BF"); // inverted question mark
474	0	badEntities.put("À", "\u00C0"); // latin capital letter A with grave
475	0	badEntities.put("Á", "\u00C1"); // latin capital letter A with acute
476	0	badEntities.put("Â", "\u00C2"); // latin capital letter A with circumflex
477	0	badEntities.put("Ã", "\u00C3"); // latin capital letter A with tilde
478	0	badEntities.put("Ä", "\u00C4"); // latin capital letter A with diaeresis
479	0	badEntities.put("Å", "\u00C5"); // latin capital letter A with ring above
480	0	badEntities.put("Æ", "\u00C6"); // latin capital letter AE
481	0	badEntities.put("Ç", "\u00C7"); // latin capital letter C with cedilla
482	0	badEntities.put("È", "\u00C8"); // latin capital letter E with grave
483	0	badEntities.put("É", "\u00C9"); // latin capital letter E with acute
484	0	badEntities.put("Ê", "\u00CA"); // latin capital letter E with circumflex
485	0	badEntities.put("Ë", "\u00CB"); // latin capital letter E with diaeresis
486	0	badEntities.put("Ì", "\u00CC"); // latin capital letter I with grave
487	0	badEntities.put("Í", "\u00CD"); // latin capital letter I with acute
488	0	badEntities.put("Î", "\u00CE"); // latin capital letter I with circumflex
489	0	badEntities.put("Ï", "\u00CF"); // latin capital letter I with diaeresis
490	0	badEntities.put("Ð", "\u00D0"); // latin capital letter ETH
491	0	badEntities.put("Ñ", "\u00D1"); // latin capital letter N with tilde
492	0	badEntities.put("Ò", "\u00D2"); // latin capital letter O with grave
493	0	badEntities.put("Ó", "\u00D3"); // latin capital letter O with acute
494	0	badEntities.put("Ô", "\u00D4"); // latin capital letter O with circumflex
495	0	badEntities.put("Õ", "\u00D5"); // latin capital letter O with tilde
496	0	badEntities.put("Ö", "\u00D6"); // latin capital letter O with diaeresis
497	0	badEntities.put("×", "\u00D7"); // multiplication sign
498	0	badEntities.put("Ø", "\u00D8"); // latin capital letter O with stroke
499	0	badEntities.put("Ù", "\u00D9"); // latin capital letter U with grave
500	0	badEntities.put("Ú", "\u00DA"); // latin capital letter U with acute
501	0	badEntities.put("Û", "\u00DB"); // latin capital letter U with circumflex
502	0	badEntities.put("Ü", "\u00DC"); // latin capital letter U with diaeresis
503	0	badEntities.put("Ý", "\u00DD"); // latin capital letter Y with acute
504	0	badEntities.put("Þ", "\u00DE"); // latin capital letter THORN
505	0	badEntities.put("ß", "\u00DF"); // latin small letter sharp s
506	0	badEntities.put("à", "\u00E0"); // latin small letter a with grave
507	0	badEntities.put("á", "\u00E1"); // latin small letter a with acute
508	0	badEntities.put("â", "\u00E2"); // latin small letter a with circumflex
509	0	badEntities.put("ã", "\u00E3"); // latin small letter a with tilde
510	0	badEntities.put("ä", "\u00E4"); // latin small letter a with diaeresis
511	0	badEntities.put("å", "\u00E5"); // latin small letter a with ring above
512	0	badEntities.put("æ", "\u00E6"); // latin small letter ae
513	0	badEntities.put("ç", "\u00E7"); // latin small letter c with cedilla
514	0	badEntities.put("è", "\u00E8"); // latin small letter e with grave
515	0	badEntities.put("é", "\u00E9"); // latin small letter e with acute
516	0	badEntities.put("ê", "\u00EA"); // latin small letter e with circumflex
517	0	badEntities.put("ë", "\u00EB"); // latin small letter e with diaeresis
518	0	badEntities.put("ì", "\u00EC"); // latin small letter i with grave
519	0	badEntities.put("í", "\u00ED"); // latin small letter i with acute
520	0	badEntities.put("î", "\u00EE"); // latin small letter i with circumflex
521	0	badEntities.put("ï", "\u00EF"); // latin small letter i with diaeresis
522	0	badEntities.put("ð", "\u00F0"); // latin small letter eth
523	0	badEntities.put("ñ", "\u00F1"); // latin small letter n with tilde
524	0	badEntities.put("ò", "\u00F2"); // latin small letter o with grave
525	0	badEntities.put("ó", "\u00F3"); // latin small letter o with acute
526	0	badEntities.put("ô", "\u00F4"); // latin small letter o with circumflex
527	0	badEntities.put("õ", "\u00F5"); // latin small letter o with tilde
528	0	badEntities.put("ö", "\u00F6"); // latin small letter o with diaeresis
529	0	badEntities.put("÷", "\u00F7"); // division sign
530	0	badEntities.put("ø", "\u00F8"); // latin small letter o with stroke
531	0	badEntities.put("ù", "\u00F9"); // latin small letter u with grave
532	0	badEntities.put("ú", "\u00FA"); // latin small letter u with acute
533	0	badEntities.put("û", "\u00FB"); // latin small letter u with circumflex
534	0	badEntities.put("ü", "\u00FC"); // latin small letter u with diaeresis
535	0	badEntities.put("ý", "\u00FD"); // latin small letter y with acute
536	0	badEntities.put("þ", "\u00FE"); // latin small letter thorn
537	0	badEntities.put("ÿ", "\u00FF"); // latin small letter y with diaeresis
538		}
539
540		/**
541		* Pattern for numeric entities.
542		*/
543	0	private static Pattern validCharacterEntityPattern = Pattern.compile("^&#x?\\d{2,4};");
544
545		/**
546		* Pattern that negates the allowable XML 4 byte unicode characters. Valid
547		* are: #x9 \| #xA \| #xD \| [#x20-#xD7FF] \| [#xE000-#xFFFD] \|
548		* [#x10000-#x10FFFF]
549		*/
550	0	private static Pattern invalidCharacterPattern = Pattern.compile("[^\t\r\n\u0020-\uD7FF\uE000-\uFFFD]");
551
552		/**
553		* Pattern that matches open <br>,<hr> and <img> tags.
554		*/
555	0	private static Pattern openHTMLTagPattern = Pattern.compile("<(img\|hr\|br)([^>]*)(?<!/)>");
556
557		/**
558		* The log stream
559		*/
560	0	private static final Logger log = LoggerFactory.getLogger(XMLUtil.class);
561		}