| Classes in this File | Line Coverage | Branch Coverage | Complexity | ||||
| OSISFilter |
|
| 7.25;7.25 |
| 1 | /** | |
| 2 | * Distribution License: | |
| 3 | * JSword is free software; you can redistribute it and/or modify it under | |
| 4 | * the terms of the GNU Lesser General Public License, version 2.1 or later | |
| 5 | * as published by the Free Software Foundation. This program is distributed | |
| 6 | * in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even | |
| 7 | * the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. | |
| 8 | * See the GNU Lesser General Public License for more details. | |
| 9 | * | |
| 10 | * The License is available on the internet at: | |
| 11 | * http://www.gnu.org/copyleft/lgpl.html | |
| 12 | * or by writing to: | |
| 13 | * Free Software Foundation, Inc. | |
| 14 | * 59 Temple Place - Suite 330 | |
| 15 | * Boston, MA 02111-1307, USA | |
| 16 | * | |
| 17 | * © CrossWire Bible Society, 2005 - 2016 | |
| 18 | * | |
| 19 | */ | |
| 20 | package org.crosswire.jsword.book.filter.osis; | |
| 21 | ||
| 22 | import java.io.IOException; | |
| 23 | import java.io.StringReader; | |
| 24 | import java.util.Arrays; | |
| 25 | import java.util.List; | |
| 26 | import java.util.concurrent.ArrayBlockingQueue; | |
| 27 | import java.util.concurrent.BlockingQueue; | |
| 28 | import java.util.regex.Pattern; | |
| 29 | ||
| 30 | import org.crosswire.common.xml.XMLUtil; | |
| 31 | import org.crosswire.jsword.book.Book; | |
| 32 | import org.crosswire.jsword.book.DataPolice; | |
| 33 | import org.crosswire.jsword.book.OSISUtil; | |
| 34 | import org.crosswire.jsword.book.filter.SourceFilter; | |
| 35 | import org.crosswire.jsword.passage.Key; | |
| 36 | import org.jdom2.Content; | |
| 37 | import org.jdom2.Document; | |
| 38 | import org.jdom2.Element; | |
| 39 | import org.jdom2.JDOMException; | |
| 40 | import org.jdom2.input.SAXBuilder; | |
| 41 | import org.xml.sax.InputSource; | |
| 42 | ||
| 43 | /** | |
| 44 | * Filter to convert an OSIS XML string to OSIS format. | |
| 45 | * | |
| 46 | * @see gnu.lgpl.License The GNU Lesser General Public License for details. | |
| 47 | * @author Joe Walker | |
| 48 | */ | |
| 49 | 0 | public class OSISFilter implements SourceFilter { |
| 50 | ||
| 51 | /* (non-Javadoc) | |
| 52 | * @see org.crosswire.jsword.book.filter.Filter#toOSIS(org.crosswire.jsword.book.Book, org.crosswire.jsword.passage.Key, java.lang.String) | |
| 53 | */ | |
| 54 | public List<Content> toOSIS(Book book, Key key, String plain) { | |
| 55 | 0 | Element ele = null; |
| 56 | 0 | Exception ex = null; |
| 57 | 0 | String clean = plain; |
| 58 | ||
| 59 | // The following converts simple <div> and </div> to their milestoned versions. | |
| 60 | // Current versions of osis2mod do this already | |
| 61 | // Note: if the div element has attributes, it is not seen. | |
| 62 | 0 | clean = DIV_START.matcher(clean).replaceAll("<div sID=\"xyz\"/>"); |
| 63 | 0 | clean = DIV_END.matcher(clean).replaceAll("<div eID=\"xyz\"/>"); |
| 64 | 0 | clean = CHAPTER_END.matcher(clean).replaceAll("<chapter eID=\"xyz\"/>"); |
| 65 | 0 | clean = SPEECH_START.matcher(clean).replaceAll("<speech sID=\"xyz\"/>"); |
| 66 | 0 | clean = SPEECH_END.matcher(clean).replaceAll("<speech eID=\"xyz\"/>"); |
| 67 | ||
| 68 | // FIXME(dms): this is a major HACK handling a problem with a badly | |
| 69 | // encoded module. | |
| 70 | /* if (book.getInitials().startsWith("NET") && plain.endsWith("</div>")) { | |
| 71 | clean = clean.substring(0, plain.length() - 6); | |
| 72 | if (clean.matches(".*</div> <chapter eID=\"[A-Za-z0-9.]+\"/>")) { | |
| 73 | clean = clean.substring(0, clean.lastIndexOf("</div> <chapter")); | |
| 74 | } | |
| 75 | } else if (book.getInitials().equals("Kekchi") && plain.endsWith("</div> <lb type=\"x-begin-paragraph\"/>")) { | |
| 76 | clean = clean.substring(0, clean.length() - 37); | |
| 77 | } else if (book.getInitials().equals("VietLCCMN")) { | |
| 78 | int startPos = clean.indexOf("<div>"), endPos = clean.indexOf("</div>"); | |
| 79 | if (endPos != -1 && (startPos == -1 || startPos > endPos)) { | |
| 80 | if (clean.startsWith("<l ") || (clean.startsWith("<title ") && clean.contains("</title><l "))) | |
| 81 | clean = "<lg>"+clean; | |
| 82 | clean = "<div><div><div><div>"+clean; | |
| 83 | } | |
| 84 | } else */ | |
| 85 | 0 | if ("MapM".equals(book.getInitials())) { |
| 86 | 0 | for (String tag : Arrays.asList("cell", "row", "table")) { |
| 87 | 0 | int startPos = clean.indexOf("<" + tag + ">"); |
| 88 | 0 | int endPos = clean.indexOf("</" + tag + ">"); |
| 89 | 0 | if (endPos != -1 && (startPos == -1 || startPos > endPos)) { |
| 90 | 0 | clean = "<" + tag + ">" + clean; |
| 91 | } | |
| 92 | 0 | } |
| 93 | } | |
| 94 | ||
| 95 | try { | |
| 96 | 0 | ele = parse(clean); |
| 97 | 0 | } catch (JDOMException e) { |
| 98 | 0 | ex = e; |
| 99 | 0 | } catch (IOException e) { |
| 100 | 0 | ex = e; |
| 101 | 0 | } |
| 102 | ||
| 103 | 0 | if (ele == null) { |
| 104 | // There should be no bad entities in OSIS. | |
| 105 | 0 | String cleanedEntities = XMLUtil.cleanAllEntities(clean); |
| 106 | 0 | if (cleanedEntities != null && !cleanedEntities.equals(clean)) { |
| 107 | 0 | clean = cleanedEntities; |
| 108 | try { | |
| 109 | 0 | ele = parse(clean); |
| 110 | 0 | ex = null; |
| 111 | 0 | } catch (JDOMException e) { |
| 112 | 0 | ex = e; |
| 113 | 0 | } catch (IOException e) { |
| 114 | 0 | ex = e; |
| 115 | 0 | } |
| 116 | } | |
| 117 | } | |
| 118 | ||
| 119 | 0 | if (ele == null) { |
| 120 | 0 | String reclosed = XMLUtil.recloseTags(clean); |
| 121 | 0 | if (reclosed != null && !reclosed.equals(clean)) { |
| 122 | 0 | clean = reclosed; |
| 123 | try { | |
| 124 | 0 | ele = parse(clean); |
| 125 | 0 | ex = null; |
| 126 | 0 | } catch (JDOMException e) { |
| 127 | 0 | ex = e; |
| 128 | 0 | } catch (IOException e) { |
| 129 | 0 | ex = e; |
| 130 | 0 | } |
| 131 | } | |
| 132 | } | |
| 133 | ||
| 134 | 0 | if (ex != null) { |
| 135 | 0 | DataPolice.report(book, key, "Parse failed: " + ex.getMessage() + "\non: " + clean); |
| 136 | 0 | ele = cleanTags(book, key, clean); |
| 137 | } | |
| 138 | ||
| 139 | 0 | if (ele == null) { |
| 140 | 0 | ele = OSISUtil.factory().createP(); |
| 141 | } | |
| 142 | ||
| 143 | 0 | return ele.removeContent(); |
| 144 | } | |
| 145 | ||
| 146 | @Override | |
| 147 | public OSISFilter clone() { | |
| 148 | 0 | OSISFilter clone = null; |
| 149 | try { | |
| 150 | 0 | clone = (OSISFilter) super.clone(); |
| 151 | 0 | } catch (CloneNotSupportedException e) { |
| 152 | 0 | assert false : e; |
| 153 | 0 | } |
| 154 | 0 | return clone; |
| 155 | } | |
| 156 | ||
| 157 | private Element cleanTags(Book book, Key key, String plain) { | |
| 158 | // So just try to strip out all XML looking things | |
| 159 | 0 | String shawn = XMLUtil.cleanAllTags(plain); |
| 160 | 0 | Exception ex = null; |
| 161 | try { | |
| 162 | 0 | return parse(shawn); |
| 163 | 0 | } catch (JDOMException e) { |
| 164 | 0 | ex = e; |
| 165 | 0 | } catch (IOException e) { |
| 166 | 0 | ex = e; |
| 167 | 0 | } |
| 168 | ||
| 169 | 0 | DataPolice.report(book, key, "Parse failed: " + ex.getMessage() + "\non: " + shawn); |
| 170 | ||
| 171 | 0 | return null; |
| 172 | } | |
| 173 | ||
| 174 | /** | |
| 175 | * If the string is invalid then we might want to have more than one crack | |
| 176 | * at parsing it | |
| 177 | */ | |
| 178 | private Element parse(String plain) throws JDOMException, IOException { | |
| 179 | 0 | SAXBuilder builder = saxBuilders.poll(); |
| 180 | 0 | if (builder == null) { |
| 181 | //then we have no sax builders available, so let's create a new one and store | |
| 182 | 0 | builder = new SAXBuilder(); |
| 183 | // With JDom 1.x it was important to set Fast Reconfigure on re-usable SAXBuilders | |
| 184 | // This is the default with 2.x and this method does nothing | |
| 185 | // builder.setFastReconfigure(true); | |
| 186 | } | |
| 187 | ||
| 188 | // create a root element to house our document fragment | |
| 189 | 0 | StringReader in = null; |
| 190 | Element div; | |
| 191 | try { | |
| 192 | // Need to contain it in something that we remove when returning it to the user. | |
| 193 | 0 | in = new StringReader("<xxx>" + plain + "</xxx>"); |
| 194 | 0 | InputSource is = new InputSource(in); |
| 195 | 0 | Document doc = builder.build(is); |
| 196 | 0 | div = doc.getRootElement(); |
| 197 | } finally { | |
| 198 | 0 | if (in != null) { |
| 199 | 0 | in.close(); |
| 200 | } | |
| 201 | } | |
| 202 | ||
| 203 | //return builder to queue, or offer a new one. Ignore return value as we don't care whether the builder is going to be re-used | |
| 204 | 0 | saxBuilders.offer(builder); |
| 205 | ||
| 206 | 0 | return div; |
| 207 | } | |
| 208 | ||
| 209 | // space for 32 re-usable sax builders, but doesn't bound the number available to the callers | |
| 210 | 0 | private BlockingQueue<SAXBuilder> saxBuilders = new ArrayBlockingQueue<SAXBuilder>(32); |
| 211 | ||
| 212 | /** | |
| 213 | * Pattern to find the start of a div. Used to convert to a milestoned version. | |
| 214 | */ | |
| 215 | 0 | private static final Pattern DIV_START = Pattern.compile("<div>", Pattern.LITERAL); |
| 216 | /** | |
| 217 | * Pattern to find the end of a div. Used to convert to a milestoned version. | |
| 218 | */ | |
| 219 | 0 | private static final Pattern DIV_END = Pattern.compile("</div>", Pattern.LITERAL); |
| 220 | /** | |
| 221 | * Pattern to find the end of a chapter. Used to convert to a milestoned version. | |
| 222 | */ | |
| 223 | 0 | private static final Pattern CHAPTER_END = Pattern.compile("</chapter>", Pattern.LITERAL); |
| 224 | /** | |
| 225 | * Pattern to find the start of a speech. Used to convert to a milestoned version. | |
| 226 | */ | |
| 227 | 0 | private static final Pattern SPEECH_START = Pattern.compile("<speech>", Pattern.LITERAL); |
| 228 | /** | |
| 229 | * Pattern to find the end of a speech. Used to convert to a milestoned version. | |
| 230 | */ | |
| 231 | 0 | private static final Pattern SPEECH_END = Pattern.compile("</speech>", Pattern.LITERAL); |
| 232 | } |