Coverage Report - org.crosswire.jsword.book.filter.osis.OSISFilter
 
Classes in this File Line Coverage Branch Coverage Complexity
OSISFilter
0%
0/84
0%
0/32
7.25
 
 1  
 /**
 2  
  * Distribution License:
 3  
  * JSword is free software; you can redistribute it and/or modify it under
 4  
  * the terms of the GNU Lesser General Public License, version 2.1 or later
 5  
  * as published by the Free Software Foundation. This program is distributed
 6  
  * in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even
 7  
  * the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 8  
  * See the GNU Lesser General Public License for more details.
 9  
  *
 10  
  * The License is available on the internet at:
 11  
  *      http://www.gnu.org/copyleft/lgpl.html
 12  
  * or by writing to:
 13  
  *      Free Software Foundation, Inc.
 14  
  *      59 Temple Place - Suite 330
 15  
  *      Boston, MA 02111-1307, USA
 16  
  *
 17  
  * © CrossWire Bible Society, 2005 - 2016
 18  
  *
 19  
  */
 20  
 package org.crosswire.jsword.book.filter.osis;
 21  
 
 22  
 import java.io.IOException;
 23  
 import java.io.StringReader;
 24  
 import java.util.Arrays;
 25  
 import java.util.List;
 26  
 import java.util.concurrent.ArrayBlockingQueue;
 27  
 import java.util.concurrent.BlockingQueue;
 28  
 import java.util.regex.Pattern;
 29  
 
 30  
 import org.crosswire.common.xml.XMLUtil;
 31  
 import org.crosswire.jsword.book.Book;
 32  
 import org.crosswire.jsword.book.DataPolice;
 33  
 import org.crosswire.jsword.book.OSISUtil;
 34  
 import org.crosswire.jsword.book.filter.SourceFilter;
 35  
 import org.crosswire.jsword.passage.Key;
 36  
 import org.jdom2.Content;
 37  
 import org.jdom2.Document;
 38  
 import org.jdom2.Element;
 39  
 import org.jdom2.JDOMException;
 40  
 import org.jdom2.input.SAXBuilder;
 41  
 import org.xml.sax.InputSource;
 42  
 
 43  
 /**
 44  
  * Filter to convert an OSIS XML string to OSIS format.
 45  
  * 
 46  
  * @see gnu.lgpl.License The GNU Lesser General Public License for details.
 47  
  * @author Joe Walker
 48  
  */
 49  0
 public class OSISFilter implements SourceFilter {
 50  
 
 51  
     /* (non-Javadoc)
 52  
      * @see org.crosswire.jsword.book.filter.Filter#toOSIS(org.crosswire.jsword.book.Book, org.crosswire.jsword.passage.Key, java.lang.String)
 53  
      */
 54  
     public List<Content> toOSIS(Book book, Key key, String plain) {
 55  0
         Element ele = null;
 56  0
         Exception ex = null;
 57  0
         String clean = plain;
 58  
 
 59  
         // The following converts simple <div> and </div> to their milestoned versions.
 60  
         // Current versions of osis2mod do this already
 61  
         // Note: if the div element has attributes, it is not seen.
 62  0
         clean = DIV_START.matcher(clean).replaceAll("<div sID=\"xyz\"/>");
 63  0
         clean = DIV_END.matcher(clean).replaceAll("<div eID=\"xyz\"/>");
 64  0
         clean = CHAPTER_END.matcher(clean).replaceAll("<chapter eID=\"xyz\"/>");
 65  0
         clean = SPEECH_START.matcher(clean).replaceAll("<speech sID=\"xyz\"/>");
 66  0
         clean = SPEECH_END.matcher(clean).replaceAll("<speech eID=\"xyz\"/>");
 67  
 
 68  
         // FIXME(dms): this is a major HACK handling a problem with a badly
 69  
         // encoded module.
 70  
         /* if (book.getInitials().startsWith("NET") && plain.endsWith("</div>")) {
 71  
             clean = clean.substring(0, plain.length() - 6);
 72  
             if (clean.matches(".*</div> <chapter eID=\"[A-Za-z0-9.]+\"/>")) {
 73  
                 clean = clean.substring(0, clean.lastIndexOf("</div> <chapter"));
 74  
             }
 75  
         } else if (book.getInitials().equals("Kekchi") && plain.endsWith("</div> <lb type=\"x-begin-paragraph\"/>")) {
 76  
             clean = clean.substring(0, clean.length() - 37);
 77  
         } else if (book.getInitials().equals("VietLCCMN")) {
 78  
             int startPos = clean.indexOf("<div>"), endPos = clean.indexOf("</div>");
 79  
             if (endPos != -1 && (startPos == -1 || startPos > endPos)) {
 80  
                 if (clean.startsWith("<l ") || (clean.startsWith("<title ") && clean.contains("</title><l ")))
 81  
                     clean = "<lg>"+clean;
 82  
                 clean = "<div><div><div><div>"+clean;
 83  
             }
 84  
         } else */
 85  0
         if ("MapM".equals(book.getInitials())) {
 86  0
             for (String tag : Arrays.asList("cell", "row", "table")) {
 87  0
                 int startPos = clean.indexOf("<" + tag + ">");
 88  0
                 int endPos = clean.indexOf("</" + tag + ">");
 89  0
                 if (endPos != -1 && (startPos == -1 || startPos > endPos)) {
 90  0
                     clean = "<" + tag + ">" + clean;
 91  
                 }
 92  0
             }
 93  
         }
 94  
 
 95  
         try {
 96  0
             ele = parse(clean);
 97  0
         } catch (JDOMException e) {
 98  0
             ex = e;
 99  0
         } catch (IOException e) {
 100  0
             ex = e;
 101  0
         }
 102  
 
 103  0
         if (ele == null) {
 104  
             // There should be no bad entities in OSIS.
 105  0
             String cleanedEntities = XMLUtil.cleanAllEntities(clean);
 106  0
             if (cleanedEntities != null && !cleanedEntities.equals(clean)) {
 107  0
                 clean = cleanedEntities;
 108  
                 try {
 109  0
                     ele = parse(clean);
 110  0
                     ex = null;
 111  0
                 } catch (JDOMException e) {
 112  0
                     ex = e;
 113  0
                 } catch (IOException e) {
 114  0
                     ex = e;
 115  0
                 }
 116  
             }
 117  
         }
 118  
 
 119  0
         if (ele == null) {
 120  0
             String reclosed = XMLUtil.recloseTags(clean);
 121  0
             if (reclosed != null && !reclosed.equals(clean)) {
 122  0
                 clean = reclosed;
 123  
                 try {
 124  0
                     ele = parse(clean);
 125  0
                     ex = null;
 126  0
                 } catch (JDOMException e) {
 127  0
                     ex = e;
 128  0
                 } catch (IOException e) {
 129  0
                     ex = e;
 130  0
                 }
 131  
             }
 132  
         }
 133  
 
 134  0
         if (ex != null) {
 135  0
             DataPolice.report(book, key, "Parse failed: " + ex.getMessage() + "\non: " + clean);
 136  0
             ele = cleanTags(book, key, clean);
 137  
         }
 138  
 
 139  0
         if (ele == null) {
 140  0
             ele = OSISUtil.factory().createP();
 141  
         }
 142  
 
 143  0
         return ele.removeContent();
 144  
     }
 145  
 
 146  
     @Override
 147  
     public OSISFilter clone() {
 148  0
         OSISFilter clone = null;
 149  
         try {
 150  0
             clone = (OSISFilter) super.clone();
 151  0
         } catch (CloneNotSupportedException e) {
 152  0
             assert false : e;
 153  0
         }
 154  0
         return clone;
 155  
     }
 156  
 
 157  
     private Element cleanTags(Book book, Key key, String plain) {
 158  
         // So just try to strip out all XML looking things
 159  0
         String shawn = XMLUtil.cleanAllTags(plain);
 160  0
         Exception ex = null;
 161  
         try {
 162  0
             return parse(shawn);
 163  0
         } catch (JDOMException e) {
 164  0
             ex = e;
 165  0
         } catch (IOException e) {
 166  0
             ex = e;
 167  0
         }
 168  
 
 169  0
         DataPolice.report(book, key, "Parse failed: " + ex.getMessage() + "\non: " + shawn);
 170  
 
 171  0
         return null;
 172  
     }
 173  
 
 174  
     /**
 175  
      * If the string is invalid then we might want to have more than one crack
 176  
      * at parsing it
 177  
      */
 178  
     private Element parse(String plain) throws JDOMException, IOException {
 179  0
         SAXBuilder builder = saxBuilders.poll();
 180  0
         if (builder == null) {
 181  
             //then we have no sax builders available, so let's create a new one and store
 182  0
             builder = new SAXBuilder();
 183  
             // With JDom 1.x it was important to set Fast Reconfigure on re-usable SAXBuilders
 184  
             // This is the default with 2.x and this method does nothing
 185  
             // builder.setFastReconfigure(true);
 186  
         }
 187  
 
 188  
         // create a root element to house our document fragment
 189  0
         StringReader in = null;
 190  
         Element div;
 191  
         try {
 192  
             // Need to contain it in something that we remove when returning it to the user.
 193  0
             in = new StringReader("<xxx>" + plain + "</xxx>");
 194  0
             InputSource is = new InputSource(in);
 195  0
             Document doc = builder.build(is);
 196  0
             div = doc.getRootElement();
 197  
         } finally {
 198  0
             if (in != null) {
 199  0
                 in.close();
 200  
             }
 201  
         }
 202  
 
 203  
         //return builder to queue, or offer a new one. Ignore return value as we don't care whether the builder is going to be re-used
 204  0
         saxBuilders.offer(builder);
 205  
 
 206  0
         return div;
 207  
     }
 208  
 
 209  
     // space for 32 re-usable sax builders, but doesn't bound the number available to the callers
 210  0
     private BlockingQueue<SAXBuilder> saxBuilders = new ArrayBlockingQueue<SAXBuilder>(32);
 211  
 
 212  
     /**
 213  
      * Pattern to find the start of a div. Used to convert to a milestoned version.
 214  
      */
 215  0
     private static final Pattern DIV_START = Pattern.compile("<div>", Pattern.LITERAL);
 216  
     /**
 217  
      * Pattern to find the end of a div. Used to convert to a milestoned version.
 218  
      */
 219  0
     private static final Pattern DIV_END = Pattern.compile("</div>", Pattern.LITERAL);
 220  
     /**
 221  
      * Pattern to find the end of a chapter. Used to convert to a milestoned version.
 222  
      */
 223  0
     private static final Pattern CHAPTER_END = Pattern.compile("</chapter>", Pattern.LITERAL);
 224  
     /**
 225  
      * Pattern to find the start of a speech. Used to convert to a milestoned version.
 226  
      */
 227  0
     private static final Pattern SPEECH_START = Pattern.compile("<speech>", Pattern.LITERAL);
 228  
     /**
 229  
      * Pattern to find the end of a speech. Used to convert to a milestoned version.
 230  
      */
 231  0
     private static final Pattern SPEECH_END = Pattern.compile("</speech>", Pattern.LITERAL);
 232  
 }