Coverage Report - org.crosswire.common.diff.Bitap
 
Classes in this File Line Coverage Branch Coverage Complexity
Bitap
0%
0/78
0%
0/34
2.5
 
 1  
 /**
 2  
  * Distribution License:
 3  
  * JSword is free software; you can redistribute it and/or modify it under
 4  
  * the terms of the GNU Lesser General Public License, version 2.1 or later
 5  
  * as published by the Free Software Foundation. This program is distributed
 6  
  * in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even
 7  
  * the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 8  
  * See the GNU Lesser General Public License for more details.
 9  
  *
 10  
  * The License is available on the internet at:
 11  
  *      http://www.gnu.org/copyleft/lgpl.html
 12  
  * or by writing to:
 13  
  *      Free Software Foundation, Inc.
 14  
  *      59 Temple Place - Suite 330
 15  
  *      Boston, MA 02111-1307, USA
 16  
  *
 17  
  * © CrossWire Bible Society, 2007 - 2016
 18  
  *
 19  
  */
 20  
 package org.crosswire.common.diff;
 21  
 
 22  
 import java.util.HashMap;
 23  
 import java.util.Map;
 24  
 
 25  
 /**
 26  
  * An implementation of the Bitap algorithm for finding a "fuzzy" location of a
 27  
  * match.
 28  
  * 
 29  
  * Based on the LGPL Diff_Match_Patch v1.5 javascript of Neil Fraser, Copyright (C) 2006<br>
 30  
  * <a href="http://neil.fraser.name/software/diff_match_patch/">http://neil.fraser.name/software/diff_match_patch/</a>
 31  
  * 
 32  
  * @see gnu.lgpl.License The GNU Lesser General Public License for details.
 33  
  * @author DM Smith
 34  
  */
 35  0
 public class Bitap implements Locator {
 36  
     /**
 37  
      * Locate the best instance of 'pattern' in 'text' near 'loc'.
 38  
      * 
 39  
      * @param text
 40  
      *            The text to search
 41  
      * @param pattern
 42  
      *            The pattern to search for
 43  
      * @param loc
 44  
      *            The location to search around
 45  
      */
 46  0
     public Bitap(String text, String pattern, int loc) {
 47  0
         this.text = text;
 48  0
         this.pattern = pattern;
 49  0
         this.loc = loc;
 50  0
         alphabet = new HashMap<Character, Integer>();
 51  0
     }
 52  
 
 53  
     /*
 54  
      * (non-Javadoc)
 55  
      * 
 56  
      * @see org.crosswire.common.diff.Locator#maxPatternLength()
 57  
      */
 58  
     public int maxPatternLength() {
 59  0
         return MAXBITS;
 60  
     }
 61  
 
 62  
     /*
 63  
      * (non-Javadoc)
 64  
      * 
 65  
      * @see org.crosswire.common.diff.Locator#locate()
 66  
      */
 67  
     public int locate() {
 68  
         // Initialize the alphabet.
 69  0
         alphabet();
 70  
 
 71  
         // Coerce the text length between reasonable maximums and minimums.
 72  0
         scoreTextLength = Math.max(text.length(), Bitap.minLength);
 73  0
         scoreTextLength = Math.min(scoreTextLength, Bitap.maxLength);
 74  
 
 75  
         // Highest score beyond which we give up.
 76  0
         double scoreThreshold = Bitap.threshold;
 77  
 
 78  
         // Is there a nearby exact match? (speedup)
 79  0
         int bestLoc = text.indexOf(pattern, loc);
 80  0
         if (bestLoc != -1) {
 81  0
             scoreThreshold = Math.min(bitapScore(0, bestLoc), scoreThreshold);
 82  
         }
 83  
 
 84  
         // What about in the other direction? (speedup)
 85  0
         bestLoc = text.lastIndexOf(pattern, loc + pattern.length());
 86  0
         if (bestLoc != -1) {
 87  0
             scoreThreshold = Math.min(bitapScore(0, bestLoc), scoreThreshold);
 88  
         }
 89  
 
 90  
         // Initialize the bit arrays.
 91  0
         int matchmask = (int) Math.pow(2, pattern.length() - 1);
 92  
 
 93  0
         bestLoc = -1;
 94  
 
 95  
         int binMin;
 96  
         int binMid;
 97  0
         int binMax = Math.max(loc + loc, text.length());
 98  0
         int[] lastrd = new int[0];
 99  0
         for (int d = 0; d < pattern.length(); d++) {
 100  
             // Scan for the best match; each iteration allows for one more
 101  
             // error.
 102  0
             int[] rd = new int[text.length()];
 103  
 
 104  
             // Run a binary search to determine how far from 'loc' we can stray
 105  
             // at this error level.
 106  0
             binMin = loc;
 107  0
             binMid = binMax;
 108  0
             while (binMin < binMid) {
 109  0
                 if (bitapScore(d, binMid) < scoreThreshold) {
 110  0
                     binMin = binMid;
 111  
                 } else {
 112  0
                     binMax = binMid;
 113  
                 }
 114  0
                 binMid = (binMax - binMin) / 2 + binMin;
 115  
             }
 116  
 
 117  0
             binMax = binMid; // Use the result from this iteration as the
 118  
             // maximum for the next.
 119  0
             int start = Math.max(0, loc - (binMid - loc) - 1);
 120  0
             int finish = Math.min(text.length() - 1, pattern.length() + binMid);
 121  
 
 122  0
             if (text.charAt(finish) == pattern.charAt(pattern.length() - 1)) {
 123  0
                 rd[finish] = (int) Math.pow(2, d + 1) - 1;
 124  
             } else {
 125  0
                 rd[finish] = (int) Math.pow(2, d) - 1;
 126  
             }
 127  
 
 128  0
             for (int j = finish - 1; j >= start; j--) {
 129  0
                 Character curChar = Character.valueOf(text.charAt(j));
 130  0
                 int mask = alphabet.containsKey(curChar) ? alphabet.get(curChar).intValue() : 0;
 131  0
                 if (d == 0) { // First pass: exact match.
 132  0
                     rd[j] = ((rd[j + 1] << 1) | 1) & mask;
 133  
                 } else { // Subsequent passes: fuzzy match.
 134  0
                     rd[j] = ((rd[j + 1] << 1) | 1) & mask | ((lastrd[j + 1] << 1) | 1) | ((lastrd[j] << 1) | 1) | lastrd[j + 1];
 135  
                 }
 136  
 
 137  0
                 if ((rd[j] & matchmask) != 0) {
 138  0
                     double score = bitapScore(d, j);
 139  
                     // This match will almost certainly be better than any
 140  
                     // existing match. But check anyway.
 141  0
                     if (score <= scoreThreshold) {
 142  
                         // Told you so.
 143  0
                         scoreThreshold = score;
 144  0
                         bestLoc = j;
 145  0
                         if (j > loc) {
 146  
                             // When passing loc, don't exceed our current
 147  
                             // distance from loc.
 148  0
                             start = Math.max(0, loc - (j - loc));
 149  
                         } else {
 150  
                             // Already passed loc, downhill from here on in.
 151  
                             break;
 152  
                         }
 153  
                     }
 154  
                 }
 155  
             }
 156  
 
 157  0
             if (bitapScore(d + 1, loc) > scoreThreshold) // No hope for a
 158  
             // (better) match at
 159  
             // greater error
 160  
             // levels.
 161  
             {
 162  
                 // No hope for a (better) match at greater error levels.
 163  0
                 break;
 164  
             }
 165  
 
 166  0
             lastrd = rd;
 167  
         }
 168  
 
 169  0
         return bestLoc;
 170  
     }
 171  
 
 172  
     protected Map<Character, Integer> getAlphabet() {
 173  0
         return alphabet;
 174  
     }
 175  
 
 176  
     /**
 177  
      * Compute and return the score for a match with e errors and x location.
 178  
      * 
 179  
      * @param e
 180  
      *            Number of errors in match
 181  
      * @param x
 182  
      *            Location of match
 183  
      * @return Overall score for match
 184  
      */
 185  
     private double bitapScore(int e, int x) {
 186  
         // Compute and return the score for a match with e errors and x
 187  
         // location.
 188  0
         int d = Math.abs(loc - x);
 189  0
         return (e / (float) pattern.length() / Bitap.balance) + (d / (float) scoreTextLength / (1.0 - Bitap.balance));
 190  
     }
 191  
 
 192  
     // Initialize the alphabet for the Bitap algorithm.
 193  
     protected void alphabet() {
 194  0
         int len = pattern.length();
 195  
 
 196  0
         assert len <= Bitap.MAXBITS : "Pattern too long for this application.";
 197  
 
 198  0
         for (int i = 0; i < len; i++) {
 199  0
             Character c = Character.valueOf(pattern.charAt(i));
 200  0
             Integer value = alphabet.get(c);
 201  0
             int mask = value == null ? 0 : value.intValue();
 202  0
             mask |= (int) Math.pow(2, len - i - 1);
 203  0
             alphabet.put(c, Integer.valueOf(mask));
 204  
         }
 205  0
     }
 206  
 
 207  
     public static void setBalance(float newBalance) {
 208  0
         balance = newBalance;
 209  0
     }
 210  
 
 211  
     public static void setThreshold(float newThreshold) {
 212  0
         threshold = newThreshold;
 213  0
     }
 214  
 
 215  
     public static void setMinLength(int newMinLength) {
 216  0
         minLength = newMinLength;
 217  0
     }
 218  
 
 219  
     public static void setMaxLength(int newMaxLength) {
 220  0
         maxLength = newMaxLength;
 221  0
     }
 222  
 
 223  
     /**
 224  
      * The maximum number of bits in an int. Change this to 64 if long is used
 225  
      * by alphabet.
 226  
      */
 227  
     private static final int MAXBITS = 32;
 228  
 
 229  
     /**
 230  
      * Tweak the relative importance (0.0 = accuracy, 1.0 = proximity)
 231  
      */
 232  
     private static final float BALANCE = 0.5f;
 233  0
     private static float balance = BALANCE;
 234  
 
 235  
     /**
 236  
      * At what point is no match declared (0.0 = perfection, 1.0 = very loose)
 237  
      */
 238  
     private static final float THRESHOLD = 0.5f;
 239  0
     private static float threshold = THRESHOLD;
 240  
 
 241  
     /**
 242  
      * The min and max cutoffs used when computing text lengths.
 243  
      */
 244  
     private static final int MINLENGTH = 100;
 245  0
     private static int minLength = MINLENGTH;
 246  
     private static final int MAXLENGTH = 1000;
 247  0
     private static int maxLength = MAXLENGTH;
 248  
 
 249  
     /**
 250  
      * The text to search.
 251  
      */
 252  
     private String text;
 253  
 
 254  
     /**
 255  
      * The pattern to find in the text.
 256  
      */
 257  
     private String pattern;
 258  
 
 259  
     /**
 260  
      * The location in text to focus the search.
 261  
      */
 262  
     private int loc;
 263  
 
 264  
     /**
 265  
      * The length of the string constrained between MINLENGHT and MAXLENGTH
 266  
      */
 267  
     private int scoreTextLength;
 268  
 
 269  
     /**
 270  
      * Alphabet is the compiled representation of the pattern.
 271  
      */
 272  
     private Map<Character, Integer> alphabet;
 273  
 }