| Classes in this File | Line Coverage | Branch Coverage | Complexity | ||||
| Bitap |
|
| 2.5;2.5 |
| 1 | /** | |
| 2 | * Distribution License: | |
| 3 | * JSword is free software; you can redistribute it and/or modify it under | |
| 4 | * the terms of the GNU Lesser General Public License, version 2.1 or later | |
| 5 | * as published by the Free Software Foundation. This program is distributed | |
| 6 | * in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even | |
| 7 | * the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. | |
| 8 | * See the GNU Lesser General Public License for more details. | |
| 9 | * | |
| 10 | * The License is available on the internet at: | |
| 11 | * http://www.gnu.org/copyleft/lgpl.html | |
| 12 | * or by writing to: | |
| 13 | * Free Software Foundation, Inc. | |
| 14 | * 59 Temple Place - Suite 330 | |
| 15 | * Boston, MA 02111-1307, USA | |
| 16 | * | |
| 17 | * © CrossWire Bible Society, 2007 - 2016 | |
| 18 | * | |
| 19 | */ | |
| 20 | package org.crosswire.common.diff; | |
| 21 | ||
| 22 | import java.util.HashMap; | |
| 23 | import java.util.Map; | |
| 24 | ||
| 25 | /** | |
| 26 | * An implementation of the Bitap algorithm for finding a "fuzzy" location of a | |
| 27 | * match. | |
| 28 | * | |
| 29 | * Based on the LGPL Diff_Match_Patch v1.5 javascript of Neil Fraser, Copyright (C) 2006<br> | |
| 30 | * <a href="http://neil.fraser.name/software/diff_match_patch/">http://neil.fraser.name/software/diff_match_patch/</a> | |
| 31 | * | |
| 32 | * @see gnu.lgpl.License The GNU Lesser General Public License for details. | |
| 33 | * @author DM Smith | |
| 34 | */ | |
| 35 | 0 | public class Bitap implements Locator { |
| 36 | /** | |
| 37 | * Locate the best instance of 'pattern' in 'text' near 'loc'. | |
| 38 | * | |
| 39 | * @param text | |
| 40 | * The text to search | |
| 41 | * @param pattern | |
| 42 | * The pattern to search for | |
| 43 | * @param loc | |
| 44 | * The location to search around | |
| 45 | */ | |
| 46 | 0 | public Bitap(String text, String pattern, int loc) { |
| 47 | 0 | this.text = text; |
| 48 | 0 | this.pattern = pattern; |
| 49 | 0 | this.loc = loc; |
| 50 | 0 | alphabet = new HashMap<Character, Integer>(); |
| 51 | 0 | } |
| 52 | ||
| 53 | /* | |
| 54 | * (non-Javadoc) | |
| 55 | * | |
| 56 | * @see org.crosswire.common.diff.Locator#maxPatternLength() | |
| 57 | */ | |
| 58 | public int maxPatternLength() { | |
| 59 | 0 | return MAXBITS; |
| 60 | } | |
| 61 | ||
| 62 | /* | |
| 63 | * (non-Javadoc) | |
| 64 | * | |
| 65 | * @see org.crosswire.common.diff.Locator#locate() | |
| 66 | */ | |
| 67 | public int locate() { | |
| 68 | // Initialize the alphabet. | |
| 69 | 0 | alphabet(); |
| 70 | ||
| 71 | // Coerce the text length between reasonable maximums and minimums. | |
| 72 | 0 | scoreTextLength = Math.max(text.length(), Bitap.minLength); |
| 73 | 0 | scoreTextLength = Math.min(scoreTextLength, Bitap.maxLength); |
| 74 | ||
| 75 | // Highest score beyond which we give up. | |
| 76 | 0 | double scoreThreshold = Bitap.threshold; |
| 77 | ||
| 78 | // Is there a nearby exact match? (speedup) | |
| 79 | 0 | int bestLoc = text.indexOf(pattern, loc); |
| 80 | 0 | if (bestLoc != -1) { |
| 81 | 0 | scoreThreshold = Math.min(bitapScore(0, bestLoc), scoreThreshold); |
| 82 | } | |
| 83 | ||
| 84 | // What about in the other direction? (speedup) | |
| 85 | 0 | bestLoc = text.lastIndexOf(pattern, loc + pattern.length()); |
| 86 | 0 | if (bestLoc != -1) { |
| 87 | 0 | scoreThreshold = Math.min(bitapScore(0, bestLoc), scoreThreshold); |
| 88 | } | |
| 89 | ||
| 90 | // Initialize the bit arrays. | |
| 91 | 0 | int matchmask = (int) Math.pow(2, pattern.length() - 1); |
| 92 | ||
| 93 | 0 | bestLoc = -1; |
| 94 | ||
| 95 | int binMin; | |
| 96 | int binMid; | |
| 97 | 0 | int binMax = Math.max(loc + loc, text.length()); |
| 98 | 0 | int[] lastrd = new int[0]; |
| 99 | 0 | for (int d = 0; d < pattern.length(); d++) { |
| 100 | // Scan for the best match; each iteration allows for one more | |
| 101 | // error. | |
| 102 | 0 | int[] rd = new int[text.length()]; |
| 103 | ||
| 104 | // Run a binary search to determine how far from 'loc' we can stray | |
| 105 | // at this error level. | |
| 106 | 0 | binMin = loc; |
| 107 | 0 | binMid = binMax; |
| 108 | 0 | while (binMin < binMid) { |
| 109 | 0 | if (bitapScore(d, binMid) < scoreThreshold) { |
| 110 | 0 | binMin = binMid; |
| 111 | } else { | |
| 112 | 0 | binMax = binMid; |
| 113 | } | |
| 114 | 0 | binMid = (binMax - binMin) / 2 + binMin; |
| 115 | } | |
| 116 | ||
| 117 | 0 | binMax = binMid; // Use the result from this iteration as the |
| 118 | // maximum for the next. | |
| 119 | 0 | int start = Math.max(0, loc - (binMid - loc) - 1); |
| 120 | 0 | int finish = Math.min(text.length() - 1, pattern.length() + binMid); |
| 121 | ||
| 122 | 0 | if (text.charAt(finish) == pattern.charAt(pattern.length() - 1)) { |
| 123 | 0 | rd[finish] = (int) Math.pow(2, d + 1) - 1; |
| 124 | } else { | |
| 125 | 0 | rd[finish] = (int) Math.pow(2, d) - 1; |
| 126 | } | |
| 127 | ||
| 128 | 0 | for (int j = finish - 1; j >= start; j--) { |
| 129 | 0 | Character curChar = Character.valueOf(text.charAt(j)); |
| 130 | 0 | int mask = alphabet.containsKey(curChar) ? alphabet.get(curChar).intValue() : 0; |
| 131 | 0 | if (d == 0) { // First pass: exact match. |
| 132 | 0 | rd[j] = ((rd[j + 1] << 1) | 1) & mask; |
| 133 | } else { // Subsequent passes: fuzzy match. | |
| 134 | 0 | rd[j] = ((rd[j + 1] << 1) | 1) & mask | ((lastrd[j + 1] << 1) | 1) | ((lastrd[j] << 1) | 1) | lastrd[j + 1]; |
| 135 | } | |
| 136 | ||
| 137 | 0 | if ((rd[j] & matchmask) != 0) { |
| 138 | 0 | double score = bitapScore(d, j); |
| 139 | // This match will almost certainly be better than any | |
| 140 | // existing match. But check anyway. | |
| 141 | 0 | if (score <= scoreThreshold) { |
| 142 | // Told you so. | |
| 143 | 0 | scoreThreshold = score; |
| 144 | 0 | bestLoc = j; |
| 145 | 0 | if (j > loc) { |
| 146 | // When passing loc, don't exceed our current | |
| 147 | // distance from loc. | |
| 148 | 0 | start = Math.max(0, loc - (j - loc)); |
| 149 | } else { | |
| 150 | // Already passed loc, downhill from here on in. | |
| 151 | break; | |
| 152 | } | |
| 153 | } | |
| 154 | } | |
| 155 | } | |
| 156 | ||
| 157 | 0 | if (bitapScore(d + 1, loc) > scoreThreshold) // No hope for a |
| 158 | // (better) match at | |
| 159 | // greater error | |
| 160 | // levels. | |
| 161 | { | |
| 162 | // No hope for a (better) match at greater error levels. | |
| 163 | 0 | break; |
| 164 | } | |
| 165 | ||
| 166 | 0 | lastrd = rd; |
| 167 | } | |
| 168 | ||
| 169 | 0 | return bestLoc; |
| 170 | } | |
| 171 | ||
| 172 | protected Map<Character, Integer> getAlphabet() { | |
| 173 | 0 | return alphabet; |
| 174 | } | |
| 175 | ||
| 176 | /** | |
| 177 | * Compute and return the score for a match with e errors and x location. | |
| 178 | * | |
| 179 | * @param e | |
| 180 | * Number of errors in match | |
| 181 | * @param x | |
| 182 | * Location of match | |
| 183 | * @return Overall score for match | |
| 184 | */ | |
| 185 | private double bitapScore(int e, int x) { | |
| 186 | // Compute and return the score for a match with e errors and x | |
| 187 | // location. | |
| 188 | 0 | int d = Math.abs(loc - x); |
| 189 | 0 | return (e / (float) pattern.length() / Bitap.balance) + (d / (float) scoreTextLength / (1.0 - Bitap.balance)); |
| 190 | } | |
| 191 | ||
| 192 | // Initialize the alphabet for the Bitap algorithm. | |
| 193 | protected void alphabet() { | |
| 194 | 0 | int len = pattern.length(); |
| 195 | ||
| 196 | 0 | assert len <= Bitap.MAXBITS : "Pattern too long for this application."; |
| 197 | ||
| 198 | 0 | for (int i = 0; i < len; i++) { |
| 199 | 0 | Character c = Character.valueOf(pattern.charAt(i)); |
| 200 | 0 | Integer value = alphabet.get(c); |
| 201 | 0 | int mask = value == null ? 0 : value.intValue(); |
| 202 | 0 | mask |= (int) Math.pow(2, len - i - 1); |
| 203 | 0 | alphabet.put(c, Integer.valueOf(mask)); |
| 204 | } | |
| 205 | 0 | } |
| 206 | ||
| 207 | public static void setBalance(float newBalance) { | |
| 208 | 0 | balance = newBalance; |
| 209 | 0 | } |
| 210 | ||
| 211 | public static void setThreshold(float newThreshold) { | |
| 212 | 0 | threshold = newThreshold; |
| 213 | 0 | } |
| 214 | ||
| 215 | public static void setMinLength(int newMinLength) { | |
| 216 | 0 | minLength = newMinLength; |
| 217 | 0 | } |
| 218 | ||
| 219 | public static void setMaxLength(int newMaxLength) { | |
| 220 | 0 | maxLength = newMaxLength; |
| 221 | 0 | } |
| 222 | ||
| 223 | /** | |
| 224 | * The maximum number of bits in an int. Change this to 64 if long is used | |
| 225 | * by alphabet. | |
| 226 | */ | |
| 227 | private static final int MAXBITS = 32; | |
| 228 | ||
| 229 | /** | |
| 230 | * Tweak the relative importance (0.0 = accuracy, 1.0 = proximity) | |
| 231 | */ | |
| 232 | private static final float BALANCE = 0.5f; | |
| 233 | 0 | private static float balance = BALANCE; |
| 234 | ||
| 235 | /** | |
| 236 | * At what point is no match declared (0.0 = perfection, 1.0 = very loose) | |
| 237 | */ | |
| 238 | private static final float THRESHOLD = 0.5f; | |
| 239 | 0 | private static float threshold = THRESHOLD; |
| 240 | ||
| 241 | /** | |
| 242 | * The min and max cutoffs used when computing text lengths. | |
| 243 | */ | |
| 244 | private static final int MINLENGTH = 100; | |
| 245 | 0 | private static int minLength = MINLENGTH; |
| 246 | private static final int MAXLENGTH = 1000; | |
| 247 | 0 | private static int maxLength = MAXLENGTH; |
| 248 | ||
| 249 | /** | |
| 250 | * The text to search. | |
| 251 | */ | |
| 252 | private String text; | |
| 253 | ||
| 254 | /** | |
| 255 | * The pattern to find in the text. | |
| 256 | */ | |
| 257 | private String pattern; | |
| 258 | ||
| 259 | /** | |
| 260 | * The location in text to focus the search. | |
| 261 | */ | |
| 262 | private int loc; | |
| 263 | ||
| 264 | /** | |
| 265 | * The length of the string constrained between MINLENGHT and MAXLENGTH | |
| 266 | */ | |
| 267 | private int scoreTextLength; | |
| 268 | ||
| 269 | /** | |
| 270 | * Alphabet is the compiled representation of the pattern. | |
| 271 | */ | |
| 272 | private Map<Character, Integer> alphabet; | |
| 273 | } |