Classes in this File | Line Coverage | Branch Coverage | Complexity | ||||
Bitap |
|
| 2.5;2.5 |
1 | /** | |
2 | * Distribution License: | |
3 | * JSword is free software; you can redistribute it and/or modify it under | |
4 | * the terms of the GNU Lesser General Public License, version 2.1 or later | |
5 | * as published by the Free Software Foundation. This program is distributed | |
6 | * in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even | |
7 | * the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. | |
8 | * See the GNU Lesser General Public License for more details. | |
9 | * | |
10 | * The License is available on the internet at: | |
11 | * http://www.gnu.org/copyleft/lgpl.html | |
12 | * or by writing to: | |
13 | * Free Software Foundation, Inc. | |
14 | * 59 Temple Place - Suite 330 | |
15 | * Boston, MA 02111-1307, USA | |
16 | * | |
17 | * © CrossWire Bible Society, 2007 - 2016 | |
18 | * | |
19 | */ | |
20 | package org.crosswire.common.diff; | |
21 | ||
22 | import java.util.HashMap; | |
23 | import java.util.Map; | |
24 | ||
25 | /** | |
26 | * An implementation of the Bitap algorithm for finding a "fuzzy" location of a | |
27 | * match. | |
28 | * | |
29 | * Based on the LGPL Diff_Match_Patch v1.5 javascript of Neil Fraser, Copyright (C) 2006<br> | |
30 | * <a href="http://neil.fraser.name/software/diff_match_patch/">http://neil.fraser.name/software/diff_match_patch/</a> | |
31 | * | |
32 | * @see gnu.lgpl.License The GNU Lesser General Public License for details. | |
33 | * @author DM Smith | |
34 | */ | |
35 | 0 | public class Bitap implements Locator { |
36 | /** | |
37 | * Locate the best instance of 'pattern' in 'text' near 'loc'. | |
38 | * | |
39 | * @param text | |
40 | * The text to search | |
41 | * @param pattern | |
42 | * The pattern to search for | |
43 | * @param loc | |
44 | * The location to search around | |
45 | */ | |
46 | 0 | public Bitap(String text, String pattern, int loc) { |
47 | 0 | this.text = text; |
48 | 0 | this.pattern = pattern; |
49 | 0 | this.loc = loc; |
50 | 0 | alphabet = new HashMap<Character, Integer>(); |
51 | 0 | } |
52 | ||
53 | /* | |
54 | * (non-Javadoc) | |
55 | * | |
56 | * @see org.crosswire.common.diff.Locator#maxPatternLength() | |
57 | */ | |
58 | public int maxPatternLength() { | |
59 | 0 | return MAXBITS; |
60 | } | |
61 | ||
62 | /* | |
63 | * (non-Javadoc) | |
64 | * | |
65 | * @see org.crosswire.common.diff.Locator#locate() | |
66 | */ | |
67 | public int locate() { | |
68 | // Initialize the alphabet. | |
69 | 0 | alphabet(); |
70 | ||
71 | // Coerce the text length between reasonable maximums and minimums. | |
72 | 0 | scoreTextLength = Math.max(text.length(), Bitap.minLength); |
73 | 0 | scoreTextLength = Math.min(scoreTextLength, Bitap.maxLength); |
74 | ||
75 | // Highest score beyond which we give up. | |
76 | 0 | double scoreThreshold = Bitap.threshold; |
77 | ||
78 | // Is there a nearby exact match? (speedup) | |
79 | 0 | int bestLoc = text.indexOf(pattern, loc); |
80 | 0 | if (bestLoc != -1) { |
81 | 0 | scoreThreshold = Math.min(bitapScore(0, bestLoc), scoreThreshold); |
82 | } | |
83 | ||
84 | // What about in the other direction? (speedup) | |
85 | 0 | bestLoc = text.lastIndexOf(pattern, loc + pattern.length()); |
86 | 0 | if (bestLoc != -1) { |
87 | 0 | scoreThreshold = Math.min(bitapScore(0, bestLoc), scoreThreshold); |
88 | } | |
89 | ||
90 | // Initialize the bit arrays. | |
91 | 0 | int matchmask = (int) Math.pow(2, pattern.length() - 1); |
92 | ||
93 | 0 | bestLoc = -1; |
94 | ||
95 | int binMin; | |
96 | int binMid; | |
97 | 0 | int binMax = Math.max(loc + loc, text.length()); |
98 | 0 | int[] lastrd = new int[0]; |
99 | 0 | for (int d = 0; d < pattern.length(); d++) { |
100 | // Scan for the best match; each iteration allows for one more | |
101 | // error. | |
102 | 0 | int[] rd = new int[text.length()]; |
103 | ||
104 | // Run a binary search to determine how far from 'loc' we can stray | |
105 | // at this error level. | |
106 | 0 | binMin = loc; |
107 | 0 | binMid = binMax; |
108 | 0 | while (binMin < binMid) { |
109 | 0 | if (bitapScore(d, binMid) < scoreThreshold) { |
110 | 0 | binMin = binMid; |
111 | } else { | |
112 | 0 | binMax = binMid; |
113 | } | |
114 | 0 | binMid = (binMax - binMin) / 2 + binMin; |
115 | } | |
116 | ||
117 | 0 | binMax = binMid; // Use the result from this iteration as the |
118 | // maximum for the next. | |
119 | 0 | int start = Math.max(0, loc - (binMid - loc) - 1); |
120 | 0 | int finish = Math.min(text.length() - 1, pattern.length() + binMid); |
121 | ||
122 | 0 | if (text.charAt(finish) == pattern.charAt(pattern.length() - 1)) { |
123 | 0 | rd[finish] = (int) Math.pow(2, d + 1) - 1; |
124 | } else { | |
125 | 0 | rd[finish] = (int) Math.pow(2, d) - 1; |
126 | } | |
127 | ||
128 | 0 | for (int j = finish - 1; j >= start; j--) { |
129 | 0 | Character curChar = Character.valueOf(text.charAt(j)); |
130 | 0 | int mask = alphabet.containsKey(curChar) ? alphabet.get(curChar).intValue() : 0; |
131 | 0 | if (d == 0) { // First pass: exact match. |
132 | 0 | rd[j] = ((rd[j + 1] << 1) | 1) & mask; |
133 | } else { // Subsequent passes: fuzzy match. | |
134 | 0 | rd[j] = ((rd[j + 1] << 1) | 1) & mask | ((lastrd[j + 1] << 1) | 1) | ((lastrd[j] << 1) | 1) | lastrd[j + 1]; |
135 | } | |
136 | ||
137 | 0 | if ((rd[j] & matchmask) != 0) { |
138 | 0 | double score = bitapScore(d, j); |
139 | // This match will almost certainly be better than any | |
140 | // existing match. But check anyway. | |
141 | 0 | if (score <= scoreThreshold) { |
142 | // Told you so. | |
143 | 0 | scoreThreshold = score; |
144 | 0 | bestLoc = j; |
145 | 0 | if (j > loc) { |
146 | // When passing loc, don't exceed our current | |
147 | // distance from loc. | |
148 | 0 | start = Math.max(0, loc - (j - loc)); |
149 | } else { | |
150 | // Already passed loc, downhill from here on in. | |
151 | break; | |
152 | } | |
153 | } | |
154 | } | |
155 | } | |
156 | ||
157 | 0 | if (bitapScore(d + 1, loc) > scoreThreshold) // No hope for a |
158 | // (better) match at | |
159 | // greater error | |
160 | // levels. | |
161 | { | |
162 | // No hope for a (better) match at greater error levels. | |
163 | 0 | break; |
164 | } | |
165 | ||
166 | 0 | lastrd = rd; |
167 | } | |
168 | ||
169 | 0 | return bestLoc; |
170 | } | |
171 | ||
172 | protected Map<Character, Integer> getAlphabet() { | |
173 | 0 | return alphabet; |
174 | } | |
175 | ||
176 | /** | |
177 | * Compute and return the score for a match with e errors and x location. | |
178 | * | |
179 | * @param e | |
180 | * Number of errors in match | |
181 | * @param x | |
182 | * Location of match | |
183 | * @return Overall score for match | |
184 | */ | |
185 | private double bitapScore(int e, int x) { | |
186 | // Compute and return the score for a match with e errors and x | |
187 | // location. | |
188 | 0 | int d = Math.abs(loc - x); |
189 | 0 | return (e / (float) pattern.length() / Bitap.balance) + (d / (float) scoreTextLength / (1.0 - Bitap.balance)); |
190 | } | |
191 | ||
192 | // Initialize the alphabet for the Bitap algorithm. | |
193 | protected void alphabet() { | |
194 | 0 | int len = pattern.length(); |
195 | ||
196 | 0 | assert len <= Bitap.MAXBITS : "Pattern too long for this application."; |
197 | ||
198 | 0 | for (int i = 0; i < len; i++) { |
199 | 0 | Character c = Character.valueOf(pattern.charAt(i)); |
200 | 0 | Integer value = alphabet.get(c); |
201 | 0 | int mask = value == null ? 0 : value.intValue(); |
202 | 0 | mask |= (int) Math.pow(2, len - i - 1); |
203 | 0 | alphabet.put(c, Integer.valueOf(mask)); |
204 | } | |
205 | 0 | } |
206 | ||
207 | public static void setBalance(float newBalance) { | |
208 | 0 | balance = newBalance; |
209 | 0 | } |
210 | ||
211 | public static void setThreshold(float newThreshold) { | |
212 | 0 | threshold = newThreshold; |
213 | 0 | } |
214 | ||
215 | public static void setMinLength(int newMinLength) { | |
216 | 0 | minLength = newMinLength; |
217 | 0 | } |
218 | ||
219 | public static void setMaxLength(int newMaxLength) { | |
220 | 0 | maxLength = newMaxLength; |
221 | 0 | } |
222 | ||
223 | /** | |
224 | * The maximum number of bits in an int. Change this to 64 if long is used | |
225 | * by alphabet. | |
226 | */ | |
227 | private static final int MAXBITS = 32; | |
228 | ||
229 | /** | |
230 | * Tweak the relative importance (0.0 = accuracy, 1.0 = proximity) | |
231 | */ | |
232 | private static final float BALANCE = 0.5f; | |
233 | 0 | private static float balance = BALANCE; |
234 | ||
235 | /** | |
236 | * At what point is no match declared (0.0 = perfection, 1.0 = very loose) | |
237 | */ | |
238 | private static final float THRESHOLD = 0.5f; | |
239 | 0 | private static float threshold = THRESHOLD; |
240 | ||
241 | /** | |
242 | * The min and max cutoffs used when computing text lengths. | |
243 | */ | |
244 | private static final int MINLENGTH = 100; | |
245 | 0 | private static int minLength = MINLENGTH; |
246 | private static final int MAXLENGTH = 1000; | |
247 | 0 | private static int maxLength = MAXLENGTH; |
248 | ||
249 | /** | |
250 | * The text to search. | |
251 | */ | |
252 | private String text; | |
253 | ||
254 | /** | |
255 | * The pattern to find in the text. | |
256 | */ | |
257 | private String pattern; | |
258 | ||
259 | /** | |
260 | * The location in text to focus the search. | |
261 | */ | |
262 | private int loc; | |
263 | ||
264 | /** | |
265 | * The length of the string constrained between MINLENGHT and MAXLENGTH | |
266 | */ | |
267 | private int scoreTextLength; | |
268 | ||
269 | /** | |
270 | * Alphabet is the compiled representation of the pattern. | |
271 | */ | |
272 | private Map<Character, Integer> alphabet; | |
273 | } |