| Classes in this File | Line Coverage | Branch Coverage | Complexity | ||||
| Distance |
|
| 6.5;6.5 |
| 1 | /** | |
| 2 | * Distribution License: | |
| 3 | * JSword is free software; you can redistribute it and/or modify it under | |
| 4 | * the terms of the GNU Lesser General Public License, version 2.1 or later | |
| 5 | * as published by the Free Software Foundation. This program is distributed | |
| 6 | * in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even | |
| 7 | * the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. | |
| 8 | * See the GNU Lesser General Public License for more details. | |
| 9 | * | |
| 10 | * The License is available on the internet at: | |
| 11 | * http://www.gnu.org/copyleft/lgpl.html | |
| 12 | * or by writing to: | |
| 13 | * Free Software Foundation, Inc. | |
| 14 | * 59 Temple Place - Suite 330 | |
| 15 | * Boston, MA 02111-1307, USA | |
| 16 | * | |
| 17 | * © CrossWire Bible Society, 2007 - 2016 | |
| 18 | * | |
| 19 | */ | |
| 20 | package org.crosswire.common.diff; | |
| 21 | ||
| 22 | /** | |
| 23 | * Compute the distance between 2 strings. The larger the number the greater the | |
| 24 | * distance. | |
| 25 | * | |
| 26 | * @see gnu.lgpl.License The GNU Lesser General Public License for details. | |
| 27 | * @author DM Smith | |
| 28 | */ | |
| 29 | public final class Distance { | |
| 30 | /** | |
| 31 | * Prevent instantiation. | |
| 32 | */ | |
| 33 | 0 | private Distance() { |
| 34 | 0 | } |
| 35 | ||
| 36 | /** | |
| 37 | * Compute the LevenshteinDistance between two strings. See <a | |
| 38 | * href="http://www.merriampark.com/ldjava.htm" | |
| 39 | * >www.merriampark.com/ldjava.htm</a> for original implementation. | |
| 40 | * | |
| 41 | * @param source | |
| 42 | * the baseline text | |
| 43 | * @param target | |
| 44 | * the changed text | |
| 45 | * @return the distance | |
| 46 | */ | |
| 47 | public static int getLevenshteinDistance(String source, String target) { | |
| 48 | 0 | if (source == null || target == null) { |
| 49 | 0 | throw new IllegalArgumentException("Strings must not be null"); |
| 50 | } | |
| 51 | ||
| 52 | /* | |
| 53 | * The difference between this impl. and the previous is that, rather | |
| 54 | * than creating and retaining a matrix of size s.length()+1 by | |
| 55 | * t.length()+1, we maintain two single-dimensional arrays of length | |
| 56 | * s.length()+1. The first, d, is the 'current working' distance array | |
| 57 | * that maintains the newest distance cost counts as we iterate through | |
| 58 | * the characters of String s. Each time we increment the index of | |
| 59 | * String t we are comparing, d is copied to p, the second int[]. Doing | |
| 60 | * so allows us to retain the previous cost counts as required by the | |
| 61 | * algorithm (taking the minimum of the cost count to the left, up one, | |
| 62 | * and diagonally up and to the left of the current cost count being | |
| 63 | * calculated). (Note that the arrays aren't really copied anymore, just | |
| 64 | * switched...this is clearly much better than cloning an array or doing | |
| 65 | * a System.arraycopy() each time through the outer loop.) | |
| 66 | * | |
| 67 | * Effectively, the difference between the two implementations is this | |
| 68 | * one does not cause an out of memory condition when calculating the LD | |
| 69 | * over two very large strings. | |
| 70 | */ | |
| 71 | ||
| 72 | 0 | int sourceLength = source.length(); // length of source |
| 73 | 0 | int targetLength = target.length(); // length of target |
| 74 | ||
| 75 | 0 | if (sourceLength == 0) { |
| 76 | 0 | return targetLength; |
| 77 | 0 | } else if (targetLength == 0) { |
| 78 | 0 | return sourceLength; |
| 79 | } | |
| 80 | ||
| 81 | 0 | int[] prevDist = new int[sourceLength + 1]; // 'previous' cost array, |
| 82 | // horizontally | |
| 83 | 0 | int[] dist = new int[sourceLength + 1]; // cost array, horizontally |
| 84 | int[] swap; // placeholder to assist in swapping prevDist and dist | |
| 85 | ||
| 86 | // indexes into strings source and target | |
| 87 | int i; // iterates through source | |
| 88 | int j; // iterates through target | |
| 89 | ||
| 90 | char targetJ; // jth character of t | |
| 91 | ||
| 92 | int cost; | |
| 93 | ||
| 94 | 0 | for (i = 0; i <= sourceLength; i++) { |
| 95 | 0 | prevDist[i] = i; |
| 96 | } | |
| 97 | ||
| 98 | 0 | for (j = 1; j <= targetLength; j++) { |
| 99 | 0 | targetJ = target.charAt(j - 1); |
| 100 | 0 | dist[0] = j; |
| 101 | ||
| 102 | 0 | for (i = 1; i <= sourceLength; i++) { |
| 103 | 0 | cost = source.charAt(i - 1) == targetJ ? 0 : 1; |
| 104 | // minimum of cell to the left + 1, to the top + 1, diagonally | |
| 105 | // left and up +cost | |
| 106 | 0 | dist[i] = Math.min(Math.min(dist[i - 1] + 1, prevDist[i] + 1), prevDist[i - 1] + cost); |
| 107 | } | |
| 108 | ||
| 109 | // copy current distance counts to 'previous row' distance counts | |
| 110 | 0 | swap = prevDist; |
| 111 | 0 | prevDist = dist; |
| 112 | 0 | dist = swap; |
| 113 | } | |
| 114 | ||
| 115 | // our last action in the above loop was to switch d and p, so p now | |
| 116 | // actually has the most recent cost counts | |
| 117 | 0 | return prevDist[sourceLength]; |
| 118 | } | |
| 119 | } |