| Classes in this File | Line Coverage | Branch Coverage | Complexity | ||||
| SentenceUtil |
|
| 3.6666666666666665;3.667 |
| 1 | /** | |
| 2 | * Distribution License: | |
| 3 | * JSword is free software; you can redistribute it and/or modify it under | |
| 4 | * the terms of the GNU Lesser General Public License, version 2.1 or later | |
| 5 | * as published by the Free Software Foundation. This program is distributed | |
| 6 | * in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even | |
| 7 | * the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. | |
| 8 | * See the GNU Lesser General Public License for more details. | |
| 9 | * | |
| 10 | * The License is available on the internet at: | |
| 11 | * http://www.gnu.org/copyleft/lgpl.html | |
| 12 | * or by writing to: | |
| 13 | * Free Software Foundation, Inc. | |
| 14 | * 59 Temple Place - Suite 330 | |
| 15 | * Boston, MA 02111-1307, USA | |
| 16 | * | |
| 17 | * © CrossWire Bible Society, 2005 - 2016 | |
| 18 | * | |
| 19 | */ | |
| 20 | package org.crosswire.jsword.book; | |
| 21 | ||
| 22 | import java.util.ArrayList; | |
| 23 | import java.util.List; | |
| 24 | import java.util.Locale; | |
| 25 | ||
| 26 | import org.crosswire.common.util.StringUtil; | |
| 27 | ||
| 28 | /** | |
| 29 | * The SentenceUtil class provide utility functions for the various Books. | |
| 30 | * | |
| 31 | * It is not designed to be used outside of the book package, so using it | |
| 32 | * outside of these bounds is at your own risk. | |
| 33 | * | |
| 34 | * @see gnu.lgpl.License The GNU Lesser General Public License for details. | |
| 35 | * @author Joe Walker | |
| 36 | */ | |
| 37 | public final class SentenceUtil { | |
| 38 | /** | |
| 39 | * Ensure we can not be instantiated | |
| 40 | */ | |
| 41 | 0 | private SentenceUtil() { |
| 42 | 0 | } |
| 43 | ||
| 44 | /** | |
| 45 | * Take a string and tokenize it using " " and "--" as delimiters into an | |
| 46 | * Array of Strings. There is a question mark over what to do with initial | |
| 47 | * spaces. This algorithm discards them, I'm not sure if this is the right | |
| 48 | * thing to do. | |
| 49 | * | |
| 50 | * @param sentence | |
| 51 | * The string to parse. | |
| 52 | * @return The string array | |
| 53 | */ | |
| 54 | public static String[] tokenize(String sentence) { | |
| 55 | 0 | List<String> tokens = new ArrayList<String>(); |
| 56 | ||
| 57 | 0 | int pos = 0; |
| 58 | String temp; | |
| 59 | 0 | boolean alive = true; |
| 60 | ||
| 61 | 0 | while (alive) { |
| 62 | // Find the next space and double dash | |
| 63 | 0 | int nextSpace = sentence.indexOf(' ', pos); |
| 64 | 0 | int nextDDash = sentence.indexOf("--", pos); |
| 65 | ||
| 66 | // If there is a space just after the ddash then ignore the ddash | |
| 67 | 0 | if (nextSpace == nextDDash + 2) { |
| 68 | 0 | nextDDash = -1; |
| 69 | } | |
| 70 | ||
| 71 | // If there is a ddash just after the space then ignore the space | |
| 72 | 0 | if (nextDDash == nextSpace + 1) { |
| 73 | 0 | nextSpace = -1; |
| 74 | } | |
| 75 | ||
| 76 | // if there are no more tokens then just add in what we've got. | |
| 77 | 0 | if (nextSpace == -1 && nextDDash == -1) { |
| 78 | 0 | temp = sentence.substring(pos); |
| 79 | 0 | alive = false; |
| 80 | 0 | } else if ((nextSpace != -1 && nextSpace < nextDDash) || (nextDDash == -1)) { |
| 81 | // Space is next if it is not -1 and it is less than ddash | |
| 82 | // The next separator is a space | |
| 83 | 0 | temp = sentence.substring(pos, nextSpace) + ' '; |
| 84 | 0 | pos = nextSpace + 1; |
| 85 | } else { | |
| 86 | // The next separator is a ddash | |
| 87 | 0 | temp = sentence.substring(pos, nextDDash) + "--"; |
| 88 | 0 | pos = nextDDash + 2; |
| 89 | } | |
| 90 | ||
| 91 | 0 | if (temp != null && !"".equals(temp.trim())) { |
| 92 | 0 | tokens.add(temp); |
| 93 | } | |
| 94 | 0 | } |
| 95 | ||
| 96 | // Create a String[] | |
| 97 | 0 | String[] retcode = new String[tokens.size()]; |
| 98 | 0 | int i = 0; |
| 99 | 0 | for (String token : tokens) { |
| 100 | 0 | retcode[i++] = token; |
| 101 | } | |
| 102 | ||
| 103 | 0 | return retcode; |
| 104 | } | |
| 105 | ||
| 106 | /** | |
| 107 | * From a sentence get a list of words (in original order) without any | |
| 108 | * punctuation, and all in lower case. | |
| 109 | * | |
| 110 | * @param words | |
| 111 | * Words with punctuation | |
| 112 | * @return Words without punctuation | |
| 113 | */ | |
| 114 | public static String[] stripPunctuation(String... words) { | |
| 115 | 0 | String[] retcode = new String[words.length]; |
| 116 | ||
| 117 | // Remove the punctuation from the ends of the words. | |
| 118 | 0 | for (int i = 0; i < words.length; i++) { |
| 119 | 0 | retcode[i] = stripPunctuationWord(words[i]); |
| 120 | } | |
| 121 | ||
| 122 | 0 | return retcode; |
| 123 | } | |
| 124 | ||
| 125 | /** | |
| 126 | * From a sentence get a list of words (in original order) without any | |
| 127 | * punctuation, and all in lower case. | |
| 128 | * | |
| 129 | * @param words | |
| 130 | * Words with punctuation | |
| 131 | * @return Punctuation without words | |
| 132 | */ | |
| 133 | public static String[] stripWords(String... words) { | |
| 134 | 0 | if (words.length == 0) { |
| 135 | 0 | return new String[0]; |
| 136 | } | |
| 137 | ||
| 138 | 0 | String[] retcode = new String[words.length + 1]; |
| 139 | ||
| 140 | // The first bit of punctuation is what comes in front of the first word | |
| 141 | 0 | int first = firstLetter(words[0]); |
| 142 | 0 | if (first == 0) { |
| 143 | 0 | retcode[0] = ""; |
| 144 | } else { | |
| 145 | 0 | retcode[0] = words[0].substring(0, first); |
| 146 | } | |
| 147 | ||
| 148 | // The rest of the words | |
| 149 | 0 | for (int i = 1; i < words.length; i++) { |
| 150 | 0 | retcode[i] = stripWords(words[i - 1], words[i]); |
| 151 | } | |
| 152 | ||
| 153 | // The last bit of punctuation is what comes at the end of the last word | |
| 154 | 0 | int last = lastLetter(words[words.length - 1]); |
| 155 | 0 | if (last == words[words.length - 1].length()) { |
| 156 | 0 | retcode[words.length] = ""; |
| 157 | } else { | |
| 158 | 0 | retcode[words.length] = words[words.length - 1].substring(last + 1); |
| 159 | } | |
| 160 | ||
| 161 | 0 | return retcode; |
| 162 | } | |
| 163 | ||
| 164 | /** | |
| 165 | * Remove the punctuation from the ends of the word. The special case is | |
| 166 | * that if the first word ends "--" and the last word has no punctuation at | |
| 167 | * the beginning, then the answer is "--" and not "-- ". We miss out the | |
| 168 | * space because "--" is a special separator. | |
| 169 | * | |
| 170 | * @param first | |
| 171 | * The word to grab the punctuation from the end of | |
| 172 | * @param last | |
| 173 | * The word to grab the punctuation from the start of | |
| 174 | * @return The end of the first, a space, and the end of the first | |
| 175 | */ | |
| 176 | public static String stripWords(String first, String last) { | |
| 177 | 0 | String init1 = first.substring(lastLetter(first) + 1); |
| 178 | 0 | String init2 = last.substring(0, firstLetter(last)); |
| 179 | ||
| 180 | 0 | return init1 + init2; |
| 181 | } | |
| 182 | ||
| 183 | /** | |
| 184 | * From a sentence get a list of words (in original order) without any | |
| 185 | * punctuation, and all in lower case. | |
| 186 | * | |
| 187 | * @param aSentence | |
| 188 | * The string to parse. | |
| 189 | * @return The words split up as an array | |
| 190 | */ | |
| 191 | public static String[] getWords(String aSentence) { | |
| 192 | 0 | String sentence = aSentence; |
| 193 | // First there are some things we regard as word delimiters even if | |
| 194 | // they are not near space. Note that "-" should not be in this list | |
| 195 | // because words like abel-beth-maiacha contain them. | |
| 196 | 0 | sentence = sentence.replaceAll("--", " "); |
| 197 | 0 | sentence = sentence.replace('.', ' '); |
| 198 | 0 | sentence = sentence.replace('!', ' '); |
| 199 | 0 | sentence = sentence.replace('?', ' '); |
| 200 | 0 | sentence = sentence.replace(':', ' '); |
| 201 | 0 | sentence = sentence.replace(';', ' '); |
| 202 | 0 | sentence = sentence.replace('"', ' '); |
| 203 | 0 | sentence = sentence.replace('\'', ' '); |
| 204 | 0 | sentence = sentence.replace('(', ' '); |
| 205 | 0 | sentence = sentence.replace(')', ' '); |
| 206 | ||
| 207 | 0 | String[] words = StringUtil.split(sentence, " "); |
| 208 | 0 | String[] retcode = new String[words.length]; |
| 209 | ||
| 210 | // Remove the punctuation from the ends of the words. | |
| 211 | 0 | for (int i = 0; i < words.length; i++) { |
| 212 | 0 | retcode[i] = stripPunctuationWord(words[i]).toLowerCase(Locale.ENGLISH); |
| 213 | } | |
| 214 | ||
| 215 | 0 | return retcode; |
| 216 | } | |
| 217 | ||
| 218 | /** | |
| 219 | * Remove the punctuation from the ends of the word | |
| 220 | * | |
| 221 | * @param word | |
| 222 | * Word with punctuation | |
| 223 | * @return Word without punctuation | |
| 224 | */ | |
| 225 | public static String stripPunctuationWord(String word) { | |
| 226 | 0 | int first = firstLetter(word); |
| 227 | 0 | int last = lastLetter(word) + 1; |
| 228 | ||
| 229 | 0 | if (first > last) { |
| 230 | 0 | return word; |
| 231 | } | |
| 232 | ||
| 233 | 0 | return word.substring(first, last); |
| 234 | } | |
| 235 | ||
| 236 | /** | |
| 237 | * Where is the first letter in this word | |
| 238 | * | |
| 239 | * @param word | |
| 240 | * The word to search for letters | |
| 241 | * @return The offset of the first letter | |
| 242 | */ | |
| 243 | public static int firstLetter(String word) { | |
| 244 | int first; | |
| 245 | ||
| 246 | 0 | for (first = 0; first < word.length(); first++) { |
| 247 | 0 | char c = word.charAt(first); |
| 248 | 0 | if (Character.isLetterOrDigit(c)) { |
| 249 | 0 | break; |
| 250 | } | |
| 251 | } | |
| 252 | ||
| 253 | 0 | return first; |
| 254 | } | |
| 255 | ||
| 256 | /** | |
| 257 | * Where is the last letter in this word | |
| 258 | * | |
| 259 | * @param word | |
| 260 | * The word to search for letters | |
| 261 | * @return The offset of the last letter | |
| 262 | */ | |
| 263 | public static int lastLetter(String word) { | |
| 264 | int last; | |
| 265 | ||
| 266 | 0 | for (last = word.length() - 1; last >= 0; last--) { |
| 267 | 0 | char c = word.charAt(last); |
| 268 | 0 | if (Character.isLetterOrDigit(c)) { |
| 269 | 0 | break; |
| 270 | } | |
| 271 | } | |
| 272 | ||
| 273 | 0 | return last; |
| 274 | } | |
| 275 | } |