Classes in this File | Line Coverage | Branch Coverage | Complexity | ||||
SwordUtil |
|
| 2.7222222222222223;2.722 |
1 | /** | |
2 | * Distribution License: | |
3 | * JSword is free software; you can redistribute it and/or modify it under | |
4 | * the terms of the GNU Lesser General Public License, version 2.1 or later | |
5 | * as published by the Free Software Foundation. This program is distributed | |
6 | * in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even | |
7 | * the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. | |
8 | * See the GNU Lesser General Public License for more details. | |
9 | * | |
10 | * The License is available on the internet at: | |
11 | * http://www.gnu.org/copyleft/lgpl.html | |
12 | * or by writing to: | |
13 | * Free Software Foundation, Inc. | |
14 | * 59 Temple Place - Suite 330 | |
15 | * Boston, MA 02111-1307, USA | |
16 | * | |
17 | * © CrossWire Bible Society, 2005 - 2016 | |
18 | * | |
19 | */ | |
20 | package org.crosswire.jsword.book.sword; | |
21 | ||
22 | import java.io.IOException; | |
23 | import java.io.RandomAccessFile; | |
24 | import java.io.UnsupportedEncodingException; | |
25 | import java.net.URI; | |
26 | ||
27 | import org.crosswire.common.util.NetUtil; | |
28 | import org.crosswire.jsword.JSOtherMsg; | |
29 | import org.crosswire.jsword.book.BookException; | |
30 | import org.crosswire.jsword.book.BookMetaData; | |
31 | import org.slf4j.Logger; | |
32 | import org.slf4j.LoggerFactory; | |
33 | ||
34 | /** | |
35 | * Various utilities used by different Sword classes. | |
36 | * | |
37 | * @see gnu.lgpl.License The GNU Lesser General Public License for details. | |
38 | * @author Joe Walker | |
39 | */ | |
40 | public final class SwordUtil { | |
41 | /** | |
42 | * Prevent instantiation | |
43 | */ | |
44 | 0 | private SwordUtil() { |
45 | 0 | } |
46 | ||
47 | /** | |
48 | * Read a RandomAccessFile | |
49 | * | |
50 | * @param raf | |
51 | * The file to read | |
52 | * @param offset | |
53 | * The start of the record to read | |
54 | * @param theSize | |
55 | * The number of bytes to read | |
56 | * @return the read data | |
57 | * @throws IOException | |
58 | * on error | |
59 | */ | |
60 | protected static byte[] readRAF(RandomAccessFile raf, long offset, int theSize) throws IOException { | |
61 | 0 | raf.seek(offset); |
62 | 0 | return readNextRAF(raf, theSize); |
63 | } | |
64 | ||
65 | /** | |
66 | * Read a RandomAccessFile from the current location in the file. | |
67 | * | |
68 | * @param raf | |
69 | * The file to read | |
70 | * @param theSize | |
71 | * The number of bytes to read | |
72 | * @return the read data | |
73 | * @throws IOException | |
74 | * on error | |
75 | */ | |
76 | protected static byte[] readNextRAF(RandomAccessFile raf, int theSize) throws IOException { | |
77 | 0 | long offset = raf.getFilePointer(); |
78 | 0 | int size = theSize; |
79 | 0 | long rafSize = raf.length(); |
80 | ||
81 | // It is common to have an entry that points to nothing. | |
82 | // That is the equivalent of an empty string. | |
83 | 0 | if (size == 0) { |
84 | 0 | return new byte[0]; |
85 | } | |
86 | ||
87 | 0 | if (size < 0) { |
88 | 0 | log.error("Nothing to read at offset = {} returning empty because negative size={}", Long.toString(offset), Integer.toString(size)); |
89 | 0 | return new byte[0]; |
90 | } | |
91 | ||
92 | 0 | if (offset >= rafSize) { |
93 | 0 | log.error("Attempt to read beyond end. offset={} size={} but raf.length={}", Long.toString(offset), Integer.toString(size), Long.toString(rafSize)); |
94 | 0 | return new byte[0]; |
95 | } | |
96 | ||
97 | 0 | if (offset + size > raf.length()) { |
98 | 0 | log.error("Need to reduce size to avoid EOFException. offset={} size={} but raf.length={}", Long.toString(offset), Integer.toString(size), Long.toString(rafSize)); |
99 | 0 | size = (int) (raf.length() - offset); |
100 | } | |
101 | ||
102 | 0 | byte[] read = new byte[size]; |
103 | 0 | raf.readFully(read); |
104 | ||
105 | 0 | return read; |
106 | } | |
107 | ||
108 | /** | |
109 | * Writes "data" to a RandomAccessFile at the "offset" position | |
110 | * | |
111 | * @param raf | |
112 | * RandomAccessFile | |
113 | * @param offset | |
114 | * offset to write at | |
115 | * @param data | |
116 | * data to write | |
117 | * @throws IOException | |
118 | * on error | |
119 | */ | |
120 | protected static void writeRAF(RandomAccessFile raf, long offset, byte[] data) throws IOException { | |
121 | 0 | raf.seek(offset); |
122 | 0 | writeNextRAF(raf, data); |
123 | 0 | } |
124 | ||
125 | protected static void writeNextRAF(RandomAccessFile raf, byte[] data) throws IOException { | |
126 | 0 | if (data == null) { |
127 | 0 | return; |
128 | } | |
129 | 0 | raf.write(data); |
130 | 0 | } |
131 | ||
132 | /** | |
133 | * Read a RandomAccessFile until a particular byte is seen | |
134 | * | |
135 | * @param raf | |
136 | * The file to read | |
137 | * @param offset | |
138 | * The start of the record to read | |
139 | * @param stopByte | |
140 | * The point at which to stop reading | |
141 | * @return the read data | |
142 | * @throws IOException | |
143 | * on error | |
144 | */ | |
145 | protected static byte[] readUntilRAF(RandomAccessFile raf, int offset, byte stopByte) throws IOException { | |
146 | 0 | raf.seek(offset); |
147 | 0 | return readUntilRAF(raf, stopByte); |
148 | } | |
149 | ||
150 | /** | |
151 | * Read a RandomAccessFile until a particular byte is seen | |
152 | * | |
153 | * @param raf | |
154 | * The file to read | |
155 | * @param stopByte | |
156 | * The point at which to stop reading | |
157 | * @return the read data | |
158 | * @throws IOException | |
159 | * on error | |
160 | */ | |
161 | protected static byte[] readUntilRAF(RandomAccessFile raf, byte stopByte) throws IOException { | |
162 | // The strategy used here is to read the file twice. | |
163 | // Once to determine how much to read and then getting the actual data. | |
164 | // It may be more efficient to incrementally build up a byte buffer. | |
165 | // Note: that growing a static array by 1 byte at a time is O(n**2) | |
166 | // This is negligible when the n is small, but prohibitive otherwise. | |
167 | 0 | long offset = raf.getFilePointer(); |
168 | 0 | int size = 0; |
169 | ||
170 | 0 | int nextByte = -1; |
171 | do { | |
172 | 0 | nextByte = raf.read(); |
173 | ||
174 | 0 | size++; |
175 | 0 | } while (nextByte != -1 && nextByte != stopByte); |
176 | ||
177 | // Note: we allow for nextByte == -1 to be included in size | |
178 | // so that readRAF will report EOF errors | |
179 | 0 | return readRAF(raf, offset, size); |
180 | } | |
181 | ||
182 | /** | |
183 | * Decode little endian data from a byte array. This assumes that the high | |
184 | * order bit is not set as this is used solely for an offset in a file in | |
185 | * bytes. For a practical limit, 2**31 is way bigger than any document that | |
186 | * we can have. | |
187 | * | |
188 | * @param data | |
189 | * the byte[] from which to read 4 bytes | |
190 | * @param offset | |
191 | * the offset into the array | |
192 | * @return The decoded data | |
193 | */ | |
194 | public static int decodeLittleEndian32(byte[] data, int offset) { | |
195 | // Convert from a byte to an int, but prevent sign extension. | |
196 | // So -16 becomes 240 | |
197 | 0 | int byte1 = data[0 + offset] & 0xFF; |
198 | 0 | int byte2 = (data[1 + offset] & 0xFF) << 8; |
199 | 0 | int byte3 = (data[2 + offset] & 0xFF) << 16; |
200 | 0 | int byte4 = (data[3 + offset] & 0xFF) << 24; |
201 | ||
202 | 0 | return byte4 | byte3 | byte2 | byte1; |
203 | } | |
204 | ||
205 | /** | |
206 | * Encode little endian data from a byte array. This assumes that the number | |
207 | * fits in a Java integer. That is, the range of an unsigned C integer is | |
208 | * greater than a signed Java integer. For a practical limit, 2**31 is way | |
209 | * bigger than any document that we can have. If this ever doesn't work, use | |
210 | * a long for the number. | |
211 | * | |
212 | * @param val | |
213 | * the number to encode into little endian | |
214 | * @param data | |
215 | * the byte[] from which to write 4 bytes | |
216 | * @param offset | |
217 | * the offset into the array | |
218 | */ | |
219 | protected static void encodeLittleEndian32(int val, byte[] data, int offset) { | |
220 | 0 | data[0 + offset] = (byte) (val & 0xFF); |
221 | 0 | data[1 + offset] = (byte) ((val >> 8) & 0xFF); |
222 | 0 | data[2 + offset] = (byte) ((val >> 16) & 0xFF); |
223 | 0 | data[3 + offset] = (byte) ((val >> 24) & 0xFF); |
224 | 0 | } |
225 | ||
226 | /** | |
227 | * Decode little endian data from a byte array | |
228 | * | |
229 | * @param data | |
230 | * the byte[] from which to read 2 bytes | |
231 | * @param offset | |
232 | * the offset into the array | |
233 | * @return The decoded data | |
234 | */ | |
235 | protected static int decodeLittleEndian16(byte[] data, int offset) { | |
236 | // Convert from a byte to an int, but prevent sign extension. | |
237 | // So -16 becomes 240 | |
238 | 0 | int byte1 = data[0 + offset] & 0xFF; |
239 | 0 | int byte2 = (data[1 + offset] & 0xFF) << 8; |
240 | ||
241 | 0 | return byte2 | byte1; |
242 | } | |
243 | ||
244 | /** | |
245 | * Encode a 16-bit little endian from an integer. It is assumed that the | |
246 | * integer's lower 16 bits are the only that are set. | |
247 | * | |
248 | * @param data | |
249 | * the byte[] from which to write 2 bytes | |
250 | * @param offset | |
251 | * the offset into the array | |
252 | */ | |
253 | protected static void encodeLittleEndian16(int val, byte[] data, int offset) { | |
254 | 0 | data[0 + offset] = (byte) (val & 0xFF); |
255 | 0 | data[1 + offset] = (byte) ((val >> 8) & 0xFF); |
256 | 0 | } |
257 | ||
258 | /** | |
259 | * Find a byte of data in an array | |
260 | * | |
261 | * @param data | |
262 | * The array to search | |
263 | * @param sought | |
264 | * The data to search for | |
265 | * @return The index of the found position or -1 if not found | |
266 | */ | |
267 | protected static int findByte(byte[] data, byte sought) { | |
268 | 0 | return findByte(data, 0, sought); |
269 | } | |
270 | ||
271 | /** | |
272 | * Find a byte of data in an array | |
273 | * | |
274 | * @param data | |
275 | * The array to search | |
276 | * @param offset | |
277 | * The position in the array to begin looking | |
278 | * @param sought | |
279 | * The data to search for | |
280 | * @return The index of the found position or -1 if not found | |
281 | */ | |
282 | protected static int findByte(byte[] data, int offset, byte sought) { | |
283 | 0 | for (int i = offset; i < data.length; i++) { |
284 | 0 | if (data[i] == sought) { |
285 | 0 | return i; |
286 | } | |
287 | } | |
288 | ||
289 | 0 | return -1; |
290 | } | |
291 | ||
292 | /** | |
293 | * Transform a byte array into a string given the encoding. If the encoding | |
294 | * is bad then it just does it as a string. | |
295 | * Note: this may modify data. Don't use it to examine data. | |
296 | * | |
297 | * @param key the key | |
298 | * @param data | |
299 | * The byte array to be converted | |
300 | * @param charset | |
301 | * The encoding of the byte array | |
302 | * @return a string that is UTF-8 internally | |
303 | */ | |
304 | public static String decode(String key, byte[] data, String charset) { | |
305 | 0 | return decode(key, data, 0, data.length, charset); |
306 | } | |
307 | ||
308 | /** | |
309 | * Transform a portion of a byte array into a string given the encoding. If | |
310 | * the encoding is bad then it just does it as a string. | |
311 | * Note: this may modify data. Don't use it to examine data. | |
312 | * | |
313 | * @param key the key | |
314 | * @param data | |
315 | * The byte array to be converted | |
316 | * @param length | |
317 | * The number of bytes to use. | |
318 | * @param charset | |
319 | * The encoding of the byte array | |
320 | * @return a string that is UTF-8 internally | |
321 | */ | |
322 | public static String decode(String key, byte[] data, int length, String charset) { | |
323 | 0 | return decode(key, data, 0, length, charset); |
324 | } | |
325 | ||
326 | /** | |
327 | * Transform a portion of a byte array starting at an offset into a string | |
328 | * given the encoding. If the encoding is bad then it just does it as a | |
329 | * string. Note: this may modify data. Don't use it to examine data. | |
330 | * | |
331 | * @param key the key | |
332 | * @param data | |
333 | * The byte array to be converted | |
334 | * @param offset | |
335 | * The starting position in the byte array | |
336 | * @param length | |
337 | * The number of bytes to use. | |
338 | * @param charset | |
339 | * The encoding of the byte array | |
340 | * @return a string that is UTF-8 internally | |
341 | */ | |
342 | public static String decode(String key, byte[] data, int offset, int length, String charset) { | |
343 | 0 | if ("WINDOWS-1252".equals(charset)) { |
344 | 0 | clean1252(key, data, offset, length); |
345 | } | |
346 | 0 | String txt = ""; |
347 | try { | |
348 | 0 | if (offset + length <= data.length) { |
349 | 0 | txt = new String(data, offset, length, charset); |
350 | } | |
351 | 0 | } catch (UnsupportedEncodingException ex) { |
352 | // It is impossible! In case, use system default... | |
353 | 0 | log.error("{}: Encoding {} not supported.", key, charset, ex); |
354 | 0 | txt = new String(data, offset, length); |
355 | 0 | } |
356 | ||
357 | 0 | return txt; |
358 | } | |
359 | ||
360 | /** | |
361 | * Remove rogue characters in the source. These are characters that are not | |
362 | * valid in cp1252 aka WINDOWS-1252 and in UTF-8 or are non-printing control | |
363 | * characters in the range of 0-32. | |
364 | */ | |
365 | private static void clean1252(String key, byte[] data, int offset, int length) { | |
366 | 0 | int end = offset + length; |
367 | // make sure it doesn't go off the end | |
368 | 0 | if (end > data.length) { |
369 | 0 | end = data.length; |
370 | } | |
371 | 0 | for (int i = offset; i < end; i++) { |
372 | // between 0-32 only allow whitespace: \t, \n, \r, ' ' | |
373 | // characters 0x81, 0x8D, 0x8F, 0x90 and 0x9D are undefined in | |
374 | // cp1252 | |
375 | 0 | int c = data[i] & 0xFF; |
376 | 0 | if ((c >= 0x00 && c < 0x20 && c != 0x09 && c != 0x0A && c != 0x0D) || (c == 0x81 || c == 0x8D || c == 0x8F || c == 0x90 || c == 0x9D)) { |
377 | 0 | data[i] = 0x20; |
378 | 0 | log.error("{} has bad character 0x{} at position {} in input.", key, Integer.toString(c, 16), Integer.toString(i)); |
379 | } | |
380 | } | |
381 | 0 | } |
382 | ||
383 | /** | |
384 | * Returns where the book should be located | |
385 | * @param bookMetaData meta information about the book | |
386 | * @return the URI locating the resource | |
387 | * @throws BookException thrown if an issue is encountered, e.g. missing data files. | |
388 | */ | |
389 | public static URI getExpandedDataPath(BookMetaData bookMetaData) throws BookException { | |
390 | 0 | URI loc = NetUtil.lengthenURI(bookMetaData.getLibrary(), bookMetaData.getProperty(SwordBookMetaData.KEY_DATA_PATH)); |
391 | ||
392 | 0 | if (loc == null) { |
393 | // FIXME(DMS): missing parameter | |
394 | 0 | throw new BookException(JSOtherMsg.lookupText("Missing data files for old and new testaments in {0}.")); |
395 | } | |
396 | ||
397 | 0 | return loc; |
398 | } | |
399 | ||
400 | /** | |
401 | * The log stream | |
402 | */ | |
403 | 0 | private static final Logger log = LoggerFactory.getLogger(SwordUtil.class); |
404 | ||
405 | } |