1 | |
|
2 | |
|
3 | |
|
4 | |
|
5 | |
|
6 | |
|
7 | |
|
8 | |
|
9 | |
|
10 | |
|
11 | |
|
12 | |
|
13 | |
|
14 | |
|
15 | |
|
16 | |
|
17 | |
|
18 | |
|
19 | |
|
20 | |
package org.crosswire.jsword.index.lucene; |
21 | |
|
22 | |
import java.io.Closeable; |
23 | |
import java.io.File; |
24 | |
import java.io.IOException; |
25 | |
import java.net.URI; |
26 | |
import java.util.ArrayList; |
27 | |
import java.util.List; |
28 | |
|
29 | |
import org.apache.lucene.analysis.Analyzer; |
30 | |
import org.apache.lucene.document.Document; |
31 | |
import org.apache.lucene.document.Field; |
32 | |
import org.apache.lucene.index.IndexWriter; |
33 | |
import org.apache.lucene.queryParser.ParseException; |
34 | |
import org.apache.lucene.queryParser.QueryParser; |
35 | |
import org.apache.lucene.search.IndexSearcher; |
36 | |
import org.apache.lucene.search.Query; |
37 | |
import org.apache.lucene.search.ScoreDoc; |
38 | |
import org.apache.lucene.search.Searcher; |
39 | |
import org.apache.lucene.search.TopScoreDocCollector; |
40 | |
import org.apache.lucene.store.Directory; |
41 | |
import org.apache.lucene.store.FSDirectory; |
42 | |
import org.apache.lucene.util.Version; |
43 | |
import org.crosswire.common.progress.JobManager; |
44 | |
import org.crosswire.common.progress.Progress; |
45 | |
import org.crosswire.common.util.FileUtil; |
46 | |
import org.crosswire.common.util.IOUtil; |
47 | |
import org.crosswire.common.util.NetUtil; |
48 | |
import org.crosswire.common.util.Reporter; |
49 | |
import org.crosswire.jsword.JSMsg; |
50 | |
import org.crosswire.jsword.book.Book; |
51 | |
import org.crosswire.jsword.book.BookData; |
52 | |
import org.crosswire.jsword.book.BookException; |
53 | |
import org.crosswire.jsword.book.FeatureType; |
54 | |
import org.crosswire.jsword.book.OSISUtil; |
55 | |
import org.crosswire.jsword.index.AbstractIndex; |
56 | |
import org.crosswire.jsword.index.IndexPolicy; |
57 | |
import org.crosswire.jsword.index.IndexStatus; |
58 | |
import org.crosswire.jsword.index.lucene.analysis.LuceneAnalyzer; |
59 | |
import org.crosswire.jsword.index.search.SearchModifier; |
60 | |
import org.crosswire.jsword.passage.AbstractPassage; |
61 | |
import org.crosswire.jsword.passage.Key; |
62 | |
import org.crosswire.jsword.passage.NoSuchKeyException; |
63 | |
import org.crosswire.jsword.passage.NoSuchVerseException; |
64 | |
import org.crosswire.jsword.passage.PassageTally; |
65 | |
import org.crosswire.jsword.passage.Verse; |
66 | |
import org.crosswire.jsword.passage.VerseFactory; |
67 | |
import org.crosswire.jsword.versification.Versification; |
68 | |
import org.crosswire.jsword.versification.system.Versifications; |
69 | |
import org.jdom2.Element; |
70 | |
import org.slf4j.Logger; |
71 | |
import org.slf4j.LoggerFactory; |
72 | |
|
73 | |
|
74 | |
|
75 | |
|
76 | |
|
77 | |
|
78 | |
|
79 | |
public class LuceneIndex extends AbstractIndex implements Closeable { |
80 | |
|
81 | |
|
82 | |
|
83 | |
|
84 | |
|
85 | |
|
86 | |
|
87 | |
public static final String FIELD_KEY = "key"; |
88 | |
|
89 | |
|
90 | |
|
91 | |
|
92 | |
public static final String FIELD_BODY = "content"; |
93 | |
|
94 | |
|
95 | |
|
96 | |
|
97 | |
public static final String FIELD_STRONG = "strong"; |
98 | |
|
99 | |
|
100 | |
|
101 | |
|
102 | |
public static final String FIELD_HEADING = "heading"; |
103 | |
|
104 | |
|
105 | |
|
106 | |
|
107 | |
public static final String FIELD_XREF = "xref"; |
108 | |
|
109 | |
|
110 | |
|
111 | |
|
112 | |
public static final String FIELD_NOTE = "note"; |
113 | |
|
114 | |
|
115 | |
|
116 | |
|
117 | |
public static final String FIELD_MORPHOLOGY = "morph"; |
118 | |
|
119 | |
|
120 | |
|
121 | |
|
122 | |
public static final String FIELD_INTRO = "intro"; |
123 | |
|
124 | |
|
125 | |
|
126 | |
|
127 | |
|
128 | |
private static final int WORK_ESTIMATE = 98; |
129 | |
|
130 | |
|
131 | |
|
132 | |
|
133 | |
|
134 | |
|
135 | |
|
136 | |
|
137 | |
|
138 | 0 | public LuceneIndex(Book book, URI storage) throws BookException { |
139 | 0 | this.book = book; |
140 | |
|
141 | |
try { |
142 | 0 | this.path = NetUtil.getAsFile(storage).getCanonicalPath(); |
143 | 0 | } catch (IOException ex) { |
144 | |
|
145 | 0 | throw new BookException(JSMsg.gettext("Failed to initialize Lucene search engine."), ex); |
146 | 0 | } |
147 | 0 | initDirectoryAndSearcher(); |
148 | 0 | } |
149 | |
|
150 | |
|
151 | |
|
152 | |
|
153 | |
|
154 | |
|
155 | |
|
156 | |
|
157 | |
|
158 | |
|
159 | 0 | public LuceneIndex(Book book, URI storage, IndexPolicy policy) throws BookException { |
160 | |
|
161 | 0 | this.book = book; |
162 | 0 | File finalPath = null; |
163 | |
try { |
164 | 0 | finalPath = NetUtil.getAsFile(storage); |
165 | 0 | this.path = finalPath.getCanonicalPath(); |
166 | 0 | } catch (IOException ex) { |
167 | |
|
168 | 0 | throw new BookException(JSMsg.gettext("Failed to initialize Lucene search engine."), ex); |
169 | 0 | } |
170 | |
|
171 | |
|
172 | 0 | String jobName = JSMsg.gettext("Creating index. Processing {0}", book.getInitials()); |
173 | 0 | Progress job = JobManager.createJob(String.format(Progress.CREATE_INDEX, book.getInitials()), jobName, Thread.currentThread()); |
174 | 0 | job.beginJob(jobName); |
175 | |
|
176 | 0 | IndexStatus finalStatus = IndexStatus.UNDONE; |
177 | |
|
178 | 0 | List<Key> errors = new ArrayList<Key>(); |
179 | |
|
180 | 0 | File tempPath = new File(path + '.' + IndexStatus.CREATING.toString()); |
181 | |
|
182 | |
|
183 | |
|
184 | 0 | if (tempPath.exists()) { |
185 | 0 | FileUtil.delete(tempPath); |
186 | |
} |
187 | |
|
188 | |
try { |
189 | |
|
190 | 0 | Analyzer analyzer = new LuceneAnalyzer(book); |
191 | |
|
192 | |
|
193 | |
|
194 | 0 | Object mutex = policy.isSerial() ? CREATING : book.getBookMetaData(); |
195 | 0 | synchronized (mutex) { |
196 | |
|
197 | 0 | book.setIndexStatus(IndexStatus.CREATING); |
198 | |
|
199 | 0 | IndexWriter writer = null; |
200 | |
try { |
201 | |
|
202 | 0 | final Directory destination = FSDirectory.open(new File(tempPath.getCanonicalPath())); |
203 | 0 | writer = new IndexWriter(destination, analyzer, true, IndexWriter.MaxFieldLength.UNLIMITED); |
204 | 0 | writer.setRAMBufferSizeMB(policy.getRAMBufferSize()); |
205 | |
|
206 | 0 | generateSearchIndexImpl(job, errors, writer, book.getGlobalKeyList(), 0, policy); |
207 | |
|
208 | |
} finally { |
209 | 0 | if (writer != null) { |
210 | 0 | writer.close(); |
211 | |
} |
212 | |
} |
213 | |
|
214 | 0 | job.setCancelable(false); |
215 | 0 | if (!job.isFinished()) { |
216 | 0 | if (!tempPath.renameTo(finalPath)) { |
217 | |
|
218 | 0 | throw new BookException(JSMsg.gettext("Installation failed.")); |
219 | |
} |
220 | |
} |
221 | |
|
222 | 0 | if (finalPath.exists()) { |
223 | 0 | finalStatus = IndexStatus.DONE; |
224 | |
} |
225 | |
|
226 | 0 | if (!errors.isEmpty()) { |
227 | 0 | StringBuilder buf = new StringBuilder(); |
228 | 0 | for (Key error : errors) { |
229 | 0 | buf.append(error); |
230 | 0 | buf.append('\n'); |
231 | |
} |
232 | |
|
233 | |
|
234 | 0 | Reporter.informUser(this, JSMsg.gettext("The following verses have errors and could not be indexed\n{0}", buf)); |
235 | |
} |
236 | 0 | initDirectoryAndSearcher(); |
237 | 0 | } |
238 | 0 | } catch (IOException ex) { |
239 | 0 | job.cancel(); |
240 | |
|
241 | 0 | throw new BookException(JSMsg.gettext("Failed to initialize Lucene search engine."), ex); |
242 | |
} finally { |
243 | 0 | book.setIndexStatus(finalStatus); |
244 | 0 | job.done(); |
245 | |
|
246 | 0 | if (tempPath.exists()) { |
247 | 0 | FileUtil.delete(tempPath); |
248 | |
} |
249 | |
} |
250 | 0 | } |
251 | |
|
252 | |
|
253 | |
|
254 | |
|
255 | |
private void initDirectoryAndSearcher() { |
256 | |
try { |
257 | 0 | directory = FSDirectory.open(new File(path)); |
258 | 0 | searcher = new IndexSearcher(directory, true); |
259 | 0 | } catch (IOException ex) { |
260 | 0 | log.warn("second load failure", ex); |
261 | 0 | } |
262 | 0 | } |
263 | |
|
264 | |
|
265 | |
|
266 | |
|
267 | |
public Key find(String search) throws BookException { |
268 | 0 | String v11nName = book.getBookMetaData().getProperty("Versification").toString(); |
269 | 0 | Versification v11n = Versifications.instance().getVersification(v11nName); |
270 | |
|
271 | 0 | SearchModifier modifier = getSearchModifier(); |
272 | 0 | Key results = null; |
273 | |
|
274 | 0 | if (search != null) { |
275 | 0 | Throwable theCause = null; |
276 | |
try { |
277 | 0 | Analyzer analyzer = new LuceneAnalyzer(book); |
278 | |
|
279 | 0 | QueryParser parser = new QueryParser(Version.LUCENE_29, LuceneIndex.FIELD_BODY, analyzer); |
280 | 0 | parser.setAllowLeadingWildcard(true); |
281 | 0 | Query query = parser.parse(search); |
282 | 0 | log.info("ParsedQuery- {}", query.toString()); |
283 | |
|
284 | |
|
285 | 0 | if (modifier != null && modifier.isRanked()) { |
286 | 0 | PassageTally tally = new PassageTally(v11n); |
287 | 0 | tally.raiseEventSuppresion(); |
288 | 0 | tally.raiseNormalizeProtection(); |
289 | 0 | results = tally; |
290 | |
|
291 | 0 | TopScoreDocCollector collector = TopScoreDocCollector.create(modifier.getMaxResults(), false); |
292 | 0 | searcher.search(query, collector); |
293 | 0 | tally.setTotal(collector.getTotalHits()); |
294 | 0 | ScoreDoc[] hits = collector.topDocs().scoreDocs; |
295 | 0 | for (int i = 0; i < hits.length; i++) { |
296 | 0 | int docId = hits[i].doc; |
297 | 0 | Document doc = searcher.doc(docId); |
298 | 0 | Key key = VerseFactory.fromString(v11n, doc.get(LuceneIndex.FIELD_KEY)); |
299 | |
|
300 | |
|
301 | 0 | int score = (int) (hits[i].score * 100 + 1); |
302 | 0 | tally.add(key, score); |
303 | |
} |
304 | 0 | tally.lowerNormalizeProtection(); |
305 | 0 | tally.lowerEventSuppressionAndTest(); |
306 | 0 | } else { |
307 | 0 | results = book.createEmptyKeyList(); |
308 | |
|
309 | |
|
310 | 0 | AbstractPassage passage = null; |
311 | 0 | if (results instanceof AbstractPassage) { |
312 | 0 | passage = (AbstractPassage) results; |
313 | 0 | passage.raiseEventSuppresion(); |
314 | 0 | passage.raiseNormalizeProtection(); |
315 | |
} |
316 | 0 | searcher.search(query, new VerseCollector(v11n, searcher, results)); |
317 | 0 | if (passage != null) { |
318 | 0 | passage.lowerNormalizeProtection(); |
319 | 0 | passage.lowerEventSuppressionAndTest(); |
320 | |
} |
321 | |
} |
322 | 0 | } catch (IOException e) { |
323 | |
|
324 | 0 | Throwable cause = e.getCause(); |
325 | 0 | theCause = cause instanceof NoSuchVerseException ? cause : e; |
326 | 0 | } catch (NoSuchVerseException e) { |
327 | 0 | theCause = e; |
328 | 0 | } catch (ParseException e) { |
329 | 0 | theCause = e; |
330 | 0 | } |
331 | |
|
332 | 0 | if (theCause != null) { |
333 | |
|
334 | 0 | throw new BookException(JSMsg.gettext("Search failed."), theCause); |
335 | |
} |
336 | |
} |
337 | |
|
338 | 0 | if (results == null) { |
339 | 0 | if (modifier != null && modifier.isRanked()) { |
340 | 0 | results = new PassageTally(v11n); |
341 | |
} else { |
342 | 0 | results = book.createEmptyKeyList(); |
343 | |
} |
344 | |
} |
345 | 0 | return results; |
346 | |
} |
347 | |
|
348 | |
|
349 | |
|
350 | |
|
351 | |
public Key getKey(String name) throws NoSuchKeyException { |
352 | 0 | return book.getKey(name); |
353 | |
} |
354 | |
|
355 | |
|
356 | |
|
357 | |
|
358 | |
public final void close() { |
359 | 0 | IOUtil.close(searcher); |
360 | 0 | searcher = null; |
361 | 0 | IOUtil.close(directory); |
362 | 0 | directory = null; |
363 | 0 | } |
364 | |
|
365 | |
|
366 | |
|
367 | |
|
368 | |
|
369 | |
private void generateSearchIndexImpl(Progress job, List<Key> errors, IndexWriter writer, Key key, int count, IndexPolicy policy) throws BookException, IOException { |
370 | 0 | String v11nName = null; |
371 | 0 | if (book.getBookMetaData().getProperty("Versification") != null) { |
372 | 0 | v11nName = book.getBookMetaData().getProperty("Versification").toString(); |
373 | |
} |
374 | 0 | Versification v11n = Versifications.instance().getVersification(v11nName); |
375 | 0 | boolean includeStrongs = book.getBookMetaData().hasFeature(FeatureType.STRONGS_NUMBERS) && policy.isStrongsIndexed(); |
376 | 0 | boolean includeXrefs = book.getBookMetaData().hasFeature(FeatureType.SCRIPTURE_REFERENCES) && policy.isXrefIndexed(); |
377 | 0 | boolean includeNotes = book.getBookMetaData().hasFeature(FeatureType.FOOTNOTES) && policy.isNoteIndexed(); |
378 | 0 | boolean includeHeadings = book.getBookMetaData().hasFeature(FeatureType.HEADINGS) && policy.isTitleIndexed(); |
379 | 0 | boolean includeMorphology = book.getBookMetaData().hasFeature(FeatureType.MORPHOLOGY) && policy.isMorphIndexed(); |
380 | |
|
381 | 0 | String oldRootName = ""; |
382 | 0 | int percent = 0; |
383 | 0 | String rootName = ""; |
384 | 0 | BookData data = null; |
385 | 0 | Element osis = null; |
386 | |
|
387 | |
|
388 | 0 | Document doc = new Document(); |
389 | 0 | Field keyField = new Field(FIELD_KEY, "", Field.Store.YES, Field.Index.NOT_ANALYZED, Field.TermVector.NO); |
390 | 0 | Field bodyField = new Field(FIELD_BODY, "", Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.NO); |
391 | 0 | Field introField = new Field(FIELD_INTRO, "", Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.NO); |
392 | 0 | Field strongField = new Field(FIELD_STRONG, "", Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.YES); |
393 | 0 | Field xrefField = new Field(FIELD_XREF, "", Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.NO); |
394 | 0 | Field noteField = new Field(FIELD_NOTE, "", Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.NO); |
395 | 0 | Field headingField = new Field(FIELD_HEADING, "", Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.NO); |
396 | 0 | Field morphologyField = new Field(FIELD_MORPHOLOGY , "", Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.NO); |
397 | |
|
398 | 0 | int size = key.getCardinality(); |
399 | 0 | int subCount = count; |
400 | 0 | log.debug("Number of keys = {}", Integer.toString(size)); |
401 | 0 | for (Key subkey : key) { |
402 | |
|
403 | |
|
404 | |
|
405 | 0 | if (subkey.canHaveChildren()) { |
406 | 0 | generateSearchIndexImpl(job, errors, writer, subkey, subCount, policy); |
407 | 0 | continue; |
408 | |
} |
409 | |
|
410 | 0 | data = new BookData(book, subkey); |
411 | 0 | osis = null; |
412 | |
|
413 | |
try { |
414 | 0 | osis = data.getOsisFragment(false); |
415 | 0 | } catch (BookException e) { |
416 | 0 | errors.add(subkey); |
417 | 0 | continue; |
418 | 0 | } |
419 | |
|
420 | |
|
421 | 0 | doc.getFields().clear(); |
422 | |
|
423 | |
|
424 | |
|
425 | 0 | keyField.setValue(subkey.getOsisRef()); |
426 | 0 | doc.add(keyField); |
427 | |
|
428 | 0 | if (subkey instanceof Verse && ((Verse) subkey).getVerse() == 0) { |
429 | 0 | addField(doc, introField, OSISUtil.getCanonicalText(osis)); |
430 | |
} else { |
431 | 0 | addField(doc, bodyField, OSISUtil.getCanonicalText(osis)); |
432 | |
} |
433 | |
|
434 | 0 | if (includeStrongs) { |
435 | 0 | addField(doc, strongField, OSISUtil.getStrongsNumbers(osis)); |
436 | |
} |
437 | |
|
438 | 0 | if (includeXrefs) { |
439 | |
|
440 | 0 | addField(doc, xrefField, OSISUtil.getReferences(this.book, subkey, v11n, osis)); |
441 | |
} |
442 | |
|
443 | 0 | if (includeNotes) { |
444 | 0 | addField(doc, noteField, OSISUtil.getNotes(osis)); |
445 | |
} |
446 | |
|
447 | 0 | if (includeHeadings) { |
448 | 0 | String heading = OSISUtil.getHeadings(osis); |
449 | 0 | addField(doc, headingField, heading); |
450 | |
} |
451 | |
|
452 | 0 | if (includeMorphology) { |
453 | 0 | addField(doc, morphologyField, OSISUtil.getMorphologiesWithStrong(osis)); |
454 | |
} |
455 | |
|
456 | |
|
457 | 0 | if (doc.getFields().size() > 1) { |
458 | 0 | writer.addDocument(doc); |
459 | |
} |
460 | |
|
461 | |
|
462 | 0 | rootName = subkey.getRootName(); |
463 | 0 | if (!rootName.equals(oldRootName)) { |
464 | 0 | oldRootName = rootName; |
465 | |
|
466 | |
|
467 | 0 | job.setSectionName(rootName); |
468 | |
} |
469 | |
|
470 | 0 | subCount++; |
471 | 0 | int oldPercent = percent; |
472 | 0 | percent = WORK_ESTIMATE * subCount / size; |
473 | |
|
474 | |
|
475 | 0 | if (oldPercent != percent) { |
476 | 0 | job.setWork(percent); |
477 | |
} |
478 | |
|
479 | |
|
480 | 0 | Thread.yield(); |
481 | 0 | if (Thread.currentThread().isInterrupted()) { |
482 | 0 | break; |
483 | |
} |
484 | 0 | } |
485 | 0 | } |
486 | |
|
487 | |
|
488 | |
|
489 | |
|
490 | |
|
491 | |
|
492 | |
|
493 | |
|
494 | |
|
495 | |
private void addField(Document doc, Field field, String text) { |
496 | 0 | if (text != null && text.length() > 0) { |
497 | 0 | field.setValue(text); |
498 | 0 | doc.add(field); |
499 | |
} |
500 | 0 | } |
501 | |
|
502 | |
|
503 | |
|
504 | |
|
505 | |
|
506 | |
|
507 | |
|
508 | |
|
509 | |
|
510 | |
public Searcher getSearcher() { |
511 | 0 | return searcher; |
512 | |
} |
513 | |
|
514 | |
|
515 | |
|
516 | |
|
517 | |
private Book book; |
518 | |
|
519 | |
|
520 | |
|
521 | |
|
522 | |
private String path; |
523 | |
|
524 | |
|
525 | |
|
526 | |
|
527 | |
private Directory directory; |
528 | |
|
529 | |
|
530 | |
|
531 | |
|
532 | |
private Searcher searcher; |
533 | |
|
534 | |
|
535 | |
|
536 | |
|
537 | |
|
538 | 0 | private static final Object CREATING = new Object(); |
539 | |
|
540 | |
|
541 | |
|
542 | |
|
543 | 0 | private static final Logger log = LoggerFactory.getLogger(LuceneIndex.class); |
544 | |
} |