| Classes in this File | Line Coverage | Branch Coverage | Complexity | ||||
| PersianLuceneAnalyzer |
|
| 2.6666666666666665;2.667 |
| 1 | /** | |
| 2 | * Distribution License: | |
| 3 | * JSword is free software; you can redistribute it and/or modify it under | |
| 4 | * the terms of the GNU Lesser General Public License, version 2.1 or later | |
| 5 | * as published by the Free Software Foundation. This program is distributed | |
| 6 | * in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even | |
| 7 | * the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. | |
| 8 | * See the GNU Lesser General Public License for more details. | |
| 9 | * | |
| 10 | * The License is available on the internet at: | |
| 11 | * http://www.gnu.org/copyleft/lgpl.html | |
| 12 | * or by writing to: | |
| 13 | * Free Software Foundation, Inc. | |
| 14 | * 59 Temple Place - Suite 330 | |
| 15 | * Boston, MA 02111-1307, USA | |
| 16 | * | |
| 17 | * © CrossWire Bible Society, 2009 - 2016 | |
| 18 | * | |
| 19 | */ | |
| 20 | package org.crosswire.jsword.index.lucene.analysis; | |
| 21 | ||
| 22 | import java.io.IOException; | |
| 23 | import java.io.Reader; | |
| 24 | ||
| 25 | import org.apache.lucene.analysis.LowerCaseFilter; | |
| 26 | import org.apache.lucene.analysis.StopFilter; | |
| 27 | import org.apache.lucene.analysis.TokenStream; | |
| 28 | import org.apache.lucene.analysis.ar.ArabicLetterTokenizer; | |
| 29 | import org.apache.lucene.analysis.ar.ArabicNormalizationFilter; | |
| 30 | import org.apache.lucene.analysis.fa.PersianAnalyzer; | |
| 31 | import org.apache.lucene.analysis.fa.PersianNormalizationFilter; | |
| 32 | import org.apache.lucene.util.Version; | |
| 33 | ||
| 34 | /** | |
| 35 | * An Analyzer whose {@link TokenStream} is built from a | |
| 36 | * {@link ArabicLetterTokenizer} filtered with {@link LowerCaseFilter}, | |
| 37 | * {@link ArabicNormalizationFilter}, {@link PersianNormalizationFilter} and | |
| 38 | * Persian {@link StopFilter} (optional) | |
| 39 | * | |
| 40 | * @see gnu.lgpl.License The GNU Lesser General Public License for details. | |
| 41 | * @author DM Smith | |
| 42 | */ | |
| 43 | public class PersianLuceneAnalyzer extends AbstractBookAnalyzer { | |
| 44 | 0 | public PersianLuceneAnalyzer() { |
| 45 | 0 | stopSet = PersianAnalyzer.getDefaultStopSet(); |
| 46 | 0 | } |
| 47 | ||
| 48 | /* | |
| 49 | * (non-Javadoc) | |
| 50 | * | |
| 51 | * @see org.apache.lucene.analysis.Analyzer#tokenStream(java.lang.String, | |
| 52 | * java.io.Reader) | |
| 53 | */ | |
| 54 | @Override | |
| 55 | public final TokenStream tokenStream(String fieldName, Reader reader) { | |
| 56 | 0 | TokenStream result = new ArabicLetterTokenizer(reader); |
| 57 | 0 | result = new LowerCaseFilter(result); |
| 58 | 0 | result = new ArabicNormalizationFilter(result); |
| 59 | /* additional persian-specific normalization */ | |
| 60 | 0 | result = new PersianNormalizationFilter(result); |
| 61 | /* | |
| 62 | * the order here is important: the stop set is normalized with the | |
| 63 | * above! | |
| 64 | */ | |
| 65 | 0 | if (doStopWords && stopSet != null) { |
| 66 | 0 | result = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion), result, stopSet); |
| 67 | } | |
| 68 | ||
| 69 | 0 | return result; |
| 70 | } | |
| 71 | ||
| 72 | /** | |
| 73 | * Returns a (possibly reused) {@link TokenStream} which tokenizes all the | |
| 74 | * text in the provided {@link Reader}. | |
| 75 | * | |
| 76 | * @return A {@link TokenStream} built from a {@link ArabicLetterTokenizer} | |
| 77 | * filtered with {@link LowerCaseFilter}, | |
| 78 | * {@link ArabicNormalizationFilter}, | |
| 79 | * {@link PersianNormalizationFilter} and Persian Stop words | |
| 80 | */ | |
| 81 | @Override | |
| 82 | public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException { | |
| 83 | 0 | SavedStreams streams = (SavedStreams) getPreviousTokenStream(); |
| 84 | 0 | if (streams == null) { |
| 85 | 0 | streams = new SavedStreams(new ArabicLetterTokenizer(reader)); |
| 86 | 0 | streams.setResult(new LowerCaseFilter(streams.getResult())); |
| 87 | 0 | streams.setResult(new ArabicNormalizationFilter(streams.getResult())); |
| 88 | /* additional persian-specific normalization */ | |
| 89 | 0 | streams.setResult(new PersianNormalizationFilter(streams.getResult())); |
| 90 | /* | |
| 91 | * the order here is important: the stop set is normalized with the | |
| 92 | * above! | |
| 93 | */ | |
| 94 | 0 | if (doStopWords && stopSet != null) { |
| 95 | 0 | streams.setResult(new StopFilter(false, streams.getResult(), stopSet)); |
| 96 | } | |
| 97 | 0 | setPreviousTokenStream(streams); |
| 98 | } else { | |
| 99 | 0 | streams.getSource().reset(reader); |
| 100 | } | |
| 101 | 0 | return streams.getResult(); |
| 102 | } | |
| 103 | 0 | private final Version matchVersion = Version.LUCENE_29; |
| 104 | } |