Coverage Report - org.crosswire.jsword.index.lucene.analysis.ConfigurableSnowballAnalyzer
 
Classes in this File Line Coverage Branch Coverage Complexity
ConfigurableSnowballAnalyzer
0%
0/50
0%
0/22
3.4
 
 1  
 /**
 2  
  * Distribution License:
 3  
  * JSword is free software; you can redistribute it and/or modify it under
 4  
  * the terms of the GNU Lesser General Public License, version 2.1 or later
 5  
  * as published by the Free Software Foundation. This program is distributed
 6  
  * in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even
 7  
  * the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 8  
  * See the GNU Lesser General Public License for more details.
 9  
  *
 10  
  * The License is available on the internet at:
 11  
  *      http://www.gnu.org/copyleft/lgpl.html
 12  
  * or by writing to:
 13  
  *      Free Software Foundation, Inc.
 14  
  *      59 Temple Place - Suite 330
 15  
  *      Boston, MA 02111-1307, USA
 16  
  *
 17  
  * © CrossWire Bible Society, 2007 - 2016
 18  
  *
 19  
  */
 20  
 package org.crosswire.jsword.index.lucene.analysis;
 21  
 
 22  
 import java.io.IOException;
 23  
 import java.io.Reader;
 24  
 import java.util.HashMap;
 25  
 import java.util.Map;
 26  
 import java.util.Set;
 27  
 
 28  
 import org.apache.lucene.analysis.LowerCaseTokenizer;
 29  
 import org.apache.lucene.analysis.StopAnalyzer;
 30  
 import org.apache.lucene.analysis.StopFilter;
 31  
 import org.apache.lucene.analysis.TokenStream;
 32  
 import org.apache.lucene.analysis.de.GermanAnalyzer;
 33  
 import org.apache.lucene.analysis.fr.FrenchAnalyzer;
 34  
 import org.apache.lucene.analysis.nl.DutchAnalyzer;
 35  
 import org.apache.lucene.analysis.snowball.SnowballFilter;
 36  
 import org.apache.lucene.util.Version;
 37  
 import org.crosswire.jsword.book.Book;
 38  
 
 39  
 /**
 40  
  * An Analyzer whose {@link TokenStream} is built from a
 41  
  * {@link LowerCaseTokenizer} filtered with {@link SnowballFilter} (optional)
 42  
  * and {@link StopFilter} (optional) Default behavior: Stemming is done, Stop
 43  
  * words not removed A snowball stemmer is configured according to the language
 44  
  * of the Book. Currently it takes following stemmer names (available stemmers
 45  
  * in lucene snowball package net.sf.snowball.ext)
 46  
  * 
 47  
  * <pre>
 48  
  *     Danish
 49  
  *     Dutch
 50  
  *     English
 51  
  *     Finnish
 52  
  *     French
 53  
  *     German2
 54  
  *     German
 55  
  *     Italian
 56  
  *     Kp
 57  
  *     Lovins
 58  
  *     Norwegian
 59  
  *     Porter
 60  
  *     Portuguese
 61  
  *     Russian
 62  
  *     Spanish
 63  
  *     Swedish
 64  
  * </pre>
 65  
  * 
 66  
  * This list is expected to expand, as and when Snowball project support more
 67  
  * languages
 68  
  * 
 69  
  * @see gnu.lgpl.License The GNU Lesser General Public License for details.
 70  
  * @author sijo cherian
 71  
  */
 72  
 public class ConfigurableSnowballAnalyzer extends AbstractBookAnalyzer {
 73  0
     public ConfigurableSnowballAnalyzer() {
 74  0
     }
 75  
 
 76  
     /**
 77  
      * Filters {@link LowerCaseTokenizer} with {@link StopFilter} if enabled and
 78  
      * {@link SnowballFilter}.
 79  
      */
 80  
     @Override
 81  
     public final TokenStream tokenStream(String fieldName, Reader reader) {
 82  0
         TokenStream result = new LowerCaseTokenizer(reader);
 83  0
         if (doStopWords && stopSet != null) {
 84  0
             result = new StopFilter(false, result, stopSet);
 85  
         }
 86  
 
 87  
         // Configure Snowball filter based on language/stemmerName
 88  0
         if (doStemming) {
 89  0
             result = new SnowballFilter(result, stemmerName);
 90  
         }
 91  
 
 92  0
         return result;
 93  
     }
 94  
 
 95  
     /* (non-Javadoc)
 96  
      * @see org.apache.lucene.analysis.Analyzer#reusableTokenStream(java.lang.String, java.io.Reader)
 97  
      */
 98  
     @Override
 99  
     public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
 100  0
         SavedStreams streams = (SavedStreams) getPreviousTokenStream();
 101  0
         if (streams == null) {
 102  0
             streams = new SavedStreams(new LowerCaseTokenizer(reader));
 103  0
             if (doStopWords && stopSet != null) {
 104  0
                 streams.setResult(new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion), streams.getResult(), stopSet));
 105  
             }
 106  
 
 107  0
             if (doStemming) {
 108  0
                 streams.setResult(new SnowballFilter(streams.getResult(), stemmerName));
 109  
             }
 110  
 
 111  0
             setPreviousTokenStream(streams);
 112  
         } else {
 113  0
             streams.getSource().reset(reader);
 114  
         }
 115  0
         return streams.getResult();
 116  
     }
 117  
 
 118  
     @Override
 119  
     public void setBook(Book newBook) {
 120  0
         book = newBook;
 121  0
         stemmerName = null;
 122  0
         if (book != null) {
 123  
             // stemmer name are same as language name, in most cases
 124  0
             pickStemmer(book.getLanguage().getCode());
 125  
         }
 126  0
     }
 127  
 
 128  
     /**
 129  
      * Given the name of a stemmer, use that one.
 130  
      * 
 131  
      * @param languageCode
 132  
      */
 133  
     public void pickStemmer(String languageCode) {
 134  0
         if (languageCode != null) {
 135  
             // Check for allowed stemmers
 136  0
             if (languageCodeToStemmerLanguageNameMap.containsKey(languageCode)) {
 137  0
                 stemmerName = languageCodeToStemmerLanguageNameMap.get(languageCode);
 138  
             } else {
 139  0
                 throw new IllegalArgumentException("SnowballAnalyzer configured for unavailable stemmer " + stemmerName);
 140  
             }
 141  
 
 142  
             // Initialize the default stop words
 143  0
             if (defaultStopWordMap.containsKey(languageCode)) {
 144  0
                 stopSet = defaultStopWordMap.get(languageCode);
 145  
             }
 146  
         }
 147  0
     }
 148  
 
 149  
     /**
 150  
      * The name of the stemmer to use.
 151  
      */
 152  
     private String stemmerName;
 153  
 
 154  0
     private static Map<String, String> languageCodeToStemmerLanguageNameMap = new HashMap<String, String>();
 155  
     static {
 156  0
         languageCodeToStemmerLanguageNameMap.put("da", "Danish");
 157  0
         languageCodeToStemmerLanguageNameMap.put("nl", "Dutch");
 158  0
         languageCodeToStemmerLanguageNameMap.put("en", "English");
 159  0
         languageCodeToStemmerLanguageNameMap.put("fi", "Finnish");
 160  0
         languageCodeToStemmerLanguageNameMap.put("fr", "French");
 161  0
         languageCodeToStemmerLanguageNameMap.put("de", "German");
 162  0
         languageCodeToStemmerLanguageNameMap.put("it", "Italian");
 163  0
         languageCodeToStemmerLanguageNameMap.put("no", "Norwegian");
 164  0
         languageCodeToStemmerLanguageNameMap.put("pt", "Portuguese");
 165  0
         languageCodeToStemmerLanguageNameMap.put("ru", "Russian");
 166  0
         languageCodeToStemmerLanguageNameMap.put("es", "Spanish");
 167  0
         languageCodeToStemmerLanguageNameMap.put("sv", "Swedish");
 168  
     }
 169  
 
 170  
     // Maps StemmerName > String array of standard stop words
 171  0
     private static HashMap<String, Set<?>> defaultStopWordMap = new HashMap<String, Set<?>>();
 172  
     static {
 173  0
         defaultStopWordMap.put("fr", FrenchAnalyzer.getDefaultStopSet());
 174  0
         defaultStopWordMap.put("de", GermanAnalyzer.getDefaultStopSet());
 175  0
         defaultStopWordMap.put("nl", DutchAnalyzer.getDefaultStopSet());
 176  0
         defaultStopWordMap.put("en", StopAnalyzer.ENGLISH_STOP_WORDS_SET);
 177  0
     }
 178  
 
 179  0
     private final Version matchVersion = Version.LUCENE_29;
 180  
 }