ANN: acts_as_ferret

It’s so cool!
I am just looking for the CJK solutions,
Here is “JavaCC code for the Nutch lexical analyzer.”
Inlucded in Nutch source code, so could anyone port it into ferret?

/**

  • Copyright 2005 The Apache Software Foundation
  • Licensed under the Apache License, Version 2.0 (the “License”);
  • you may not use this file except in compliance with the License.
  • You may obtain a copy of the License at
  • http://www.apache.org/licenses/LICENSE-2.0
    
  • Unless required by applicable law or agreed to in writing, software
  • distributed under the License is distributed on an “AS IS” BASIS,
  • WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
    implied.
  • See the License for the specific language governing permissions and
  • limitations under the License.
    */

/** JavaCC code for the Nutch lexical analyzer. */

options {
STATIC = false;
USER_CHAR_STREAM = true;
OPTIMIZE_TOKEN_MANAGER = true;
UNICODE_INPUT = true;
//DEBUG_TOKEN_MANAGER = true;
}

PARSER_BEGIN(NutchAnalysis)

package org.apache.nutch.analysis;

import org.apache.nutch.searcher.Query;
import org.apache.nutch.searcher.QueryFilters;
import org.apache.nutch.searcher.Query.Clause;

import org.apache.lucene.analysis.StopFilter;

import java.io.;
import java.util.
;

/** The JavaCC-generated Nutch lexical analyzer and query parser. */
public class NutchAnalysis {

private static final String[] STOP_WORDS = {
“a”, “and”, “are”, “as”, “at”, “be”, “but”, “by”,
“for”, “if”, “in”, “into”, “is”, “it”,
“no”, “not”, “of”, “on”, “or”, “s”, “such”,
“t”, “that”, “the”, “their”, “then”, “there”, “these”,
“they”, “this”, “to”, “was”, “will”, “with”
};

private static final Set STOP_SET =
StopFilter.makeStopSet(STOP_WORDS);

private String queryString;

/** True iff word is a stop word. Stop words are only removed from
queries.

  • Every word is indexed. */
    public static boolean isStopWord(String word) {
    return STOP_SET.contains(word);
    }

/** Construct a query parser for the text in a reader. */
public static Query parseQuery(String queryString) throws IOException
{
NutchAnalysis parser =
new NutchAnalysis(new FastCharStream(new
StringReader(queryString)));
parser.queryString = queryString;
return parser.parse();
}

/** For debugging. */
public static void main(String[] args) throws Exception {
BufferedReader in = new BufferedReader(new
InputStreamReader(System.in));
while (true) {
System.out.print("Query: ");
String line = in.readLine();
System.out.println(parseQuery(line));
}
}

}

PARSER_END(NutchAnalysis)

TOKEN_MGR_DECLS : {

/** Constructs a token manager for the provided Reader. */
public NutchAnalysisTokenManager(Reader reader) {
this(new FastCharStream(reader));
}

}

TOKEN : { // token regular expressions

// basic word – lowercase it
<WORD: ((||<WORD_PUNCT>)+ | <IRREGULAR_WORD>)>
{ matchedToken.image = matchedToken.image.toLowerCase(); }

// special handling for acronyms: U.S.A., I.B.M., etc: dots are
removed
| <ACRONYM: “.” ( “.”)+ >
{ // remove dots
for (int i = 0; i < image.length(); i++) {
if (image.charAt(i) == ‘.’)
image.deleteCharAt(i–);
}
matchedToken.image = image.toString().toLowerCase();
}

// chinese, japanese and korean characters
| <SIGRAM: >

// irregular words
| <#IRREGULAR_WORD: (<C_PLUS_PLUS>|<C_SHARP>)>
| <#C_PLUS_PLUS: (“C”|“c”) “++” >
| <#C_SHARP: (“C”|“c”) “#” >

// query syntax characters
| <PLUS: “+” >
| <MINUS: “-” >
| <QUOTE: “"” >
| <COLON: “:” >
| <SLASH: “/” >
| <DOT: “.” >
| <ATSIGN: “@” >
| <APOSTROPHE: “'” >

| <WHITE: ~[] > // treat unrecognized chars
// as whitespace
// primitive, non-token patterns

| <#WORD_PUNCT: (“_”|“&”)> // allowed anywhere in words

| < #LETTER: // alphabets
[
“\u0041”-“\u005a”,
“\u0061”-“\u007a”,
“\u00c0”-“\u00d6”,
“\u00d8”-“\u00f6”,
“\u00f8”-“\u00ff”,
“\u0100”-“\u1fff”
]
>

| <#CJK: // non-alphabets
[
“\u3040”-“\u318f”,
“\u3300”-“\u337f”,
“\u3400”-“\u3d2d”,
“\u4e00”-“\u9fff”,
“\uf900”-“\ufaff”
]
>

| < #DIGIT: // unicode digits
[
“\u0030”-“\u0039”,
“\u0660”-“\u0669”,
“\u06f0”-“\u06f9”,
“\u0966”-“\u096f”,
“\u09e6”-“\u09ef”,
“\u0a66”-“\u0a6f”,
“\u0ae6”-“\u0aef”,
“\u0b66”-“\u0b6f”,
“\u0be7”-“\u0bef”,
“\u0c66”-“\u0c6f”,
“\u0ce6”-“\u0cef”,
“\u0d66”-“\u0d6f”,
“\u0e50”-“\u0e59”,
“\u0ed0”-“\u0ed9”,
“\u1040”-“\u1049”
]

}

/** Parse a query. */
Query parse() :
{
Query query = new Query();
ArrayList terms;
Token token;
String field;
boolean stop;
boolean prohibited;

}
{
nonOpOrTerm() // skip noise
(
{ stop=true; prohibited=false; field = Clause.DEFAULT_FIELD; }

                                              // optional + or - 

operator
( {stop=false;} | ( { stop=false;prohibited=true; }
))?

                                              // optional field 

spec.
( LOOKAHEAD((phrase(field)|compound(field)))
token= { field = token.image; } )?

( terms=phrase(field) {stop=false;} |         // quoted terms or
  terms=compound(field))                      // single or compound 

term

nonOpOrTerm()                                 // skip noise

{
  String[] array = (String[])terms.toArray(new 

String[terms.size()]);

  if (stop
      && field == Clause.DEFAULT_FIELD
      && terms.size()==1
      && isStopWord(array[0])) {
    // ignore stop words only when single, unadorned terms in 

default field
} else {
if (prohibited)
query.addProhibitedPhrase(array, field);
else
query.addRequiredPhrase(array, field);
}
}
)*

{ return query; }

}

/** Parse an explcitly quoted phrase query. Note that this may return a
single

  • term, a trivial phrase.*/
    ArrayList phrase(String field) :
    {
    int start;
    int end;
    ArrayList result = new ArrayList();
    String term;
    }
    {

{ start = token.endColumn; }

(nonTerm())* // skip noise
( term = term() { result.add(term); } // parse a term
(nonTerm())) // skip noise

{ end = token.endColumn; }

(|)

{
if (QueryFilters.isRawField(field)) {
result.clear();
result.add(queryString.substring(start, end));
}
return result;
}

}

/** Parse a compound term that is interpreted as an implicit phrase
query.

  • Compounds are a sequence of terms separated by infix characters.
    Note that
  • htis may return a single term, a trivial compound. */
    ArrayList compound(String field) :
    {
    int start;
    ArrayList result = new ArrayList();
    String term;
    }
    {
    { start = token.endColumn; }

term = term() { result.add(term); }
( LOOKAHEAD( (infix())+ term() )
(infix())+
term = term() { result.add(term); })*

{
if (QueryFilters.isRawField(field)) {
result.clear();
result.add(queryString.substring(start, token.endColumn));
}
return result;
}

}

/** Parse a single term. */
String term() :
{
Token token;
}
{
( token= | token= | token=)

{ return token.image; }
}

/** Parse anything but a term or a quote. */
void nonTerm() :
{}
{
| infix()
}

void nonTermOrEOF() :
{}
{
nonTerm() |
}

/** Parse anything but a term or an operator (plur or minus or quote).
/
void nonOpOrTerm() :
{}
{
(LOOKAHEAD(2) ( | nonOpInfix() | ((|)
nonTermOrEOF())))

}

/** Characters which can be used to form compound terms. */
void infix() :
{}
{
| | nonOpInfix()
}

/** Parse infix characters except plus and minus. */
void nonOpInfix() :
{}
{
||||
}

Hi David,

The problem is, that i need that query to use the paginator, ie i need
the hits before i do the actual search with the limit and offset, and
since that query also translates into model objects, it hits the
database when it doesnt actually need to. But I agree, my solution is
not really that nice either.

Albert