ANN: acts_as_ferret

weibel · December 16, 2005, 6:15am

It’s so cool!
I am just looking for the CJK solutions,
Here is “JavaCC code for the Nutch lexical analyzer.”
Inlucded in Nutch source code, so could anyone port it into ferret?

/**

Copyright 2005 The Apache Software Foundation
Licensed under the Apache License, Version 2.0 (the “License”);
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an “AS IS” BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

/** JavaCC code for the Nutch lexical analyzer. */

options {
STATIC = false;
USER_CHAR_STREAM = true;
OPTIMIZE_TOKEN_MANAGER = true;
UNICODE_INPUT = true;
//DEBUG_TOKEN_MANAGER = true;
}

PARSER_BEGIN(NutchAnalysis)

package org.apache.nutch.analysis;

import org.apache.nutch.searcher.Query;
import org.apache.nutch.searcher.QueryFilters;
import org.apache.nutch.searcher.Query.Clause;

import org.apache.lucene.analysis.StopFilter;

import java.io.;
import java.util.;

/** The JavaCC-generated Nutch lexical analyzer and query parser. */
public class NutchAnalysis {

private static final String[] STOP_WORDS = {
“a”, “and”, “are”, “as”, “at”, “be”, “but”, “by”,
“for”, “if”, “in”, “into”, “is”, “it”,
“no”, “not”, “of”, “on”, “or”, “s”, “such”,
“t”, “that”, “the”, “their”, “then”, “there”, “these”,
“they”, “this”, “to”, “was”, “will”, “with”
};

private static final Set STOP_SET =
StopFilter.makeStopSet(STOP_WORDS);

private String queryString;

/** True iff word is a stop word. Stop words are only removed from
queries.

Every word is indexed. */
public static boolean isStopWord(String word) {
return STOP_SET.contains(word);
}

/** Construct a query parser for the text in a reader. */
public static Query parseQuery(String queryString) throws IOException
{
NutchAnalysis parser =
new NutchAnalysis(new FastCharStream(new
StringReader(queryString)));
parser.queryString = queryString;
return parser.parse();
}

/** For debugging. */
public static void main(String[] args) throws Exception {
BufferedReader in = new BufferedReader(new
InputStreamReader(System.in));
while (true) {
System.out.print("Query: ");
String line = in.readLine();
System.out.println(parseQuery(line));
}
}

}

PARSER_END(NutchAnalysis)

TOKEN_MGR_DECLS : {

/** Constructs a token manager for the provided Reader. */
public NutchAnalysisTokenManager(Reader reader) {
this(new FastCharStream(reader));
}

}

TOKEN : { // token regular expressions

// basic word – lowercase it
<WORD: ((||<WORD_PUNCT>)+ | <IRREGULAR_WORD>)>
{ matchedToken.image = matchedToken.image.toLowerCase(); }

// special handling for acronyms: U.S.A., I.B.M., etc: dots are
removed
| <ACRONYM: “.” ( “.”)+ >
{ // remove dots
for (int i = 0; i < image.length(); i++) {
if (image.charAt(i) == ‘.’)
image.deleteCharAt(i–);
}
matchedToken.image = image.toString().toLowerCase();
}

// chinese, japanese and korean characters
| <SIGRAM: >

| <WHITE: ~[] > // treat unrecognized chars
// as whitespace
// primitive, non-token patterns

| <#WORD_PUNCT: (“_”|“&”)> // allowed anywhere in words

| < #LETTER: // alphabets
[
“\u0041”-“\u005a”,
“\u0061”-“\u007a”,
“\u00c0”-“\u00d6”,
“\u00d8”-“\u00f6”,
“\u00f8”-“\u00ff”,
“\u0100”-“\u1fff”
]
>

| <#CJK: // non-alphabets
[
“\u3040”-“\u318f”,
“\u3300”-“\u337f”,
“\u3400”-“\u3d2d”,
“\u4e00”-“\u9fff”,
“\uf900”-“\ufaff”
]
>

| < #DIGIT: // unicode digits
[
“\u0030”-“\u0039”,
“\u0660”-“\u0669”,
“\u06f0”-“\u06f9”,
“\u0966”-“\u096f”,
“\u09e6”-“\u09ef”,
“\u0a66”-“\u0a6f”,
“\u0ae6”-“\u0aef”,
“\u0b66”-“\u0b6f”,
“\u0be7”-“\u0bef”,
“\u0c66”-“\u0c6f”,
“\u0ce6”-“\u0cef”,
“\u0d66”-“\u0d6f”,
“\u0e50”-“\u0e59”,
“\u0ed0”-“\u0ed9”,
“\u1040”-“\u1049”
]

}

/** Parse a query. */
Query parse() :
{
Query query = new Query();
ArrayList terms;
Token token;
String field;
boolean stop;
boolean prohibited;

}
{
nonOpOrTerm() // skip noise
(
{ stop=true; prohibited=false; field = Clause.DEFAULT_FIELD; }

                                              // optional + or -

operator
( {stop=false;} | ( { stop=false;prohibited=true; }
))?

                                              // optional field

spec.
( LOOKAHEAD((phrase(field)|compound(field)))
token= { field = token.image; } )?

( terms=phrase(field) {stop=false;} |         // quoted terms or
  terms=compound(field))                      // single or compound

term

nonOpOrTerm()                                 // skip noise

{
  String[] array = (String[])terms.toArray(new

String[terms.size()]);

  if (stop
      && field == Clause.DEFAULT_FIELD
      && terms.size()==1
      && isStopWord(array[0])) {
    // ignore stop words only when single, unadorned terms in

default field
} else {
if (prohibited)
query.addProhibitedPhrase(array, field);
else
query.addRequiredPhrase(array, field);
}
}
)*

{ return query; }

}

/** Parse an explcitly quoted phrase query. Note that this may return a
single

term, a trivial phrase.*/
ArrayList phrase(String field) :
{
int start;
int end;
ArrayList result = new ArrayList();
String term;
}
{

{ start = token.endColumn; }

(nonTerm())* // skip noise
( term = term() { result.add(term); } // parse a term
(nonTerm())) // skip noise

{ end = token.endColumn; }

(|)

{
if (QueryFilters.isRawField(field)) {
result.clear();
result.add(queryString.substring(start, end));
}
return result;
}

}

/** Parse a compound term that is interpreted as an implicit phrase
query.

Compounds are a sequence of terms separated by infix characters.
Note that
htis may return a single term, a trivial compound. */
ArrayList compound(String field) :
{
int start;
ArrayList result = new ArrayList();
String term;
}
{
{ start = token.endColumn; }

term = term() { result.add(term); }
( LOOKAHEAD( (infix())+ term() )
(infix())+
term = term() { result.add(term); })*

{
if (QueryFilters.isRawField(field)) {
result.clear();
result.add(queryString.substring(start, token.endColumn));
}
return result;
}

}

/** Parse a single term. */
String term() :
{
Token token;
}
{
( token= | token= | token=)

{ return token.image; }
}

/** Parse anything but a term or a quote. */
void nonTerm() :
{}
{
| infix()
}

void nonTermOrEOF() :
{}
{
nonTerm() |
}

/** Parse anything but a term or an operator (plur or minus or quote).
/
void nonOpOrTerm() :
{}
{
(LOOKAHEAD(2) ( | nonOpInfix() | ((|)
nonTermOrEOF())))
}

/** Characters which can be used to form compound terms. */
void infix() :
{}
{
| | nonOpInfix()
}

/** Parse infix characters except plus and minus. */
void nonOpInfix() :
{}
{
||||
}

weibel · December 15, 2005, 8:19pm

Hi David,

The problem is, that i need that query to use the paginator, ie i need
the hits before i do the actual search with the limit and offset, and
since that query also translates into model objects, it hits the
database when it doesnt actually need to. But I agree, my solution is
not really that nice either.

Albert

ANN: acts_as_ferret

It’s so cool! I am just looking for the CJK solutions, Here is “JavaCC code for the Nutch lexical analyzer.” Inlucded in Nutch source code, so could anyone port it into ferret?

It’s so cool!
I am just looking for the CJK solutions,
Here is “JavaCC code for the Nutch lexical analyzer.”
Inlucded in Nutch source code, so could anyone port it into ferret?