Lucene query: TermQuery doesn't work but QueryParser works - lucene

def searchGiustizia(self,startPagination,recPerPage):
indexPath = File(self.fileLucene).toPath()
directory = FSDirectory.open(indexPath)
searcher = IndexSearcher(DirectoryReader.open(directory))
paQuery5 = TermQuery(Term("parte","TELESI RICCARDO"))
analyzer = StandardAnalyzer()
print ("\n------------------------------------------------------")
start = datetime.now()
collector = TotalHitCountCollector()
searcher.search(paQuery5, collector)
print("found: ",collector.getTotalHits())
scoreCollector = TopScoreDocCollector.create(collector.getTotalHits(), 0)
searcher.search(paQuery5, scoreCollector)
scoreDocs = scoreCollector.topDocs(startPagination, recPerPage).scoreDocs
duration = datetime.now() - start
print("paginata ",str(duration))
for scoreDoc in scoreDocs:
doc = searcher.doc(scoreDoc.doc)
print (doc.get("cf_giudice")," ",
doc.get("codiceoggetto")," ",
doc.get("parte")," ",
doc.get("distretto"))
print ("\n------------------------------------------------------")
The function try to search "TELESI RICCARDO" in lucene index.
This search function return zeros results. Why?
I have TELESI RICCARDO as parte field indexed in lucene. If I try the same query using QueryParser it work like this:
def searchGiustizia(self,startPagination,recPerPage):
indexPath = File(self.fileLucene).toPath()
directory = FSDirectory.open(indexPath)
searcher = IndexSearcher(DirectoryReader.open(directory))
analyzer = StandardAnalyzer()
queryParser = QueryParser("parte", analyzer)
queryParser.setDefaultOperator(QueryParser.Operator.AND);
paQuery4 = queryParser.parse("TELESI RICCARDO");
print ("\n------------------------------------------------------")
start = datetime.now()
collector = TotalHitCountCollector()
searcher.search(paQuery4, collector)
print("trovati: ",collector.getTotalHits())
scoreCollector = TopScoreDocCollector.create(collector.getTotalHits(), 0)
searcher.search(paQuery4, scoreCollector)
scoreDocs = scoreCollector.topDocs(startPagination, recPerPage).scoreDocs
duration = datetime.now() - start
print("paginata ",str(duration))
for scoreDoc in scoreDocs:
doc = searcher.doc(scoreDoc.doc)
print (doc.get("cf_giudice")," ",
doc.get("codiceoggetto")," ",
doc.get("parte")," ",
doc.get("distretto"))
print ("\n------------------------------------------------------")
This QueryParser work perfectly but TermQuery doesn't.

Related

What is the right way to get term positions in a Lucene document?

The example in this question and some others I've seen on the web use postings method of a TermVector to get terms positions. Copy paste from the example in the linked question:
IndexReader ir = obtainIndexReader();
Terms tv = ir.getTermVector( doc, field );
TermsEnum terms = tv.iterator();
PostingsEnum p = null;
while( terms.next() != null ) {
p = terms.postings( p, PostingsEnum.ALL );
while( p.nextDoc() != PostingsEnum.NO_MORE_DOCS ) {
int freq = p.freq();
for( int i = 0; i < freq; i++ ) {
int pos = p.nextPosition(); // Always returns -1!!!
BytesRef data = p.getPayload();
doStuff( freq, pos, data ); // Fails miserably, of course.
}
}
}
This code works for me but what drives me mad is that the Terms type is where the position information is kept. All the documentation I've seen keep saying that term vectors keep position data. However, there are no methods on this type to get that information!
Older versions of Lucene apparently had a method but as of at least version 6.5.1 of Lucene, that is not the case.
Instead I'm supposed to use postings method and traverse the documents but I already know which document I want to work on!
The API documentation does not say anything about postings returning only the current document (the one the term vector belongs to) but when I run it, I only get the current doc.
Is this the correct and only way to get position data from term vectors? Why such an unintuitive API? Is there a document that explains why the previous approach changed in favour of this?
Don't know about "right or wrong" but for version 6.6.3 this seems to work.
private void run() throws Exception {
Directory directory = new RAMDirectory();
IndexWriterConfig indexWriterConfig = new IndexWriterConfig(new StandardAnalyzer());
IndexWriter writer = new IndexWriter(directory, indexWriterConfig);
Document doc = new Document();
// Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.YES
FieldType type = new FieldType();
type.setStoreTermVectors(true);
type.setStoreTermVectorPositions(true);
type.setStoreTermVectorOffsets(true);
type.setIndexOptions(IndexOptions.DOCS);
Field fieldStore = new Field("tags", "foo bar and then some", type);
doc.add(fieldStore);
writer.addDocument(doc);
writer.close();
DirectoryReader reader = DirectoryReader.open(directory);
IndexSearcher searcher = new IndexSearcher(reader);
Term t = new Term("tags", "bar");
Query q = new TermQuery(t);
TopDocs results = searcher.search(q, 1);
for ( ScoreDoc scoreDoc: results.scoreDocs ) {
Fields termVs = reader.getTermVectors(scoreDoc.doc);
Terms f = termVs.terms("tags");
TermsEnum te = f.iterator();
PostingsEnum docsAndPosEnum = null;
BytesRef bytesRef;
while ( (bytesRef = te.next()) != null ) {
docsAndPosEnum = te.postings(docsAndPosEnum, PostingsEnum.ALL);
// for each term (iterator next) in this field (field)
// iterate over the docs (should only be one)
int nextDoc = docsAndPosEnum.nextDoc();
assert nextDoc != DocIdSetIterator.NO_MORE_DOCS;
final int fr = docsAndPosEnum.freq();
final int p = docsAndPosEnum.nextPosition();
final int o = docsAndPosEnum.startOffset();
System.out.println("p="+ p + ", o=" + o + ", l=" + bytesRef.length + ", f=" + fr + ", s=" + bytesRef.utf8ToString());
}
}
}

Apache Lucene fuzzy search for multi-worded phrases

I have the following Apache Lucene 7 application:
StandardAnalyzer standardAnalyzer = new StandardAnalyzer();
Directory directory = new RAMDirectory();
IndexWriterConfig config = new IndexWriterConfig(standardAnalyzer);
IndexWriter writer = new IndexWriter(directory, config);
Document document = new Document();
document.add(new TextField("content", new FileReader("document.txt")));
writer.addDocument(document);
writer.close();
IndexReader reader = DirectoryReader.open(directory);
IndexSearcher searcher = new IndexSearcher(reader);
Query fuzzyQuery = new FuzzyQuery(new Term("content", "Company"), 2);
TopDocs results = searcher.search(fuzzyQuery, 5);
System.out.println("Hits: " + results.totalHits);
System.out.println("Max score:" + results.getMaxScore())
when I use it with :
new FuzzyQuery(new Term("content", "Company"), 2);
the application works fine and returns the following result:
Hits: 1
Max score:0.35161147
but when I try to search with multi term query, for example:
new FuzzyQuery(new Term("content", "Company name"), 2);
it returns the following result:
Hits: 0
Max score:NaN
Anyway, the phrase Company name exists in the source document.txt file.
How to properly use FuzzyQuery in this case in order to be able to do the fuzzy search for multi-word phrases.
UPDATED
Based on the provided solution I have tested it on the following text information:
Company name: BlueCross BlueShield Customer Service
1-800-521-2227
of Texas Preauth-Medical 1-800-441-9188
Preauth-MH/CD 1-800-528-7264
Blue Card Access 1-800-810-2583
For the following query:
SpanQuery[] clauses = new SpanQuery[2];
clauses[0] = new SpanMultiTermQueryWrapper<FuzzyQuery>(new FuzzyQuery(new Term("content", "BlueCross"), 2));
clauses[1] = new SpanMultiTermQueryWrapper<FuzzyQuery>(new FuzzyQuery(new Term("content", "BlueShield"), 2));
SpanNearQuery query = new SpanNearQuery(clauses, 0, true);
the search works fine:
Hits: 1
Max score:0.5753642
but when I try to corrupt a little bit the search query(for example from BlueCross to BlueCros)
SpanQuery[] clauses = new SpanQuery[2];
clauses[0] = new SpanMultiTermQueryWrapper<FuzzyQuery>(new FuzzyQuery(new Term("content", "BlueCros"), 2));
clauses[1] = new SpanMultiTermQueryWrapper<FuzzyQuery>(new FuzzyQuery(new Term("content", "BlueShield"), 2));
SpanNearQuery query = new SpanNearQuery(clauses, 0, true);
it stops working and returns:
Hits: 0
Max score:NaN
The problem here is the following, you're using TextField, which is tokenizing field. E.g. your text "Company name is working on something" would be effectively split by spaces (and others delimeters). So, even if you have the text Company name, during indexation it will become Company, name, is, etc.
In this case this TermQuery won't be able to find what you're looking for. The trick which going to help you would look like this:
SpanQuery[] clauses = new SpanQuery[2];
clauses[0] = new SpanMultiTermQueryWrapper(new FuzzyQuery(new Term("content", "some"), 2));
clauses[1] = new SpanMultiTermQueryWrapper(new FuzzyQuery(new Term("content", "text"), 2));
SpanNearQuery query = new SpanNearQuery(clauses, 0, true);
However, I wouldn't recommend this approach much, especially if your load would be big and you're planning on searching on a 10 term long company names. One should be aware, that those query are potentially heavy to execute.
The following problem with BlueCros is the following. By default Lucene uses StandardAnalyzer for TextField. So it means it effectively lowercase the terms, basically it means that BlueCross in the content field becomes bluecross.
Fuzzy difference between BlueCros and bluecross is 3, that's the reason you do not have a match.
Simple proposal would be to convert term in query to the lowercase, by doing something like .toLowerCase()
In general, one should prefer to use same analyzers during the query time as well (e.g. during construction of the query)
For Lucene.Net it can be like this.
private string _IndexPath = #"Your Index Path";
private Directory _Directory;
private Searcher _IndexSearcher;
private MultiPhraseQuery _MultiPhraseQuery;
_Directory = FSDirectory.Open(_IndexPath);
IndexReader indexReader = IndexReader.Open(_Directory, true);
string field = "Name" // Your field name
string keyword = "big red fox"; // your search term
float fuzzy = 0,7f; // between 0-1
using (_IndexSearcher = new IndexSearcher(indexReader))
{
// "big red fox" to [big,red,fox]
var keywordSplit = keyword.Split();
_MultiPhraseQuery = new MultiPhraseQuery();
FuzzyTermEnum[] _FuzzyTermEnum = new FuzzyTermEnum[keywordSplit.Length];
Term[] _Term = new Term[keywordSplit.Length];
for (int i = 0; i < keywordSplit.Length; i++)
{
_FuzzyTermEnum[i] = new FuzzyTermEnum(indexReader, new Term(field, keywordSplit[i]),fuzzy);
_Term[i] = _FuzzyTermEnum[i].Term;
if (_Term[i] == null)
{
_MultiPhraseQuery.Add(new Term(field, keywordSplit[i]));
}
else
{
_MultiPhraseQuery.Add(_FuzzyTermEnum[i].Term);
}
}
var results = _IndexSearcher.Search(_MultiPhraseQuery, indexReader.MaxDoc);
foreach (var loopDoc in results.ScoreDocs.OrderByDescending(s => s.Score))
{
//YourCode Here
}
}

Lucene.net highlight searched term in the text

I am using Lucene.net to search a given document. Requirement is once search is done, it should highlight the searched term in the document. I have seen examples which returns the best fragments. But what i need is to highlight in the main content.
using (StandardAnalyzer standardAnalyzer = new StandardAnalyzer(Version.LUCENE_30, stopWords))
{
QueryParser parser = new QueryParser(Version.LUCENE_30, "Content", standardAnalyzer);
parser.AllowLeadingWildcard = true;
Query qry = parser.Parse(searchText);
Directory indexDir = CreateRAMDirectory(htmlContent);
IndexReader reader = IndexReader.Open(indexDir, true);
IndexSearcher searcher = new IndexSearcher(reader);
searcher.SetDefaultFieldSortScoring(true, true);
IFormatter formatter = new SimpleHTMLFormatter("<span style=\"font-weight:bold; background-color:yellow;\">", "</span>");
SimpleFragmenter fragmenter = new SimpleFragmenter(1000);
QueryScorer scorer = null;
scorer = new QueryScorer(qry);
ScoreDoc[] hits = searcher.Search(qry, null, 10000, Sort.RELEVANCE).ScoreDocs;
Highlighter highlighter = new Highlighter(formatter, scorer);
highlighter.TextFragmenter = fragmenter;
foreach (var result in hits)
{
int docId = result.Doc;
float score = result.Score;
Document doc = searcher.Doc(docId);
Lucene.Net.Analysis.TokenStream stream = standardAnalyzer.TokenStream("Content", new IO.StringReader(searchText));
String highlighterData = highlighter.GetBestFragments(stream, searchText, 1000, "");
}
}
I am a newbie to Lucene.net, how can i get the entire document with searched term content highlighted rather than fragments?
The fragmenter governs how large the chunks of text returned are. To use the entire field contents, just use NullFragmenter, instead of SimpleFragmenter.
Fragmenter fragmenter = new NullFragmenter();
.....
highlighter.TextFragmenter = fragmenter;
I had the same issue, even with the NullFragmenter, it only returned roughly 51 kB of text.
By analyzing the objects, I found out that there is another property at the highligher which sets how large a fragment would be at maximum. Set this value to the length of your string, then the whole document will be processed.
highlighter.TextFragmenter = new NullFragmenter();
highlighter.MaxDocCharsToAnalyze = text.Length;

How to use Lucene IndexReader to read index in version 4.4?

For the just the sake of learning I've created an index from 1 file and wanted to search it. I am using Lucene Version 4.4. I know that indexing part is true.
tempFileName is the name of file which contains tokens and this file has the following words :
"odd plus odd is even ## even plus even is even ## odd plus even is odd ##"
However when I provide a query it returns nothing. I can't see what would be the problem. Any help is greatly appreciated.
Indexing part :
public void startIndexingDocument(String indexPath) throws IOException {
Analyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_44);
SimpleFSDirectory directory = new SimpleFSDirectory(new File(indexPath));
IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_44,
analyzer);
IndexWriter writer = new IndexWriter(directory, config);
indexDocs(writer);
writer.close();
}
private void indexDocs(IndexWriter w) throws IOException {
Document doc = new Document();
File file = new File(tempFileName);
BufferedReader br = new BufferedReader(new FileReader(tempFileName));
Field field = new StringField(fieldName, br.readLine().toString(),
Field.Store.YES);
doc.add(field);
w.addDocument(doc);
}
Searching part :
public void readFromIndex(String indexPath) throws IOException,
ParseException {
Analyzer anal = new WhitespaceAnalyzer(Version.LUCENE_44);
QueryParser parser = new QueryParser(Version.LUCENE_44, fieldName, anal);
Query query = parser.parse("odd");
IndexReader reader = IndexReader.open(NIOFSDirectory.open(new File(
indexPath)));
IndexSearcher searcher = new IndexSearcher(reader);
TopScoreDocCollector collector = TopScoreDocCollector.create(10, true);
searcher.search(query, collector);
ScoreDoc[] hits = collector.topDocs().scoreDocs;
// display
System.out.println("fieldName =" + fieldName);
System.out.println("Found : " + hits.length + " hits.");
for (int i = 0; i < hits.length; i++) {
int docId = hits[i].doc;
Document d = searcher.doc(docId);
System.out.println((i + 1) + ". " + d.get(fieldName));
}
reader.close();
}
The problem is that you are using a StringField. StringField indexes the entire input as a single token. Good for atomic strings, like keywords, identifiers, stuff like that. Not good for full text searching.
Use a TextField.
StringField have a single token. So, I try to test with simple code.
for example #yns~ If you have a file that this is cralwer file and this contents hava a single String.
ex) file name : data03.scd , contents : parktaeha
You try to search with "parktaeha" queryString.
You get the search result!
field name : acet, queryString parktaeha
======== start search!! ========== q=acet:parktaeha Found 1 hits. result array length :1 search result=> parktaeha
======== end search!! ==========
Look under the code. This code is test code.
while((target = in.readLine()) != null){
System.out.println("target:"+target);
doc.add(new TextField("acet",target ,Field.Store.YES)); // use TextField
// TEST : doc.add(new StringField("acet", target.toString(),Field.Store.YES));
}
ref url

Lucene: the same query parsed from String and build via Query API doesn't yield same results

I have the following code:
public static void main(String[] args) throws Throwable {
String[] texts = new String[]{
"starts_with k mer",
"starts_with mer",
"starts_with bleue est mer",
"starts_with mer est bleue",
"starts_with mer bla1 bla2 bla3 bla4 bla5",
"starts_with bleue est la mer",
"starts_with la mer est bleue",
"starts_with la mer"
};
//write:
Set<String> stopWords = new HashSet<String>();
StandardAnalyzer stdAn = new StandardAnalyzer(Version.LUCENE_36, stopWords);
Directory fsDir = FSDirectory.open(INDEX_DIR);
IndexWriterConfig iwConf = new IndexWriterConfig(Version.LUCENE_36,stdAn);
iwConf.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
IndexWriter indexWriter = new IndexWriter(fsDir,iwConf);
for(String text:texts) {
Document document = new Document();
document.add(new Field("title",text,Store.YES,Index.ANALYZED));
indexWriter.addDocument(document);
}
indexWriter.commit();
//read
IndexReader indexReader = IndexReader.open(fsDir);
IndexSearcher indexSearcher = new IndexSearcher(indexReader);
//get query:
//Query query = getQueryFromString("mer");
Query query = getQueryFromAPI("mer");
//explain
System.out.println("======== Query: "+query+"\n");
TopDocs hits = indexSearcher.search(query, 10);
for (ScoreDoc scoreDoc : hits.scoreDocs) {
Document doc = indexSearcher.doc(scoreDoc.doc);
System.out.println(">>> "+doc.get("title"));
System.out.println("Explain:");
System.out.println(indexSearcher.explain(query, scoreDoc.doc));
}
}
private static Query getQueryFromString(String searchString) throws Throwable {
Set<String> stopWords = new HashSet<String>();
Query query = new QueryParser(Version.LUCENE_36, "title",new StandardAnalyzer(Version.LUCENE_36, stopWords)).parse("("+searchString+") \"STARTS_WITH "+searchString+"\"");
return query;
}
private static Query getQueryFromAPI(String searchString) throws Throwable {
Set<String> stopWords = new HashSet<String>();
Query searchStringTermsMatchTitle = new QueryParser(Version.LUCENE_36, "title", new StandardAnalyzer(Version.LUCENE_36, stopWords)).parse(searchString);
PhraseQuery titleStartsWithSearchString = new PhraseQuery();
titleStartsWithSearchString.add(new Term("title","STARTS_WITH".toLowerCase()+" "+searchString));
BooleanQuery query = new BooleanQuery(true);
BooleanClause matchClause = new BooleanClause(searchStringTermsMatchTitle, Occur.SHOULD);
query.add(matchClause);
BooleanClause startsWithClause = new BooleanClause(titleStartsWithSearchString, Occur.SHOULD);
query.add(startsWithClause);
return query;
}
Basically I'm indexing some strings, and then I have two methods for creating a Lucene Query from user input, one that simply builds the corresponding Lucene query String "manually" (via string concatenation) and another that uses Lucene's API for building queries. They seem to be building the same query, as the debug output of the query shows the exact same query string, but the search results are not the same:
running the query built via String concatenation yields (for argument "mer"):
title:mer title:"starts_with mer"
and ideed in this case when I search with it I get documents that match the title:"starts_with mer" part first. Here's the explain on the first result:
>>> starts_with mer
Explain:
1.2329358 = (MATCH) sum of:
0.24658716 = (MATCH) weight(title:mer in 1), product of:
0.4472136 = queryWeight(title:mer), product of:
0.882217 = idf(docFreq=8, maxDocs=8)
0.50692016 = queryNorm
0.55138564 = (MATCH) fieldWeight(title:mer in 1), product of:
1.0 = tf(termFreq(title:mer)=1)
0.882217 = idf(docFreq=8, maxDocs=8)
0.625 = fieldNorm(field=title, doc=1)
0.9863486 = (MATCH) weight(title:"starts_with mer" in 1), product of:
0.8944272 = queryWeight(title:"starts_with mer"), product of:
1.764434 = idf(title: starts_with=8 mer=8)
0.50692016 = queryNorm
1.1027713 = fieldWeight(title:"starts_with mer" in 1), product of:
1.0 = tf(phraseFreq=1.0)
1.764434 = idf(title: starts_with=8 mer=8)
0.625 = fieldNorm(field=title, doc=1)
running the query built via Lucene query helper tools yields an apparently identical query:
title:mer title:"starts_with mer"
but this time the results are not the same, since in fact the title:"starts_with mer" part is not matched. Here's an explain of the first result:
>>> starts_with mer
Explain:
0.15185544 = (MATCH) sum of:
0.15185544 = (MATCH) weight(title:mer in 1), product of:
0.27540696 = queryWeight(title:mer), product of:
0.882217 = idf(docFreq=8, maxDocs=8)
0.312176 = queryNorm
0.55138564 = (MATCH) fieldWeight(title:mer in 1), product of:
1.0 = tf(termFreq(title:mer)=1)
0.882217 = idf(docFreq=8, maxDocs=8)
0.625 = fieldNorm(field=title, doc=1)
My question is: whay don't I get the same results? I'd really like to be able to use the Query helper tools here, especially since there's the BooleanQuery(disableCoord) option which I'd like to use and I really don't know how to express direclly into Lucene query string. (Yes, my example passes "true" there, I've also tried with "false", same result).
===UPDATE
femtoRgon's answer is great: the problem was that I was adding the whole search string as a term, instead of first splitting it into terms and then adding each one to the query.
The answer femtoRgon gives works ok if the input string consists of one term: in this case, separatedly adding the "STARTS_WITH" text as one term, and then adding the search string as a 2nd term works.
However if the user inputs something that would be tokenzied by more than one term, you'd have to first split it into terms (preferably using the same analyzers and/or tokenizers that you used when indexing - to get consistent results) and then add each term to the query.
What I ended up doing is making a function that splits the query string into terms, using the same analyzer that I used for indexing:
private static List<String> getTerms(String text) throws Throwable {
Analyzer analyzer = getAnalyzer();
StringReader textReader = new StringReader(text);
TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME_TITLE, textReader);
tokenStream.reset();
List<String> terms = new ArrayList<String>();
CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
while (tokenStream.incrementToken()) {
String term = charTermAttribute.toString();
terms.add(term);
}
textReader.close();
tokenStream.close();
analyzer.close();
return terms;
}
Then I first add the "STARTS_WITH" as one term, and then each of the elements in the list as a separate term:
PhraseQuery titleStartsWithSearchString = new PhraseQuery();
titleStartsWithSearchString.add(new Term("title","STARTS_WITH".toLowerCase()));
for(String term:getTerms(searchString)) {
titleStartsWithSearchString.add(new Term("title",term));
}
I believe the problem you are running into is that you are adding the entire phrase to your PhraseQuery as a single term. In the index, and in the query parsed by the QueryParser, this will be split into terms "starts_with" and "mer", which must be found consecutively. However, in the query you have constructed, you have a single term in your PhraseQuery instead, the term "starts_with mer", which doesn't exist as a single term in the index.
You should be able to change the bit where you are constructing the PhraseQuery to:
PhraseQuery titleStartsWithSearchString = new PhraseQuery();
titleStartsWithSearchString.add(new Term("title","STARTS_WITH".toLowerCase())
titleStartsWithSearchString.add(new Term("title",searchString));