Lucene: how to get the score of a document

Lucene: how to get the score of a document - lucene

I want to output the score of documents. The code I write for this is that:
IndexReader reader = IndexReader.open(FSDirectory.open(indexDir));
IndexSearcher searcher = new IndexSearcher(reader);
Analyzer analyzer = new IKAnalyzer();
QueryParser parser = new QueryParser(Version.LUCENE_31, "title",
analyzer);
Query q = null;
q = parser.parse("MacOS");
TopDocs docs = searcher.search(q, 10);
ScoreDoc[] hits = docs.scoreDocs;
for(int i=0;i<hits.length;++i){
System.out.println(hits[i].score);
}
but the output is NaN. I want to know how to get the score of the document.

additional to daulets answere you have to enable the scoring in the indexSearcher:
...
searcher.setDefaultFieldSortScoring(true, true);
...
I think thats what you meant remy, but that way it should be clearer :)

IndexReader reader = IndexReader.open(FSDirectory.open(indexDir));
IndexSearcher searcher = new IndexSearcher(reader);
Analyzer analyzer = new IKAnalyzer();
QueryParser parser = new QueryParser(Version.LUCENE_31, "title", analyzer);
Query q = null;
q = parser.parse("MacOS");
TopDocs docs = searcher.search(q, 10);
ScoreDoc[] filterScoreDosArray = docs.topDocs().scoreDocs;
for (int i = 0; i < filterScoreDosArray.length; ++i) {
int docId = filterScoreDosArray[i].doc;
Document d = is.doc(docId);
System.out.println((i + 1) + ". " + d.get("docno")+" Score: "+ filterScoreDosArray[i].score);
}
try this.

To print score I should set defaultFieldSortScoring(true,true)

Related

Highlighter in lucene.net not working for wildchard and fuzzy search

highlighter using lucene.net ( 3.0.3) not working for the below code. If I am searching for a word "dealing" highlighter is showing but if I am searching for a word with wildchar "deal*" then there is no highlighting
protected void btnIndex_Click(object sender, EventArgs e)
{
string indexPath = #"D:\temp\LuceneIndex1";
Lucene.Net.Store.Directory directory = FSDirectory.Open(indexPath);
Analyzer analyzer = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_30);
IndexWriter writer = new IndexWriter(directory, analyzer, IndexWriter.MaxFieldLength.UNLIMITED);
IndexReader red = IndexReader.Open(directory, true);
int totDocs = red.MaxDoc;
red.Close();
//Add documents to the index
string text = String.Empty;
text = "One thing that may be of interest, is that if you are dealing with vast quantites of data you may want to create static Field fields and reuse them rather than creating new one each time you rebuild the index. Obviously for this demo the Lucene index is only created once per application run, but in a production application you may build the index every 5 mins or something like that, in which case I would recommend reusing the Field objects by making static fields that get re-used.";
int txts = totDocs;
AddTextToIndex(txts++, text, writer);
writer.Optimize();
writer.Dispose();
//Setup searcher
IndexSearcher searcher = new IndexSearcher(directory);
QueryParser parser = new QueryParser(Lucene.Net.Util.Version.LUCENE_30, "postBody", analyzer);
text = txtSearchData.Text;
Label1.Text = Search(text, searcher, parser, analyzer);
//Clean up everything
searcher.Close();
directory.Close();
}
private static void AddTextToIndex(int txts, string text, IndexWriter writer)
{
Document doc = new Document();
doc.Add(new Field("id", txts.ToString(), Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
doc.Add(new Field("postBody", text, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
writer.AddDocument(doc);
}
private string Search(string text, IndexSearcher searcher, QueryParser parser, Analyzer analyzer)
{
string indexPath = #"D:\temp\LuceneIndex1";
Lucene.Net.Store.Directory directory = FSDirectory.Open(indexPath);
string result = "";
string snip = "";
var booleanQuery = new BooleanQuery();
var fuzzyQuery = new FuzzyQuery(new Term("postBody", text), 0.7f, 3);
booleanQuery.Add(new BooleanClause(fuzzyQuery, Occur.SHOULD));
//Supply conditions
Query query = parser.Parse(text);
FastVectorHighlighter highlighter = getHighlighter();
parser.AllowLeadingWildcard = true;
query = parser.Parse(text);
BooleanQuery.MaxClauseCount = 10;
query = query.Rewrite(IndexReader.Open(directory, true));
query.Rewrite(IndexReader.Open(directory, true));
FieldQuery fieldQuery = highlighter.GetFieldQuery(booleanQuery);
TopScoreDocCollector collector = TopScoreDocCollector.Create(100, true);
searcher.Search(query, collector);
ScoreDoc[] hits = collector.TopDocs().ScoreDocs;
int results = hits.Length;
Console.WriteLine("Found {0} results", results);
for (int i = 0; i < hits.Length; i++)
{
int docId = hits[i].Doc;
float score = hits[i].Score;
Lucene.Net.Documents.Document doc = searcher.Doc(docId);
result = "Score: " + score.ToString() +
" Field: " + doc.Get("id") +
" Field2: " + doc.Get("postBody");
string text1 = doc.Get("postBody");
string[] hight = getFragmentsWithHighlightedTerms(analyzer, query, "postBody", text1, 5, 100, directory);
}
return result + " :::: " + snip;
}
private FastVectorHighlighter getHighlighter()
{
FragListBuilder fragListBuilder = new SimpleFragListBuilder();
FragmentsBuilder fragmentsBuilder = new ScoreOrderFragmentsBuilder(
BaseFragmentsBuilder.COLORED_PRE_TAGS,
BaseFragmentsBuilder.COLORED_POST_TAGS);
return new FastVectorHighlighter(true, true, fragListBuilder,
fragmentsBuilder);
}
private static String[] getFragmentsWithHighlightedTerms(Analyzer analyzer, Query query, string fieldName, string fieldContents, int fragmentSize, int maxsize, Lucene.Net.Store.Directory directory)
{
TokenStream stream = TokenSources.GetTokenStream(fieldName, fieldContents, analyzer);
// SpanScorer scorer = new SpanScorer();//(query, fieldName, new CachingTokenFilter(stream));
query = query.Rewrite(IndexReader.Open(directory, true));
QueryScorer scorer = new QueryScorer(query, fieldName);
scorer.IsExpandMultiTermQuery = true;// (true);
SimpleSpanFragmenter fragmenter = new SimpleSpanFragmenter(scorer, fragmentSize);
Highlighter highlighter = new Highlighter(scorer);
highlighter.TextFragmenter = fragmenter;
highlighter.MaxDocCharsToAnalyze = maxsize;
String[] fragments = highlighter.GetBestFragments(stream, fieldContents, 10);
return fragments;
}

Same document being returned twice when querying from Lucene Index

I'm new to Lucene. I have some news texts which I am indexing using the fields below:
doc.add(new Field("url", article.getUrl(), TextField.TYPE_STORED));
doc.add(new Field("source", article.getSource(), TextField.TYPE_STORED));
doc.add(new Field("title", article.getArticleTitle(), TextField.TYPE_STORED));
doc.add(new Field("content", article.getArticleContent(), TextField.TYPE_STORED));
doc.add(new Field("date", DateTools.dateToString(article.getArticleDate(), DateTools.Resolution.DAY), TextField.TYPE_STORED));
doc.add(new Field("type", article.getType().getName(), TextField.TYPE_STORED));
When I do a query basing on these fields, in some instances, the same document is being returned twice. I query the index using the following code:
StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_40);
String contentQueryString = buildQuery(contentKeywords);
Query contentKeywordsQuery = null;
if (!StringUtils.isEmpty(contentQueryString)) {
contentKeywordsQuery = new QueryParser(Version.LUCENE_40, "content", analyzer).parse(contentQueryString);
}
Query titleKeywordsQuery = null;
String titleQueryString = buildQuery(titleKeywords);
if (!StringUtils.isEmpty(titleQueryString)) {
titleKeywordsQuery = new QueryParser(Version.LUCENE_40, "title", analyzer).parse(titleQueryString);
}
String sFrom = DateTools.dateToString(from, DateTools.Resolution.DAY);
String sTo = DateTools.dateToString(to, DateTools.Resolution.DAY);
Term lowerTerm = new Term("date", sFrom);
Term upperTerm = new Term("date", sTo);
Query dateQuery = new TermRangeQuery("date", lowerTerm.bytes(), upperTerm.bytes(), true, true);
Term sourceTerm = new Term("source", source);
Query sourceQuery = new TermQuery(sourceTerm);
Term typeTerm = new Term("type", type);
Query typeQuery = new TermQuery(typeTerm);
BooleanQuery q = new BooleanQuery();
q.add(dateQuery, BooleanClause.Occur.MUST);
q.add(sourceQuery, BooleanClause.Occur.MUST);
q.add(typeQuery, BooleanClause.Occur.MUST);
if (null != titleKeywordsQuery) {
q.add(titleKeywordsQuery, BooleanClause.Occur.MUST);
}
if (null != contentKeywordsQuery) {
q.add(contentKeywordsQuery, BooleanClause.Occur.MUST);
}
Directory index = new SimpleFSDirectory(new File("resources/lucene_index"));
int hitsPerPage = 5;
IndexReader reader = DirectoryReader.open(index);
IndexSearcher searcher = new IndexSearcher(reader);
TopScoreDocCollector collector = TopScoreDocCollector.create(hitsPerPage, true);
searcher.search(q, collector);
ScoreDoc[] hits = collector.topDocs().scoreDocs;
for (int i = 0; i < hits.length; ++i) {
int docId = hits[i].doc;
Document d = searcher.doc(docId);
QueryResult r = new QueryResult(d.get("url"), d.get("title"), d.get("date"), hits[i].score);
results.add(r);
}
This line in particular is giving two hits for the same document:
ScoreDoc[] hits = collector.topDocs().scoreDocs;
There shouldn't be any duplicate documents in the index, I've checked it out.

Ensuring the directory is open when using lucene

I'm trying to search an index I've created:
File index = new File("C:/MyIndex");
Directory indexDir = FSDirectory.open(index);
StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_36);
IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_36, analyzer);
IndexWriter writer = new IndexWriter(indexDir, config);
Document doc = new Document();
doc.add(new Field("My Data", Integer.toString(Id) , Field.Store.YES, Field.Index.NO));
indexDir.close();
Using Luke (the lucene index viewer) I can verify that the index exists and the data I enter is correct. My problem is how to check that the index is open (currently any searches of this index result in no matches):
File indexDir = new File("C:/CustomerInnovation");
Directory directory = FSDirectory.open(indexDir);
IndexReader reader = IndexReader.open(directory);
IndexSearcher searcher = new IndexSearcher(reader);
Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_36);
QueryParser parser = new QueryParser(Version.LUCENE_36, " ", new StandardAnalyzer(Version.LUCENE_36));
Query query = parser.parse(searchQuery);
log.debug("searchQuery: " + searchQuery);
log.debug("query: " + query.toString());
int hits = 100;
int hitsPerPage = 10;
TopScoreDocCollector collector = TopScoreDocCollector.create(hitsPerPage, true);
searcher.search(query, collector);
int returned = collector.topDocs().totalHits;
log.debug("returned: " + returned);
ScoreDoc[] numHits = collector.topDocs().scoreDocs;
List<Document> results = new ArrayList<Document>();
for (int i = 0; i < numHits.length; i++) {
int docId = numHits[i].doc;
Document d = searcher.doc(docId);
results.add(d);
log.debug(d.get("customername"));
}
log.debug("Found: " + numHits.length);
How do I check that the index has been opened and ready to search? I should mention that these bits of code are in separate classes.

To check if index exists at a specified directory use indexExists method.
IndexReader.indexExists(directory)

In Lucene, why do my boosted and unboosted documents get the same score?

At index time I am boosting certain document in this way:
if (myCondition)
{
document.SetBoost(1.2f);
}
But at search time documents with all the exact same qualities but some passing and some failing myCondition all end up having the same score.
And here is the search code:
BooleanQuery booleanQuery = new BooleanQuery();
booleanQuery.Add(new TermQuery(new Term(FieldNames.HAS_PHOTO, "y")), BooleanClause.Occur.MUST);
booleanQuery.Add(new TermQuery(new Term(FieldNames.AUTHOR_TYPE, AuthorTypes.BLOGGER)), BooleanClause.Occur.MUST_NOT);
indexSearcher.Search(booleanQuery, 10);
Can you tell me what I need to do to get the documents that were boosted to get a higher score?
Many Thanks!

Lucene encodes boosts on a single byte (although a float is generally encoded on four bytes) using the SmallFloat#floatToByte315 method. As a consequence, there can be a big loss in precision when converting back the byte to a float.
In your case SmallFloat.byte315ToFloat(SmallFloat.floatToByte315(1.2f)) returns 1f because 1f and 1.2f are too close to each other. Try using a bigger boost so that your documents get different scores. (For exemple 1.25, SmallFloat.byte315ToFloat(SmallFloat.floatToByte315(1.25f)) gives 1.25f.)

Here is the requested test program that was too long to post in a comment.
class Program
{
static void Main(string[] args)
{
RAMDirectory dir = new RAMDirectory();
IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer());
const string FIELD = "name";
for (int i = 0; i < 10; i++)
{
StringBuilder notes = new StringBuilder();
notes.AppendLine("This is a note 123 - " + i);
string text = notes.ToString();
Document doc = new Document();
var field = new Field(FIELD, text, Field.Store.YES, Field.Index.NOT_ANALYZED);
if (i % 2 == 0)
{
field.SetBoost(1.5f);
doc.SetBoost(1.5f);
}
else
{
field.SetBoost(0.1f);
doc.SetBoost(0.1f);
}
doc.Add(field);
writer.AddDocument(doc);
}
writer.Commit();
//string TERM = QueryParser.Escape("*+*");
string TERM = "T";
IndexSearcher searcher = new IndexSearcher(dir);
Query query = new PrefixQuery(new Term(FIELD, TERM));
var hits = searcher.Search(query);
int count = hits.Length();
Console.WriteLine("Hits - {0}", count);
for (int i = 0; i < count; i++)
{
var doc = hits.Doc(i);
Console.WriteLine(doc.ToString());
var explain = searcher.Explain(query, i);
Console.WriteLine(explain.ToString());
}
}
}

Exact search with Lucene.Net

I already have seen few similar questions, but I still don't have an answer. I think I have a simple problem.
In sentence
In this text, only Meta Files are important, and Test Generation.
Anything else is irrelevant
I want to index only Meta Files and Test Generation. That means that I need exact match.
Could someone please explain me how to achieve this?
And here is the code:
Analyzer analyzer = new StandardAnalyzer();
Lucene.Net.Store.Directory directory = new RAMDirectory();
indexWriter iwriter = new IndexWriter(directory, analyzer, true);
iwriter.SetMaxFieldLength(10000);
Document doc = new Document();
doc.Add(new Field("textFragment", text, Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.YES));
iwriter.AddDocument(doc);
iwriter.Close();
IndexSearcher isearcher = new IndexSearcher(directory);
QueryParser parser = new QueryParser("textFragment", analyzer);
foreach (DictionaryEntry de in OntologyLayer.OntologyLayer.HashTable)
{
List<string> buffer = new List<string>();
double weight = 0;
List<OntologyLayer.Term> list = (List<OntologyLayer.Term>)de.Value;
foreach (OntologyLayer.Term t in list)
{
Hits hits = null;
string label = t.Label;
string[] words = label.Split(' ');
int numOfWords = words.Length;
double wordWeight = 1 / (double)numOfWords;
double localWeight = 0;
foreach (string a in words)
{
try
{
if (!buffer.Contains(a))
{
Lucene.Net.Search.Query query = parser.Parse(a);
hits = isearcher.Search(query);
if (hits != null && hits.Length() > 0)
{
localWeight = localWeight + t.Weight * wordWeight * hits.Length();
}
buffer.Add(a);
}
}
catch (Exception ex)
{}
}
weight = weight + localWeight;
}
sbWeight.AppendLine(weight.ToString());
if (weight > 0)
{
string objectURI = (string)de.Key;
conceptList.Add(objectURI);
}
}

Take a look at Stupid Lucene Tricks: Exact Match, Starts With, Ends With.

We Keep Coding

sql objective-c vba vb.net react-native apache vue.js tensorflow api pandas

Lucene: how to get the score of a document - lucene

additional to daulets answere you have to enable the scoring in the indexSearcher: ... searcher.setDefaultFieldSortScoring(true, true); ... I think thats what you meant remy, but that way it should be clearer :)

To print score I should set defaultFieldSortScoring(true,true)

Related

Highlighter in lucene.net not working for wildchard and fuzzy search

Same document being returned twice when querying from Lucene Index

Ensuring the directory is open when using lucene

In Lucene, why do my boosted and unboosted documents get the same score?

Exact search with Lucene.Net

Categories

Resources