Ensuring the directory is open when using lucene

Ensuring the directory is open when using lucene - lucene

I'm trying to search an index I've created:
File index = new File("C:/MyIndex");
Directory indexDir = FSDirectory.open(index);
StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_36);
IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_36, analyzer);
IndexWriter writer = new IndexWriter(indexDir, config);
Document doc = new Document();
doc.add(new Field("My Data", Integer.toString(Id) , Field.Store.YES, Field.Index.NO));
indexDir.close();
Using Luke (the lucene index viewer) I can verify that the index exists and the data I enter is correct. My problem is how to check that the index is open (currently any searches of this index result in no matches):
File indexDir = new File("C:/CustomerInnovation");
Directory directory = FSDirectory.open(indexDir);
IndexReader reader = IndexReader.open(directory);
IndexSearcher searcher = new IndexSearcher(reader);
Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_36);
QueryParser parser = new QueryParser(Version.LUCENE_36, " ", new StandardAnalyzer(Version.LUCENE_36));
Query query = parser.parse(searchQuery);
log.debug("searchQuery: " + searchQuery);
log.debug("query: " + query.toString());
int hits = 100;
int hitsPerPage = 10;
TopScoreDocCollector collector = TopScoreDocCollector.create(hitsPerPage, true);
searcher.search(query, collector);
int returned = collector.topDocs().totalHits;
log.debug("returned: " + returned);
ScoreDoc[] numHits = collector.topDocs().scoreDocs;
List<Document> results = new ArrayList<Document>();
for (int i = 0; i < numHits.length; i++) {
int docId = numHits[i].doc;
Document d = searcher.doc(docId);
results.add(d);
log.debug(d.get("customername"));
}
log.debug("Found: " + numHits.length);
How do I check that the index has been opened and ready to search? I should mention that these bits of code are in separate classes.

To check if index exists at a specified directory use indexExists method.
IndexReader.indexExists(directory)

Related

Highlighter in lucene.net not working for wildchard and fuzzy search

highlighter using lucene.net ( 3.0.3) not working for the below code. If I am searching for a word "dealing" highlighter is showing but if I am searching for a word with wildchar "deal*" then there is no highlighting
protected void btnIndex_Click(object sender, EventArgs e)
{
string indexPath = #"D:\temp\LuceneIndex1";
Lucene.Net.Store.Directory directory = FSDirectory.Open(indexPath);
Analyzer analyzer = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_30);
IndexWriter writer = new IndexWriter(directory, analyzer, IndexWriter.MaxFieldLength.UNLIMITED);
IndexReader red = IndexReader.Open(directory, true);
int totDocs = red.MaxDoc;
red.Close();
//Add documents to the index
string text = String.Empty;
text = "One thing that may be of interest, is that if you are dealing with vast quantites of data you may want to create static Field fields and reuse them rather than creating new one each time you rebuild the index. Obviously for this demo the Lucene index is only created once per application run, but in a production application you may build the index every 5 mins or something like that, in which case I would recommend reusing the Field objects by making static fields that get re-used.";
int txts = totDocs;
AddTextToIndex(txts++, text, writer);
writer.Optimize();
writer.Dispose();
//Setup searcher
IndexSearcher searcher = new IndexSearcher(directory);
QueryParser parser = new QueryParser(Lucene.Net.Util.Version.LUCENE_30, "postBody", analyzer);
text = txtSearchData.Text;
Label1.Text = Search(text, searcher, parser, analyzer);
//Clean up everything
searcher.Close();
directory.Close();
}
private static void AddTextToIndex(int txts, string text, IndexWriter writer)
{
Document doc = new Document();
doc.Add(new Field("id", txts.ToString(), Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
doc.Add(new Field("postBody", text, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
writer.AddDocument(doc);
}
private string Search(string text, IndexSearcher searcher, QueryParser parser, Analyzer analyzer)
{
string indexPath = #"D:\temp\LuceneIndex1";
Lucene.Net.Store.Directory directory = FSDirectory.Open(indexPath);
string result = "";
string snip = "";
var booleanQuery = new BooleanQuery();
var fuzzyQuery = new FuzzyQuery(new Term("postBody", text), 0.7f, 3);
booleanQuery.Add(new BooleanClause(fuzzyQuery, Occur.SHOULD));
//Supply conditions
Query query = parser.Parse(text);
FastVectorHighlighter highlighter = getHighlighter();
parser.AllowLeadingWildcard = true;
query = parser.Parse(text);
BooleanQuery.MaxClauseCount = 10;
query = query.Rewrite(IndexReader.Open(directory, true));
query.Rewrite(IndexReader.Open(directory, true));
FieldQuery fieldQuery = highlighter.GetFieldQuery(booleanQuery);
TopScoreDocCollector collector = TopScoreDocCollector.Create(100, true);
searcher.Search(query, collector);
ScoreDoc[] hits = collector.TopDocs().ScoreDocs;
int results = hits.Length;
Console.WriteLine("Found {0} results", results);
for (int i = 0; i < hits.Length; i++)
{
int docId = hits[i].Doc;
float score = hits[i].Score;
Lucene.Net.Documents.Document doc = searcher.Doc(docId);
result = "Score: " + score.ToString() +
" Field: " + doc.Get("id") +
" Field2: " + doc.Get("postBody");
string text1 = doc.Get("postBody");
string[] hight = getFragmentsWithHighlightedTerms(analyzer, query, "postBody", text1, 5, 100, directory);
}
return result + " :::: " + snip;
}
private FastVectorHighlighter getHighlighter()
{
FragListBuilder fragListBuilder = new SimpleFragListBuilder();
FragmentsBuilder fragmentsBuilder = new ScoreOrderFragmentsBuilder(
BaseFragmentsBuilder.COLORED_PRE_TAGS,
BaseFragmentsBuilder.COLORED_POST_TAGS);
return new FastVectorHighlighter(true, true, fragListBuilder,
fragmentsBuilder);
}
private static String[] getFragmentsWithHighlightedTerms(Analyzer analyzer, Query query, string fieldName, string fieldContents, int fragmentSize, int maxsize, Lucene.Net.Store.Directory directory)
{
TokenStream stream = TokenSources.GetTokenStream(fieldName, fieldContents, analyzer);
// SpanScorer scorer = new SpanScorer();//(query, fieldName, new CachingTokenFilter(stream));
query = query.Rewrite(IndexReader.Open(directory, true));
QueryScorer scorer = new QueryScorer(query, fieldName);
scorer.IsExpandMultiTermQuery = true;// (true);
SimpleSpanFragmenter fragmenter = new SimpleSpanFragmenter(scorer, fragmentSize);
Highlighter highlighter = new Highlighter(scorer);
highlighter.TextFragmenter = fragmenter;
highlighter.MaxDocCharsToAnalyze = maxsize;
String[] fragments = highlighter.GetBestFragments(stream, fieldContents, 10);
return fragments;
}

IndexSearcher in Lucene is always returning 0 totalHits

The search method of IndexSearcher in Lucene is not returning any output. The number of documents that are returned by the query is always 0. I had built the index using the following code:
void buildIndex(File indexDir, File trainDir, HashMap<String,Integer> dictionary)
throws IOException, FileNotFoundException {
Directory fsDir = FSDirectory.open(indexDir);
IndexWriterConfig iwConf
= new IndexWriterConfig(VERSION,mAnalyzer);
iwConf.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
IndexWriter indexWriter
= new IndexWriter(fsDir,iwConf);
File file = trainDir;
String csvFilename = "/home/serene/Downloads/IndustryClassification/Train/Training.csv";
CSVReader csvReader = new CSVReader(new FileReader(csvFilename),'\t');
String[] row = null;
while((row = csvReader.readNext()) != null) {
Document d = new Document();
String companyname = row[1];
String NAICSID = row[2];
//System.out.println(NAICSID);
String description = row[4];
d.add(new TextField("company",companyname,Store.YES));
d.add(new StringField("category",NAICSID,Store.YES));
dictionary.put(NAICSID, 1);
d.add(new TextField("description", description, Store.NO));
//System.out.println(d.toString());
indexWriter.addDocument(d);
}
csvReader.close();
int numDocs = indexWriter.numDocs();
indexWriter.forceMerge(1);
indexWriter.commit();
indexWriter.close();
System.out.println("index=" + indexDir.getName());
System.out.println("num docs=" + numDocs);
}
When trying to get the output for a test query using the following code, I am not getting any output for the categories as scoreDocs.length is always 0 and the code within the for loop isn't executed.
void testIndex(File indexDir, File testDir, Set<String>NEWSGROUPS)
throws IOException, FileNotFoundException, ParseException {
Directory fsDir = FSDirectory.open(indexDir);
DirectoryReader reader = DirectoryReader.open(fsDir);
IndexSearcher searcher = new IndexSearcher(reader);
Analyzer analyzer = new StandardAnalyzer(VERSION);
System.out.print("inside testIndex");
int[][] confusionMatrix
= new int[NEWSGROUPS.size()][NEWSGROUPS.size()];
String csvFilename = "/home/serene/Downloads/IndustryClassification/Test/Test.csv";
CSVReader csvReader = new CSVReader(new FileReader(csvFilename), '\t');
String[] row = null;
while((row = csvReader.readNext()) != null) {
String companyname = row[1];
String NAICSID = row[2];
String description = row[4];
Query query = new QueryParser(Version.LUCENE_44,"contents",analyzer).parse(QueryParser.escape(description));
System.out.print(query +"\n");
TopDocs hits = searcher.search(query,3);
ScoreDoc[] scoreDocs = hits.scoreDocs;
System.out.println(hits.totalHits);
for (int n = 0; n < scoreDocs.length; ++n) {
ScoreDoc sd = scoreDocs[n];
int docId = sd.doc;
Document d = searcher.doc(docId);
String category = d.get("category");
System.out.println(category);
}
}
csvReader.close();
}

Replace "contents" with with any of the field (company .. ) that you indexed.

Same document being returned twice when querying from Lucene Index

I'm new to Lucene. I have some news texts which I am indexing using the fields below:
doc.add(new Field("url", article.getUrl(), TextField.TYPE_STORED));
doc.add(new Field("source", article.getSource(), TextField.TYPE_STORED));
doc.add(new Field("title", article.getArticleTitle(), TextField.TYPE_STORED));
doc.add(new Field("content", article.getArticleContent(), TextField.TYPE_STORED));
doc.add(new Field("date", DateTools.dateToString(article.getArticleDate(), DateTools.Resolution.DAY), TextField.TYPE_STORED));
doc.add(new Field("type", article.getType().getName(), TextField.TYPE_STORED));
When I do a query basing on these fields, in some instances, the same document is being returned twice. I query the index using the following code:
StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_40);
String contentQueryString = buildQuery(contentKeywords);
Query contentKeywordsQuery = null;
if (!StringUtils.isEmpty(contentQueryString)) {
contentKeywordsQuery = new QueryParser(Version.LUCENE_40, "content", analyzer).parse(contentQueryString);
}
Query titleKeywordsQuery = null;
String titleQueryString = buildQuery(titleKeywords);
if (!StringUtils.isEmpty(titleQueryString)) {
titleKeywordsQuery = new QueryParser(Version.LUCENE_40, "title", analyzer).parse(titleQueryString);
}
String sFrom = DateTools.dateToString(from, DateTools.Resolution.DAY);
String sTo = DateTools.dateToString(to, DateTools.Resolution.DAY);
Term lowerTerm = new Term("date", sFrom);
Term upperTerm = new Term("date", sTo);
Query dateQuery = new TermRangeQuery("date", lowerTerm.bytes(), upperTerm.bytes(), true, true);
Term sourceTerm = new Term("source", source);
Query sourceQuery = new TermQuery(sourceTerm);
Term typeTerm = new Term("type", type);
Query typeQuery = new TermQuery(typeTerm);
BooleanQuery q = new BooleanQuery();
q.add(dateQuery, BooleanClause.Occur.MUST);
q.add(sourceQuery, BooleanClause.Occur.MUST);
q.add(typeQuery, BooleanClause.Occur.MUST);
if (null != titleKeywordsQuery) {
q.add(titleKeywordsQuery, BooleanClause.Occur.MUST);
}
if (null != contentKeywordsQuery) {
q.add(contentKeywordsQuery, BooleanClause.Occur.MUST);
}
Directory index = new SimpleFSDirectory(new File("resources/lucene_index"));
int hitsPerPage = 5;
IndexReader reader = DirectoryReader.open(index);
IndexSearcher searcher = new IndexSearcher(reader);
TopScoreDocCollector collector = TopScoreDocCollector.create(hitsPerPage, true);
searcher.search(q, collector);
ScoreDoc[] hits = collector.topDocs().scoreDocs;
for (int i = 0; i < hits.length; ++i) {
int docId = hits[i].doc;
Document d = searcher.doc(docId);
QueryResult r = new QueryResult(d.get("url"), d.get("title"), d.get("date"), hits[i].score);
results.add(r);
}
This line in particular is giving two hits for the same document:
ScoreDoc[] hits = collector.topDocs().scoreDocs;
There shouldn't be any duplicate documents in the index, I've checked it out.

Lucene: how to get the score of a document

I want to output the score of documents. The code I write for this is that:
IndexReader reader = IndexReader.open(FSDirectory.open(indexDir));
IndexSearcher searcher = new IndexSearcher(reader);
Analyzer analyzer = new IKAnalyzer();
QueryParser parser = new QueryParser(Version.LUCENE_31, "title",
analyzer);
Query q = null;
q = parser.parse("MacOS");
TopDocs docs = searcher.search(q, 10);
ScoreDoc[] hits = docs.scoreDocs;
for(int i=0;i<hits.length;++i){
System.out.println(hits[i].score);
}
but the output is NaN. I want to know how to get the score of the document.

additional to daulets answere you have to enable the scoring in the indexSearcher:
...
searcher.setDefaultFieldSortScoring(true, true);
...
I think thats what you meant remy, but that way it should be clearer :)

IndexReader reader = IndexReader.open(FSDirectory.open(indexDir));
IndexSearcher searcher = new IndexSearcher(reader);
Analyzer analyzer = new IKAnalyzer();
QueryParser parser = new QueryParser(Version.LUCENE_31, "title", analyzer);
Query q = null;
q = parser.parse("MacOS");
TopDocs docs = searcher.search(q, 10);
ScoreDoc[] filterScoreDosArray = docs.topDocs().scoreDocs;
for (int i = 0; i < filterScoreDosArray.length; ++i) {
int docId = filterScoreDosArray[i].doc;
Document d = is.doc(docId);
System.out.println((i + 1) + ". " + d.get("docno")+" Score: "+ filterScoreDosArray[i].score);
}
try this.

To print score I should set defaultFieldSortScoring(true,true)

Exact search with Lucene.Net

I already have seen few similar questions, but I still don't have an answer. I think I have a simple problem.
In sentence
In this text, only Meta Files are important, and Test Generation.
Anything else is irrelevant
I want to index only Meta Files and Test Generation. That means that I need exact match.
Could someone please explain me how to achieve this?
And here is the code:
Analyzer analyzer = new StandardAnalyzer();
Lucene.Net.Store.Directory directory = new RAMDirectory();
indexWriter iwriter = new IndexWriter(directory, analyzer, true);
iwriter.SetMaxFieldLength(10000);
Document doc = new Document();
doc.Add(new Field("textFragment", text, Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.YES));
iwriter.AddDocument(doc);
iwriter.Close();
IndexSearcher isearcher = new IndexSearcher(directory);
QueryParser parser = new QueryParser("textFragment", analyzer);
foreach (DictionaryEntry de in OntologyLayer.OntologyLayer.HashTable)
{
List<string> buffer = new List<string>();
double weight = 0;
List<OntologyLayer.Term> list = (List<OntologyLayer.Term>)de.Value;
foreach (OntologyLayer.Term t in list)
{
Hits hits = null;
string label = t.Label;
string[] words = label.Split(' ');
int numOfWords = words.Length;
double wordWeight = 1 / (double)numOfWords;
double localWeight = 0;
foreach (string a in words)
{
try
{
if (!buffer.Contains(a))
{
Lucene.Net.Search.Query query = parser.Parse(a);
hits = isearcher.Search(query);
if (hits != null && hits.Length() > 0)
{
localWeight = localWeight + t.Weight * wordWeight * hits.Length();
}
buffer.Add(a);
}
}
catch (Exception ex)
{}
}
weight = weight + localWeight;
}
sbWeight.AppendLine(weight.ToString());
if (weight > 0)
{
string objectURI = (string)de.Key;
conceptList.Add(objectURI);
}
}

Take a look at Stupid Lucene Tricks: Exact Match, Starts With, Ends With.

We Keep Coding

sql objective-c vba vb.net react-native apache vue.js tensorflow api pandas

Ensuring the directory is open when using lucene - lucene

To check if index exists at a specified directory use indexExists method. IndexReader.indexExists(directory)

Related

Highlighter in lucene.net not working for wildchard and fuzzy search

IndexSearcher in Lucene is always returning 0 totalHits

Same document being returned twice when querying from Lucene Index

Lucene: how to get the score of a document

Exact search with Lucene.Net

Categories

Resources