BM25 in Lucene 4.9

BM25 in Lucene 4.9 - lucene

I'm struggling with with BM25Similarity class in Lucene (link). All examples provided on the Web refers to older implementation (link). I kindly ask for a pointer how to modify the standard toy example below to include BM25 similarity (create index and perform search).
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;
import java.io.IOException;
public class HelloLucene {
public static void main(String[] args) throws IOException, ParseException {
// Specify the analyzer for tokenizing text.
// The same analyzer should be used for indexing and searching
StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_4_9);
// Create the index
Directory index = new RAMDirectory();
IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_4_9, analyzer);
IndexWriter w = new IndexWriter(index, config);
addDoc(w, "Lucene in Action", "193398817");
addDoc(w, "Lucene for Dummies", "55320055Z");
addDoc(w, "Managing Gigabytes", "55063554A");
addDoc(w, "The Art of Computer Science", "9900333X");
w.close();
// Query
String querystr = args.length > 0 ? args[0] : "lucene";
// the "title" arg specifies the default field to use
// when no field is explicitly specified in the query.
Query q = new QueryParser(Version.LUCENE_4_9, "title", analyzer).parse(querystr);
// Search
int hitsPerPage = 10;
IndexReader reader = DirectoryReader.open(index);
IndexSearcher searcher = new IndexSearcher(reader);
TopScoreDocCollector collector = TopScoreDocCollector.create(hitsPerPage, true);
searcher.search(q, collector);
ScoreDoc[] hits = collector.topDocs().scoreDocs;
// Display results
System.out.println("Found " + hits.length + " hits.");
for(int i=0;i<hits.length;++i) {
int docId = hits[i].doc;
Document d = searcher.doc(docId);
System.out.println((i + 1) + ". " + d.get("isbn") + "\t" + d.get("title"));
}
reader.close();
}
private static void addDoc(IndexWriter w, String title, String isbn) throws IOException {
Document doc = new Document();
doc.add(new TextField("title", title, Field.Store.YES));
// use a string field for isbn because we don't want it tokenized
doc.add(new StringField("isbn", isbn, Field.Store.YES));
w.addDocument(doc);
}
}

You just need to set the similarity in IndexSearcher:
searcher.setSimilarity(new BM25Similarity(1.2, 0.75));
And IndexWriterConfig:
config.setSimilarity(new BM25Similarity(1.2, 0.75));

Related

Is there a way to get the word-count from a Lucene index?

I'm using the latest Apache-Lucene version 8.1.1
Is it possible and how can I get the word-count of all the (non stop words) terms stored in a Lucene index? The result should be:
term1 453443
term2 445484
term3 443333
etc.
I need this in Java or Scala but any language will be fine to illustrate the API for it ...

You can find an example implementation below.
Note that count gives number of documents not number of occurences (lucene word count 4, document count 3). Also stop words are not omitted.
import java.io.IOException;
import java.nio.file.Paths;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.misc.HighFreqTerms;
import org.apache.lucene.misc.TermStats;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
public class LuceneTest2 {
final static String index = "index";
final static String field = "text";
public static void index() {
try {
Directory dir = FSDirectory.open(Paths.get(index));
Analyzer analyzer = new StandardAnalyzer();
IndexWriterConfig iwc = new IndexWriterConfig(analyzer);
iwc.setOpenMode(OpenMode.CREATE);
IndexWriter writer = new IndexWriter(dir, iwc);
String[] lines = {
"lucene java lucene mark",
"lucene",
"lucene an example",
"java python"
};
for (int i = 0; i < lines.length; i++) {
String line = lines[i];
Document doc = new Document();
doc.add(new StringField("id", "" + i, Field.Store.YES));
doc.add(new TextField(field, line.trim(), Field.Store.YES));
writer.addDocument(doc);
}
System.out.println("indexed " + lines.length + " sentences");
writer.close();
} catch (IOException e) {
System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage());
}
}
public static void count() {
try {
IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(index)));
int numTerms = 100;
TermStats[] stats = HighFreqTerms.getHighFreqTerms(reader, numTerms, field, new HighFreqTerms.DocFreqComparator());
for (TermStats termStats : stats) {
String termText = termStats.termtext.utf8ToString();
System.out.println(termText + " " + termStats.docFreq);
}
reader.close();
} catch (Exception e) {
System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage());
}
}
public static void main(String[] args) {
index();
count();
}
}
This outputs:
lucene 3
java 2
python 1
mark 1
example 1
an 1

Lucene Document and Java ArrayList

I have a set of data: ID, Name and a ArrayList associated with the ID. I wanted this data to be stored in the Lucene document. The search will be based on the Id and name. The List should not be indexed.
I do not know how a List/ArrayList can be stored in the Lucene Document. What is best way of doing this?
Apache Lucene verison I am using is 7.1.0.
Thanks.
Document doc = new Document();
String id = "something";
String name = "someName";
List someList = new ArrayList();
doc.add(new StringField("Id", id, Field.Store.YES));
doc.add(new TextField("name", name, Field.Store.YES));
How do I do something similar for the 'someList'?

You can use StoredField.
A simple aproach might be serializing the list. Here is a sample code:
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Random;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StoredField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
public class LuceneTest {
public static byte[] serialize(Object obj) throws IOException {
ByteArrayOutputStream bos = new ByteArrayOutputStream(200000);
ObjectOutputStream os = new ObjectOutputStream(bos);
os.writeObject(obj);
os.close();
return bos.toByteArray();
}
public static Object unserialize(byte[] bytes) throws ClassNotFoundException, IOException {
ObjectInputStream is = new ObjectInputStream(new ByteArrayInputStream(bytes));
Object result = is.readObject();
is.close();
return result;
}
public static void index() {
String indexPath = "index";
try {
System.out.println("Indexing to directory '" + indexPath + "'...");
Directory dir = FSDirectory.open(Paths.get(indexPath));
Analyzer analyzer = new StandardAnalyzer();
IndexWriterConfig iwc = new IndexWriterConfig(analyzer);
iwc.setOpenMode(OpenMode.CREATE);
IndexWriter writer = new IndexWriter(dir, iwc);
for (int i = 0; i < 100; i++) {
Document doc = new Document();
String id = "id" + i;
String name = "jack " + i;
ArrayList<Object> someList = new ArrayList<>();
someList.add(id + " => " + name);
someList.add("index" + i);
doc.add(new StringField("Id", id, Field.Store.YES));
doc.add(new TextField("name", name, Field.Store.YES));
doc.add(new StoredField("list", serialize(someList)));
writer.addDocument(doc);
}
writer.close();
} catch (IOException e) {
System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage());
}
}
public static void search() {
String index = "index";
String field = "name";
int hitsPerPage = 10;
try {
IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(index)));
IndexSearcher searcher = new IndexSearcher(reader);
Analyzer analyzer = new StandardAnalyzer();
QueryParser parser = new QueryParser(field, analyzer);
Random rnd = new Random();
for (int i = 0; i < 10; i++) {
int randIndex = rnd.nextInt(100);
String nameQuery = "" + randIndex;
Query query = parser.parse(nameQuery);
System.out.println("Searching for: " + query.toString(field));
TopDocs results = searcher.search(query, hitsPerPage);
ScoreDoc[] hits = results.scoreDocs;
for (ScoreDoc scoreDoc : hits) {
Document doc = searcher.doc(scoreDoc.doc);
String id = doc.get("Id");
String name = doc.get("name");
ArrayList<Object> list = (ArrayList<Object>) unserialize(doc.getBinaryValue("list").bytes);
System.out.println("id: " + id + " name: " + name + " list: " + list);
}
System.out.println();
}
reader.close();
} catch (Exception e) {
System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage());
}
}
public static void main(String[] args) {
index();
search();
}
}

Empty Query - More Like This (Lucene)

I'm new at Java and Lucene.
I'm trying a simple MLT test, but I'm not getting any results.
import java.io.File;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.Version;
import org.apache.lucene.index.DirectoryReader;
import java.io.IOException;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.queries.mlt.MoreLikeThis;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.SimpleFSDirectory;
public class HelloLucene {
public static void main(String[] args) throws IOException, ParseException {
StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_40);
String indexPath = "C:/Users/Name/Desktop/Lucene";
Directory index = new SimpleFSDirectory(new File(indexPath));
IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_40, analyzer);
IndexWriter w = new IndexWriter(index, config);
addDoc(w, "Lucene in Action", "193398817");
addDoc(w, "Lucene for Dummies", "55320055Z");
addDoc(w, "Managing Gigabytes", "55063554A");
addDoc(w, "The Art of Computer Science", "9900333X");
w.close();
IndexReader reader = DirectoryReader.open(index);
System.out.println("Reader has " + reader.numDocs() + " docs on it.");
MoreLikeThis mlt = new MoreLikeThis(reader);
mlt.setMinTermFreq(1);
mlt.setMinDocFreq(1);
Query query = mlt.like(0); //doc ID
IndexSearcher searcher = new IndexSearcher(reader);
TopDocs topDocs = searcher.search(query,5);
System.out.println("Found " + topDocs.totalHits + " hits.");
for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
Document doc = searcher.doc(scoreDoc.doc);
System.out.println(scoreDoc.doc + ". " + doc.get("isbn") + "\t" + doc.get("title"));
}
reader.close();
}
private static void addDoc(IndexWriter w, String title, String isbn) throws IOException {
Document doc = new Document();
doc.add(new TextField("title", title, Field.Store.YES));
// use a string field for isbn because we don't want it tokenized
doc.add(new StringField("isbn", isbn, Field.Store.YES));
w.addDocument(doc);
}
}
And this is what I get:
Reader has 4 docs on it.
Found 0 hits.
I tried using Luke to check the Doc's ID and there is nothing wrong apparently.
I even did some tests there, I dont know whats wrong :(
Did a lot of searching on the web, someone said something about the settings as MinTermFreq and MinDocFreq, I tried 1 and 0, but got nothing.
Does anyone have an idea?
Thanks in advance!
[SOLVED] Edited:
its working now!
I just had to add this:
mlt.setAnalyzer(analyzer);
mlt.setFieldNames(new String[] {"title"});

its working now!
I just had to add this:
mlt.setAnalyzer(analyzer);
mlt.setFieldNames(new String[] {"title"});

Lucene Highlighter with stemming analyzer

I am using Lucene's Highlighter class to highlight fragments of matched search results and it works well. I would like to switch from searching with the StandardAnalyzer to the EnglishAnalyzer, which will perform stemming of terms.
The search results are good, but now the highlighter doesn't always find a match. Here's an example of what I'm looking at:
document field text 1: Everyone likes goats.
document field text 2: I have a goat that eats everything.
Using the EnglishAnalyzer and searching for "goat", both documents are matched, but the highlighter is only able to find a matched fragment from document 2. Is there a way to have the highlighter return data for both documents?
I understand that the characters are different for the tokens, but the same tokens are still there, so it seems reasonable for it to just highlight whatever token is present at that location.
If it helps, this is using Lucene 3.5.

I found a solution to this problem. I changed from using the Highlighter class to using the FastVectorHighlighter. It looks like I'll pick up some speed improvements too (at the expense of storage of term vector data). For the benefit of anyone coming across this question later, here's a unit test showing how this all works together:
package com.sample.index;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.en.EnglishAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.vectorhighlight.*;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;
import org.junit.Before;
import org.junit.Test;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import static junit.framework.Assert.assertEquals;
public class TestIndexStuff {
public static final String FIELD_NORMAL = "normal";
public static final String[] PRE_TAGS = new String[]{"["};
public static final String[] POST_TAGS = new String[]{"]"};
private IndexSearcher searcher;
private Analyzer analyzer = new EnglishAnalyzer(Version.LUCENE_35);
#Before
public void init() throws IOException {
RAMDirectory idx = new RAMDirectory();
IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_35, analyzer);
IndexWriter writer = new IndexWriter(idx, config);
addDocs(writer);
writer.close();
searcher = new IndexSearcher(IndexReader.open(idx));
}
private void addDocs(IndexWriter writer) throws IOException {
for (String text : new String[] {
"Pretty much everyone likes goats.",
"I have a goat that eats everything.",
"goats goats goats goats goats"}) {
Document doc = new Document();
doc.add(new Field(FIELD_NORMAL, text, Field.Store.YES,
Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
writer.addDocument(doc);
}
}
private FastVectorHighlighter makeHighlighter() {
FragListBuilder fragListBuilder = new SimpleFragListBuilder(200);
FragmentsBuilder fragmentBuilder = new SimpleFragmentsBuilder(PRE_TAGS, POST_TAGS);
return new FastVectorHighlighter(true, true, fragListBuilder, fragmentBuilder);
}
#Test
public void highlight() throws ParseException, IOException {
Query query = new QueryParser(Version.LUCENE_35, FIELD_NORMAL, analyzer)
.parse("goat");
FastVectorHighlighter highlighter = makeHighlighter();
FieldQuery fieldQuery = highlighter.getFieldQuery(query);
TopDocs topDocs = searcher.search(query, 10);
List<String> fragments = new ArrayList<String>();
for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
fragments.add(highlighter.getBestFragment(fieldQuery, searcher.getIndexReader(),
scoreDoc.doc, FIELD_NORMAL, 10000));
}
assertEquals(3, fragments.size());
assertEquals("[goats] [goats] [goats] [goats] [goats]", fragments.get(0).trim());
assertEquals("Pretty much everyone likes [goats].", fragments.get(1).trim());
assertEquals("I have a [goat] that eats everything.", fragments.get(2).trim());
}
}

Lucene ,highlighting and NullPointerException

I am trying to highlight some results . I index the body (the text) of my documents in the field "contents" and when I try to highilight using highlighter.getBestFragment(...) I get a NullPointerException .
But when,for exemple I try to highlight the fileName it works properly.
I know since I use only one field with the fileReader or (ParsingReader) my text is tokenized which is different from a file name .
Here's my code ,please help me .
package xxxxxx;
import java.io.File;
import java.io.FileFilter;
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.apache.tika.parser.ParsingReader;
public class Indexer {
static long start = 0;
public static void main(String[] args) throws Exception {
System.out.println("l'index se trouve Ã  " + args[0]);
System.out.println("le dossier ou s'effectue l'indexation est :" + args[1]);
if (args.length != 2) {
throw new IllegalArgumentException("Usage: java " + Indexer.class.getName()
+ " <index dir> <data dir>");
}
String indexDir = args[0];
String dataDir = args[1];
start = System.currentTimeMillis();
Indexer indexer = new Indexer(indexDir);
int numIndexed;
try {
numIndexed = indexer.index(dataDir, new TextFilesFilter());
} finally {
indexer.close();
}
long end = System.currentTimeMillis();
System.out.println("Indexing " + numIndexed + " files took "
+ (end - start) + " milliseconds");
}
private IndexWriter writer;
public Indexer(String indexDir) throws IOException, InterruptedException {
Directory dir = FSDirectory.open(new File(indexDir));
writer = new IndexWriter(dir, new StandardAnalyzer(Version.LUCENE_30), true,
IndexWriter.MaxFieldLength.UNLIMITED);
writer.setUseCompoundFile(true);
}
public void close() throws IOException {
writer.optimize();
writer.close();
}
public int index(String dataDir, FileFilter filter) throws Exception {
File[] files = new File(dataDir).listFiles();
for (File f : files) {
if (!f.isDirectory() && !f.isHidden() && f.exists() && f.canRead() && (filter == null || filter.accept(f))) {
if (!(f.getCanonicalPath().endsWith("~"))) {
indexFile(f);
}
} else {
index(f.toString(), filter);
}
}
return writer.numDocs();
}
private static class TextFilesFilter implements FileFilter {
public boolean accept(File path) {
return true;
}
}
protected Document getDocument(File f) throws Exception {
// FileReader frf = new FileReader(f);
Document doc = new Document();
Reader reader = new ParsingReader(f);
doc.add(new Field("contents", reader, Field.TermVector.WITH_POSITIONS_OFFSETS));
doc.add(new Field("filename", f.getName(), Field.Store.YES, Field.Index.ANALYZED ));
doc.add(new Field("fullpath", f.getCanonicalPath(),Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));
return doc;
}
private void indexFile(File f) throws Exception {
System.out.println("Indexing " + f.getCanonicalPath());
Document doc = getDocument(f);
writer.addDocument(doc);
System.out.println(System.currentTimeMillis() - start);
}
}
-------------------------------------------------------------------
package xxxxxxxxxxxxxxxxxxxx;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.queryParser.MultiFieldQueryParser;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.DisjunctionMaxQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleSpanFragmenter;
import org.apache.lucene.search.highlight.TokenSources;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
public class Searcher {
public static void main(String[] args) throws IllegalArgumentException,
IOException, ParseException, InvalidTokenOffsetsException {
System.out.println("endroit ou se situe l'index " + args[0]);
System.out.println(args[1]);
if (args.length != 2) {
throw new IllegalArgumentException("Usage: java "
+ Searcher.class.getName()
+ " <index dir> <query>");
}
String indexDir = args[0];
String q = args[1];
search(indexDir, q);
}
public static void search(String indexDir, String q) throws IOException, ParseException, InvalidTokenOffsetsException {
Directory dir = FSDirectory.open(new File(indexDir));
IndexSearcher indexSearcher = new IndexSearcher(dir);
QueryParser parserC = new QueryParser(Version.LUCENE_30, "contents", new StandardAnalyzer(Version.LUCENE_30));
// QueryParser parserN = new QueryParser(Version.LUCENE_30, "filename", new StandardAnalyzer(Version.LUCENE_30));
QueryParser parserP = new QueryParser(Version.LUCENE_30, "fullpath", new StandardAnalyzer(Version.LUCENE_30));
parserC.setDefaultOperator(QueryParser.Operator.OR);
// parserN.setDefaultOperator(QueryParser.Operator.OR);
parserC.setPhraseSlop(10);
// parserN.setPhraseSlop(10);
DisjunctionMaxQuery dmq = new DisjunctionMaxQuery(6);
Query query = new MultiFieldQueryParser(Version.LUCENE_30, new String[]{"contents", "filename"},
new CustomAnalyzer()).parse(q);
Query queryC = parserC.parse(q);
//Query queryN = parserN.parse(q);
dmq.add(queryC);
//dmq.add(queryN);
// dmq.add(query) ;
QueryScorer scorer = new QueryScorer(dmq, "contents");
Highlighter highlighter = new Highlighter(scorer);
highlighter.setTextFragmenter(new SimpleSpanFragmenter(scorer));
System.out.println(query.toString());
long start = System.currentTimeMillis();
TopDocs hits = indexSearcher.search(dmq, 15);
System.out.println(hits.totalHits);
long end = System.currentTimeMillis();
System.err.println("Found " + hits.totalHits
+ " document(s) (in " + (end - start)
+ " milliseconds) that matched query '"
+ q + "':");
for (ScoreDoc scoreDoc : hits.scoreDocs) {
Document doc = indexSearcher.doc(scoreDoc.doc);
System.out.print(scoreDoc.score);
System.out.println(doc.get("fullpath"));
String contents = doc.get("contents"); // I am pretty sure the mistake is here , contents is always Null
//But what can I do to make this thing work ?
TokenStream stream =
TokenSources.getAnyTokenStream(indexSearcher.getIndexReader(),
scoreDoc.doc,
"contents",
doc,
new StandardAnalyzer(Version.LUCENE_30));
String fragment =
highlighter.getBestFragment(stream, contents);
System.out.println(fragment);
}
indexSearcher.close();
}
}
----------------------------------------------------------------------

You need it to be stored if you want to use that highlighter. "filename" is stored but "contents" isn't, which is why you see them behaving differently:
doc.add(new Field("contents", reader, Field.TermVector.WITH_POSITIONS_OFFSETS));
doc.add(new Field("filename", f.getName(), Field.Store.YES, Field.Index.ANALYZED ));

We Keep Coding

sql objective-c vba vb.net react-native apache vue.js tensorflow api pandas

BM25 in Lucene 4.9 - lucene

You just need to set the similarity in IndexSearcher: searcher.setSimilarity(new BM25Similarity(1.2, 0.75)); And IndexWriterConfig: config.setSimilarity(new BM25Similarity(1.2, 0.75));

Related

Is there a way to get the word-count from a Lucene index?

Lucene Document and Java ArrayList

Empty Query - More Like This (Lucene)

Lucene Highlighter with stemming analyzer

Lucene ,highlighting and NullPointerException

Categories

Resources