Updating Lucene Indexer and Searcher examples to Lucene 6.6.0

Updating Lucene Indexer and Searcher examples to Lucene 6.6.0 - lucene

I have updated the Indexer and Searcher examples from the Lucene in Action 2nd edition book.
Indexer works ok but Searcher does not work.
I have indexed a bunch of txt files (Indexer filters txt files).
When I search with the Searcher class using a word I'm sure the txt files contain (it can be verified with grep) it finds 0 matching document.
There must be a problem with the code.
Here are the files
pom.xml
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.learning</groupId>
<artifactId>lucenebook</artifactId>
<version>1.0-SNAPSHOT</version>
<dependencies>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-core</artifactId>
<version>6.6.0</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-queryparser</artifactId>
<version>6.6.0</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<source>1.8</source>
<target>1.8</target>
</configuration>
</plugin>
</plugins>
</build>
</project>
Indexer:
package lia.meetlucene;
/**
* Copyright Manning Publications Co.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific lan
*/
import org.apache.lucene.document.FieldType;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.Directory;
import java.io.File;
import java.io.FileFilter;
import java.io.IOException;
import java.io.FileReader;
import java.nio.file.Paths;
// From chapter 1
/**
* This code was originally written for
* Erik's Lucene intro java.net article
*/
public class Indexer {
public static void main(String[] args) throws Exception {
if (args.length != 2) {
throw new IllegalArgumentException("Usage: java " + Indexer.class.getName()
+ " <index dir> <data dir>");
}
String indexDir = args[0]; //1
String dataDir = args[1]; //2
long start = System.currentTimeMillis();
Indexer indexer = new Indexer(indexDir);
int numIndexed;
try {
numIndexed = indexer.index(dataDir, new TextFilesFilter());
} finally {
indexer.close();
}
long end = System.currentTimeMillis();
System.out.println("Indexing " + numIndexed + " files took "
+ (end - start) + " milliseconds");
}
private IndexWriter writer;
public Indexer(String indexDir) throws IOException {
Directory dir = FSDirectory.open(Paths.get(indexDir));
writer = new IndexWriter(dir, new IndexWriterConfig()); //3
}
public void close() throws IOException {
writer.close(); //4
}
public int index(String dataDir, FileFilter filter)
throws Exception {
File[] files = new File(dataDir).listFiles();
for (File f: files) {
if (!f.isDirectory() &&
!f.isHidden() &&
f.exists() &&
f.canRead() &&
(filter == null || filter.accept(f))) {
indexFile(f);
}
}
return writer.numDocs(); //5
}
private static class TextFilesFilter implements FileFilter {
public boolean accept(File path) {
return path.getName().toLowerCase() //6
.endsWith(".txt"); //6
}
}
protected Document getDocument(File f) throws Exception {
Document doc = new Document();
doc.add(new Field("contents", new FileReader(f), new FieldType())); //7
FieldType notAnalyzed = new FieldType();
notAnalyzed.setTokenized(false);
notAnalyzed.setStored(true);
doc.add(new Field("filename", f.getName(), notAnalyzed //8
));//8
doc.add(new Field("fullpath", f.getCanonicalPath(), //9
notAnalyzed));//9
return doc;
}
private void indexFile(File f) throws Exception {
System.out.println("Indexing " + f.getCanonicalPath());
Document doc = getDocument(f);
writer.addDocument(doc); //10
}
}
/*
#1 Create index in this directory
#2 Index *.txt files from this directory
#3 Create Lucene IndexWriter
#4 Close IndexWriter
#5 Return number of documents indexed
#6 Index .txt files only, using FileFilter
#7 Index file content
#8 Index file name
#9 Index file full path
#10 Add document to Lucene index
*/
and Searcher:
package lia.meetlucene;
/**
* Copyright Manning Publications Co.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific lan
*/
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.Directory;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import java.io.IOException;
import java.nio.file.Paths;
// From chapter 1
/**
* This code was originally written for
* Erik's Lucene intro java.net article
*/
public class Searcher {
public static void main(String[] args) throws IllegalArgumentException,
IOException, ParseException {
if (args.length != 2) {
throw new IllegalArgumentException("Usage: java " + Searcher.class.getName()
+ " <index dir> <query>");
}
String indexDir = args[0]; //1
String q = args[1]; //2
search(indexDir, q);
}
public static void search(String indexDir, String q)
throws IOException, ParseException {
Directory dir = FSDirectory.open(Paths.get(indexDir)); //3
DirectoryReader directoryReader = DirectoryReader.open(dir);
IndexSearcher is = new IndexSearcher(directoryReader); //3
QueryParser parser = new QueryParser( // 4
"f", //4
new StandardAnalyzer( )); //4
Query query = parser.parse(q); //4
long start = System.currentTimeMillis();
TopDocs hits = is.search(query, 10); //5
long end = System.currentTimeMillis();
System.err.println("Found " + hits.totalHits + //6
" document(s) (in " + (end - start) + // 6
" milliseconds) that matched query '" + // 6
q + "':"); // 6
for(ScoreDoc scoreDoc : hits.scoreDocs) {
Document doc = is.doc(scoreDoc.doc); //7
System.out.println(doc.get("fullpath")); //8
}
//9
}
}
/*
#1 Parse provided index directory
#2 Parse provided query string
#3 Open index
#4 Parse query
#5 Search index
#6 Write search stats
#7 Retrieve matching document
#8 Display filename
#9 Close IndexSearcher
*/

Your biggest problem is how you are handling Exceptions. Here is the exception you should be seeing, if you weren't throwing them all away:
Exception in thread "main" java.lang.IllegalArgumentException: it doesn't make sense to have a field that is neither indexed nor stored
at org.apache.lucene.document.Field.(Field.java:249)
If you are trying to figure out what is wrong with your program, don't just toss out all the exceptions! Those are the tools to help you find and handle problems!
What that exception is telling you is that one of your fields ("content", specifically), is neither indexed nor stored. You should never use a FieldType with all default values. You need to set it up with the appropriate values for the field. In this case, it might make sense to just use TextField, instead.
PS - Lucene in Action 2nd ed. is 7 years old, and it's code is for Lucene 3.0. It's quite outdated, and you would probably be better served trying the current lucene demo.

The problem was that the Searcher was not using the field that is indexed, which is named "contents", it was using "f" and it was matching nothing.
Here is the corrected code, Searcher only changes "f" to "contents" and Indexer uses a TextField and better captures exceptions (the changes to Indexer are suggested by the answer submitted by femtoRgon).
Note that it is better to use the current Lucene demo, as femtoRgon suggests, in fact comparing it with the code was the path to the solution.
Corrected Indexer.java:
package lia.meetlucene;
/**
* Copyright Manning Publications Co.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific lan
*/
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.Directory;
import java.io.File;
import java.io.FileFilter;
import java.io.IOException;
import java.io.FileReader;
import java.nio.file.Paths;
// From chapter 1
/**
* This code was originally written for
* Erik's Lucene intro java.net article
*/
public class Indexer {
public static void main(String[] args) {
if (args.length != 2) {
throw new IllegalArgumentException("Usage: java " + Indexer.class.getName()
+ " <index dir> <data dir>");
}
String indexDir = args[0]; //1
String dataDir = args[1];//2
Indexer indexer = null;
long start = System.currentTimeMillis();
int numIndexed = 0;
try {
indexer = new Indexer(indexDir);
numIndexed = indexer.index(dataDir, new TextFilesFilter());
} catch(Exception e) {
e.printStackTrace();
} finally {
if (indexer != null)
try {
indexer.close();
} catch (IOException e) {
// ignored
}
}
long end = System.currentTimeMillis();
System.out.println("Indexing " + numIndexed + " files took "
+ (end - start) + " milliseconds");
}
private IndexWriter writer;
public Indexer(String indexDir) throws IOException {
Directory dir = FSDirectory.open(Paths.get(indexDir));
writer = new IndexWriter(dir, new IndexWriterConfig()); //3
}
public void close() throws IOException {
writer.close(); //4
}
public int index(String dataDir, FileFilter filter)
throws Exception {
File[] files = new File(dataDir).listFiles();
for (File f: files) {
if (!f.isDirectory() &&
!f.isHidden() &&
f.exists() &&
f.canRead() &&
(filter == null || filter.accept(f))) {
indexFile(f);
}
}
return writer.numDocs(); //5
}
private static class TextFilesFilter implements FileFilter {
public boolean accept(File path) {
return path.getName().toLowerCase() //6
.endsWith(".txt"); //6
}
}
protected Document getDocument(File f) throws Exception {
Document doc = new Document();
doc.add(new TextField("contents", new FileReader(f))); //7
FieldType notAnalyzed = new FieldType();
notAnalyzed.setTokenized(false);
notAnalyzed.setStored(true);
doc.add(new Field("filename", f.getName(), notAnalyzed //8
));//8
doc.add(new Field("fullpath", f.getCanonicalPath(), //9
notAnalyzed));//9
return doc;
}
private void indexFile(File f) throws Exception {
System.out.println("Indexing " + f.getCanonicalPath());
Document doc = getDocument(f);
writer.addDocument(doc); //10
}
}
/*
#1 Create index in this directory
#2 Index *.txt files from this directory
#3 Create Lucene IndexWriter
#4 Close IndexWriter
#5 Return number of documents indexed
#6 Index .txt files only, using FileFilter
#7 Index file content
#8 Index file name
#9 Index file full path
#10 Add document to Lucene index
*/
and Searcher.java:
package lia.meetlucene;
/**
* Copyright Manning Publications Co.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific lan
*/
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.Directory;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import java.io.IOException;
import java.nio.file.Paths;
// From chapter 1
/**
* This code was originally written for
* Erik's Lucene intro java.net article
*/
public class Searcher {
public static void main(String[] args) throws IllegalArgumentException,
IOException, ParseException {
if (args.length != 2) {
throw new IllegalArgumentException("Usage: java " + Searcher.class.getName()
+ " <index dir> <query>");
}
String indexDir = args[0]; //1
String q = args[1]; //2
search(indexDir, q);
}
public static void search(String indexDir, String q)
throws IOException, ParseException {
Directory dir = FSDirectory.open(Paths.get(indexDir)); //3
DirectoryReader directoryReader = DirectoryReader.open(dir);
IndexSearcher is = new IndexSearcher(directoryReader); //3
QueryParser parser = new QueryParser( // 4
"contents", //4
new StandardAnalyzer( )); //4
Query query = parser.parse(q); //4
long start = System.currentTimeMillis();
TopDocs hits = is.search(query, 10); //5
long end = System.currentTimeMillis();
System.err.println("Found " + hits.totalHits + //6
" document(s) (in " + (end - start) + // 6
" milliseconds) that matched query '" + // 6
q + "':"); // 6
for(ScoreDoc scoreDoc : hits.scoreDocs) {
Document doc = is.doc(scoreDoc.doc); //7
System.out.println(doc.get("fullpath")); //8
}
//9
}
}
/*
#1 Parse provided index directory
#2 Parse provided query string
#3 Open index
#4 Parse query
#5 Search index
#6 Write search stats
#7 Retrieve matching document
#8 Display filename
#9 Close IndexSearcher
*/

Related

Apache Lucene 8.4.1 How to get indexed fields and term list?

I'am new to Apache Lucene, I'm using Apache Lucene 8.4.1, I can do Lucene Indexing and Searching but don't know how to read and list index / print index using java.
How to get indexed fields and term list ? .
I was able to get Fileds list by using following function grabbed from Other Stackoverflow article.
public static String[] getFieldNames(IndexReader reader) {
List<String> fieldNames = new ArrayList<String>();
//For a simple reader over only one index, reader.leaves() should only return one LeafReaderContext
for (LeafReaderContext readerCtx : reader.leaves()) {
FieldInfos fields = readerCtx.reader().getFieldInfos();
for (FieldInfo field : fields) {
//Check whether the field is indexed and searchable, perhaps?
fieldNames.add(field.name);
}
}
return fieldNames.toArray(new String[fieldNames.size()]);
}
Thanks

package com.lucene.ram;
import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.RAMDirectory;
/**
*
* #author W.P.Roshan
* #email sunone5 at gmail.com
*
* The RAMDirector is deprecated instead you can use
*
* import org.apache.lucene.index.memory.MemoryIndex;
*
*/
public class RAMDirectoryExample {
public RAMDirectoryExample() {
// TODO Auto-generated constructor stub
}
static void writeIndex(RAMDirectory ramDir, Analyzer analyzer) {
try {
// IndexWriter Configuration
IndexWriterConfig iwc = new IndexWriterConfig(analyzer);
iwc.setOpenMode(OpenMode.CREATE);
// IndexWriter writes new index files to the directory
IndexWriter writer = new IndexWriter(ramDir, iwc);
// Create some docs with name and content
indexDoc(writer, "document-1", "hello world");
indexDoc(writer, "document-2", "hello happy world");
indexDoc(writer, "document-3", "hello happy world");
indexDoc(writer, "document-4", "hello hello world");
// don't forget to close the writer
writer.close();
} catch (IOException e) {
// Any error goes here
e.printStackTrace();
}
}
static void indexDoc(IndexWriter writer, String name, String content) throws IOException {
Document doc = new Document();
doc.add(new TextField("name", name, Store.YES));
doc.add(new TextField("content", content, Store.YES));
writer.addDocument(doc);
}
static void searchIndex(RAMDirectory ramDir, Analyzer analyzer) {
IndexReader reader = null;
try {
// Create Reader
reader = DirectoryReader.open(ramDir);
// Create index searcher
IndexSearcher searcher = new IndexSearcher(reader);
// Build query
QueryParser qp = new QueryParser("content", analyzer);
Query query = qp.parse("happy");
// Search the index
TopDocs foundDocs = searcher.search(query, 10);
// Total found documents
System.out.println("Total Results :: " + foundDocs.totalHits);
// Let's print found doc names and their content along with score
for (ScoreDoc sd : foundDocs.scoreDocs) {
Document d = searcher.doc(sd.doc);
System.out.println("Document Number : " + sd.doc + " :: Document Name : " + d.get("name")
+ " :: Content : " + d.get("content") + " :: Score : " + sd.score);
}
System.out.println("");
// don't forget to close the reader
reader.close();
} catch (IOException e) {
// Any error goes here
e.printStackTrace();
} catch (ParseException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
static void readIndex_Get_Documents(RAMDirectory ramDir) {
IndexReader reader = null;
try {
// Create Reader
reader = DirectoryReader.open(ramDir);
// Create index searcher
IndexSearcher searcher = new IndexSearcher(reader);
System.out.println("-----------------------Document List-----------------------");
int maxDoc = reader.maxDoc();
for (int i = 0; i < maxDoc; i++) {
Document d = reader.document(i);
/**
* There are three types of method to retrieve indexed document name list
*/
/**
* Method 1 for get document name list
*/
// System.out.println(""+d.getFields().iterator().next().stringValue());
/**
* Method 2 for get document name list
*/
// System.out.println(""+d.iterator().next().stringValue());
/**
* Method 3 for get document name list
*/
String[] vls = d.getValues("name");
for (int j = 0; j < vls.length; j++) {
System.out.println("" + vls[j].toString());
}
}
// don't forget to close the reader
reader.close();
} catch (IOException e) {
// Any error goes here
e.printStackTrace();
}
}
static void readIndex_Get_Terms(RAMDirectory ramDir) {
IndexReader reader = null;
try {
// Create Reader
reader = DirectoryReader.open(ramDir);
// Create index searcher
IndexSearcher searcher = new IndexSearcher(reader);
System.out.println("");
System.out.println("--------------------------Term List------------------------");
int maxDoc = reader.maxDoc();
for (int i = 0; i < maxDoc; i++) {
Document d = reader.document(i);
/**
* There are three types of methods to retrieve indexed term list
*/
/**
* Method 1 for retrieve terms list
*/
// System.out.println(""+d.get("content").toString());
/**
* Method 2 for retrieve terms list
*/
// System.out.println(""+d.getField("content").stringValue());
/**
* Method 3 for retrieve terms list
*/
String[] vl = searcher.doc(i).getValues("content");
for (int k = 0; k < vl.length; k++) {
System.out.println("" + vl[k].toString());
}
}
// don't forget to close the reader
reader.close();
} catch (IOException e) {
// Any error goes here
e.printStackTrace();
}
}
public static void main(String[] args) {
// Create RAMDirectory instance
RAMDirectory ramDir = new RAMDirectory();
// Builds an analyzer with the default stop words
Analyzer analyzer = new StandardAnalyzer();
// Write some docs to RAMDirectory
writeIndex(ramDir, analyzer);
// Search indexed docs in RAMDirectory
searchIndex(ramDir, analyzer);
// read Index get indexed document list
readIndex_Get_Documents(ramDir);
// read Index get indexed terms list
readIndex_Get_Terms(ramDir);
}
}
I found the way to list documents & terms list . complete example will be available on git-hub for anyone reference. https://github.com/sunone5/lucene-ramdirectory-index

Is there a way to get the word-count from a Lucene index?

I'm using the latest Apache-Lucene version 8.1.1
Is it possible and how can I get the word-count of all the (non stop words) terms stored in a Lucene index? The result should be:
term1 453443
term2 445484
term3 443333
etc.
I need this in Java or Scala but any language will be fine to illustrate the API for it ...

You can find an example implementation below.
Note that count gives number of documents not number of occurences (lucene word count 4, document count 3). Also stop words are not omitted.
import java.io.IOException;
import java.nio.file.Paths;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.misc.HighFreqTerms;
import org.apache.lucene.misc.TermStats;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
public class LuceneTest2 {
final static String index = "index";
final static String field = "text";
public static void index() {
try {
Directory dir = FSDirectory.open(Paths.get(index));
Analyzer analyzer = new StandardAnalyzer();
IndexWriterConfig iwc = new IndexWriterConfig(analyzer);
iwc.setOpenMode(OpenMode.CREATE);
IndexWriter writer = new IndexWriter(dir, iwc);
String[] lines = {
"lucene java lucene mark",
"lucene",
"lucene an example",
"java python"
};
for (int i = 0; i < lines.length; i++) {
String line = lines[i];
Document doc = new Document();
doc.add(new StringField("id", "" + i, Field.Store.YES));
doc.add(new TextField(field, line.trim(), Field.Store.YES));
writer.addDocument(doc);
}
System.out.println("indexed " + lines.length + " sentences");
writer.close();
} catch (IOException e) {
System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage());
}
}
public static void count() {
try {
IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(index)));
int numTerms = 100;
TermStats[] stats = HighFreqTerms.getHighFreqTerms(reader, numTerms, field, new HighFreqTerms.DocFreqComparator());
for (TermStats termStats : stats) {
String termText = termStats.termtext.utf8ToString();
System.out.println(termText + " " + termStats.docFreq);
}
reader.close();
} catch (Exception e) {
System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage());
}
}
public static void main(String[] args) {
index();
count();
}
}
This outputs:
lucene 3
java 2
python 1
mark 1
example 1
an 1

Splitting PDF document into multiple documents

I'm trying to split a PDF document into multiple documents where each document includes the maximum number of pages it can contain where the file size is less than a maximum file size.
My code currently works when running from Eclipse, but when I click on the .jar file, the static method in a java class seems to crash (I can't seem to catch an exception however).
The code that isn't working is:
myListOfDocuments=mysplitter.split(document);
Somehow the JVM bails on the static method when the above line is called. The load seems to work fine, as follows:
PDDocument document=PDDocument.load(aFile);
Any ideas?
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
import org.apache.pdfbox.multipdf.Splitter;
import org.apache.pdfbox.pdmodel.PDDocument;
public class PDFMaxSizeSplitter {
public static void main(String[] args) {
}
public static ArrayList<File> splitTheFile(File aFile,long maxSize){
ArrayList<File> resultFiles = new ArrayList<File>();
//Checks to see if file is already small enough
if (aFile.length() <= maxSize){
resultFiles.add(aFile);
return resultFiles;
}
//checks to see if it's a directory
if (aFile.isDirectory()){
resultFiles.add(aFile);
return resultFiles;
}
try {
PDDocument document = PDDocument.load(aFile);
Splitter mysplitter = new Splitter();
List<PDDocument> myListOfDocuments = mysplitter.split(document);
int docNumber = 0;
while (myListOfDocuments.size()>0){
long theResults = 0;
theResults = getChunk(myListOfDocuments,0,(long) (myListOfDocuments.size()-1),maxSize);
PDDocument newPDFDoc = new PDDocument();
for (long pageindex=0; pageindex<=theResults; pageindex++){
newPDFDoc.addPage(myListOfDocuments.get((int) pageindex).getPage(0));
}
File newFile = new File(aFile.getParentFile() +
File.separator +
aFile.getName().replace(".pdf", "") +
"Part" +
String.format("%03d", docNumber) +
".pdf");
//System.out.println(newFile.getCanonicalFile());
newPDFDoc.save(newFile);
resultFiles.add(newFile);
myListOfDocuments=myListOfDocuments.subList((int) (theResults)+1, (myListOfDocuments.size()));
newPDFDoc.close();
docNumber++;
}
document.close();
} catch (IOException e) {
e.printStackTrace();
}
return resultFiles;
}
private static long getChunk(List<PDDocument> thePages, long lowPage, long highPage, long maxSize) throws IOException{
//System.out.println("low " + lowPage + " high page: " + highPage);
if ( (highPage-lowPage)<=1 ){
if(PDFMaxSizeSplitter.testSize(thePages,0,highPage)<=maxSize){
return highPage;
} else{
return lowPage;
}
} else if (PDFMaxSizeSplitter.testSize(thePages, 0,lowPage+ (highPage-lowPage)/2)<=maxSize){
return PDFMaxSizeSplitter.getChunk(thePages, lowPage + (highPage-lowPage)/2, highPage,maxSize);
}
else {
return PDFMaxSizeSplitter.getChunk(thePages, lowPage, lowPage + (highPage-lowPage)/2,maxSize);
}
}
private static long testSize(List<PDDocument> thePages, long start, long stop) throws IOException{
//System.out.println("Trying: " + (new Long(start)).toString() + " to " + (new Long(stop)).toString());
PDDocument testerdocument = new PDDocument();
//Path tempPath = Files.createTempFile((new Long(start)).toString(), (new Long(stop)).toString());
//System.out.println("Creating tempPath " +tempPath.toString());
//File tempFile=new File(tempPath.toString());
ByteArrayOutputStream tempFile = new ByteArrayOutputStream();
for (long pageindex=start; pageindex<=stop; pageindex++){
testerdocument.addPage(thePages.get((int) pageindex).getPage(0));
}
testerdocument.save(tempFile);
long thefilesize = tempFile.size();
//long thefilesize = (tempFile.length());
//Files.deleteIfExists(tempPath);
tempFile.reset();
testerdocument.close();
return thefilesize;
}
}
-----------edit--------------
It turns out the JVM was running out of memory.

It turns out the JVM was running out of memory. I added a jvm argument to increase the memory. Also, I switched to the 64 bit jvm mode by using the argument -d64 on the jvm. Also, I have been using the disk drive cached memory management found in pdfbox, e.g., new PDDocument(aFile, MemoryUsageSetting.setupTempFileOnly());
With these settings, I can handle several gigabytes of files. Now in the code, I try to load the documents into direct memory and catch the out of memory exception to switch to a low memory mode. In the low memory mode I use the MemoryUsageSetting.setupTempFileOnly() to avoid using too much of the heap.

Not able to get the search text in lucene

I hereby paste the following code,
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import java.io.*;
import java.util.ArrayList;
/**
* This terminal application creates an Apache Lucene index in a folder and adds files into this index
* based on the input of the user.
*/
public class TextFileIndexer {
private static StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_44);
private Analyzer anal = new WhitespaceAnalyzer(Version.LUCENE_44);
private IndexWriter writer;
private ArrayList<File> queue = new ArrayList<File>();
public static void main(String[] args) throws IOException {
System.out.println("Enter the path where the index will be created: (e.g. /tmp/index or c:/temp/index)");
String indexLocation = null;
BufferedReader br = new BufferedReader(
new InputStreamReader(System.in));
String s = br.readLine();
TextFileIndexer indexer = null;
try {
indexLocation = s;
indexer = new TextFileIndexer(s);
} catch (Exception ex) {
System.out.println("Cannot create index..." + ex.getMessage());
System.exit(-1);
}
//===================================================
//read input from user until he enters q for quit
//===================================================
while (!s.equalsIgnoreCase("q")) {
try {
System.out.println("Enter the full path to add into the index (q=quit): (e.g. /home/ron/mydir or c:\\Users\\ron\\mydir)");
System.out.println("[Acceptable file types: .xml, .html, .html, .txt]");
s = br.readLine();
if (s.equalsIgnoreCase("q")) {
break;
}
//try to add file into the index
indexer.indexFileOrDirectory(s);
} catch (Exception e) {
System.out.println("Error indexing " + s + " : " + e.getMessage());
}
}
//===================================================
//after adding, we always have to call the
//closeIndex, otherwise the index is not created
//===================================================
indexer.closeIndex();
//=========================================================
// Now search
//=========================================================
IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexLocation)));
IndexSearcher searcher = new IndexSearcher(reader);
TopScoreDocCollector collector = TopScoreDocCollector.create(5, true);
s = "";
while (!s.equalsIgnoreCase("q")) {
try {
System.out.println("Enter the search query (q=quit):");
s = br.readLine();
if (s.equalsIgnoreCase("q")) {
break;
}
Query q = new QueryParser(Version.LUCENE_44, "contents", analyzer).parse(s);
searcher.search(q, collector);
ScoreDoc[] hits = collector.topDocs().scoreDocs;
// 4. display results
System.out.println("Found " + hits.length + " hits.");
for(int i=0;i<hits.length;++i) {
int docId = hits[i].doc;
Document d = searcher.doc(docId);
System.out.println((i + 1) + ". " + d.get("path") + " score=" + hits[i].score);
}
} catch (Exception e) {
System.out.println("Error searching " + s + " : " + e.getMessage());
}
}
}
/**
* Constructor
* #param indexDir the name of the folder in which the index should be created
* #throws java.io.IOException when exception creating index.
*/
TextFileIndexer(String indexDir) throws IOException {
// the boolean true parameter means to create a new index everytime,
// potentially overwriting any existing files there.
FSDirectory dir = FSDirectory.open(new File(indexDir));
IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_44, analyzer);
writer = new IndexWriter(dir, config);
}
/**
* Indexes a file or directory
* #param fileName the name of a text file or a folder we wish to add to the index
* #throws java.io.IOException when exception
*/
public void indexFileOrDirectory(String fileName) throws IOException {
//===================================================
//gets the list of files in a folder (if user has submitted
//the name of a folder) or gets a single file name (is user
//has submitted only the file name)
//===================================================
addFiles(new File(fileName));
int originalNumDocs = writer.numDocs();
for (File f : queue) {
FileReader fr = null;
try {
Document doc = new Document();
//===================================================
// add contents of file
//===================================================
fr = new FileReader(f);
// doc.add(new TextField("contents", fr));
doc.add(new StringField("path", f.getPath(), Field.Store.YES));
doc.add(new StringField("filename", f.getName(), Field.Store.YES));
writer.addDocument(doc);
System.out.println("Added: " + f);
BufferedReader br = new BufferedReader(new FileReader(fileName));
Field field = new StringField("contents", br.readLine().toString(),
Field.Store.YES);
doc.add(field);
writer.addDocument(doc);
} catch (Exception e) {
System.out.println("Could not add: " + f);
} finally {
fr.close();
}
}
int newNumDocs = writer.numDocs();
System.out.println("");
System.out.println("************************");
System.out.println((newNumDocs - originalNumDocs) + " documents added.");
System.out.println("************************");
queue.clear();
}
private void addFiles(File file) {
if (!file.exists()) {
System.out.println(file + " does not exist.");
}
if (file.isDirectory()) {
for (File f : file.listFiles()) {
addFiles(f);
}
} else {
String filename = file.getName().toLowerCase();
//===================================================
// Only index text files
//===================================================
if (filename.endsWith(".htm") || filename.endsWith(".html") ||
filename.endsWith(".xml") || filename.endsWith(".txt") || filename.endsWith(".pdf") ) {
queue.add(file);
} else {
System.out.println("Skipped " + filename);
}
}
}
/**
* Close the index.
* #throws java.io.IOException when exception closing
*/
public void closeIndex() throws IOException {
writer.close();
}
}
But when,I search for a particular string in the file. I get String Not Found. The output is as follows,
Enter the path where the index will be created: (e.g. /tmp/index or c:/temp/index)
D:/svn/phase2/JavaSource/test/test/
Enter the full path to add into the index (q=quit): (e.g. /home/ron/mydir or c:\Users\ron\mydir)
[Acceptable file types: .xml, .html, .html, .txt]
D:/svn/phase2/JavaSource/test/test
Skipped segments.gen
Skipped segments_1
Skipped write.lock
Added fileName : D:/svn/phase2/JavaSource/test/test
Added: D:\svn\phase2\JavaSource\test\test\demo.xml
Added fileName : D:/svn/phase2/JavaSource/test/test
Added: D:\svn\phase2\JavaSource\test\test\exe.xml
Added fileName : D:/svn/phase2/JavaSource/test/test
Added: D:\svn\phase2\JavaSource\test\test\Fruit.XML
Added fileName : D:/svn/phase2/JavaSource/test/test
Added: D:\svn\phase2\JavaSource\test\test\Influence_People.pdf
Added fileName : D:/svn/phase2/JavaSource/test/test
Added: D:\svn\phase2\JavaSource\test\test\new.html
Added fileName : D:/svn/phase2/JavaSource/test/test
Added: D:\svn\phase2\JavaSource\test\test\Toy.xml
************************
6 documents added.
************************
Enter the full path to add into the index (q=quit): (e.g. /home/ron/mydir or c:\Users\ron\mydir)
[Acceptable file types: .xml, .html, .html, .txt]
q
Enter the search query (q=quit):
for
Entered String is : for
fieldName =for
Found : 0 hits.
Enter the search query (q=quit):
i
Entered String is : i
Error searching i : this IndexReader is closed
Enter the search query (q=quit):
q
Entered String is : q

"for" and "i" are both stopwords, by default, in StandardAnalyzer, and so can not eally be searched for. The full list of default stop words is:
"a", "an", "and", "are", "as", "at", "be", "but", "by",
"for", "if", "in", "into", "is", "it",
"no", "not", "of", "on", "or", "such",
"that", "the", "their", "then", "there", "these",
"they", "this", "to", "was", "will", "with"
Seems likely there are other issues issues at work. Don't know why your reader would be closed for the second query. I don't know where the output "fieldName =for" is coming from either. But hopefully that gets you started on debugging.

Have you tried debugging your code in Luke? (Lucene Index Toolbox)
http://code.google.com/p/luke/
Luke is really nice in performing searches using different analyzers, inspecting the index storage, understanding how documents are scored based on searches etc. It can help eliminate any problems with search code, since it directly works on the index files.
Luke works for both Java and .NET versions of Lucene.

How to search special characters(+ ! \ ? : ) in Lucene

I want to search special characters in index.
I escaped all the special characters in query string but when i perform query as + on lucene in index it create query as +().
Hence it search on no fields.
How to solve this problem? My index contains these special characters.

If you are using the StandardAnalyzer, that will discard non-alphanum characters. Try indexing the same value with a WhitespaceAnalyzer and see if that preserves the characters you need. It might also keep stuff you don't want: that's when you might consider writing your own Analyzer, which basically means creating a TokenStream stack that does exactly the kind of processing you need.
For example, the SimpleAnalyzer implements the following pipeline:
#Override
public TokenStream tokenStream(String fieldName, Reader reader) {
return new LowerCaseTokenizer(reader);
}
which just lower-cases the tokens.
The StandardAnalyzer does much more:
/** Constructs a {#link StandardTokenizer} filtered by a {#link
StandardFilter}, a {#link LowerCaseFilter} and a {#link StopFilter}. */
#Override
public TokenStream tokenStream(String fieldName, Reader reader) {
StandardTokenizer tokenStream = new StandardTokenizer(matchVersion, reader);
tokenStream.setMaxTokenLength(maxTokenLength);
TokenStream result = new StandardFilter(tokenStream);
result = new LowerCaseFilter(result);
result = new StopFilter(enableStopPositionIncrements, result, stopSet);
return result;
}
You can mix & match from these and other components in org.apache.lucene.analysis, or you can write your own specialized TokenStream instances that are wrapped into a processing pipeline by your custom Analyzer.
One other thing to look at is what sort of CharTokenizer you're using. CharTokenizer is an abstract class that specifies the machinery for tokenizing text strings. It's used by some simpler Analyzers (but not by the StandardAnalyzer). Lucene comes with two subclasses: a LetterTokenizer and a WhitespaceTokenizer. You can create your own that keeps the characters you need and breaks on those you don't by implementing the boolean isTokenChar(char c) method.

Maybe it's not actual for the author but to be able to search special characters you need:
Create custom analyzer
Use it for indexing and searching
Example how it works for me:
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.custom.CustomAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.*;
import org.apache.lucene.store.RAMDirectory;
import org.junit.Test;
import java.io.IOException;
import static org.hamcrest.Matchers.equalTo;
import static org.junit.Assert.assertThat;
public class LuceneSpecialCharactersSearchTest {
/**
* Test that tries to search a string by some substring with each special character separately.
*/
#Test
public void testSpecialCharacterSearch() throws Exception {
// GIVEN
LuceneSpecialCharactersSearch service = new LuceneSpecialCharactersSearch();
String[] luceneSpecialCharacters = new String[]{"+", "-", "&&", "||", "!", "(", ")", "{", "}", "[", "]", "^", "\"", "~", "*", "?", ":", "\\"};
// WHEN
for (String specialCharacter : luceneSpecialCharacters) {
String actual = service.search("list's special-characters " + specialCharacter);
// THEN
assertThat(actual, equalTo(LuceneSpecialCharactersSearch.TEXT_WITH_SPECIAL_CHARACTERS));
}
}
private static class LuceneSpecialCharactersSearch {
private static final String TEXT_WITH_SPECIAL_CHARACTERS = "This is the list's of special-characters + - && || ! ( ) { } [ ] ^ \" ~ ? : \\ *";
private final IndexWriter writer;
public LuceneSpecialCharactersSearch() throws Exception {
Document document = new Document();
document.add(new TextField("body", TEXT_WITH_SPECIAL_CHARACTERS, Field.Store.YES));
RAMDirectory directory = new RAMDirectory();
writer = new IndexWriter(directory, new IndexWriterConfig(buildAnalyzer()));
writer.addDocument(document);
writer.commit();
}
public String search(String queryString) throws Exception {
try (IndexReader reader = DirectoryReader.open(writer, false)) {
IndexSearcher searcher = new IndexSearcher(reader);
String escapedQueryString = QueryParser.escape(queryString).toLowerCase();
Analyzer analyzer = buildAnalyzer();
QueryParser bodyQueryParser = new QueryParser("body", analyzer);
bodyQueryParser.setDefaultOperator(QueryParser.Operator.AND);
Query bodyQuery = bodyQueryParser.parse(escapedQueryString);
BooleanQuery query = new BooleanQuery.Builder()
.add(new BooleanClause(bodyQuery, BooleanClause.Occur.SHOULD))
.build();
TopDocs searchResult = searcher.search(query, 1);
return searcher.doc(searchResult.scoreDocs[0].doc).getField("body").stringValue();
}
}
/**
* Builds analyzer that is used for indexing and searching.
*/
private static Analyzer buildAnalyzer() throws IOException {
return CustomAnalyzer.builder()
.withTokenizer("whitespace")
.addTokenFilter("lowercase")
.addTokenFilter("standard")
.build();
}
}
}

We Keep Coding

sql objective-c vba vb.net react-native apache vue.js tensorflow api pandas

Updating Lucene Indexer and Searcher examples to Lucene 6.6.0 - lucene

Related

Apache Lucene 8.4.1 How to get indexed fields and term list?

Is there a way to get the word-count from a Lucene index?

Splitting PDF document into multiple documents

Not able to get the search text in lucene

How to search special characters(+ ! \ ? : ) in Lucene

Categories

Resources