error is occuring while creating custom tokenizer in lucene 7.3 - lucene

I m trying to create new tokenizer by refering book tamingtext (which uses lucene 3.+ api) using new lucene api 7.3, but it is giving me error as mentioned below
java.lang.IllegalStateException: TokenStream contract violation: reset()/close() call missing, reset() called multiple times, or subclass does not call super.reset(). Please see Javadocs of TokenStream class for more information about the correct consuming workflow.
at org.apache.lucene.analysis.Tokenizer$1.read(Tokenizer.java:109)
at java.io.Reader.read(Reader.java:140)
at solr.SentenceTokenizer.fillSentences(SentenceTokenizer.java:43)
at solr.SentenceTokenizer.incrementToken(SentenceTokenizer.java:55)
at solr.NameFilter.fillSpans(NameFilter.java:56)
at solr.NameFilter.incrementToken(NameFilter.java:88)
at spec.solr.NameFilterTest.testNameFilter(NameFilterTest.java:81)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
Here is my SentenceTokenizer class
Initializing method, in older api there was super(Reader); but in current api there is no access to Reader class
public SentenceTokenizer(SentenceDetector detector) {
super();
setReader(reader);
this.sentenceDetector = detector;
}
Here is my reset method
#Override
public void reset() throws IOException {
super.reset();
sentenceDetector = null;
}
When i tried to access this method from custom TokenFilter, I m getting above error
public void fillSentences() throws IOException {
char[] c = new char[256];
int size = 0;
StringBuilder stringBuilder = new StringBuilder();
while ((size = input.read(c)) >= 0) {
stringBuilder.append(c, 0, size);
}
String temp = stringBuilder.toString();
inputSentence = temp.toCharArray();
sentenceSpans = sentenceDetector.sentPosDetect(temp);
tokenOffset = 0;
}
#Override
public final boolean incrementToken() throws IOException {
if (sentenceSpans == null) {
//invoking following method
fillSentences();
}
if (tokenOffset == sentenceSpans.length) {
return false;
}
Span sentenceSpan = sentenceSpans[tokenOffset];
clearAttributes();
int start = sentenceSpan.getStart();
int end = sentenceSpan.getEnd();
charTermAttribute.copyBuffer(inputSentence, start, end - start);
positionIncrementAttribute.setPositionIncrement(1);
offsetAttribute.setOffset(start, end);
tokenOffset++;
return true;
}
Here is my custom TokenFilter class
public final class NameFilter extends TokenFilter {
public static final String NE_PREFIX = "NE_";
private final Tokenizer tokenizer;
private final String[] tokenTypeNames;
private final NameFinderME[] nameFinderME;
private final KeywordAttribute keywordAttribute = addAttribute(KeywordAttribute.class);
private final PositionIncrementAttribute positionIncrementAttribute = addAttribute(PositionIncrementAttribute.class);
private final CharTermAttribute charTermAttribute = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
private String text;
private int baseOffset;
private Span[] spans;
private String[] tokens;
private Span[][] foundNames;
private boolean[][] tokenTypes;
private int spanOffsets = 0;
private final Queue<AttributeSource.State> tokenQueue =
new LinkedList<>();
public NameFilter(TokenStream in, String[] modelNames, NameFinderME[] nameFinderME) {
super(in);
this.tokenizer = SimpleTokenizer.INSTANCE;
this.nameFinderME = nameFinderME;
this.tokenTypeNames = new String[modelNames.length];
for (int i = 0; i < modelNames.length; i++) {
this.tokenTypeNames[i] = NE_PREFIX + modelNames[i];
}
}
//consumes tokens from the upstream tokenizer and buffer them in a
//StringBuilder whose contents will be passed to opennlp
protected boolean fillSpans() throws IOException {
if (!this.input.incrementToken()) return false;
//process the next sentence from the upstream tokenizer
this.text = input.getAttribute(CharTermAttribute.class).toString();
this.baseOffset = this.input.getAttribute(OffsetAttribute.class).startOffset();
this.spans = this.tokenizer.tokenizePos(text);
this.tokens = Span.spansToStrings(spans, text);
this.foundNames = new Span[this.nameFinderME.length][];
for (int i = 0; i < nameFinderME.length; i++) {
this.foundNames[i] = nameFinderME[i].find(tokens);
}
//insize
this.tokenTypes = new boolean[this.tokens.length][this.nameFinderME.length];
for (int i = 0; i < nameFinderME.length; i++) {
Span[] spans = foundNames[i];
for (int j = 0; j < spans.length; j++) {
int start = spans[j].getStart();
int end = spans[j].getEnd();
for (int k = start; k < end; k++) {
this.tokenTypes[k][i] = true;
}
}
}
spanOffsets = 0;
return true;
}
#Override
public boolean incrementToken() throws IOException {
//if there's nothing in the queue
if(tokenQueue.peek()==null){
//no span or spans consumed
if (spans==null||spanOffsets>=spans.length){
if (!fillSpans())return false;
}
if (spanOffsets>=spans.length)return false;
//copy the token and any types
clearAttributes();
keywordAttribute.setKeyword(false);
positionIncrementAttribute.setPositionIncrement(1);
int startOffset = baseOffset +spans[spanOffsets].getStart();
int endOffset = baseOffset+spans[spanOffsets].getEnd();
offsetAttribute.setOffset(startOffset,endOffset);
charTermAttribute.setEmpty()
.append(tokens[spanOffsets]);
//determine of the current token is of a named entity type, if so
//push the current state into the queue and add a token reflecting
// any matching entity types.
boolean [] types = tokenTypes[spanOffsets];
for (int i = 0; i < nameFinderME.length; i++) {
if (types[i]){
keywordAttribute.setKeyword(true);
positionIncrementAttribute.setPositionIncrement(0);
tokenQueue.add(captureState());
positionIncrementAttribute.setPositionIncrement(1);
charTermAttribute.setEmpty().append(tokenTypeNames[i]);
}
}
}
spanOffsets++;
return true;
}
#Override
public void close() throws IOException {
super.close();
reset();
}
#Override
public void reset() throws IOException {
super.reset();
this.spanOffsets = 0;
this.spans = null;
}
#Override
public void end() throws IOException {
super.end();
reset();
}
}
here is my test case for following class
#Test
public void testNameFilter() throws IOException {
Reader in = new StringReader(input);
Tokenizer tokenizer = new SentenceTokenizer( detector);
tokenizer.reset();
NameFilter nameFilter = new NameFilter(tokenizer, modelName, nameFinderMES);
nameFilter.reset();
CharTermAttribute charTermAttribute;
PositionIncrementAttribute positionIncrementAttribute;
OffsetAttribute offsetAttribute;
int pass = 0;
while (pass < 2) {
int pos = 0;
int lastStart = 0;
int lastEnd = 0;
//error occur on below invoke
while (nameFilter.incrementToken()) {
}
}

I have added following changes in my code and it work fine but i m now sure it is correct answer
public SentenceTokenizer(Reader reader,SentenceDetector sentenceDetector) {
super();
this.input =reader;
this.sentenceDetector = sentenceDetector;
}

Related

Optimizing Transposing and Comparison of Concurrent List in Java 8

I have an application in Java 8 Collecting Data of multiple Threads using BlockingQueue.
I need to perform comparison of samples.
But my application is very large, I implemented a mock application (Github) in Java 8.
I'm generating a chunk of bytes (really is random order).
The bytes are stored into ChunkDTO class.
I implemented a capturer the ChunkDTO in a List, code in Capturer class.
Each ChunkDTO of List is translated into a List of Samples (TimePitchValue exactly) returning a nested List (or List of List of TimePitchValue).
Later the nested List is transposed in order to performs comparisons between TimePitchValue with the same time value.
Due to enormous volume of TimePitchValue instances it's consumes huge time in my application.
Here some code (The complete functional Code is in Github) because is still large for this site).
public class Generator {
final static Logger LOGGER = Logger.getLogger("SampleComparator");
public static void main(String[] args) {
long previous = System.nanoTime();
final int minBufferSize = 2048;
int sampleRate = 8192;
int numChannels = 1;
int numBytesPerSample = 1;
int samplesChunkPerSecond = sampleRate / minBufferSize;
int minutes = 0;
int seconds = 10;
int time = 60 * minutes + seconds;
int chunksBySecond = samplesChunkPerSecond * numBytesPerSample * numChannels;
int pitchs = 32;
boolean signed = false;
boolean endianness = false;
AudioFormat audioformat = new AudioFormat(sampleRate, 8 * numBytesPerSample, numChannels, signed, endianness);
ControlDSP controlDSP = new ControlDSP(audioformat);
BlockingQueue<ChunkDTO> generatorBlockingQueue = new LinkedBlockingQueue<>();
Capturer capturer = new Capturer(controlDSP, pitchs, pitchs * time * chunksBySecond, generatorBlockingQueue);
controlDSP.getListFuture().add(controlDSP.getExecutorService().submit(capturer));
for (int i = 0; i < time * chunksBySecond; i++) {
for (int p = 0; p < pitchs; p++) {
ChunkDTO chunkDTO = new ChunkDTO(UtilClass.getArrayByte(minBufferSize), i, p);
LOGGER.info(String.format("chunkDTO: %s", chunkDTO));
try {
generatorBlockingQueue.put(chunkDTO);
} catch (InterruptedException ex) {
LOGGER.info(ex.getMessage());
}
}
try {
Thread.sleep(1000 / chunksBySecond);
} catch (Exception ex) {
}
}
controlDSP.tryFinishThreads(Thread.currentThread());
long current = System.nanoTime();
long interval = TimeUnit.NANOSECONDS.toSeconds(current - previous);
System.out.println("Seconds Interval: " + interval);
}
}
Capturer Class
public class Capturer implements Callable<Void> {
private final ControlDSP controlDSP;
private final int pitchs;
private final int totalChunks;
private final BlockingQueue<ChunkDTO> capturerBlockingQueue;
private final Counter intCounter;
private final Map<Long, List<ChunkDTO>> mapIndexListChunkDTO = Collections.synchronizedMap(new HashMap<>());
private volatile boolean isRunning = false;
private final String threadName;
private static final Logger LOGGER = Logger.getLogger("SampleComparator");
public Capturer(ControlDSP controlDSP, int pitchs, int totalChunks, BlockingQueue<ChunkDTO> capturerBlockingQueue) {
this.controlDSP = controlDSP;
this.pitchs = pitchs;
this.totalChunks = totalChunks;
this.capturerBlockingQueue = capturerBlockingQueue;
this.intCounter = new Counter();
this.controlDSP.getListFuture().add(this.controlDSP.getExecutorService().submit(() -> {
while (intCounter.getValue() < totalChunks) {
try {
Thread.sleep(100);
} catch (InterruptedException ex) {
LOGGER.log(Level.SEVERE, null, ex);
}
}
capturerBlockingQueue.add(new ChunkDTOStopper());
}));
this.threadName = this.getClass().getSimpleName();
}
#Override
public Void call() throws Exception {
long quantity = 0;
isRunning = true;
while (isRunning) {
try {
ChunkDTO chunkDTO = capturerBlockingQueue.take();
if (chunkDTO instanceof ChunkDTOStopper) {
break;
}
//Find or Create List (according to Index) to add the incoming Chunk
long index = chunkDTO.getIndex();
int sizeChunk = chunkDTO.getChunk().length;
List<ChunkDTO> listChunkDTOWithIndex = getListChunkDTOByIndex(chunkDTO);
//When the List (according to Index) is completed and processed
if (listChunkDTOWithIndex.size() == pitchs) {
mapIndexListChunkDTO.remove(index);
TransposerComparator transposerComparator = new TransposerComparator(controlDSP, controlDSP.getAudioformat(), index, sizeChunk, listChunkDTOWithIndex);
controlDSP.getListFuture().add(controlDSP.getExecutorService().submit(transposerComparator));
}
quantity++;
intCounter.setValue(quantity);
LOGGER.info(String.format("%s\tConsumes:%s\ttotal:%05d", threadName, chunkDTO, quantity));
} catch (Exception ex) {
LOGGER.log(Level.SEVERE, null, ex);
}
}
LOGGER.info(String.format("%s\tReceived:%05d\tQty:%s\tPitchs:%s\tEND\n", threadName, quantity, quantity / pitchs, pitchs));
return null;
}
private List<ChunkDTO> getListChunkDTOByIndex(ChunkDTO chunkDTO) {
List<ChunkDTO> listChunkDTOWithIndex = mapIndexListChunkDTO.get(chunkDTO.getIndex());
if (listChunkDTOWithIndex == null) {
listChunkDTOWithIndex = new ArrayList<>();
mapIndexListChunkDTO.put(chunkDTO.getIndex(), listChunkDTOWithIndex);
listChunkDTOWithIndex = mapIndexListChunkDTO.get(chunkDTO.getIndex());
}
listChunkDTOWithIndex.add(chunkDTO);
return listChunkDTOWithIndex;
}
}
TransposerComparator class.
The optimization required is in this code, specifically on transposedNestedList method.
public class TransposerComparator implements Callable<Void> {
private final ControlDSP controlDSP;
private final AudioFormat audioformat;
private final long index;
private final int sizeChunk;
private final List<ChunkDTO> listChunkDTOWithIndex;
private final String threadName;
private static final Logger LOGGER = Logger.getLogger("SampleComparator");
public TransposerComparator(ControlDSP controlDSP, AudioFormat audioformat, long index, int sizeChunk, List<ChunkDTO> listChunkDTOWithIndex) {
this.controlDSP = controlDSP;
this.audioformat = audioformat;
this.index = index;
this.sizeChunk = sizeChunk;
this.listChunkDTOWithIndex = listChunkDTOWithIndex;
this.threadName = this.getClass().getSimpleName() + "_" + String.format("%05d", index);
}
#Override
public Void call() throws Exception {
Thread.currentThread().setName(threadName);
LOGGER.info(String.format("%s\tINI", threadName));
try {
int numBytesPerSample = audioformat.getSampleSizeInBits() / 8;
int quantitySamples = sizeChunk / numBytesPerSample;
long baseTime = quantitySamples * index;
// Convert the List of Chunk Bytes to Nested List of TimePitchValue
List<List<TimePitchValue>> nestedListTimePitchValue = listChunkDTOWithIndex
.stream()
.map(chunkDTO -> {
return IntStream
.range(0, quantitySamples)
.mapToObj(time -> {
int value = extractValue(chunkDTO.getChunk(), numBytesPerSample, time);
return new TimePitchValue(chunkDTO.getPitch(), baseTime + time, value);
}).collect(Collectors.toList());
}).collect(Collectors.toList());
List<List<TimePitchValue>> timeNestedListTimePitchValue = transposedNestedList(nestedListTimePitchValue);
} catch (Exception ex) {
ex.printStackTrace();
LOGGER.log(Level.SEVERE, null, ex);
throw ex;
}
return null;
}
private static int extractValue(byte[] bytesSamples, int numBytesPerSample, int time) {
byte[] bytesSingleNumber = Arrays.copyOfRange(bytesSamples, time * numBytesPerSample, (time + 1) * numBytesPerSample);
int value = numBytesPerSample == 2
? (UtilClass.Byte2IntLit(bytesSingleNumber[0], bytesSingleNumber[1]))
: (UtilClass.byte2intSmpl(bytesSingleNumber[0]));
return value;
}
private static List<List<TimePitchValue>> transposedNestedList(List<List<TimePitchValue>> nestedList) {
List<List<TimePitchValue>> outNestedList = new ArrayList<>();
nestedList.forEach(pitchList -> {
pitchList.forEach(pitchValue -> {
List<TimePitchValue> listTimePitchValueWithTime = listTimePitchValueWithTime(outNestedList, pitchValue.getTime());
if (!outNestedList.contains(listTimePitchValueWithTime)) {
outNestedList.add(listTimePitchValueWithTime);
}
listTimePitchValueWithTime.add(pitchValue);
});
});
outNestedList.forEach(pitchList -> {
pitchList.sort(Comparator.comparingInt(TimePitchValue::getValue).reversed());
});
return outNestedList;
}
private static List<TimePitchValue> listTimePitchValueWithTime(List<List<TimePitchValue>> nestedList, long time) {
List<TimePitchValue> listTimePitchValueWithTime = nestedList
.stream()
.filter(innerList -> innerList.stream()
.anyMatch(timePitchValue -> timePitchValue.getTime() == time))
.findAny()
.orElseGet(ArrayList::new);
return listTimePitchValueWithTime;
}
}
I was testing:
With 5 Seconds in Generator class and the List<List<TimePitchValue>> timeNestedListTimePitchValue = transposedNestedList(nestedListTimePitchValue); line in TransposerComparator class, Commented 7 Seconds needed, Uncommented 211 Seconds needed.
With 10 Seconds in Generator class and the List<List<TimePitchValue>> timeNestedListTimePitchValue = transposedNestedList(nestedListTimePitchValue); line in TransposerComparator class, Commented 12 Seconds needed, Uncommented 574 Seconds needed.
I need to use the application at least 60 minutes.
With the purpose of reduce the needed (consumed) time, I have two ways:
I choose for short is to optimize the methods that I am currently using.
That should be successful but longer and is to use GPGPU, but I don't know where to start implementing it yet.
QUESTIONS
This Question is for the first way: What changes do you recommend in the code of the transposedNestedList method in order to improve speed?
Is there better alternative to use this Comparison?
outNestedList.forEach(pitchList -> {
pitchList.sort(Comparator.comparingInt(TimePitchValue::getValue).reversed());
});

Find the largest String in the list

I am required to find the largest string in the ArrayList, and the print it. My code is not currently working though.
public class Solution {
private static List<String> strings;
public static void main(String[] args) throws Exception {
BufferedReader reader = new BufferedReader(new InputStreamReader(System.in));
strings = new ArrayList<String>();
for (int i = 0; i < 5; i++) {
String n = reader.readLine();
strings.add(n);
}
String largestString = strings.get(0);
for (int i = 0; i < strings.size(); i++) {
if (strings.get(i).length() > largestString.length()) {
largestString = strings.get(i);
System.out.println(largestString);
}
}
}
}
public static void main(String[] args) {
Scanner reader = new Scanner(System.in);
String value = null;
int length = 0;
for (int i = 0; i < 5; i++) {
String n = reader.nextLine();
if (n != null && n.length() > length) {
length = n.length();
value = n;
}
}
System.out.println("Largest String : " + value + " of length : " + length);
}

hadoop mapreduce common friends reducer spillage

I am trying to run the below code, to find the common friends between two persons. The input is as follows
A : B C D
B : A C D E
C : A B D E
D : A B C E
E : B C D
I am not able to get any output in the output file and there is no exception.
Please find my code below,
public class Friend {
public static class Mapperfriend extends Mapper<Object, Text, Text, Text>{
private Text vendor = new Text();
#Override
protected void map(Object key, Text value, Context context) throws IOException, InterruptedException {
StringTokenizer tokenizer = new StringTokenizer(value.toString(), "\n");
String line = null;
String[] lineArray = null;
String[] friendArray = null;
String[] tempArray = null;
while(tokenizer.hasMoreTokens()){
line = tokenizer.nextToken();
lineArray = line.split(":");
friendArray = lineArray[1].split(" ");
tempArray = new String[2];
for(int i = 0; i < friendArray.length; i++){
tempArray[0] = friendArray[i];
tempArray[1] = lineArray[0];
Arrays.sort(tempArray);
context.write(new Text(tempArray[0] + " " + tempArray[1]), new Text(lineArray[1]));
}
}
}}
public static class ReducerFriend extends Reducer<Text,Text,Text,Text>{
#Override
protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
Text[] texts = new Text[2];
int index = 0;
for(Text val: values)
{
texts[index++] = new Text(val);
}
String[] list1 = texts[0].toString().split(" ");
String[] list2 = texts[1].toString().split(" ");
List<String> list = new LinkedList<String>();
for(String friend1 : list1){
for(String friend2 : list2){
if(friend1.equals(friend2)){
list.add(friend1);
}
}
}
StringBuffer sb = new StringBuffer();
for(int i = 0; i < list.size(); i++){
sb.append(list.get(i));
if(i != list.size() - 1)
sb.append(" ");
}
context.write(key, new Text(sb.toString()));
}
}
/**
* #param args the command line arguments
*/
public static void main(String[] args) throws Exception {
// TODO code application logic here
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "Friends");
job.setJarByClass(Friend.class);
job.setMapperClass(Mapperfriend.class);
job.setReducerClass(ReducerFriend.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
The mapper class emitted the entire key from first. Instead of picking up the array. When removed that it works fine.

With Lucene 4.3.1, How to get all terms which occur in sub-range of all docs

Suppose a lucene index with fields : date, content.
I want to get all terms value and frequency of docs whose date is yesterday. date field is keyword field. content field is analyzed and indexed.
Pls help me with sample code.
My solution source is as follow ...
/**
*
*
* #param reader
* #param fromDateTime
* - yyyymmddhhmmss
* #param toDateTime
* - yyyymmddhhmmss
* #return
*/
static public String top10(IndexSearcher searcher, String fromDateTime,
String toDateTime) {
String top10Query = "";
try {
Query query = new TermRangeQuery("tweetDate", new BytesRef(
fromDateTime), new BytesRef(toDateTime), true, false);
final BitSet bits = new BitSet(searcher.getIndexReader().maxDoc());
searcher.search(query, new Collector() {
private int docBase;
#Override
public void setScorer(Scorer scorer) throws IOException {
}
#Override
public void setNextReader(AtomicReaderContext context)
throws IOException {
this.docBase = context.docBase;
}
#Override
public void collect(int doc) throws IOException {
bits.set(doc + docBase);
}
#Override
public boolean acceptsDocsOutOfOrder() {
return false;
}
});
//
Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43,
EnglishStopWords.getEnglishStopWords());
//
HashMap<String, Long> wordFrequency = new HashMap<>();
for (int wx = 0; wx < bits.length(); ++wx) {
if (bits.get(wx)) {
Document wd = searcher.doc(wx);
//
TokenStream tokenStream = analyzer.tokenStream("temp",
new StringReader(wd.get("content")));
// OffsetAttribute offsetAttribute = tokenStream
// .addAttribute(OffsetAttribute.class);
CharTermAttribute charTermAttribute = tokenStream
.addAttribute(CharTermAttribute.class);
tokenStream.reset();
while (tokenStream.incrementToken()) {
// int startOffset = offsetAttribute.startOffset();
// int endOffset = offsetAttribute.endOffset();
String term = charTermAttribute.toString();
if (term.length() < 2)
continue;
Long wl;
if ((wl = wordFrequency.get(term)) == null)
wordFrequency.put(term, 1L);
else {
wl += 1;
wordFrequency.put(term, wl);
}
}
tokenStream.end();
tokenStream.close();
}
}
analyzer.close();
// sort
List<String> occurterm = new ArrayList<String>();
for (String ws : wordFrequency.keySet()) {
occurterm.add(String.format("%06d\t%s", wordFrequency.get(ws),
ws));
}
Collections.sort(occurterm, Collections.reverseOrder());
// make query string by top 10 words
int topCount = 10;
for (String ws : occurterm) {
if (topCount-- == 0)
break;
String[] tks = ws.split("\\t");
top10Query += tks[1] + " ";
}
top10Query.trim();
} catch (IOException e) {
e.printStackTrace();
} finally {
}
// return top10 word string
return top10Query;
}

What's wrong with this Lucene TokenFilter?

Disclaimer: I've been coding for 36 of the last 41 hours. I have a headache. And I can't figure out why this combining TokenFilter is returning 2 tokens, both the first token from the source stream.
public class TokenCombiner extends TokenFilter {
/*
* Recombines all tokens back into a single token using the specified delimiter.
*/
public TokenCombiner(TokenStream in, int delimiter) {
super(in);
this.delimiter = delimiter;
}
int delimiter;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private boolean firstToken = true;
int startOffset = 0;
#Override
public final boolean incrementToken() throws IOException {
while (true){
boolean eos = input.incrementToken(); //We have to process tokens even if they return end of file.
CharTermAttribute token = input.getAttribute(CharTermAttribute.class);
if (eos && token.length() == 0) break; //Break early to avoid extra whitespace.
if (firstToken){
startOffset = input.getAttribute(OffsetAttribute.class).startOffset();
firstToken = false;
}else{
termAtt.append(Character.toString((char)delimiter));
}
termAtt.append(token);
if (eos) break;
}
offsetAtt.setOffset(startOffset, input.getAttribute(OffsetAttribute.class).endOffset());
return false;
}
#Override
public void reset() throws IOException {
super.reset();
firstToken = true;
startOffset = 0;
}
}
I think the fundamental problem here, is that you must realize both TokenCombiner and the producer it consumes (input) share and reuse the same attributes! So token == termAtt always (try adding an assert!).
Man, that sucks if you have been coding for 36 hours on a weekend... try this:
public class TokenCombiner extends TokenFilter {
private final StringBuilder sb = new StringBuilder();
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private final char separator;
private boolean consumed; // true if we already consumed
protected TokenCombiner(TokenStream input, char separator) {
super(input);
this.separator = separator;
}
#Override
public final boolean incrementToken() throws IOException {
if (consumed) {
return false; // don't call input.incrementToken() after it returns false
}
consumed = true;
int startOffset = 0;
int endOffset = 0;
boolean found = false; // true if we actually consumed any tokens
while (input.incrementToken()) {
if (!found) {
startOffset = offsetAtt.startOffset();
found = true;
}
sb.append(termAtt);
sb.append(separator);
endOffset = offsetAtt.endOffset();
}
if (found) {
assert sb.length() > 0; // always: because we append separator
sb.setLength(sb.length() - 1);
clearAttributes();
termAtt.setEmpty().append(sb);
offsetAtt.setOffset(startOffset, endOffset);
return true;
} else {
return false;
}
}
#Override
public void reset() throws IOException {
super.reset();
sb.setLength(0);
consumed = false;
}
}