hadoop mapreduce common friends reducer spillage - apache

I am trying to run the below code, to find the common friends between two persons. The input is as follows
A : B C D
B : A C D E
C : A B D E
D : A B C E
E : B C D
I am not able to get any output in the output file and there is no exception.
Please find my code below,
public class Friend {
public static class Mapperfriend extends Mapper<Object, Text, Text, Text>{
private Text vendor = new Text();
#Override
protected void map(Object key, Text value, Context context) throws IOException, InterruptedException {
StringTokenizer tokenizer = new StringTokenizer(value.toString(), "\n");
String line = null;
String[] lineArray = null;
String[] friendArray = null;
String[] tempArray = null;
while(tokenizer.hasMoreTokens()){
line = tokenizer.nextToken();
lineArray = line.split(":");
friendArray = lineArray[1].split(" ");
tempArray = new String[2];
for(int i = 0; i < friendArray.length; i++){
tempArray[0] = friendArray[i];
tempArray[1] = lineArray[0];
Arrays.sort(tempArray);
context.write(new Text(tempArray[0] + " " + tempArray[1]), new Text(lineArray[1]));
}
}
}}
public static class ReducerFriend extends Reducer<Text,Text,Text,Text>{
#Override
protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
Text[] texts = new Text[2];
int index = 0;
for(Text val: values)
{
texts[index++] = new Text(val);
}
String[] list1 = texts[0].toString().split(" ");
String[] list2 = texts[1].toString().split(" ");
List<String> list = new LinkedList<String>();
for(String friend1 : list1){
for(String friend2 : list2){
if(friend1.equals(friend2)){
list.add(friend1);
}
}
}
StringBuffer sb = new StringBuffer();
for(int i = 0; i < list.size(); i++){
sb.append(list.get(i));
if(i != list.size() - 1)
sb.append(" ");
}
context.write(key, new Text(sb.toString()));
}
}
/**
* #param args the command line arguments
*/
public static void main(String[] args) throws Exception {
// TODO code application logic here
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "Friends");
job.setJarByClass(Friend.class);
job.setMapperClass(Mapperfriend.class);
job.setReducerClass(ReducerFriend.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}

The mapper class emitted the entire key from first. Instead of picking up the array. When removed that it works fine.

Related

Query with distinct keyword and subquery not working in Hive with udf

Not working Query :
select lookup(city, state, tax,'addresslookup')
from (select distinct city, state, tax
from open_glory.addylookup) a
Working Query (without distinct):
select lookup(city, state, tax,'addresslookup')
from (select city, state, tax
from open_glory.addylookup) a
Any help would be appreciated.
UDF code:
Not working Query :
select lookup(city, state, tax,'addresslookup')
from (select distinct city, state, tax
from open_glory.addylookup) a
Working Query (without distinct):
select lookup(city, state, tax,'addresslookup')
from (select city, state, tax
from open_glory.addylookup) a
Any help would be appreciated.
UDF code:
public class Lookup extends GenericUDF {
private static String delimiter = "|";
private ConcurrentHashMap < String, HashMap < String, String >> fileMap = new ConcurrentHashMap < String, HashMap < String, String >> ();
protected String loggedInUser;
protected String loggedInApplication;
private transient GenericUDFUtils.StringHelper returnHelper;
private transient StringConverter[] stringConverter;
private static final Logger LOG = LoggerFactory.getLogger(Lookup.class.getName());
#Override
public ObjectInspector initialize(ObjectInspector[] arguments)
throws UDFArgumentException {
if (arguments.length < 2) {
throw new UDFArgumentLengthException(
"lookup takes 2 or more arguments");
}
stringConverter = new StringConverter[arguments.length];
for (int i = 0; i < arguments.length; i++) {
if (arguments[0].getCategory() != Category.PRIMITIVE) {
throw new UDFArgumentException(
"lookup only takes primitive types");
}
stringConverter[i] = new PrimitiveObjectInspectorConverter.StringConverter(
(PrimitiveObjectInspector) arguments[i]);
}
setLoggedInUser();
returnHelper = new GenericUDFUtils.StringHelper(
PrimitiveCategory.STRING);
LOG.info("initialize successful");
return PrimitiveObjectInspectorFactory.writableStringObjectInspector;
}
private void setLoggedInUser() {
if (loggedInUser == null) {
loggedInUser = SessionState.get().getUserName();
if (loggedInUser != null) {
int idx = loggedInUser.indexOf('.');
loggedInApplication = idx > -1 ? loggedInUser.substring(0, idx) : null;
}
}
}
private void initMap(String f) {
LOG.info("initMap involked");
if (loggedInApplication == null)
throw new NullPointerException(
"Unable to retrieve application name from user.");
String filePath = "/basepath/" + loggedInApplication.toLowerCase() + "/" + f +
".txt";
String line = null;
try {
LOG.info("filePath =" + filePath);
FileSystem fs = FileSystem.get(new Configuration());
FSDataInputStream in = fs.open(new Path(filePath));
BufferedReader br = new BufferedReader(new InputStreamReader( in ));
HashMap < String, String > map = new HashMap < String, String > ();
while ((line = br.readLine()) != null) {
// ignore comment lines
if (line.startsWith("#")) {
continue;
}
String[] strs = line.split("\t");
if (strs.length == 2) {
map.put(strs[0].toUpperCase().trim(), strs[1].trim());
} else if (strs.length > 2) {
map.put(getKey(strs), strs[strs.length - 1].trim());
}
}
fileMap.put(f, map);
br.close();
} catch (Exception e) {
LOG.error(e.getMessage(), e);
}
}
public Text getValue(String s, String f) {
initMap(f);
HashMap < String, String > map = fileMap.get(f);
LOG.info("getValue() fileMap =" + fileMap);
String v = map.get(s);
return v == null ? null : new Text(v);
}
#Override
public Object evaluate(DeferredObject[] arguments) throws HiveException {
String val = buildVal(arguments);
String lookupFile = (String) stringConverter[arguments.length - 1].convert(arguments[arguments.length - 1].get());
Text returnVal = getValue(val.toUpperCase(), lookupFile.toLowerCase());
return returnVal == null ? null : returnHelper.setReturnValue(returnVal.toString());
}
#Override
public String getDisplayString(String[] arg0) {
return "lookup()";
}
private String buildVal(DeferredObject[] arguments) throws HiveException {
StringBuilder builder = new StringBuilder();
int cnt = arguments.length - 1;
for (int i = 0; i < cnt; i++) {
builder.append((String) stringConverter[i].convert(arguments[i].get()));
if (i < cnt - 1) {
builder.append(delimiter);
}
}
return builder.toString();
}
private String getKey(String[] strs) {
StringBuilder builder = new StringBuilder();
int cnt = strs.length - 1;
for (int i = 0; i < cnt; i++) {
builder.append(strs[i].toUpperCase().trim());
if (i < cnt - 1) {
builder.append(delimiter);
}
}
return builder.toString();
}
}

Find the largest String in the list

I am required to find the largest string in the ArrayList, and the print it. My code is not currently working though.
public class Solution {
private static List<String> strings;
public static void main(String[] args) throws Exception {
BufferedReader reader = new BufferedReader(new InputStreamReader(System.in));
strings = new ArrayList<String>();
for (int i = 0; i < 5; i++) {
String n = reader.readLine();
strings.add(n);
}
String largestString = strings.get(0);
for (int i = 0; i < strings.size(); i++) {
if (strings.get(i).length() > largestString.length()) {
largestString = strings.get(i);
System.out.println(largestString);
}
}
}
}
public static void main(String[] args) {
Scanner reader = new Scanner(System.in);
String value = null;
int length = 0;
for (int i = 0; i < 5; i++) {
String n = reader.nextLine();
if (n != null && n.length() > length) {
length = n.length();
value = n;
}
}
System.out.println("Largest String : " + value + " of length : " + length);
}

error is occuring while creating custom tokenizer in lucene 7.3

I m trying to create new tokenizer by refering book tamingtext (which uses lucene 3.+ api) using new lucene api 7.3, but it is giving me error as mentioned below
java.lang.IllegalStateException: TokenStream contract violation: reset()/close() call missing, reset() called multiple times, or subclass does not call super.reset(). Please see Javadocs of TokenStream class for more information about the correct consuming workflow.
at org.apache.lucene.analysis.Tokenizer$1.read(Tokenizer.java:109)
at java.io.Reader.read(Reader.java:140)
at solr.SentenceTokenizer.fillSentences(SentenceTokenizer.java:43)
at solr.SentenceTokenizer.incrementToken(SentenceTokenizer.java:55)
at solr.NameFilter.fillSpans(NameFilter.java:56)
at solr.NameFilter.incrementToken(NameFilter.java:88)
at spec.solr.NameFilterTest.testNameFilter(NameFilterTest.java:81)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
Here is my SentenceTokenizer class
Initializing method, in older api there was super(Reader); but in current api there is no access to Reader class
public SentenceTokenizer(SentenceDetector detector) {
super();
setReader(reader);
this.sentenceDetector = detector;
}
Here is my reset method
#Override
public void reset() throws IOException {
super.reset();
sentenceDetector = null;
}
When i tried to access this method from custom TokenFilter, I m getting above error
public void fillSentences() throws IOException {
char[] c = new char[256];
int size = 0;
StringBuilder stringBuilder = new StringBuilder();
while ((size = input.read(c)) >= 0) {
stringBuilder.append(c, 0, size);
}
String temp = stringBuilder.toString();
inputSentence = temp.toCharArray();
sentenceSpans = sentenceDetector.sentPosDetect(temp);
tokenOffset = 0;
}
#Override
public final boolean incrementToken() throws IOException {
if (sentenceSpans == null) {
//invoking following method
fillSentences();
}
if (tokenOffset == sentenceSpans.length) {
return false;
}
Span sentenceSpan = sentenceSpans[tokenOffset];
clearAttributes();
int start = sentenceSpan.getStart();
int end = sentenceSpan.getEnd();
charTermAttribute.copyBuffer(inputSentence, start, end - start);
positionIncrementAttribute.setPositionIncrement(1);
offsetAttribute.setOffset(start, end);
tokenOffset++;
return true;
}
Here is my custom TokenFilter class
public final class NameFilter extends TokenFilter {
public static final String NE_PREFIX = "NE_";
private final Tokenizer tokenizer;
private final String[] tokenTypeNames;
private final NameFinderME[] nameFinderME;
private final KeywordAttribute keywordAttribute = addAttribute(KeywordAttribute.class);
private final PositionIncrementAttribute positionIncrementAttribute = addAttribute(PositionIncrementAttribute.class);
private final CharTermAttribute charTermAttribute = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
private String text;
private int baseOffset;
private Span[] spans;
private String[] tokens;
private Span[][] foundNames;
private boolean[][] tokenTypes;
private int spanOffsets = 0;
private final Queue<AttributeSource.State> tokenQueue =
new LinkedList<>();
public NameFilter(TokenStream in, String[] modelNames, NameFinderME[] nameFinderME) {
super(in);
this.tokenizer = SimpleTokenizer.INSTANCE;
this.nameFinderME = nameFinderME;
this.tokenTypeNames = new String[modelNames.length];
for (int i = 0; i < modelNames.length; i++) {
this.tokenTypeNames[i] = NE_PREFIX + modelNames[i];
}
}
//consumes tokens from the upstream tokenizer and buffer them in a
//StringBuilder whose contents will be passed to opennlp
protected boolean fillSpans() throws IOException {
if (!this.input.incrementToken()) return false;
//process the next sentence from the upstream tokenizer
this.text = input.getAttribute(CharTermAttribute.class).toString();
this.baseOffset = this.input.getAttribute(OffsetAttribute.class).startOffset();
this.spans = this.tokenizer.tokenizePos(text);
this.tokens = Span.spansToStrings(spans, text);
this.foundNames = new Span[this.nameFinderME.length][];
for (int i = 0; i < nameFinderME.length; i++) {
this.foundNames[i] = nameFinderME[i].find(tokens);
}
//insize
this.tokenTypes = new boolean[this.tokens.length][this.nameFinderME.length];
for (int i = 0; i < nameFinderME.length; i++) {
Span[] spans = foundNames[i];
for (int j = 0; j < spans.length; j++) {
int start = spans[j].getStart();
int end = spans[j].getEnd();
for (int k = start; k < end; k++) {
this.tokenTypes[k][i] = true;
}
}
}
spanOffsets = 0;
return true;
}
#Override
public boolean incrementToken() throws IOException {
//if there's nothing in the queue
if(tokenQueue.peek()==null){
//no span or spans consumed
if (spans==null||spanOffsets>=spans.length){
if (!fillSpans())return false;
}
if (spanOffsets>=spans.length)return false;
//copy the token and any types
clearAttributes();
keywordAttribute.setKeyword(false);
positionIncrementAttribute.setPositionIncrement(1);
int startOffset = baseOffset +spans[spanOffsets].getStart();
int endOffset = baseOffset+spans[spanOffsets].getEnd();
offsetAttribute.setOffset(startOffset,endOffset);
charTermAttribute.setEmpty()
.append(tokens[spanOffsets]);
//determine of the current token is of a named entity type, if so
//push the current state into the queue and add a token reflecting
// any matching entity types.
boolean [] types = tokenTypes[spanOffsets];
for (int i = 0; i < nameFinderME.length; i++) {
if (types[i]){
keywordAttribute.setKeyword(true);
positionIncrementAttribute.setPositionIncrement(0);
tokenQueue.add(captureState());
positionIncrementAttribute.setPositionIncrement(1);
charTermAttribute.setEmpty().append(tokenTypeNames[i]);
}
}
}
spanOffsets++;
return true;
}
#Override
public void close() throws IOException {
super.close();
reset();
}
#Override
public void reset() throws IOException {
super.reset();
this.spanOffsets = 0;
this.spans = null;
}
#Override
public void end() throws IOException {
super.end();
reset();
}
}
here is my test case for following class
#Test
public void testNameFilter() throws IOException {
Reader in = new StringReader(input);
Tokenizer tokenizer = new SentenceTokenizer( detector);
tokenizer.reset();
NameFilter nameFilter = new NameFilter(tokenizer, modelName, nameFinderMES);
nameFilter.reset();
CharTermAttribute charTermAttribute;
PositionIncrementAttribute positionIncrementAttribute;
OffsetAttribute offsetAttribute;
int pass = 0;
while (pass < 2) {
int pos = 0;
int lastStart = 0;
int lastEnd = 0;
//error occur on below invoke
while (nameFilter.incrementToken()) {
}
}
I have added following changes in my code and it work fine but i m now sure it is correct answer
public SentenceTokenizer(Reader reader,SentenceDetector sentenceDetector) {
super();
this.input =reader;
this.sentenceDetector = sentenceDetector;
}

How to deserialize a JSON array to a singly linked list by using Jackson

I want to deserialize a JSON array to a singly linked list in Java.
The definition of singly linked list is as the following:
public class SinglyLinkedListNode<T> {
public T value;
public SinglyLinkedListNode next;
public SinglyLinkedListNode(final T value) {
this.value = value;
}
}
How to deserialize a JSON string such as [1,2,3,4,5] in to a singly linked list?
public void typeReferenceTest() throws IOException {
ObjectMapper objectMapper = new ObjectMapper();
final ArrayList<Integer> intArray = objectMapper.readValue("[1,2,3,4,5]",
new TypeReference<ArrayList<Integer>>() {});
System.out.println(intArray);
// How to achieve this?
final ArrayList<Integer> intList = objectMapper.readValue("[1,2,3,4,5]",
new TypeReference<SinglyLinkedListNode<Integer>>() {});
System.out.println(intList);
}
Moreover, I want the SinglyLinkedListNode to be a first-class citizen the same as ArrayList, which can be used in all kinds of combinations, such as HashSet<SinglyLinkedListNode<Integer>>, SinglyLinkedListNode<HashMap<String, Integer>>.
For example, what happens if I want to deserialize [[1,2,3], [4,5,6]] into a ArrayList<SinglyLinkedListNode<Integer>> ?
As far as I know, a customized deserializer extending JsonDeserializer is not enough to do this.
When you want it to be deserialized to ArrayList<SinglyLinkedListNode<Integer>> for example. Your code specifies that is the type that expected. Therefore it should if a deserializer for SinglyLinkedListNode<Integer> is regeistered it will succeed.
From the jackson-user google group I get the right answer from #Tatu Saloranta.
The answer is simple: just implement the java.util.List interface, and Jackson will automatically serialize/deserialize between JSON array and SinglyLinkedListNode.
So I implement the java.util.List interface for SinglyLinkedListNode, the code is as the following:
import java.util.*;
import java.util.function.Consumer;
/**
* Singly Linked List.
*
* <p>As to singly linked list, a node can be viewed as a single node,
* and it can be viewed as a list too.</p>
*
* #param <E> the type of elements held in this collection
* #see java.util.LinkedList
*/
public class SinglyLinkedListNode<E>
extends AbstractSequentialList<E>
implements Cloneable, java.io.Serializable {
public E value;
public SinglyLinkedListNode<E> next;
/**
* Constructs an empty list.
*/
public SinglyLinkedListNode() {
value = null;
next = null;
}
/**
* Constructs an list with one elment.
*/
public SinglyLinkedListNode(final E value) {
this.value = value;
next = null;
}
/**
* Constructs a list containing the elements of the specified
* collection, in the order they are returned by the collection's
* iterator.
*
* #param c the collection whose elements are to be placed into this list
* #throws NullPointerException if the specified collection is null
*/
public SinglyLinkedListNode(Collection<? extends E> c) {
this();
addAll(c);
}
/**
* Links e as last element.
*/
void linkLast(E e) {
final SinglyLinkedListNode<E> l = last();
final SinglyLinkedListNode<E> newNode = new SinglyLinkedListNode<>(e);
if (l == null)
this.value = e;
else
l.next = newNode;
modCount++;
}
/**
* Inserts element e before non-null Node succ.
*/
void linkBefore(E e, SinglyLinkedListNode<E> succ) {
assert succ != null;
final SinglyLinkedListNode<E> prev = this.previous(succ);
final SinglyLinkedListNode<E> newNode = new SinglyLinkedListNode<>(e);
if (prev == null)
this.value = e;
else
prev.next = newNode;
modCount++;
}
/**
* Return the node before x.
*
* #param x current node
* #return the node before x
*/
private SinglyLinkedListNode<E> previous(final SinglyLinkedListNode<E> x) {
assert (x != null);
if (size() < 2) return null;
if (this == x) return null;
SinglyLinkedListNode<E> prev = new SinglyLinkedListNode<>();
prev.next = this;
SinglyLinkedListNode<E> cur = this;
while (cur != x) {
prev = prev.next;
cur = cur.next;
}
return prev;
}
/**
* Return the last node.
* #return the last node.
*/
private SinglyLinkedListNode<E> last() {
if (size() == 0) return null;
if (size() == 1) return this;
SinglyLinkedListNode<E> prev = new SinglyLinkedListNode<>();
prev.next = this;
SinglyLinkedListNode<E> cur = this;
while (cur != null) {
prev = prev.next;
cur = cur.next;
}
return prev;
}
/**
* Unlinks non-null node x.
*/
E unlink(SinglyLinkedListNode<E> x) {
assert x != null;
final E element = x.value;
final SinglyLinkedListNode<E> next = x.next;
final SinglyLinkedListNode<E> prev = previous(x);
if (prev == null) {
this.value = next.value;
this.next = next.next;
} else {
prev.next = next;
}
x.next = null;
modCount++;
return element;
}
/**
* #inheritDoc
*/
public ListIterator<E> listIterator(int index) {
checkPositionIndex(index);
return new ListItr(index);
}
private class ListItr implements ListIterator<E> {
private SinglyLinkedListNode<E> lastReturned;
private SinglyLinkedListNode<E> next;
private int nextIndex;
private int expectedModCount = modCount;
ListItr(int index) {
assert isPositionIndex(index);
next = (index == size()) ? null : node(index);
nextIndex = index;
}
public boolean hasNext() {
return nextIndex < size();
}
public E next() {
checkForComodification();
if (!hasNext())
throw new NoSuchElementException();
lastReturned = next;
next = next.next;
nextIndex++;
return lastReturned.value;
}
public boolean hasPrevious() {
return nextIndex > 0;
}
public E previous() {
throw new UnsupportedOperationException();
}
public int nextIndex() {
return nextIndex;
}
public int previousIndex() {
return nextIndex - 1;
}
public void remove() {
checkForComodification();
if (lastReturned == null)
throw new IllegalStateException();
unlink(lastReturned);
nextIndex--;
lastReturned = null;
expectedModCount++;
}
public void set(E e) {
if (lastReturned == null)
throw new IllegalStateException();
checkForComodification();
lastReturned.value = e;
}
public void add(E e) {
checkForComodification();
lastReturned = null;
if (next == null)
linkLast(e);
else
linkBefore(e, next);
nextIndex++;
expectedModCount++;
}
public void forEachRemaining(Consumer<? super E> action) {
Objects.requireNonNull(action);
while (modCount == expectedModCount && nextIndex < size()) {
action.accept(next.value);
lastReturned = next;
next = next.next;
nextIndex++;
}
checkForComodification();
}
final void checkForComodification() {
if (modCount != expectedModCount)
throw new ConcurrentModificationException();
}
}
/**
* #inheritDoc
*/
public int size() {
int size = 0;
if (value == null) return size;
SinglyLinkedListNode<E> cur = this;
while (cur != null) {
size++;
cur = cur.next;
}
return size;
}
private void checkPositionIndex(int index) {
if (!isPositionIndex(index))
throw new IndexOutOfBoundsException(outOfBoundsMsg(index));
}
/**
* Returns the (non-null) Node at the specified element index.
*/
SinglyLinkedListNode<E> node(int index) {
assert isElementIndex(index);
SinglyLinkedListNode<E> x = this;
for (int i = 0; i < index; i++)
x = x.next;
return x;
}
/**
* Tells if the argument is the index of an existing element.
*/
private boolean isElementIndex(int index) {
return index >= 0 && index < size();
}
/**
* Tells if the argument is the index of a valid position for an
* iterator or an add operation.
*/
private boolean isPositionIndex(int index) {
return index >= 0 && index <= size();
}
/**
* Constructs an IndexOutOfBoundsException detail message.
* Of the many possible refactorings of the error handling code,
* this "outlining" performs best with both server and client VMs.
*/
private String outOfBoundsMsg(int index) {
return "Index: "+index+", Size: " + size();
}
}
Here is the unit test code:
#Test public void typeReferenceTest() throws IOException {
ObjectMapper objectMapper = new ObjectMapper();
final SinglyLinkedListNode<Integer> intList = objectMapper.readValue("[1,2,3,4,5]",
new TypeReference<SinglyLinkedListNode<Integer>>() {});
System.out.println(intList);
final ArrayList<SinglyLinkedListNode<Integer>> arrayOfList = objectMapper.readValue("[[1,2,3], [4,5,6]]",
new TypeReference<ArrayList<SinglyLinkedListNode<Integer>>>() {});
System.out.println(arrayOfList);
}
#Tatu Saloranta Thank you very much!
Here is my original blog, Deserialize a JSON Array to a Singly Linked List

With Lucene 4.3.1, How to get all terms which occur in sub-range of all docs

Suppose a lucene index with fields : date, content.
I want to get all terms value and frequency of docs whose date is yesterday. date field is keyword field. content field is analyzed and indexed.
Pls help me with sample code.
My solution source is as follow ...
/**
*
*
* #param reader
* #param fromDateTime
* - yyyymmddhhmmss
* #param toDateTime
* - yyyymmddhhmmss
* #return
*/
static public String top10(IndexSearcher searcher, String fromDateTime,
String toDateTime) {
String top10Query = "";
try {
Query query = new TermRangeQuery("tweetDate", new BytesRef(
fromDateTime), new BytesRef(toDateTime), true, false);
final BitSet bits = new BitSet(searcher.getIndexReader().maxDoc());
searcher.search(query, new Collector() {
private int docBase;
#Override
public void setScorer(Scorer scorer) throws IOException {
}
#Override
public void setNextReader(AtomicReaderContext context)
throws IOException {
this.docBase = context.docBase;
}
#Override
public void collect(int doc) throws IOException {
bits.set(doc + docBase);
}
#Override
public boolean acceptsDocsOutOfOrder() {
return false;
}
});
//
Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43,
EnglishStopWords.getEnglishStopWords());
//
HashMap<String, Long> wordFrequency = new HashMap<>();
for (int wx = 0; wx < bits.length(); ++wx) {
if (bits.get(wx)) {
Document wd = searcher.doc(wx);
//
TokenStream tokenStream = analyzer.tokenStream("temp",
new StringReader(wd.get("content")));
// OffsetAttribute offsetAttribute = tokenStream
// .addAttribute(OffsetAttribute.class);
CharTermAttribute charTermAttribute = tokenStream
.addAttribute(CharTermAttribute.class);
tokenStream.reset();
while (tokenStream.incrementToken()) {
// int startOffset = offsetAttribute.startOffset();
// int endOffset = offsetAttribute.endOffset();
String term = charTermAttribute.toString();
if (term.length() < 2)
continue;
Long wl;
if ((wl = wordFrequency.get(term)) == null)
wordFrequency.put(term, 1L);
else {
wl += 1;
wordFrequency.put(term, wl);
}
}
tokenStream.end();
tokenStream.close();
}
}
analyzer.close();
// sort
List<String> occurterm = new ArrayList<String>();
for (String ws : wordFrequency.keySet()) {
occurterm.add(String.format("%06d\t%s", wordFrequency.get(ws),
ws));
}
Collections.sort(occurterm, Collections.reverseOrder());
// make query string by top 10 words
int topCount = 10;
for (String ws : occurterm) {
if (topCount-- == 0)
break;
String[] tks = ws.split("\\t");
top10Query += tks[1] + " ";
}
top10Query.trim();
} catch (IOException e) {
e.printStackTrace();
} finally {
}
// return top10 word string
return top10Query;
}