apache pig Java UDF - changing values in attributes doesn't seem to stick - apache-pig

I'm trying to write a Java UDF that will rank tuples in a bag using a java UDF.
The tuples have a value column that is the criteria for the ranking and a rank column which is initially set to 0.
The tuples are sorted based on the value column.
All the tuples are placed in a bag and that bag is placed inside a new tuple which is passed to the UDF.
The UDF is modifying the rank column however - once the method exits the values have all become 0 again. I'm not sure how to get the values to "Stick".
Any help would greatly appreciated.
Here is my java class
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.pig.FilterFunc;
import org.apache.pig.EvalFunc;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.data.DataType;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.DataBag;
import org.apache.pig.impl.logicalLayer.FrontendException;
import java.util.Iterator;
import org.apache.pig.PigWarning;
/**
*
* #author Winter
*/
public class Ranker extends EvalFunc<String>{
#Override
public String exec(Tuple tuple) throws IOException {
if (tuple == null || tuple.size() == 0) {
return null;
}
List<Object> list = tuple.getAll();
DataBag db = (DataBag) list.get(0);
Integer num = (Integer)list.get(1);
Iterator<Tuple>itr = db.iterator();
boolean containsNonNull = false;
int i = 1;
double previous=0;
while (itr.hasNext()) {
Tuple t= itr.next();
double d = (Double)t.get(num.intValue());
int rankCol = t.size()-1;
Integer rankVal = (Integer)t.get(rankCol);
if(i == 0){
System.out.println("i==0");
previous = d;
t.set(rankCol, i);
} else {
if(d == previous)
t.set(rankCol, i);
else{
System.out.print("d!==previous|" + d + "|"+ previous+"|"+rankVal);
t.set(rankCol, ++i);
rankVal = (Integer)t.get(rankCol);
System.out.println("|now rank val" + rankVal);
previous = d;
}
}
}
return "Y";
}
}
Here is how I am calling everything in Pig -
REGISTER /myJar.jar;
A = LOAD '/Users/Winter/milk-tea-coffee.tsv' as (year:chararray, milk:double);
B = foreach A generate year, milk, 0 as rank;
C = order B by milk asc;
D = group C by rank order C by milk;
E = foreach D generate D.C.year,D.C.milk,D.C.rank, piglet3.evalFunctions.Ranker(D.C,1);
dump E;
I can tell its working inside the UDF because of the print statements inside the UDF -
d!==previous|21.2|0.0|0|now rank val2
d!==previous|21.6|21.2|0|now rank val3
d!==previous|21.9|21.6|0|now rank val4
d!==previous|22.0|21.9|0|now rank val5
d!==previous|22.5|22.0|0|now rank val6
d!==previous|22.9|22.5|0|now rank val7
d!==previous|23.0|22.9|0|now rank val8
d!==previous|23.4|23.0|0|now rank val9
d!==previous|23.8|23.4|0|now rank val10
d!==previous|23.9|23.8|0|now rank val11
but when I dump out E or D or C the rank column only contains 0s.

The exec function must return the output you want from the UDF. You are currently modifying the Tuple that is being passed to the exec function, then returning the String "Y" -- all that Pig see's as output from your UDF is "Y". In this case, you should return the Tuple instead of "Y".
I think the following code is close to your intent, but I'm not quite clear on what you are trying to do:
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.pig.FilterFunc;
import org.apache.pig.EvalFunc;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.data.DataType;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.DataBag;
import org.apache.pig.impl.logicalLayer.FrontendException;
import java.util.Iterator;
import org.apache.pig.PigWarning;
/**
*
* #author Winter
*/
public class Ranker extends EvalFunc<Tuple>{
#Override
public Tuple exec(Tuple tuple) throws IOException {
if (tuple == null || tuple.size() == 0) {
return null;
}
List<Object> list = tuple.getAll();
DataBag db = (DataBag) list.get(0);
Integer num = (Integer)list.get(1);
Iterator<Tuple>itr = db.iterator();
boolean containsNonNull = false;
int i = 1;
double previous=0;
while (itr.hasNext()) {
Tuple t= itr.next();
double d = (Double)t.get(num.intValue());
int rankCol = t.size()-1;
Integer rankVal = (Integer)t.get(rankCol);
if(i == 0){
System.out.println("i==0");
previous = d;
t.set(rankCol, i);
} else {
if(d == previous)
t.set(rankCol, i);
else{
System.out.print("d!==previous|" + d + "|"+ previous+"|"+rankVal);
t.set(rankCol, ++i);
rankVal = (Integer)t.get(rankCol);
System.out.println("|now rank val" + rankVal);
previous = d;
}
}
}
return tuple;
}
}

Related

Selenium - Unable to pick particular rows from TestData excel file

I am trying to build the TestDataReader.java file. My code should pick only the rows which are tagged by the particular testcaseid (For eg: for Test Case 1, it should pick only rows with TC_001, for Test Case 2 it should pick only rows with TC_002) and return the data to the Dataprovider method. With the below attached code, When i pass Test Case ID as TC_002, it is putting null in the testData[rowNum][0] column for rows with TC_001.
My expectation is that i should get rows only for TC_002. it should not process for TC_001.
Please suggest what is incorrect in this code.
Below is my TestDatareader class:
package com.automation.website.kaspick.utils;
import java.io.FileInputStream;
import java.io.IOException;
import java.nio.file.Paths;
import java.util.HashMap;
import org.apache.poi.xssf.usermodel.XSSFCell;
import org.apache.poi.xssf.usermodel.XSSFRow;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
public class TestDataReader {
XSSFWorkbook xlWorkbook = null;
XSSFSheet sheet = null;
XSSFRow row = null;
XSSFCell cell = null;
String currentDir = Paths.get("").toAbsolutePath().toString();
String testDataFileName = "\\TestData\\TestData.xlsx";
String sheetName = null;
String xlFilePath = currentDir + testDataFileName;
public Object[][] readTestData(String sheetName, String testCaseID) throws IOException {
String rowHeader = null;
String columnValue = null;
FileInputStream fileInputStream = new FileInputStream(xlFilePath);
xlWorkbook = new XSSFWorkbook(fileInputStream);
sheet = xlWorkbook.getSheet(sheetName);
int rowCount = sheet.getLastRowNum();
Object[][] testData = new Object[rowCount][1]; // This object is created to store the MapData as an object for
// DataProvider
if (sheet.getSheetName().equalsIgnoreCase(sheetName)) {
for (int rowNum = 0; rowNum < rowCount; rowNum++) {
HashMap<Object, Object> testDataInEachRow = new HashMap<Object, Object>();
row = sheet.getRow(rowNum + 1); // Since the first row will be header, parse from the second row
int columnCount = row.getLastCellNum();
for (int colNum = 0; colNum < columnCount; colNum++) {
rowHeader = sheet.getRow(0).getCell(colNum).toString();
if(colNum==0 && !row.getCell(colNum).toString().equalsIgnoreCase(testCaseID)) {
break;
}
if (row.getCell(colNum) != null) {
columnValue = row.getCell(colNum).toString();
} else {
columnValue = "";
}
testDataInEachRow.put(rowHeader, columnValue);
}
}
testData[rowNum][0] = testDataInEachRow;
}
return testData;
}
}
Below is my Test Data. I have masked the data
[TestData.xlsx][1]
[1]: https://i.stack.imgur.com/h6cMf.png

Why doesn't my number sequence print from the 2d arraylist correctly?

I cannot get the loop to work in the buildDimArray method to store the number combinations "11+11", "11+12", "11+21", "11+22", "12+11", "12+12", "12+21", "12+22", "21+11", "21+12", "21+21", "21+22", "22+11", "22+12", "22+21", and "22+22" into the 2d arraylist with each expression going into one column of the index dimBase-1 row. The loop may work for other people, but for some reason mine isn't functioning correctly. The JVM sees the if dimBase==1 condition, but refuses to check the other conditions. The "WTF" not being printed as a result from the buildDimArray method. If dimBase=1, it prints successfully, but doesn't for the other integers. The dimBase==3 condition needs a loop eventually. The "WTF" is for illustrative purposes. I could get away with a 1d arraylist, but in the future I will likely need the 2d arraylist once the program is completed.
package jordanNumberApp;
import java.util.Scanner;
import java.util.ArrayList;
/*
* Dev Wills
* Purpose: This code contains some methods that aren't developed. This program is supposed to
* store all possible number combinations from numbers 1-dimBase for the math expression
* "##+##" into a 2d arraylist at index row dimBase-1 and the columns storing the
* individual combinations. After storing the values in the arraylist, the print method
* pours the contents in order from the arraylist as string values.
*/
public class JordanNumberSystem {
// a-d are digits, assembled as a math expression, stored in outcomeOutput, outcomeAnswer
public static int dimBase, outcomeAnswer, a, b, c, d;
public static String inputOutcome, outcomeOutput;
public static final int NUM_OF_DIMENSIONS = 9; //Eventually # combinations go up to 9
public static ArrayList<ArrayList<String>> dimBaseArray;
public static Scanner keyboard;
/*
* Constructor for JordanNumber System
* accepts no parameters
*/
public JordanNumberSystem() // Defunct constructor
{
// Declare and Initialize public variables
this.dimBase = dimBase;
this.outcomeOutput = outcomeOutput;
this.outcomeAnswer = outcomeAnswer;
}
// Set all values of variable values
public static void setAllValues()
{
// Initialize
dimBase = 1;
outcomeAnswer = 22; // variables not used for now
outcomeOutput = "1"; // variables not used for now
//a = 1;
//b = 1;
//c = 1;
//d = 1;
dimBaseArray = new ArrayList<ArrayList<String>>();
keyboard = new Scanner(System.in);
}
public static void buildDimArray(int dim)
{
dimBase = dim;
try
{
//create first row
dimBaseArray.add(dimBase-1, new ArrayList<String>());
if( dimBase == 1)
{
a = b = c = d = dimBase ;
dimBaseArray.get(0).add(a+""+b+"+"+c+""+d);
System.out.println("WTF"); // SHOWS
}
else if (dimBase == 2)
{ // dim = 2
a = b = c = d = 1 ;
System.out.println("WTF"); // doesn't show
// dimBaseArray.get(dimBase-1).add(a+""+b+"+"+c+""+d);
for( int i = 1 ; i <= dim ; i++)
a=i;
for( int j = 1 ; j <= dim ; j++)
b=j;
for( int k = 1 ; k <= dim ; k++)
c=k;
for( int l = 1 ; l <= dim ; l++)
{
d=l;
dimBaseArray.get(dim-1).add(a+""+b+"+"+c+""+d);
}
}
else if (dimBase == 3)
{
a = b = c = d = dimBase;
dimBaseArray.get(2).add(a+""+b+"+"+c+""+d);
System.out.println("WTF");
}
}catch (IndexOutOfBoundsException e)
{
System.out.println(e.getMessage());
}
}
public static void printArray(int num) // Prints the contents of the array
{ // Fixing the printing method
try
{
int i = num-1;
for( String string : dimBaseArray.get(i))
{
System.out.println(string);
System.out.println("");
}
} catch (IndexOutOfBoundsException e)
{
System.out.println(e.getMessage());
}
}
public static void main(String[] args) throws java.lang.IndexOutOfBoundsException
{
setAllValues(); // sets the initial a,b,c,d values and dimBase, initializes 2d arraylist
// Get the Dimension Base number
System.out.println("Enter Dimension Base Number. Input an integer: ");
int dimBaseInput = keyboard.nextInt(); // Receives integer
dimBase = dimBaseInput;
if( dimBase != 1 && dimBase != 2 && dimBase != 3)
{// Error checking
System.out.println("invalid Dimension Base Number should be 1 or 2 ");
System.exit(1);
}
// Build the arraylist, print, clear, exit
buildDimArray(dimBase);
printArray(dimBase);
dimBaseArray.clear();
System.exit(1);
}
}// End of class

Generate N-grams while preserving spaces in apache lucene

I am trying to generate N-grams using apache Lucene 5.5.4 for a given set input text. Following is my java code to do the same.
public static void main( String[] args )
{
Analyzer analyzer = createAnalyzer( 2 );
List<String> nGrams = generateNgrams( analyzer, "blah1 blah2 blah3" );
for ( String nGram : nGrams ) {
System.out.println( nGram );
}
}
public static Analyzer createAnalyzer( final int shingles )
{
return new Analyzer() {
#Override
protected TokenStreamComponents createComponents( #NotNull String field )
{
final Tokenizer source = new WhitespaceTokenizer();
final ShingleFilter shingleFilter = new ShingleFilter( new LowerCaseFilter( source ), shingles );
shingleFilter.setOutputUnigrams( true );
return new TokenStreamComponents( source, shingleFilter );
}
};
}
public static List<String> generateNgrams( Analyzer analyzer, String str )
{
List<String> result = new ArrayList<>();
try {
TokenStream stream = analyzer.tokenStream( null, new StringReader( str ) );
stream.reset();
while ( stream.incrementToken() ) {
String nGram = stream.getAttribute( CharTermAttribute.class ).toString();
result.add( nGram );
LOG.debug( "Generated N-gram = {}", nGram );
}
} catch ( IOException e ) {
LOG.error( "IO Exception occured! {}", e );
}
return result;
}
For my input blah1 blah2 blah3, the output is as follows and i am okay with it.
blah1
blah1 blah2
blah2
blah2 blah3
blah3
However, when the input is Foo bar Foo2, my requirement is to generate the following output:
Foo
Foo bar
bar
bar Foo2
Foo2
If you noticed, I have to preserve the spaces in between 2 words as it is in the input.(Foo bar and not Foo bar).
Can I make any tweaks and ask lucene to handle it internally?
May be its a minor tweak like adding a filter or something and since I am new to Lucene, I don't know where to start.
Thanks in Advance.
I had to write custom tokenizers and and trim filters to achieve this.
1) I created an abstract class DelimiterPreservingCharTokenizer by extending org.apache.lucene.analysis.Tokenizer class. Next, gave my implementation for incrementToken method. I would have extended org.apache.lucene.analysis.util.CharTokenizer if not the class was final. DelimiterPreservingCharTokenizer looks like below.
package lucene.tokenizers;
import java.io.IOException;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.util.CharTokenizer;
import org.apache.lucene.analysis.util.CharacterUtils;
import org.apache.lucene.analysis.util.CharacterUtils.CharacterBuffer;
import org.apache.lucene.util.AttributeFactory;
/**
*
* #author Arun Gowda.
* This class is exactly same as {#link CharTokenizer}. Except that, the stream will have leading delimiters. This is to support N-gram vicinity matches.
*
* We are creating a new class instead of extending CharTokenizer because, incrementToken method is final and we can not override it.
*
*/
public abstract class DelimiterPreservingCharTokenizer extends Tokenizer
{
/**
* Creates a new {#link DelimiterPreservingCharTokenizer} instance
*/
public DelimiterPreservingCharTokenizer()
{}
/**
* Creates a new {#link DelimiterPreservingCharTokenizer} instance
*
* #param factory
* the attribute factory to use for this {#link Tokenizer}
*/
public DelimiterPreservingCharTokenizer( AttributeFactory factory )
{
super( factory );
}
private int offset = 0, bufferIndex = 0, dataLen = 0, finalOffset = 0;
private static final int MAX_WORD_LEN = 255;
private static final int IO_BUFFER_SIZE = 4096;
private final CharTermAttribute termAtt = addAttribute( CharTermAttribute.class );
private final OffsetAttribute offsetAtt = addAttribute( OffsetAttribute.class );
private final CharacterUtils charUtils = CharacterUtils.getInstance();
private final CharacterBuffer ioBuffer = CharacterUtils.newCharacterBuffer( IO_BUFFER_SIZE );
/**
* Returns true iff a codepoint should be included in a token. This tokenizer
* generates as tokens adjacent sequences of codepoints which satisfy this
* predicate. Codepoints for which this is false are used to define token
* boundaries and are not included in tokens.
*/
protected abstract boolean isTokenChar( int c );
/**
* Called on each token character to normalize it before it is added to the
* token. The default implementation does nothing. Subclasses may use this to,
* e.g., lowercase tokens.
*/
protected int normalize( int c )
{
return c;
}
#Override
public final boolean incrementToken() throws IOException
{
clearAttributes();
int length = 0;
int start = -1; // this variable is always initialized
int end = -1;
char[] buffer = termAtt.buffer();
while ( true ) {
if ( bufferIndex >= dataLen ) {
offset += dataLen;
charUtils.fill( ioBuffer, input ); // read supplementary char aware with CharacterUtils
if ( ioBuffer.getLength() == 0 ) {
dataLen = 0; // so next offset += dataLen won't decrement offset
if ( length > 0 ) {
break;
} else {
finalOffset = correctOffset( offset );
return false;
}
}
dataLen = ioBuffer.getLength();
bufferIndex = 0;
}
// use CharacterUtils here to support < 3.1 UTF-16 code unit behavior if the char based methods are gone
final int c = charUtils.codePointAt( ioBuffer.getBuffer(), bufferIndex, ioBuffer.getLength() );
final int charCount = Character.charCount( c );
bufferIndex += charCount;
if ( isTokenChar( c ) ) { // if it's a token char
if ( length == 0 ) { // start of token
assert start == -1;
start = offset + bufferIndex - charCount;
end = start;
} else if ( length >= buffer.length - 1 ) { // check if a supplementary could run out of bounds
buffer = termAtt.resizeBuffer( 2 + length ); // make sure a supplementary fits in the buffer
}
end += charCount;
length += Character.toChars( normalize( c ), buffer, length ); // buffer it, normalized
if ( length >= MAX_WORD_LEN ) // buffer overflow! make sure to check for >= surrogate pair could break == test
break;
} else if ( length > 0 ) // at non-Letter w/ chars
break; // return 'em
}
if ( length > 0 && bufferIndex < ioBuffer.getLength() ) {//If at least one token is found,
//THIS IS THE PART WHICH IS DIFFERENT FROM LUCENE's CHARTOKENIZER
// use CharacterUtils here to support < 3.1 UTF-16 code unit behavior if the char based methods are gone
int c = charUtils.codePointAt( ioBuffer.getBuffer(), bufferIndex, ioBuffer.getLength() );
int charCount = Character.charCount( c );
bufferIndex += charCount;
while ( !isTokenChar( c ) && bufferIndex < ioBuffer.getLength() ) {// As long as we find delimiter(not token char), keep appending it to output stream.
if ( length >= buffer.length - 1 ) { // check if a supplementary could run out of bounds
buffer = termAtt.resizeBuffer( 2 + length ); // make sure a supplementary fits in the buffer
}
end += charCount;
length += Character.toChars( normalize( c ), buffer, length ); // buffer it, normalized
if ( length >= MAX_WORD_LEN ) {// buffer overflow! make sure to check for >= surrogate pair could break == test
break;
}
c = charUtils.codePointAt( ioBuffer.getBuffer(), bufferIndex, ioBuffer.getLength() );
charCount = Character.charCount( c );
bufferIndex += charCount;
}
//ShingleFilter will add a delimiter. Hence, the last iteration is skipped.
//That is, for input `abc def ghi`, this tokenizer will return `abc `(2 spaces only). Then, Shingle filter will by default add another delimiter making it `abc `(3 spaces as it is in the input).
//If there are N delimiters, this token will at max return N-1 delimiters
bufferIndex -= charCount;
}
termAtt.setLength( length );
assert start != -1;
offsetAtt.setOffset( correctOffset( start ), finalOffset = correctOffset( end ) );
return true;
}
#Override
public final void end() throws IOException
{
super.end();
// set final offset
offsetAtt.setOffset( finalOffset, finalOffset );
}
#Override
public void reset() throws IOException
{
super.reset();
bufferIndex = 0;
offset = 0;
dataLen = 0;
finalOffset = 0;
ioBuffer.reset(); // make sure to reset the IO buffer!!
}
}
2) A concrete class WhiteSpacePreservingTokenizer extending the above abstract class to provide delimiter
package spellcheck.lucene.tokenizers;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.util.AttributeFactory;
/**
*
* #author Arun Gowda
*
* This class is exactly same as {#link WhitespaceTokenizer} Only difference is, it extends DelimiterPreservingCharTokenizer instead of CharTokenizer
*/
public class WhiteSpacePreservingTokenizer extends DelimiterPreservingCharTokenizer
{
/**
* Construct a new WhitespaceTokenizer.
*/
public WhiteSpacePreservingTokenizer()
{}
/**
* Construct a new WhitespaceTokenizer using a given
* {#link org.apache.lucene.util.AttributeFactory}.
*
* #param factory
* the attribute factory to use for this {#link Tokenizer}
*/
public WhiteSpacePreservingTokenizer( AttributeFactory factory )
{
super( factory );
}
/** Collects only characters which do not satisfy
* {#link Character#isWhitespace(int)}.*/
#Override
protected boolean isTokenChar( int c )
{
return !Character.isWhitespace( c );
}
}
3) The tokenizer above will result in tailing spaces. (Ex: blah____) we need to add a filter to trim those spaces. So we need DelimiterTrimFilter as folows.(We can also just trim by using java's trim. but doing so will be very inefficient since it creates new string)
package spellcheck.lucene.filters;
import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
public class DelimiterTrimFilter extends TokenFilter
{
private final CharTermAttribute termAtt = addAttribute( CharTermAttribute.class );
private char delimiter;
/**
* Create a new {#link DelimiterTrimFilter}.
* #param in the stream to consume
* #param delimiterToTrim delimiter that should be trimmed
*/
public DelimiterTrimFilter( TokenStream in, char delimiterToTrim )
{
super( in );
this.delimiter = delimiterToTrim;
}
#Override
public boolean incrementToken() throws IOException
{
if ( !input.incrementToken() )
return false;
char[] termBuffer = termAtt.buffer();
int len = termAtt.length();
if ( len == 0 ) {
return true;
}
int start = 0;
int end = 0;
// eat the first characters
for ( start = 0; start < len && termBuffer[start] == delimiter; start++ ) {
}
// eat the end characters
for ( end = len; end >= start && termBuffer[end - 1] == delimiter; end-- ) {
}
if ( start > 0 || end < len ) {
if ( start < end ) {
termAtt.copyBuffer( termBuffer, start, ( end - start ) );
} else {
termAtt.setEmpty();
}
}
return true;
}
}
4) My createAnalyzer will look like below
public static Analyzer createAnalyzer( final int shingles )
{
return new Analyzer() {
#Override
protected TokenStreamComponents createComponents( #NotNull String field )
{
final Tokenizer source = new WhiteSpacePreservingTokenizer();
final TokenStream filter = new ShingleFilter( new LowerCaseFilter( source ), shingles );
filter = new DelimiterTrimFilter( filter, ' ' );
return new TokenStreamComponents( source, filter );
}
};
}
Rest of the code will remain the same

get count for rolling date value Using Apache Pig

How can we achieve using Apache Pig :
File :
A 2014/10/01
A 2014/09/01
A 2014/08/01
A 2014/02/01
Result should A count 3, since i want to count the number of records using rolling window of 30 days between records group by A.
Please find the solution, i hope you can do further enhancement if it required. Try to execute with your input and let me know how it works.
input.txt
A 2014/12/01
A 2014/11/01
A 2014/10/01
A 2014/07/01
A 2014/05/01
A 2014/04/01
B 2014/09/01
B 2014/07/01
B 2014/06/01
B 2014/02/01
C 2014/09/01
C 2014/07/01
C 2014/05/01
Expected output
A 5
B 2
C 0
PigScript:
REGISTER rollingCount.jar;
A = LOAD 'input.txt' Using PigStorage(' ') AS (f1:chararray,f2:chararray);
B = GROUP A BY f1;
C = FOREACH B GENERATE mypackage.ROLLINGCOUNT(BagToString($1)) AS rollingCnt;
DUMP C;
OutPut from the Script:
(A,5)
(B,2)
(C,0)
Java Code:
1. Compile the below java code and create jar file name rollingCount.jar
2. I just wrote the code temporarily, you can optimize if required.
ROLLINGCOUNT.java
package mypackage;
import java.io.*;
import org.apache.pig.EvalFunc;
import org.apache.pig.data.Tuple;
import java.text.SimpleDateFormat;
import java.util.concurrent.TimeUnit;
import java.util.*;
public class ROLLINGCOUNT extends EvalFunc<Integer> {
public Integer exec(Tuple input) throws IOException {
//Get the input String from request
String inputString = (String)input.get(0);
Date[] arrayOfDates = getArrayOfDate(inputString);
long diffDays[] = getDaysBetweenList(arrayOfDates);
int rollingCount = getRollingCount(diffDays);
return rollingCount;
}
//Function to convert strings to array of dates
static protected Date[] getArrayOfDate(String inputString)
{
//Get the 1st column, this will be the Id
String ID = inputString.split("_")[0];
//Replace all the Ids with Null, bcoz its a duplicate columns
String modifiedString = inputString.replace(ID+"_","");
//Split the string into multiple columns using '_' as delimiter
String list[] = modifiedString.split("_");
//Convert the string to list of array dates
Date[] dateList = new Date[list.length];
int index=0;
for (String dateString: list)
{
try
{
//Convert the date string to date object in the give format
SimpleDateFormat dFormat = new SimpleDateFormat("yyyy/MM/dd");
dateList[index++] = dFormat.parse(dateString);
}
catch(Exception e)
{
// error handling goes here
}
}
return dateList;
}
//Function to get difference between two dates
static protected long[] getDaysBetweenList(Date[] arrayOfDate)
{
long diffDays[] = new long[arrayOfDate.length-1];
int cnt=0;
for (int index=0; index<arrayOfDate.length-1;index++)
{
long diff = Math.abs(arrayOfDate[index+1].getTime() - arrayOfDate[index].getTime());
long days = TimeUnit.DAYS.convert(diff, TimeUnit.MILLISECONDS);
diffDays[cnt++] = days;
}
return diffDays;
}
//Function to get the total rolling count
static protected int getRollingCount(long diffDays[])
{
int result =0;
for(int index=0;index<diffDays.length;index++)
{
int cnt =0;
//hardcoded the values of 30 and 31 days, may need to handle Feb month 28 or 29 days
while((index<diffDays.length)&&((diffDays[index]==30)||(diffDays[index]==31)))
{
cnt++;
index++;
}
if(cnt>0)
{
result = result + cnt+1;
}
}
return result;
}
}

How to write this Pig query?

I have a many-to-many mapping table between two collections. Each row in the mapping table represents a possible mapping with a weight score.
mapping(id1, id2, weight)
Query: Generate one to one mapping between id1 and id2. Use lowest weight to remove duplicate mappings. If there is tie, output any arbitrary one.
Example input:
(1, X, 1)
(1, Y, 2)
(2, X, 3)
(2, Y, 1)
(3, Z, 2)
Output
(1, X)
(2, Y)
(3, Z)
1 and 2 are both mapped to X and Y. We pick mapping (1, X) and (2, Y) because they have the lowest weight.
I will assume that you are only interested in mappings where the weight is the lowest of any mapping involving id1, and also the lowest of any mapping involving id2. For example, if you additionally had the mapping (2, Y, 4), it would not conflict with (1, X, 1). I will exclude such mappings because the weight is smaller than (1, Y, 2) and (2, X, 3), which were disqualified.
My solution proceeds as follows: find the minimum mapping weight for each id1, and then join that into the mapping relation for future reference. Use a nested foreach to go through each id2: use ORDER and LIMIT to select the record with the smallest weight for that id2, and then only keep it if the weight is also the minimum for that id1.
Here is the full script, tested on your input:
mapping = LOAD 'input' AS (id1:chararray, id2:chararray, weight:double);
id1_weights =
FOREACH (GROUP mapping BY id1)
GENERATE group AS id1, MIN(mapping.weight) AS id1_min_weight;
mapping_with_id1_mins =
FOREACH (JOIN mapping BY id1, id1_weights BY id1)
GENERATE mapping::id1, id2, weight, id1_min_weight;
accepted_mappings =
FOREACH (GROUP mapping_with_id1_mins BY id2)
{
ordered = ORDER mapping_with_id1_mins BY weight;
selected = LIMIT ordered 1;
acceptable = FILTER selected BY weight == id1_min_weight;
GENERATE FLATTEN(acceptable);
};
DUMP accepted_mappings;
Solved it by using Java UDF. it's not perfect in a sense that it won't maximize the number of one-to-one mappings but it's good enough.
Pig:
d = load 'test' as (fid, iid, priority:double);
g = group d by fid;
o = foreach g generate FLATTEN(com.propeld.pig.DEDUP(d)) as (fid, iid, priority);
store o into 'output';
g2 = group o by iid;
o2 = foreach g2 generate FLATTEN(com.propeld.pig.DEDUP(o)) as (fid, iid, priority);
store o2 into 'output2';
Java UDF:
package com.propeld.pig;
import java.io.IOException;
import java.util.Iterator;
import org.apache.pig.Algebraic;
import org.apache.pig.EvalFunc;
import org.apache.pig.PigException;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
public class DEDUP extends EvalFunc<Tuple> implements Algebraic{
public String getInitial() {return Initial.class.getName();}
public String getIntermed() {return Intermed.class.getName();}
public String getFinal() {return Final.class.getName();}
static public class Initial extends EvalFunc<Tuple> {
private static TupleFactory tfact = TupleFactory.getInstance();
public Tuple exec(Tuple input) throws IOException {
// Initial is called in the map.
// we just send the tuple down
try {
// input is a bag with one tuple containing
// the column we are trying to operate on
DataBag bg = (DataBag) input.get(0);
if (bg.iterator().hasNext()) {
Tuple dba = (Tuple) bg.iterator().next();
return dba;
} else {
// make sure that we call the object constructor, not the list constructor
return tfact.newTuple((Object) null);
}
} catch (ExecException e) {
throw e;
} catch (Exception e) {
int errCode = 2106;
throw new ExecException("Error executing an algebraic function", errCode, PigException.BUG, e);
}
}
}
static public class Intermed extends EvalFunc<Tuple> {
public Tuple exec(Tuple input) throws IOException {
return dedup(input);
}
}
static public class Final extends EvalFunc<Tuple> {
public Tuple exec(Tuple input) throws IOException {return dedup(input);}
}
static protected Tuple dedup(Tuple input) throws ExecException, NumberFormatException {
DataBag values = (DataBag)input.get(0);
Double min = Double.MAX_VALUE;
Tuple result = null;
for (Iterator<Tuple> it = values.iterator(); it.hasNext();) {
Tuple t = (Tuple) it.next();
if ((Double)t.get(2) < min){
min = (Double)t.get(2);
result = t;
}
}
return result;
}
#Override
public Tuple exec(Tuple input) throws IOException {
return dedup(input);
}
}