get count for rolling date value Using Apache Pig - apache-pig

How can we achieve using Apache Pig :
File :
A 2014/10/01
A 2014/09/01
A 2014/08/01
A 2014/02/01
Result should A count 3, since i want to count the number of records using rolling window of 30 days between records group by A.

Please find the solution, i hope you can do further enhancement if it required. Try to execute with your input and let me know how it works.
input.txt
A 2014/12/01
A 2014/11/01
A 2014/10/01
A 2014/07/01
A 2014/05/01
A 2014/04/01
B 2014/09/01
B 2014/07/01
B 2014/06/01
B 2014/02/01
C 2014/09/01
C 2014/07/01
C 2014/05/01
Expected output
A 5
B 2
C 0
PigScript:
REGISTER rollingCount.jar;
A = LOAD 'input.txt' Using PigStorage(' ') AS (f1:chararray,f2:chararray);
B = GROUP A BY f1;
C = FOREACH B GENERATE mypackage.ROLLINGCOUNT(BagToString($1)) AS rollingCnt;
DUMP C;
OutPut from the Script:
(A,5)
(B,2)
(C,0)
Java Code:
1. Compile the below java code and create jar file name rollingCount.jar
2. I just wrote the code temporarily, you can optimize if required.
ROLLINGCOUNT.java
package mypackage;
import java.io.*;
import org.apache.pig.EvalFunc;
import org.apache.pig.data.Tuple;
import java.text.SimpleDateFormat;
import java.util.concurrent.TimeUnit;
import java.util.*;
public class ROLLINGCOUNT extends EvalFunc<Integer> {
public Integer exec(Tuple input) throws IOException {
//Get the input String from request
String inputString = (String)input.get(0);
Date[] arrayOfDates = getArrayOfDate(inputString);
long diffDays[] = getDaysBetweenList(arrayOfDates);
int rollingCount = getRollingCount(diffDays);
return rollingCount;
}
//Function to convert strings to array of dates
static protected Date[] getArrayOfDate(String inputString)
{
//Get the 1st column, this will be the Id
String ID = inputString.split("_")[0];
//Replace all the Ids with Null, bcoz its a duplicate columns
String modifiedString = inputString.replace(ID+"_","");
//Split the string into multiple columns using '_' as delimiter
String list[] = modifiedString.split("_");
//Convert the string to list of array dates
Date[] dateList = new Date[list.length];
int index=0;
for (String dateString: list)
{
try
{
//Convert the date string to date object in the give format
SimpleDateFormat dFormat = new SimpleDateFormat("yyyy/MM/dd");
dateList[index++] = dFormat.parse(dateString);
}
catch(Exception e)
{
// error handling goes here
}
}
return dateList;
}
//Function to get difference between two dates
static protected long[] getDaysBetweenList(Date[] arrayOfDate)
{
long diffDays[] = new long[arrayOfDate.length-1];
int cnt=0;
for (int index=0; index<arrayOfDate.length-1;index++)
{
long diff = Math.abs(arrayOfDate[index+1].getTime() - arrayOfDate[index].getTime());
long days = TimeUnit.DAYS.convert(diff, TimeUnit.MILLISECONDS);
diffDays[cnt++] = days;
}
return diffDays;
}
//Function to get the total rolling count
static protected int getRollingCount(long diffDays[])
{
int result =0;
for(int index=0;index<diffDays.length;index++)
{
int cnt =0;
//hardcoded the values of 30 and 31 days, may need to handle Feb month 28 or 29 days
while((index<diffDays.length)&&((diffDays[index]==30)||(diffDays[index]==31)))
{
cnt++;
index++;
}
if(cnt>0)
{
result = result + cnt+1;
}
}
return result;
}
}

Related

Why doesn't my number sequence print from the 2d arraylist correctly?

I cannot get the loop to work in the buildDimArray method to store the number combinations "11+11", "11+12", "11+21", "11+22", "12+11", "12+12", "12+21", "12+22", "21+11", "21+12", "21+21", "21+22", "22+11", "22+12", "22+21", and "22+22" into the 2d arraylist with each expression going into one column of the index dimBase-1 row. The loop may work for other people, but for some reason mine isn't functioning correctly. The JVM sees the if dimBase==1 condition, but refuses to check the other conditions. The "WTF" not being printed as a result from the buildDimArray method. If dimBase=1, it prints successfully, but doesn't for the other integers. The dimBase==3 condition needs a loop eventually. The "WTF" is for illustrative purposes. I could get away with a 1d arraylist, but in the future I will likely need the 2d arraylist once the program is completed.
package jordanNumberApp;
import java.util.Scanner;
import java.util.ArrayList;
/*
* Dev Wills
* Purpose: This code contains some methods that aren't developed. This program is supposed to
* store all possible number combinations from numbers 1-dimBase for the math expression
* "##+##" into a 2d arraylist at index row dimBase-1 and the columns storing the
* individual combinations. After storing the values in the arraylist, the print method
* pours the contents in order from the arraylist as string values.
*/
public class JordanNumberSystem {
// a-d are digits, assembled as a math expression, stored in outcomeOutput, outcomeAnswer
public static int dimBase, outcomeAnswer, a, b, c, d;
public static String inputOutcome, outcomeOutput;
public static final int NUM_OF_DIMENSIONS = 9; //Eventually # combinations go up to 9
public static ArrayList<ArrayList<String>> dimBaseArray;
public static Scanner keyboard;
/*
* Constructor for JordanNumber System
* accepts no parameters
*/
public JordanNumberSystem() // Defunct constructor
{
// Declare and Initialize public variables
this.dimBase = dimBase;
this.outcomeOutput = outcomeOutput;
this.outcomeAnswer = outcomeAnswer;
}
// Set all values of variable values
public static void setAllValues()
{
// Initialize
dimBase = 1;
outcomeAnswer = 22; // variables not used for now
outcomeOutput = "1"; // variables not used for now
//a = 1;
//b = 1;
//c = 1;
//d = 1;
dimBaseArray = new ArrayList<ArrayList<String>>();
keyboard = new Scanner(System.in);
}
public static void buildDimArray(int dim)
{
dimBase = dim;
try
{
//create first row
dimBaseArray.add(dimBase-1, new ArrayList<String>());
if( dimBase == 1)
{
a = b = c = d = dimBase ;
dimBaseArray.get(0).add(a+""+b+"+"+c+""+d);
System.out.println("WTF"); // SHOWS
}
else if (dimBase == 2)
{ // dim = 2
a = b = c = d = 1 ;
System.out.println("WTF"); // doesn't show
// dimBaseArray.get(dimBase-1).add(a+""+b+"+"+c+""+d);
for( int i = 1 ; i <= dim ; i++)
a=i;
for( int j = 1 ; j <= dim ; j++)
b=j;
for( int k = 1 ; k <= dim ; k++)
c=k;
for( int l = 1 ; l <= dim ; l++)
{
d=l;
dimBaseArray.get(dim-1).add(a+""+b+"+"+c+""+d);
}
}
else if (dimBase == 3)
{
a = b = c = d = dimBase;
dimBaseArray.get(2).add(a+""+b+"+"+c+""+d);
System.out.println("WTF");
}
}catch (IndexOutOfBoundsException e)
{
System.out.println(e.getMessage());
}
}
public static void printArray(int num) // Prints the contents of the array
{ // Fixing the printing method
try
{
int i = num-1;
for( String string : dimBaseArray.get(i))
{
System.out.println(string);
System.out.println("");
}
} catch (IndexOutOfBoundsException e)
{
System.out.println(e.getMessage());
}
}
public static void main(String[] args) throws java.lang.IndexOutOfBoundsException
{
setAllValues(); // sets the initial a,b,c,d values and dimBase, initializes 2d arraylist
// Get the Dimension Base number
System.out.println("Enter Dimension Base Number. Input an integer: ");
int dimBaseInput = keyboard.nextInt(); // Receives integer
dimBase = dimBaseInput;
if( dimBase != 1 && dimBase != 2 && dimBase != 3)
{// Error checking
System.out.println("invalid Dimension Base Number should be 1 or 2 ");
System.exit(1);
}
// Build the arraylist, print, clear, exit
buildDimArray(dimBase);
printArray(dimBase);
dimBaseArray.clear();
System.exit(1);
}
}// End of class

hive querying records for a specific uniontype

I have a sample hive table created as
CREATE TABLE union_test(foo UNIONTYPE<int, double, array<string>, struct<a:int,b:string>>);
The data can be viewed as
SELECT foo FROM union_test;
The output is
{0:1}
{1:2.0}
{2:["three","four"]}
{3:{"a":5,"b":"five"}}
{2:["six","seven"]}
{3:{"a":8,"b":"eight"}}
{0:9}
{1:10.0}
the first field (tag) denotes the type of the union ( 0 for int, 1 for double, 2 for array etc).
My problem is if I found to select only those records where the union type is 2 (array), how should I frame my query?
There is no function in Hive to read data from UnionType. So i wrote 2 UDF´s. One to get Union tag (that you trying to do) and second to get struct from union as an example.
get_union_tag() function:
package HiveUDF;
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.UnionObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
#Description(name = "get_union_tag", value = "_FUNC_(unionObject)"
+ " - Returns union object Tag", extended = "Example:\n" + " > SELECT _FUNC_(unionObject) FROM src LIMIT 1;\n one")
public class GetUnionTag extends GenericUDF {
// Global variables that inspect the input.
// These are set up during the initialize() call, and are then used during the
// calls to evaluate()
private transient UnionObjectInspector uoi;
#Override
// This is what we do in the initialize() method:
// Verify that the input is of the type expected
// Set up the ObjectInspectors for the input in global variables
// Return the ObjectInspector for the output
public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException {
// Verify the input is of the required type.
// Set the global variables (the various ObjectInspectors) while we're doing this
// Exactly one input argument
if( arguments.length != 1 ){
throw new UDFArgumentLengthException("_FUNC_(unionObject) accepts exactly one argument.");
}
// Is the input an array<>
if( arguments[0].getCategory() != ObjectInspector.Category.UNION ){
throw new UDFArgumentTypeException(0,"The single argument to AddExternalIdToPurchaseDetails should be "
+ "Union<>"
+ " but " + arguments[0].getTypeName() + " is found");
}
// Store the ObjectInspectors for use later in the evaluate() method
uoi = ((UnionObjectInspector)arguments[0]);
// Set up the object inspector for the output, and return it
return PrimitiveObjectInspectorFactory.javaByteObjectInspector;
}
#Override
public Object evaluate(DeferredObject[] arguments) throws HiveException {
byte tag = uoi.getTag(arguments[0].get());
return tag;
}
#Override
public String getDisplayString(String[] children) {
StringBuilder sb = new StringBuilder();
sb.append("get_union_tag(");
for (int i = 0; i < children.length; i++) {
if (i > 0) {
sb.append(',');
}
sb.append(children[i]);
}
sb.append(')');
return sb.toString();
}
}
function get_struct_from_union() UDF :
package HiveUDF;
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.UnionObjectInspector;
#Description(name = "get_union_struct", value = "_FUNC_(unionObject)"
+ " - Returns struct ", extended = "Example:\n" + " > _FUNC_(unionObject).value \n 90.0121")
public class GetUnionStruct extends GenericUDF {
// Global variables that inspect the input.
// These are set up during the initialize() call, and are then used during the
// calls to evaluate()
//
// ObjectInspector for the list (input array<>)
// ObjectInspector for the struct<>
// ObjectInspectors for the elements of the struct<>, target, quantity and price
private UnionObjectInspector unionObjectInspector;
private StructObjectInspector structObjectInspector;
#Override
// This is what we do in the initialize() method:
// Verify that the input is of the type expected
// Set up the ObjectInspectors for the input in global variables
// Return the ObjectInspector for the output
public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException {
// Verify the input is of the required type.
// Set the global variables (the various ObjectInspectors) while we're doing this
// Exactly one input argument
if( arguments.length != 1 ){
throw new UDFArgumentLengthException("_FUNC_(unionObject) accepts exactly one argument.");
}
// Is the input an array<>
if( arguments[0].getCategory() != ObjectInspector.Category.UNION ){
throw new UDFArgumentTypeException(0,"The single argument to AddExternalIdToPurchaseDetails should be "
+ "Union<Struct>"
+ " but " + arguments[0].getTypeName() + " is found");
}
// Set up the object inspector for the output, and return it
return structObjectInspector;
}
#Override
public Object evaluate(DeferredObject[] arguments) throws HiveException {
return ((UnionObjectInspector) unionObjectInspector).getField(arguments[0].get());
}
#Override
public String getDisplayString(String[] children) {
StringBuilder sb = new StringBuilder();
sb.append("get_union_vqtstruct(");
for (int i = 0; i < children.length; i++) {
if (i > 0) {
sb.append(',');
}
sb.append(children[i]);
}
sb.append(')');
return sb.toString();
}
}
to use these UDF´s compile and create jar file. Than upload into hive (in my case HDInsight). Than just use
add jar wasb:///hive/HiveGUDF.jar;
CREATE TEMPORARY FUNCTION get_union_struct AS 'HiveUDF.GetUnionStruct';
before u run e.g.
SELECT get_union_tag(exposed) FROM test;

How do i create a calculated measure that will filter data by days overdue

I have a field in my fact table called days overdue. I would like to create a set that will do the following: If the days due is between 0 - 29, then 0 - 29 days overdue, if between 30 and 59 days old, then '30 - 59 days overdue. How would i create this?
We need to know what kind of array you're using, or linked list, or my favorite for these things, a vector, etc.
If you were using a vector, you would create your own class to be used as a datatype with things like:
Class MyData
{
String name;
int daysPastDue; // how you want to factor this is up to you,
// i suggest looking into Java.util.date or Java.util.calendar
public MyData
{
name = "";
daysPastDue = 0;
}
}
Class DoWork
{
public void myWork() // excuse the indent, forgot to put in the class name
{
vector <MyData> input;
MyData 0To29 [] = new MyData[input.size()];
MyData 33To59 [] = new MyData[input.size()];
MyData item = new MyData();
int 0To29count = 0;
int 30To59count = 0;
for (i = 0; i <= list.size(); i++)
{
item = input.elementAt(i)
if (item.daysPastDue <= 29)
{
0To29[0To29Count] = input;
0To29Count ++;
}
elseif (item.daysPastDue >= 30 && item.daysPastDue <= 59)
{
30To59[30To59Count] = input;
30To59Count ++;
}
}
}
}
then you have your 2 arrays and can output them as you wish. however i would recommend starting at daysPastDue = 100000 and decrement it and check the number through the vector until you have all the items in the vector listed. That way they're all in order from the most past due, to the least and you get the output of exactly how long they've been past due.

apache pig Java UDF - changing values in attributes doesn't seem to stick

I'm trying to write a Java UDF that will rank tuples in a bag using a java UDF.
The tuples have a value column that is the criteria for the ranking and a rank column which is initially set to 0.
The tuples are sorted based on the value column.
All the tuples are placed in a bag and that bag is placed inside a new tuple which is passed to the UDF.
The UDF is modifying the rank column however - once the method exits the values have all become 0 again. I'm not sure how to get the values to "Stick".
Any help would greatly appreciated.
Here is my java class
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.pig.FilterFunc;
import org.apache.pig.EvalFunc;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.data.DataType;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.DataBag;
import org.apache.pig.impl.logicalLayer.FrontendException;
import java.util.Iterator;
import org.apache.pig.PigWarning;
/**
*
* #author Winter
*/
public class Ranker extends EvalFunc<String>{
#Override
public String exec(Tuple tuple) throws IOException {
if (tuple == null || tuple.size() == 0) {
return null;
}
List<Object> list = tuple.getAll();
DataBag db = (DataBag) list.get(0);
Integer num = (Integer)list.get(1);
Iterator<Tuple>itr = db.iterator();
boolean containsNonNull = false;
int i = 1;
double previous=0;
while (itr.hasNext()) {
Tuple t= itr.next();
double d = (Double)t.get(num.intValue());
int rankCol = t.size()-1;
Integer rankVal = (Integer)t.get(rankCol);
if(i == 0){
System.out.println("i==0");
previous = d;
t.set(rankCol, i);
} else {
if(d == previous)
t.set(rankCol, i);
else{
System.out.print("d!==previous|" + d + "|"+ previous+"|"+rankVal);
t.set(rankCol, ++i);
rankVal = (Integer)t.get(rankCol);
System.out.println("|now rank val" + rankVal);
previous = d;
}
}
}
return "Y";
}
}
Here is how I am calling everything in Pig -
REGISTER /myJar.jar;
A = LOAD '/Users/Winter/milk-tea-coffee.tsv' as (year:chararray, milk:double);
B = foreach A generate year, milk, 0 as rank;
C = order B by milk asc;
D = group C by rank order C by milk;
E = foreach D generate D.C.year,D.C.milk,D.C.rank, piglet3.evalFunctions.Ranker(D.C,1);
dump E;
I can tell its working inside the UDF because of the print statements inside the UDF -
d!==previous|21.2|0.0|0|now rank val2
d!==previous|21.6|21.2|0|now rank val3
d!==previous|21.9|21.6|0|now rank val4
d!==previous|22.0|21.9|0|now rank val5
d!==previous|22.5|22.0|0|now rank val6
d!==previous|22.9|22.5|0|now rank val7
d!==previous|23.0|22.9|0|now rank val8
d!==previous|23.4|23.0|0|now rank val9
d!==previous|23.8|23.4|0|now rank val10
d!==previous|23.9|23.8|0|now rank val11
but when I dump out E or D or C the rank column only contains 0s.
The exec function must return the output you want from the UDF. You are currently modifying the Tuple that is being passed to the exec function, then returning the String "Y" -- all that Pig see's as output from your UDF is "Y". In this case, you should return the Tuple instead of "Y".
I think the following code is close to your intent, but I'm not quite clear on what you are trying to do:
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.pig.FilterFunc;
import org.apache.pig.EvalFunc;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.data.DataType;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.DataBag;
import org.apache.pig.impl.logicalLayer.FrontendException;
import java.util.Iterator;
import org.apache.pig.PigWarning;
/**
*
* #author Winter
*/
public class Ranker extends EvalFunc<Tuple>{
#Override
public Tuple exec(Tuple tuple) throws IOException {
if (tuple == null || tuple.size() == 0) {
return null;
}
List<Object> list = tuple.getAll();
DataBag db = (DataBag) list.get(0);
Integer num = (Integer)list.get(1);
Iterator<Tuple>itr = db.iterator();
boolean containsNonNull = false;
int i = 1;
double previous=0;
while (itr.hasNext()) {
Tuple t= itr.next();
double d = (Double)t.get(num.intValue());
int rankCol = t.size()-1;
Integer rankVal = (Integer)t.get(rankCol);
if(i == 0){
System.out.println("i==0");
previous = d;
t.set(rankCol, i);
} else {
if(d == previous)
t.set(rankCol, i);
else{
System.out.print("d!==previous|" + d + "|"+ previous+"|"+rankVal);
t.set(rankCol, ++i);
rankVal = (Integer)t.get(rankCol);
System.out.println("|now rank val" + rankVal);
previous = d;
}
}
}
return tuple;
}
}

How to multiply several fields in a tuple by a given field of the tuple

For each row of data, I would like to multiply fields 1 through N by field 0. The data could have hundreds of fields per row (or a variable number of fields for that matter), so writing out each pair is not feasible. Is there a way to specify a range of fields, sort of like the the following (incorrect) snippet?
A = LOAD 'foo.csv' USING PigStorage(',');
B = FOREACH A GENERATE $0*($1,..);
A UDF could come in handy here.
Implement exec(Tuple input) and iterate over all fields of the tuple as follows (not tested):
public class MultiplyField extends EvalFunc<Long> {
public Long exec(Tuple input) throws IOException {
if (input == null || input.size() == 0) {
return null;
}
try {
Long retVal = 1;
for (int i = 0; i < input.size(); i++) {
Long j = (Long)input.get(i);
retVal *= j;
}
return retVal;
} catch(Exception e) {
throw WrappedIOException.wrap("Caught exception processing input row ", e);
}
}
}
Then register your UDF and call it from your FOREACH.