Apache Pig: Combine multiple records within a bag - apache-pig

Any help in this would be greatly appreciated! Best way is with an example:
Input:
Schema:
Name|phone_type|phone_num
Example data:
Kyle|Cell|555-222-3333
Kyle|Home|453-444-5555
Tom|Home|555-555-5555
Tom|Pager|555-555-4344
Desired output:
Schema:
Name|Home_num|Cell_num|Pager_num
Example:
Kyle|453-444-5555|555-222-3333|null
Tom|555-555-5555|null|555-555-4344
Code:
data=Load 'test.txt' using PigStorage('|');
grpd= Group data by $0;
Foreach grpd{
???
}

After the comment of #Murali lao, I rewrite the solution.
I now use FILTER, and then the trick to not filter empty bag with FLATTEN is to add an empty string when the bag is empty.
Here are my test data:
tom,home,555
tom,pager,666
tom,cell,777
bob,home,111
bob,cell,222
Here is my solution:
data = LOAD 'phone' USING PigStorage(',') AS (name:chararray, phone_type: chararray, phone_num: chararray);
user = FOREACH (GROUP data BY name) {
home = FILTER $1 BY phone_type == 'home';
-- you add an empty string if the the bag is empty
homenum = (IsEmpty(home) ? {('')} : home.phone_num);
pager = FILTER $1 BY phone_type == 'pager';
pagernum = (IsEmpty(pager) ? {('')} : pager.phone_num);
cell = FILTER $1 BY phone_type == 'cell';
cellnum = (IsEmpty(cell) ? {('')} : cell.phone_num);
GENERATE group as name, FLATTEN(homenum) as home, FLATTEN(pagernum) as pager, FLATTEN(cellnum) as cell;
};
After a dump, I obtain the following result :
(bob,111,,222)
(tom,555,666,777)

Related

Dynamic column search in multiple tables with gorm golang

My scenario is i have a grid with search option where user can select the column and can do the search, the grid data is coming from various tables. I have attached a sample screen of grid.
User Screen
So i'm trying to create a dynamic query for search but the problem is i can able to search only in main table (schema.Robot) not in Preload tables. whenever i trying to search data data from Preload tables let say from RobotModel table that time getting below error
pq: missing FROM-clause entry for table "robot_models"
Here is my go code
func (r *RobotsRepository) GetRobotsSummary(listParams viewmodel.ListParams, companyID uint) ([]*schema.Robot, int, error) {
mrobots := []*schema.Robot{}
var count int
var order string
if listParams.SortColumn == "" {
listParams.SortColumn = "id"
listParams.SortOrder = 1
} else {
listParams.SortColumn = util.Underscore(listParams.SortColumn)
}
if listParams.SortOrder == 0 {
order = "ASC"
} else {
order = "DESC"
}
var searchQuery string
if listParams.SearchText != "" {
switch listParams.SearchColumn {
case "Robot":
listParams.SearchColumn = "name"
case "Model":
listParams.SearchColumn = "robot_models.name"
}
searchQuery = listParams.SearchColumn +" LIKE '%"+ listParams.SearchText +"%' and Company_ID = " + fmt.Sprint(companyID)
}else{
searchQuery = "Company_ID = " + fmt.Sprint(companyID)
}
orderBy := fmt.Sprintf("%s %s", listParams.SortColumn, order)
err := r.Conn.
Preload("RobotModel", func(db *gorm.DB) *gorm.DB {
return db.Select("ID,Name")
}).
Preload("Task", func(db *gorm.DB) *gorm.DB {
return db.Where("Task_Status in ('In-Progress','Pending')").Select("ID, Task_Status")
}).
Preload("CreatedUser", func(db *gorm.DB) *gorm.DB {
return db.Select("ID,Display_Name")
}).
Preload("UpdatedUser", func(db *gorm.DB) *gorm.DB {
return db.Select("ID,Display_Name")
}).
Where(searchQuery).
Order(orderBy).
Offset(listParams.PageSize * (listParams.PageNo - 1)).
Limit(listParams.PageSize).
Find(&mrobots).Error
r.Conn.Model(&schema.Robot{}).Where(searchQuery).Count(&count)
return mrobots, count, err
}
In searchQuery variable i'm storing my dynamic query.
My question is how can i search data for preload table columns
Here is the sql query which i'm trying to achieve using gorm
SELECT robots.id,robots.name,robot_models.name as
model_name,count(tasks.task_status) as task_on_hand,
robots.updated_at,users.user_name as updated_by
FROM rfm.robots as robots
left join rfm.tasks as tasks on tasks.robot_id = robots.id and
tasks.task_status in ('In-Progress','Pending')
left join rfm.robot_models as robot_models on robot_models.id =
robots.robot_model_id
left join rfm.users as users on users.id = robots.updated_by
WHERE robot_models.name::varchar like '%RNR%' and robots.deleted_at is null
GROUP BY robots.id,robot_models.name,users.user_name
ORDER BY task_on_hand DESC LIMIT 2 OFFSET 0
and sorry for bad English!
Even though you are preloading, you are still required to explicitly use joins when filtering and ordering on columns on other tables. Preloading is used to eagerly load the data to map into your models, not to join tables.
Chain on something like this:
.Joins("LEFT JOIN rfm.robot_models AS robot_models ON robot_models.id = robots.robot_model_id")
I'm not positive if you can use the AS keyword using this technique, but if not, it should be easy enough to adjust your query accordingly.

How to return a JSON array from sql table with PhalconPHP

I have several tables that have JSON arrays stored within fields.
Using PHP PDO I am able to retrieve this data without issue using:
$query1 = $database->prepare("SELECT * FROM module_settings
WHERE project_token = ? AND module_id = ? ORDER BY id DESC LIMIT 1");
$query1->execute(array($page["project_token"], 2));
$idx = $query1->fetch(PDO::FETCH_ASSOC);
$idx["settings"] = json_decode($idx["settings"]);
This returns a string like:
{"mid":"","module_id":"1","force_reg_enable":"1","force_reg_page_delay":"2"}
Attempting to gather the same data via PhalconPHP
$result = Modulesettings::findFirst( array(
'conditions' => 'project_token = "' . $token . '"' ,
'columns' => 'settings'
) );
var_dump($result);
Provides a result of
object(Phalcon\Mvc\Model\Row)#61 (1) { ["settings"]=> string(167) "{"text":"<\/a>
<\/a>
","class":""}" }
What do I need to do different in Phalcon to return the string as it is stored in the table?
Thank you.
You have 2 approach
First :
Get the settings with this structure :
$settings = $result->settings;
var_dump($settings);
Second :
First get array from resultset, then using the array element :
$res = $result->toArray();
var_dump($res['settings']);
Try it.
You can decode json right in your Modulesettings model declaration:
// handling result
function afterFetch() {
$this->settings = json_decode($this->settings);
}
// saving. Can use beforeCreate+beforeSave+beforeUpdate
// or write a Json filter.
function beforeValidation() {
$this->settings = json_encode($this->settings);
}

How to get intersection of bag and tuple in Pig?

I have a bag like this (url:chararray mal:float) and like this (url:chararray links:chararray).
I want to parse the links field and intersect the bag with parsed links:
src = LOAD 'hbase://$collection' USING
org.apache.pig.backend.hadoop.hbase.HBaseStorage('info:url anchors:links', '-loadKey true') AS (id:bytearray, url:chararray, links:chararray);
mals = LOAD '/tmp/prepare' as (url:chararray, mal:float);
urls = FILTER src BY (links IS NOT null);
urls2 = FOREACH urls GENERATE TOKENIZE(links, '\t') as links, id, url;
processed = FOREACH urls2 {
grouped = COGROUP links BY $0, mals BY url;
intersected = FILTER grouped BY NOT IsEmpty(urls) AND NOT IsEmpty(links4);
weights = FOREACH intersected GENERATE mal;
GENERATE id, AVG(weights) as mal;
};
This code isn't working: parser fails with:
[main] ERROR org.apache.pig.tools.grunt.Grunt - ERROR 1200: <file ./Rank.pig, line 11, column 19> [query, statement, foreach_statement, foreach_complex_statement, foreach_clause_complex, foreach_plan_complex, nested_blk, nested_command_list, nested_command, expr, add_expr, multi_expr, cast_expr, unary_expr, expr_eval, var_expr, projectable_expr, func_eval, recoverFromMismatchedToken] mismatched input 'links' expecting LEFT_PAREN
I'm using Pig 0.11.0.
As far as I understand links is tuple, and mals is bag, so they cannot be cogrouped. How can I create a bag with links to do cogroup?
UPD:
Example dataset:
/tmp/prepare:
http://1 1.0
http://2 0.9
http://3 0.8
http://4 0.0
HBase:
id: ID
url: http://4
links: http://1 http://2 http://3
As output:
{(id: ID, mal: 0.9)}

Linq query using list output as input

I am using Linqpad and have odata connection setup.
I have a query as follows
QUERY1
void Main()
{var a = from cpuid in Computers
where cpuid.DnsHostName == "xyz"
select new {
ID = cpuid.TechnicalProductsHosted.Select (x => new { Id = x.Id }),
System_Dept = cpuid.SystemDepartment,
};
Console.WriteLine(a);
}
The output : it returns 4 ids but one department which is common among all four id's. When i query otherway round i.e
QUERY2
var a = from id in TechnicalProducts
where id.Id == "ID-15784"
select new
{System_Dept = id.Computers.Select(x => x.SystemDepartment),
Support_Team = id.Computers.Select(x => x.SupportTeam)
};
Console.WriteLine(a);
The output : 4 departments for the id. I wish to have the whole list of departments in the first case. How is it possible? In query 1 Can i take id as input for System Department and query it somehow?
the output samples

Can I generate nested bags using nested FOREACH statements in Pig Latin?

Let's say I have a data set of restaurant reviews:
User,City,Restaurant,Rating
Jim,New York,Mecurials,3
Jim,New York,Whapme,4.5
Jim,London,Pint Size,2
Lisa,London,Pint Size,4
Lisa,London,Rabbit Whole,3.5
And I want to produce a list by user and city of average review. I.e. output:
User,City,AverageRating
Jim,New York,3.75
Jim,London,2
Lisa,London,3.75
I could write a Pig script as follows:
Data = LOAD 'data.txt' USING PigStorage(',') AS (
user:chararray, city:chararray, restaurant:charray, rating:float
);
PerUserCity = GROUP Data BY (user, city);
ResultSet = FOREACH PerUserCity {
GENERATE group.user, group.city, AVG(Data.rating);
}
However I'm curious whether I can first group the higher level group (the users) and then sub group the next level (the cities) later: i.e.
PerUser = GROUP Data BY user;
Intermediate = FOREACH PerUser {
B = GROUP Data BY city;
GENERATE group AS user, B;
}
I get:
Error during parsing.
Invalid alias: GROUP in {
group: chararray,
Data: {
user: chararray,
city: chararray,
restaurant: chararray,
rating: float
}
}
Has anyone tried this with success? Is it simply not possible to GROUP within a FOREACH?
My goal is to do something like:
ResultSet = FOREACH PerUser {
FOREACH City {
GENERATE user, city, AVG(City.rating)
}
}
Currently the allowed operations are DISTINCT, FILTER, LIMIT, and ORDER BY inside a FOREACH.
For now grouping directly by (user, city) is the good way to do as you said.
Release notes for Pig version 0.10 suggest that nested FOREACH operations are now supported.
Try this:
Records = load 'data_rating.txt' using PigStorage(',') as (user:chararray, city:chararray, restaurant:chararray, rating:float);
grpRecs = group Records By (user,city);
avgRating_Byuser_perCity = foreach grpRecs generate AVG(Records.rating) as average;
Result = foreach avgRating_Byuser_perCity generate flatten(group), average;
awdata = load 'data' using PigStorage(',') as (user:chararray , city:chararray , restaurant:chararray , rating:float);
data = filter rawdata by user != 'User';
groupbyusercity = group data by (user,city);
--describe groupbyusercity;
--groupbyusercity: {group: (user: chararray,city: chararray),data: {(user: chararray,city: chararray,restaurant: chararray,rating: float)}}
average = foreach groupbyusercity {
generate group.user,group.city,AVG(data.rating);
}
dump average;
Grouping by two keys and then flattening the structure leads to the same result:
Loading Data like you did
Data = LOAD 'data.txt' USING PigStorage(',') AS (
user:chararray, city:chararray, restaurant:charray, rating:float);
Group by user and city
ByUserByCity = GROUP Data BY (user, city);
Add Rating average of the groups (you can add more, like COUNT(Data) as count_res)
Then flatten the group structure to the original one.
ByUserByCityAvg = FOREACH ByUserByCity GENERATE
FLATTEN(group) AS (user, city),
AVG(Data.rating) as user_city_avg;
Results in:
Jim,London,2.0
Jim,New York,3.75
Lisa,London,3.75
User,City,