Edit: example DataFrame for the original error-message found and posted.
(As I just recognized, the Error does only appear, if the tuple has a certain length. The example is now adapted.)
Original text:
I need to group by tuple of different length.
For the grouping I'm applying a summary_function.
import pandas as pd
def summary_function(df):
value_mean = df['value'].mean()
df1 = pd.DataFrame({'value_mean':[value_mean]
})
return df1
tuple_list = [(1,2,1,1,1,1,1,1,1,1,1,1,1),(2,3,1,1,1,1,1,1,1,1,1,1,1), \
(1,2,1,1,1,1,1,1,1,1,1,1,1), \
(2,3,4,4,4,4,4,4,4,4,4,4,4,4,4,1,1,1,1,1,1,1,1,1,1,1)]
value = [1,2,3,4]
letter = list('abab')
df = pd.DataFrame({'letter':letter, 'tuple':tuple_list, 'value':value})
df
> letter tuple value
>0 a (1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) 1
>1 b (2, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) 2
>2 a (1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) 3
>3 b (2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, ... 4
If I'm using a direct mean() function, the result is how expected:
df.groupby(['letter','tuple']).mean()
> value
>letter tuple
>a (1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) 2
>b (2, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) 2
> (2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, ...) 4
But if I apply the function. (which I need to use since I have dozens of summaries) The tupel is empty while using the simple
df.groupby(['letter','tuple']).apply(lambda x:summary_function(x))
I get a ValueError:
>ValueError: Values not found in passed level: MultiIndex([(2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4)],
)
It would be awesome to get some ideas on how to solve this.
In your case, do not return the dataframe, return the series.
When you return the series, Pandas will align the series horizontally. For example:
def summary_function(df):
return df['value'].agg(['min','mean','max'])
df.groupby(['letter','tuple']).apply(summary_function)
Output:
value min mean max
letter tuple
a (1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) 1.0 2.0 3.0
b (2, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) 2.0 2.0 2.0
(2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 1... 4.0 4.0 4.0
The even shorter solution was just to replace "pd.DataFrame" with "pd.Series".
def summary_function(df):
value_mean = df['value'].mean()
df1 = pd.Series({'value_mean':[value_mean]
})
(Inspired by the answer of Quang Hoang)
I have the following SQL statement. The inner query ('SELECT ARRAY...ORDER BY 1,2') works correctly and gives the correct totals for each row_name. When I run the crosstab, the result is incorrect. Changing the 'ORDER BY' in the innner query doesn't seem to change its result, but changes the outer query result. I have verified the types match for crosstab(text,text) for column headings.
SELECT
ct.row_name[1:2] AS zonenumber,
sum(ct.amount1) AS "sumEmploymentamount",
sum(ct.amount3) AS "sumExport_Consumersamount"
FROM output.crosstab('
SELECT
ARRAY[
zonenumber::text,
comTypes.commodity_type_name::text,
year_run::text
] as row_name,
tab.activity_type_id as attribute,
amount as value
FROM
output.all_zonalmakeuse_3 tab,
output.activity_numbers actNums,
output.activity_types actTypes,
output.commodity_numbers comNums,
output.commodity_types comTypes
WHERE
scenario = ''S03'' AND year_run = ''2005'' AND
amount != ''-Infinity'' AND moru = ''M'' AND
actNums.activity_type_id = ActTypes.activity_type_id AND
tab.activity = actNums.activitynumber AND
comNums.commodity_type_id = comTypes.commodity_type_id AND
tab.commodity = comNums.commoditynumber AND
(
comTypes.commodity_type_name =''Financial''OR
comNums.commodity = ''Financial'' OR
comTypes.commodity_type_name =''Goods''OR
comNums.commodity = ''Goods''
) AND
(
actTypes.activity_type_name =''Employment'' OR
actNums.activity = ''Employment'' OR
actTypes.activity_type_name =''Export Consumers'' OR
actNums.activity = ''Export Consumers''
)
ORDER BY 1,2
'::text, '
SELECT activity_type_id AS activity
FROM output.activity_types
WHERE activity_type_id = 1 OR activity_type_id = 3
'::text
) ct (row_name text[], amount1 double precision, amount3 double precision)
GROUP BY ct.row_name[1:2]
ORDER BY ct.row_name[1:2]::text;
Tables
CREATE TABLE activity_numbers
("activitynumber" int, "activity" varchar(46), "activity_type_id" int)
;
INSERT INTO activity_numbers
("activitynumber", "activity", "activity_type_id")
VALUES
(0, '"AI01AgMinMan"', 1),
(1, '"AI02AgMinProd"', 1),
(2, '"AI03ConMan"', 1),
(3, '"AI04ConProd"', 1),
(4, '"AI05MfgMan"', 1),
(5, '"AI06MfgProd"', 1),
(6, '"AI07TCUMan"', 1),
(7, '"AI08TCUProd"', 1),
(8, '"AI09Whole"', 1),
(9, '"AI10Retail"', 1),
(10, '"AI11FIRE"', 1),
(11, '"AI12PTSci"', 1),
(12, '"AI13ManServ"', 1),
(13, '"AI14PBSOff"', 1),
(14, '"AI15PBSRet"', 1),
(15, '"AI16PSInd"', 1),
(16, '"AI17Religion"', 1),
(17, '"AI18BSOnsite"', 1),
(18, '"AI19PSOnsite"', 1);
CREATE TABLE activity_types
("activity_type_id" int, "activity_type_name" varchar(18))
;
INSERT INTO activity_types
("activity_type_id", "activity_type_name")
VALUES
(1, '"Employment"'),
(2, '"Households"'),
(3, '"Export Consumers"')
;
CREATE TABLE commodity_numbers
("commoditynumber" int, "commodity" varchar(29), "commodity_type_id" int)
;
INSERT INTO commodity_numbers
("commoditynumber", "commodity", "commodity_type_id")
VALUES
(0, '"CG01AgMinDirection"', 1),
(1, '"CG02AgMinOutput"', 1),
(2, '"CG03ConDirection"', 1),
(3, '"CG04ConOutput"', 1),
(4, '"CG05MfgDirection"', 1),
(5, '"CG06MfgOutput"', 1),
(6, '"CS07TCUDirection"', 2),
(7, '"CS08TCUOutput"', 2),
(8, '"CS09WsOutput"', 2),
(9, '"CS10RetailOutput"', 2),
(10, '"CS11FIREOutput"', 2),
(11, '"CS13OthServOutput"', 2),
(12, '"CS14HealthOutput"', 2),
(13, '"CS15GSEdOutput"', 2),
(14, '"CS16HiEdOutput"', 2),
(15, '"CS17GovOutput"', 2),
(16, '"CF18TaxReceipts"', 4),
(17, '"CF19GovSupReceipts"', 4),
(18, '"CF20InvestReceipts"', 4),
(19, '"CF21ReturnInvestReceipts"', 4),
(20, '"CF22CapitalTransferReceipts"', 4)
;
CREATE TABLE commodity_types
("commodity_type_id" int, "commodity_type_name" varchar(23))
;
INSERT INTO commodity_types
("commodity_type_id", "commodity_type_name")
VALUES
(1, '"Goods"'),
(4, '"Financial"')
;
CREATE TABLE all_zonalmakeuse_3
("year_run" int, "scenario" varchar(6), "activity" int, "zonenumber" int, "commodity" int, "moru" varchar(3), "amount" numeric, "activity_type_id" int, "commodity_type_id" int)
;
INSERT INTO all_zonalmakeuse_3
("year_run", "scenario", "activity", "zonenumber", "commodity", "moru", "amount", "activity_type_id", "commodity_type_id")
VALUES
(2005, '"C11a"', 0, 1, 0, '"M"', 1752708.30900861, 1, 1),
(2005, '"C11a"', 0, 3, 0, '"M"', 2785972.97039016, 1, 1),
(2005, '"C11a"', 0, 4, 0, '"M"', 3847879.45910403, 1, 1),
(2005, '"C11a"', 1, 1, 1, '"M"', 26154618.3893068, 1, 1),
(2005, '"C11a"', 1, 3, 1, '"M"', 1663.49609248196, 1, 1),
(2005, '"C11a"', 1, 4, 1, '"M"', 91727.9065950723, 1, 1),
(2005, '"C11a"', 1, 1, 5, '"M"', 855899.319689473, 1, 1),
(2005, '"C11a"', 1, 3, 5, '"M"', 54.4372375336784, 1, 1),
(2005, '"C11a"', 1, 4, 5, '"M"', 3001.75868302327, 1, 1),
(2005, '"C11a"', 2, 1, 2, '"M"', 150885191.664482, 1, 1),
(2005, '"C11a"', 2, 2, 2, '"M"', 99242746.1181359, 1, 1),
(2005, '"C11a"', 2, 3, 2, '"M"', 90993266.1879518, 1, 1),
(2005, '"C11a"', 2, 4, 2, '"M"', 60169908.2975819, 1, 1),
(2005, '"C11a"', 3, 1, 3, '"M"', 642982844.104623, 1, 1),
(2005, '"C11a"', 3, 2, 3, '"M"', 421379496.576106, 1, 1),
(2005, '"C11a"', 3, 3, 3, '"M"', 592125233.320609, 1, 1),
(2005, '"C11a"', 3, 4, 3, '"M"', 400206994.693349, 1, 1),
(2005, '"C11a"', 4, 1, 4, '"M"', 449206658.578704, 1, 1),
(2005, '"C11a"', 4, 2, 4, '"M"', 103823580.173348, 1, 1),
(2005, '"C11a"', 4, 3, 4, '"M"', 181300924.388112, 1, 1),
(2005, '"C11a"', 4, 4, 4, '"M"', 143113096.547075, 1, 1),
(2005, '"C11a"', 5, 1, 1, '"M"', 83889.8852772168, 1, 1),
(2005, '"C11a"', 5, 2, 1, '"M"', 25716.5837854808, 1, 1),
(2005, '"C11a"', 5, 3, 1, '"M"', 10243.7021847824, 1, 1),
(2005, '"C11a"', 5, 4, 1, '"M"', 22406.3296935502, 1, 1),
(2005, '"C11a"', 5, 1, 5, '"M"', 408669650.696034, 1, 1),
(2005, '"C11a"', 5, 2, 5, '"M"', 125278360.769936, 1, 1),
(2005, '"C11a"', 5, 3, 5, '"M"', 49902204.2985933, 1, 1),
(2005, '"C11a"', 5, 4, 5, '"M"', 109152455.018677, 1, 1),
(2005, '"C11a"', 5, 1, 20, '"M"', 161822.743734245, 1, 4),
(2005, '"C11a"', 5, 2, 20, '"M"', 49607.031096612, 1, 4),
(2005, '"C11a"', 5, 3, 20, '"M"', 19759.998336631, 1, 4),
(2005, '"C11a"', 5, 4, 20, '"M"', 43221.5842952059, 1, 4),
(2005, '"C11a"', 7, 1, 1, '"M"', 122316.017730318, 1, 1),
(2005, '"C11a"', 7, 2, 1, '"M"', 20514.5008361246, 1, 1),
(2005, '"C11a"', 7, 3, 1, '"M"', 8431.33094615992, 1, 1),
(2005, '"C11a"', 7, 4, 1, '"M"', 75842.631567318, 1, 1),
(2005, '"C11a"', 13, 1, 5, '"M"', 1195626.97941868, 1, 1),
(2005, '"C11a"', 13, 2, 5, '"M"', 567002.352487648, 1, 1),
(2005, '"C11a"', 13, 3, 5, '"M"', 1104908.87426762, 1, 1),
(2005, '"C11a"', 13, 4, 5, '"M"', 1071325.74253601, 1, 1),
(2005, '"C11a"', 17, 1, 1, '"M"', 751648.370711072, 1, 1),
(2005, '"C11a"', 17, 2, 1, '"M"', 340439.936040081, 1, 1),
(2005, '"C11a"', 17, 3, 1, '"M"', 800477.767008582, 1, 1),
(2005, '"C11a"', 17, 4, 1, '"M"', 489745.223392316, 1, 1),
(2005, '"C11a"', 17, 1, 20, '"M"', 3154907.39011312, 1, 4),
(2005, '"C11a"', 17, 2, 20, '"M"', 1428934.74123601, 1, 4),
(2005, '"C11a"', 17, 3, 20, '"M"', 3359859.9041298, 1, 4),
(2005, '"C11a"', 17, 4, 20, '"M"', 2055616.54193613, 1, 4),
(2005, '"C11a"', 18, 1, 20, '"M"', 2088003.66854949, 1, 4),
(2005, '"C11a"', 18, 2, 20, '"M"', 1310122.52506653, 1, 4),
(2005, '"C11a"', 18, 3, 20, '"M"', 1481450.29636847, 1, 4),
(2005, '"C11a"', 18, 4, 20, '"M"', 3035710.53213605, 1, 4)
;
I have manipulated the query in several ways (changed type casting, order by, etc), but always get incorrect values. The row and column headers are at least consistently correct.