Aggregate hstore in Postgres within GROUP BY - sql

I have data with an hstore like this:
|brand|account|likes|views |
|-----|-------|-----|----------------------|
|Ford |ford_uk|1 |"3"=>"100" |
|Ford |ford_us|2 |"3"=>"200", "5"=>"10" |
|Jeep |jeep_uk|3 |"3"=>"300" |
|Jeep |jeep_us|4 |"3"=>"400", "5"=>"20" |
I would like to be able to sum the hstores by key, grouped by brand:
|brand|likes|views |
|-----|-----|----------------------|
|Ford |3 |"3"=>"300", "5"=>"10" |
|Jeep |7 |"3"=>"700", "5"=>"20" |
This answer gives a good solution for how to do this without a GROUP BY. Adapting it to this situation gives something like:
SELECT
sum(likes) AS total_likes,
(SELECT hstore(array_agg(key), array_agg(value::text))
FROM (
SELECT s.key, sum(s.value::integer)
FROM (
SELECT((each(views)).*)
) AS s(key, value)
GROUP BY key
) x(key, value)) AS total_views
FROM my_table
GROUP BY brand
However this gives:
ERROR: subquery uses ungrouped column "my_table.views" from outer query
Any help appreciated!

It is because of using views column without aggregate function in the group by query.
Very quick workaround:
with my_table(brand,account,likes,views) as (
values
('Ford', 'ford_uk', 1, '"3"=>"100"'::hstore),
('Ford', 'ford_uk', 2, '"3"=>"200", "5"=>"10"'),
('Jeep', 'jeep_uk', 3, '"3"=>"300"'::hstore),
('Jeep', 'jeep_uk', 4, '"3"=>"400", "5"=>"20"'))
SELECT
brand,
sum(likes) AS total_likes,
(SELECT hstore(array_agg(key), array_agg(value::text))
FROM (
SELECT s.key, sum(s.value::integer)
FROM
unnest(array_agg(views)) AS h, --<< aggregate views according to the group by, then unnest it into the table
each(h) as s(key,value)
GROUP BY key
) x(key, value)) AS total_views
FROM my_table
GROUP BY brand
Update
Also you can to create the aggregate for such tasks:
--drop aggregate if exists hstore_sum(hstore);
--drop function if exists hstore_sum_ffunc(hstore[]);
create function hstore_sum_ffunc(hstore[]) returns hstore language sql immutable as $$
select hstore(array_agg(key), array_agg(value::text))
from
(select s.key, sum(s.value::numeric) as value
from unnest($1) as h, each(h) as s(key, value) group by s.key) as t
$$;
create aggregate hstore_sum(hstore)
(
SFUNC = array_append,
STYPE = hstore[],
FINALFUNC = hstore_sum_ffunc,
INITCOND = '{}'
);
After that your query will be simpler and more "canonical":
select
brand,
sum(likes) as total_likes,
hstore_sum(views) as total_views
from my_table
group by brand;
Update 2
Even without create aggregate the function hstore_sum_ffunc could be useful:
select
brand,
sum(likes) as total_likes,
hstore_sum_ffunc(array_agg(views)) as total_views
from my_table
group by brand;

If you create an aggregate for hstore, this gets a bit easier:
create aggregate hstore_agg(hstore)
(
sfunc = hs_concat(hstore, hstore),
stype = hstore
);
Then you can do this:
with totals as (
select t1.brand,
hstore(k, sum(v::int)::text) as views
from my_table t1, each(views) x(k,v)
group by brand, k
)
select brand,
(select sum(likes) from my_table t2 where t1.brand = t2.brand) as likes,
hstore_agg(views) as views
from totals t1
group by brand;
Another option is to move the co-related sub-query which might be slow into a CTE:
with vals as (
select t1.brand,
hstore(k, sum(v::int)::text) as views
from my_table t1, each(views) x(k,v)
group by brand, k
), view_totals as (
select brand,
hstore_agg(views) as views
from vals
group by brand
), like_totals as (
select brand,
sum(likes) as likes
from my_table
group by brand
)
select vt.brand,
lt.likes,
vt.views
from view_totals vt
join like_totals lt on vt.brand = lt.brand
order by brand;

Related

Compare a single-column row-set with another single-column row set in Oracle SQL

Is there any Oracle SQL operator or function, which compares 2 result sets whether they are the exact same or not. Currently my idea is to use MINUS operator in both directions, but I am looking for a better and performanter solution to achieve. The one result set is fixed (see below), the other depends on the records.
Very important: I am not allowed to change the schema and structure. So CREATE TABLE and CREATE TYPE etc. are not allowed here for me. Also important that oracle11g version is used where the solution must be found.
The shema for SQL Fiddle is:
CREATE TABLE DETAILS (ID INT, MAIN_ID INT, VALUE INT);
INSERT INTO DETAILS VALUES (1,1,1);
INSERT INTO DETAILS VALUES (2,1,2);
INSERT INTO DETAILS VALUES (3,1,3);
INSERT INTO DETAILS VALUES (4,1,4);
INSERT INTO DETAILS VALUES (5,2,1);
INSERT INTO DETAILS VALUES (6,2,2);
INSERT INTO DETAILS VALUES (7,3,1);
INSERT INTO DETAILS VALUES (7,3,2);
Now this is my SQL query for doing the job well (selects MAIN_IDs of those, whose 'VALUE's are exactly the same as the given lists'):
SELECT DISTINCT D.MAIN_ID FROM DETAILS D WHERE NOT EXISTS
(SELECT VALUE FROM DETAILS WHERE MAIN_ID=D.MAIN_ID
MINUS
SELECT * FROM TABLE(SYS.ODCINUMBERLIST(1, 2)))
AND NOT EXISTS
(SELECT * FROM TABLE(SYS.ODCINUMBERLIST(1, 2))
MINUS
SELECT VALUE FROM DETAILS WHERE MAIN_ID=D.MAIN_ID)
The SQL Fiddle link: http://sqlfiddle.com/#!4/25dde/7/0
If you use a collection (rather than a VARRAY) then you can aggregate the values into a collection and directly compare two collections:
CREATE TYPE int_list AS TABLE OF INT;
Then:
SELECT main_id
FROM details
GROUP BY main_id
HAVING CAST( COLLECT( value ) AS int_list ) = int_list( 1, 2 );
Outputs:
| MAIN_ID |
| ------: |
| 2 |
| 3 |
db<>fiddle here
Update
Based on your expanded fiddle in comments, you can use:
SELECT B.ID
FROM BUSINESS_DATA B
INNER JOIN BUSINESS_NAME N
ON ( B.NAME_ID=N.ID )
WHERE N.NAME='B1'
AND EXISTS (
SELECT business_id
FROM ORDERS O
LEFT OUTER JOIN TABLE(
SYS.ODCIDATELIST( DATE '2021-01-03', DATE '2020-04-07', DATE '2020-05-07' )
) d
ON ( o.orderdate = d.COLUMN_VALUE )
WHERE O.BUSINESS_ID=B.ID
GROUP BY business_id
HAVING COUNT( CASE WHEN d.COLUMN_VALUE IS NULL THEN 1 END ) = 0
AND COUNT( DISTINCT o.orderdate )
= ( SELECT COUNT(DISTINCT COLUMN_VALUE) FROM TABLE( SYS.ODCIDATELIST( DATE '2021-01-03', DATE '2020-04-07', DATE '2020-05-07' ) ) )
)
(Note: Do not implicitly create dates from strings; it will cause the query to fail, without there being any changes to the query text, if a user changes their NLS_DATE_FORMAT session parameter. Instead use TO_DATE with an appropriate format model or a DATE literal.)
db<>fiddle here

Find IDs with only one observation in PostgreSQL

I have the following table:
CREATE TABLE my_table (
the_visitor_id varchar(5) NOT NULL,
the_visitor_visit timestamp NOT NULL,
the_visitor_returning text
);
INSERT INTO my_table
VALUES ('VIS01', '2019-05-02 09:00:00','YES' ),
('VIS01', '2019-05-04 12:00:00',NULL ),
('VIS01', '2019-05-05 18:00:00',NULL ),
('VIS02', '2019-05-06 18:30:00',NULL),
('VIS02', '2019-05-15 12:00:00',NULL),
('VIS03', '2019-06-30 18:00:00','YES'),
('VIS04', '2019-06-30 18:00:00','NULL');
And I would like to filter out all visitor_id's that have only one observation (or record). In this case VIS03 and VIS04, so I must end up with VIS01 and VIS02. I tried this:
SELECT DISTINCT ON(the_visitor_id) the_visitor_id,
the_visitor_visit, the_visitor_returning
FROM my_table
The expected result should be:
the_visitor_id the_visitor_visit the_visitor_returning
VIS01 2019-05-02 09:00:00 YES
VIS01 2019-05-04 12:00:00
VIS01 2019-05-05 18:00:00
VIS02 2019-05-06 18:30:00
VIS02 2019-05-15 12:00:00
But I guess that something like a rank is needed. Any help will be greatly appreciated.
There are probably other ways of doing this, but it you create a derived table CTE of only the visitor_ids that have more than 1 row, then use that in the join to your table. Obviously, if my_table is large an index would enhance the performance.
WITH cte
AS (
SELECT the_visitor_id
FROM my_table
GROUP BY the_visitor_id
HAVING count(*) > 1
)
SELECT my_table.*
FROM my_table
INNER JOIN cte ON cte.the_visitor_id = my_table.the_visitor_id
EXISTS can use an index:
SELECT the_visitor_id, the_visitor_visit, the_visitor_returning
FROM my_table t1
WHERE EXISTS (
SELECT FROM my_table
WHERE the_visitor_id = t1.the_visitor_id
AND ctid <> t1.ctid
);
Using ctid because you didn't disclose the PK or any UNIQUE column of the table. About ctid:
Postgresql group by for multiple lines
Ideally, you would have a UNIQUE index on (the_visitor_id, any_notnull_column) and use that column in the query. Substantially faster than a full sequential scan, count, join (another seq or idx scan).
Barring any usable index, using a window function allows us to at least keep it to a single sequential scan:
SELECT the_visitor_id, the_visitor_visit, the_visitor_returning
FROM (
SELECT *, count(*) OVER (PARTITION BY the_visitor_id) AS ct
FROM my_table
) sub
WHERE ct > 1;
db<>fiddle here

Filtering data from joined table in PostgreSQL

In my database I have the following schema:
CREATE TABLE survey_results (
id integer NOT NULL
);
CREATE TABLE slide_results (
id integer NOT NULL,
survey_result_id integer,
tags character varying[] DEFAULT '{}'::character varying[],
content character varying,
created_at timestamp with time zone NOT NULL
);
INSERT INTO survey_results (id)
VALUES (1);
INSERT INTO slide_results (id, survey_result_id, tags, content, created_at)
VALUES (1, 1, '{food}', 'Food slide', now());
INSERT INTO slide_results (id, survey_result_id, tags, content, created_at)
VALUES (2, 1, '{motivation}', 'Motivation slide', now());
Now I want to have an SQL query that will return survey result id and content for slide results with specified tags. I wrote something like this:
select distinct on(sr.id)
sr.id,
slr.content AS food,
slr2.content AS motivation
from survey_results sr
LEFT JOIN slide_results slr ON slr.survey_result_id = sr.id AND slr.id IN (
SELECT id as id
FROM slide_results
WHERE 'food' = ANY(tags)
ORDER BY created_at desc
)
LEFT JOIN slide_results slr2 ON slr2.survey_result_id = sr.id AND slr2.id IN (
SELECT id as id
FROM slide_results
WHERE 'motivation' = ANY(tags)
ORDER BY created_at desc
)
group by slr.content, slr2.content, sr.id
which returns:
| id | food | motivation |
| --- | ---------- | ---------------- |
| 1 | Food slide | Motivation slide |
This query works fine, but I'm wondering if there is better way of doing this?
EDIT:
I forgot to add link do db-fiddle:
https://www.db-fiddle.com/f/gP761psywgmovfdTT7DjP4/0
I would write the query like this:
SELECT DISTINCT ON (sr.id)
sr.id,
slr.content AS food,
slr2.content AS motivation
FROM survey_results AS sr
LEFT JOIN (SELECT survey_result_id, content, created_at
FROM slide_results
WHERE '{food}' <# tags) AS slr
ON slr.survey_result_id = sr.id
LEFT JOIN (SELECT survey_result_id, content, created_at
FROM slide_results
WHERE '{motivation}' <# tags) AS slr2
ON slr2.survey_result_id = sr.id
ORDER BY sr.id, slr.created_at DESC, slr2.created_at DESC;
The ORDER BY has to be in the outer query to be effective.
Using <# rather than =ANY allows you to use a GIN index on slide_results.tags.
Using a subselect in the FROM list avoids an unnecessary join and an inefficient IN subquery.
I can't promise this is any better than what you have, but it seems slightly more scalable. Without seeing your full dataset and desired results, it's hard to know if this will backfire in any way:
select
sl.survey_result_id,
array_to_string (array_agg (distinct sl.content) filter
(where 'food' = any (sl.tags)), ',') as food,
array_to_string (array_agg (distinct sl.content) filter
(where 'motivation' = any (sl.tags)), ',') as motivation
from
survey_results s
join slide_results sl on s.id = sl.survey_result_id
group by survey_result_id

Pivot / crosstab with more than one value column

I have a view that produces the following resultset:
CREATE TABLE foo
AS
SELECT client_id, asset_type, current_value, future_value
FROM ( VALUES
( 1, 0, 10 , 20 ),
( 1, 1, 5 , 10 ),
( 1, 2, 7 , 15 ),
( 2, 1, 0 , 2 ),
( 2, 2, 150, 300 )
) AS t(client_id, asset_type, current_value, future_value);
And I need to transform it into this:
client_id a0_cur_val a0_fut_val a1_cur_val a1_fut_val ...
1 10 20 5 10
2 NULL NULL 0 2
I know how to do this if I use just the current_value column, using crosstab. How can I use current_value and future_value to produce new columns in the destination resultset? If I just add future_value column to the crosstab(text) query it complains about "invalid source data SQL statement".
I'm using PostgreSQL 9.3.6.
One way would be to use a composite type:
CREATE TYPE i2 AS (a int, b int);
Or, for ad-hoc use (registers the type for the duration of the session):
CREATE TEMP TABLE i2 (a int, b int);
Then run the crosstab as you know it and decompose the composite type:
SELECT client_id
, (a0).a AS a0_cur_val, (a0).b AS a0_fut_val
, (a1).a AS a1_cur_val, (a1).b AS a1_fut_val
, (a2).a AS a2_cur_val, (a2).b AS a2_fut_val
FROM crosstab(
'SELECT client_id, asset_type, (current_value, future_value)::i2
FROM foo
ORDER BY 1,2'
,'SELECT * FROM generate_series(0,2)'
) AS ct (client_id int, a0 i2, a1 i2, a2 i2);
All parentheses are required!
Basics for crosstab():
PostgreSQL Crosstab Query
Another option would be to construct a join out of the two crosstabs queries that you can use to recover any of the two sets of values independently... Meaning:
select coalesce(cur.client_id, fut.client_id) client_id
, c0, f0, c1, f1, c2, f2
from
(select client_id, c0, c1, c2
from crosstab
('select client_id, asset_type, current_value
from foo
order by client_id, asset_type')
as sal1 (client_id int4, c0 int4 , c1 int4 , c2 int4)) cur
full outer join
(select client_id, f0, f1, f2
from crosstab
('select client_id, asset_type, future_value
from foo
order by client_id, asset_type')
as sal1 (client_id int4, f0 int4 , f1 int4 , f2 int4)) fut
on fut.client_id = cur.client_id
Meaning... Get current_value and future_value in two different crosstab queries and then join them to get the result in a join query
I used full outer join and coalesce for the client_id just in case any of the clients could not be present in first query containing the current value, if we would know current_value is always present we could do with left join and if both, current and future values were required then inner join would do

Simple Query to Grab Max Value for each ID

OK I have a table like this:
ID Signal Station OwnerID
111 -120 Home 1
111 -130 Car 1
111 -135 Work 2
222 -98 Home 2
222 -95 Work 1
222 -103 Work 2
This is all for the same day. I just need the Query to return the max signal for each ID:
ID Signal Station OwnerID
111 -120 Home 1
222 -95 Work 1
I tried using MAX() and the aggregation messes up with the Station and OwnerID being different for each record. Do I need to do a JOIN?
Something like this? Join your table with itself, and exclude the rows for which a higher signal was found.
select cur.id, cur.signal, cur.station, cur.ownerid
from yourtable cur
where not exists (
select *
from yourtable high
where high.id = cur.id
and high.signal > cur.signal
)
This would list one row for each highest signal, so there might be multiple rows per id.
You are doing a group-wise maximum/minimum operation. This is a common trap: it feels like something that should be easy to do, but in SQL it aggravatingly isn't.
There are a number of approaches (both standard ANSI and vendor-specific) to this problem, most of which are sub-optimal in many situations. Some will give you multiple rows when more than one row shares the same maximum/minimum value; some won't. Some work well on tables with a small number of groups; others are more efficient for a larger number of groups with smaller rows per group.
Here's a discussion of some of the common ones (MySQL-biased but generally applicable). Personally, if I know there are no multiple maxima (or don't care about getting them) I often tend towards the null-left-self-join method, which I'll post as no-one else has yet:
SELECT reading.ID, reading.Signal, reading.Station, reading.OwnerID
FROM readings AS reading
LEFT JOIN readings AS highersignal
ON highersignal.ID=reading.ID AND highersignal.Signal>reading.Signal
WHERE highersignal.ID IS NULL;
In classic SQL-92 (not using the OLAP operations used by Quassnoi), then you can use:
SELECT g.ID, g.MaxSignal, t.Station, t.OwnerID
FROM (SELECT id, MAX(Signal) AS MaxSignal
FROM t
GROUP BY id) AS g
JOIN t ON g.id = t.id AND g.MaxSignal = t.Signal;
(Unchecked syntax; assumes your table is 't'.)
The sub-query in the FROM clause identifies the maximum signal value for each id; the join combines that with the corresponding data row from the main table.
NB: if there are several entries for a specific ID that all have the same signal strength and that strength is the MAX(), then you will get several output rows for that ID.
Tested against IBM Informix Dynamic Server 11.50.FC3 running on Solaris 10:
+ CREATE TEMP TABLE signal_info
(
id INTEGER NOT NULL,
signal INTEGER NOT NULL,
station CHAR(5) NOT NULL,
ownerid INTEGER NOT NULL
);
+ INSERT INTO signal_info VALUES(111, -120, 'Home', 1);
+ INSERT INTO signal_info VALUES(111, -130, 'Car' , 1);
+ INSERT INTO signal_info VALUES(111, -135, 'Work', 2);
+ INSERT INTO signal_info VALUES(222, -98 , 'Home', 2);
+ INSERT INTO signal_info VALUES(222, -95 , 'Work', 1);
+ INSERT INTO signal_info VALUES(222, -103, 'Work', 2);
+ SELECT g.ID, g.MaxSignal, t.Station, t.OwnerID
FROM (SELECT id, MAX(Signal) AS MaxSignal
FROM signal_info
GROUP BY id) AS g
JOIN signal_info AS t ON g.id = t.id AND g.MaxSignal = t.Signal;
111 -120 Home 1
222 -95 Work 1
I named the table Signal_Info for this test - but it seems to produce the right answer.
This only shows that there is at least one DBMS that supports the notation. However, I am a little surprised that MS SQL Server does not - which version are you using?
It never ceases to surprise me how often SQL questions are submitted without table names.
WITH q AS
(
SELECT c.*, ROW_NUMBER() OVER (PARTITION BY id ORDER BY signal DESC) rn
FROM mytable
)
SELECT *
FROM q
WHERE rn = 1
This will return one row even if there are duplicates of MAX(signal) for a given ID.
Having an index on (id, signal) will greatly improve this query.
with tab(id, sig, sta, oid) as
(
select 111 as id, -120 as signal, 'Home' as station, 1 as ownerId union all
select 111, -130, 'Car', 1 union all
select 111, -135, 'Work', 2 union all
select 222, -98, 'Home', 2 union all
select 222, -95, 'Work', 1 union all
select 222, -103, 'Work', 2
) ,
tabG(id, maxS) as
(
select id, max(sig) as sig from tab group by id
)
select g.*, p.* from tabG g
cross apply ( select top(1) * from tab t where t.id=g.id order by t.sig desc ) p
We can do using self join
SELECT T1.ID,T1.Signal,T2.Station,T2.OwnerID
FROM (select ID,max(Signal) as Signal from mytable group by ID) T1
LEFT JOIN mytable T2
ON T1.ID=T2.ID and T1.Signal=T2.Signal;
Or you can also use the following query
SELECT t0.ID,t0.Signal,t0.Station,t0.OwnerID
FROM mytable t0
LEFT JOIN mytable t1 ON t0.ID=t1.ID AND t1.Signal>t0.Signal
WHERE t1.ID IS NULL;
select a.id, b.signal, a.station, a.owner from
mytable a
join
(SELECT ID, MAX(Signal) as Signal FROM mytable GROUP BY ID) b
on a.id = b.id AND a.Signal = b.Signal
SELECT * FROM StatusTable
WHERE Signal IN (
SELECT A.maxSignal FROM
(
SELECT ID, MAX(Signal) AS maxSignal
FROM StatusTable
GROUP BY ID
) AS A
);
select
id,
max_signal,
owner,
ownerId
FROM (
select * , rank() over(partition by id order by signal desc) as max_signal from table
)
where max_signal = 1;