Postgres SQL - How do I aggregated groups of multiple consecutive rows? - sql

I have a data representing tagged continuous-spans in a single table with a <tag, start & stop>.
Example below.
I'm trying to combine multiple rows into a single row where the condition is that they create a "continuous span".
In the query below - I would like the functionality that LEFT_MOST_CONTINUOUS returns the minimum v_start of a continuous span (same for RIGHT_MOST_CONTINUOUS for maximum v_stop). Note that there might be more than a single continuous span (that should have different v_start and v_stop values).
Input:
WITH data AS (
SELECT *
FROM (VALUES
('a', 2, 3),
('a', 3, 5),
('a', 5, 7),
('a', 8, 10),
('a', 10, 12),
('a', 12, 14),
('b', 7, 8),
('b', 8, 10),
('b', 12, 15),
('c', 10, 11)
) AS T(tag, v_start, v_stop)
ORDER BY tag, v_start, v_stop
)
SELECT tag,
LEFT_MOST_CONTINUOUS(v_start) OVER (PARTITION BY tag),
RIGHT_MOST_CONTINUOUS(v_stop) OVER (PARTITION BY tag)
FROM data
ORDER BY 1, 2, 3
Where I expect to get the following output:
"a" 2 7
"a" 8 14
"b" 7 10
"b" 12 15
"c" 10 11
Since I want to merge the first 3 tuples (for tag "a") which are consecutive into a single value representing the entire span; same for the next 3 tuples (again for "a").
Then for "b" we can merge the next 2, but leave out the 3rd (which has it's v_start != the other's v_stop).
And "c" there is nothing to merge with.
Help appreciated,
Tal

You can use a gaps-and-islands approach by marking the first record of each group when either there is no previous record for the tag or the v_start is greater than v_stop of the previous record:
select tag, v_start, v_stop,
coalesce(lag(v_stop) over w < v_start, true) as is_end_grp
from data
window w as (partition by tag order by v_start)
Use a windowed sum() of the boolean is_end_grp cast to int (1 if true, 0 if false) to number the groups:
select tag, sum(is_end_grp::int) over (partition by tag
order by v_start) as grp_num,
v_start, v_stop
from mark_gaps
Aggregation over (tag, grp_num) will produce your desired result:
select tag, min(v_start) as v_start, max(v_stop) as v_stop
from numbered_groups
group by tag, grp_num
order by tag, v_start
Working DB<>Fiddle

Using the numbered_groups logic from #Mike Organek answer. I just started from a different place
WITH data AS (
SELECT *
, case when lead(v_start) over(partition by tag order by v_start) = v_stop then 0 else 1 end stopcheck
, case when lag(v_stop) over(partition by tag order by v_stop) = v_start then 0 else 1 end startcheck
FROM (VALUES
('a' , 2 , 3),
('a', 3, 5),
('a', 5, 7),
('a', 8, 10),
('a', 10, 12),
('a', 12, 14),
('b', 7, 8),
('b', 8, 10),
('b', 12, 15),
('c', 10, 11)
) AS T(tag, v_start, v_stop)
ORDER BY tag, v_start, v_stop
)
,cnt as (
select *
, sum(startcheck) over (partition by tag order by v_start) grpn
from data)
select c1.tag, c1.v_start, c2.v_stop
from cnt c1
inner join cnt c2
on c1.tag = c2.tag and c1.grpn = c2.grpn
where c1.startcheck = 1 and c2.stopcheck = 1
This logic is all based on the assumption that your data always starts where the last row left off, there is no overlap etc.
Create a startcheck and stopcheck by comparing the prior row and next row relatively. From here use another window function sum() over to order the start records (so we don't match start of second batch to stop of first batch)
Join the table to itself matching like tag and groups. Filtering start and stop records

You can use following query
WITH data AS (
SELECT *
FROM (VALUES
('a', 2, 3),
('a', 3, 5),
('a', 5, 7),
('a', 8, 10),
('a', 10, 12),
('a', 12, 14),
('b', 7, 8),
('b', 8, 10),
('b', 12, 15),
('c', 10, 11)
) AS T(tag, v_start, v_stop)
ORDER BY tag, v_start, v_stop
),
cte1 as(
select *,
case
when lag(v_stop)over(partition by tag order by(select null)) = v_start
then 0
else 1
end as grp
from data
),
cte2 as(
select *,
sum(grp) over (partition by tag order by v_start) as rnk
from cte1
)
select tag,min(v_start)v_start,max(v_stop)v_stop
from cte2
group by tag,rnk
order by tag
Demo in db<>fiddle

Related

prestoSQL aggregate columns and rows into one column

I would like to aggregate some columns and rows into one column in prestoSQL table.
with example_table as (
select * from (
values ('A', 'nh', 7), ('A', 'mn', 4), ('A', 'sv', 3),
('B', 'tb', 6), ('B', 'ty', 5), ('A', 'rw', 2),
('C', 'op', 9), ('C', 'au', 8)
) example_table("id", "time", "value")
)
select id, agg(value, time) # Unexpected parameters (integer, VARCHAR(2)) for function array_agg. Expected: array_agg(T) T
from example_table
group by id
I would like to combine column "time" and "value" as one column and then aggregate all rows by "id" such that
id. time_value_agg
A. [['nh', 7], ['mn', 4], ['sv', 3], ['rw', 2]
B. [['tb', 6], ['tv',5]
C. [['op', 9], ['au', 8]]
the column
time_value_agg
should be an array of str. If the "time" col is not str, cast it to str.
I am not sure which function can be used for this ?
thanks
array_agg can be applied to single column only. If times are unique per id you can turn data into map:
select id, map(array_agg(time), array_agg(value)) time_value_agg
from example_table
group by id
Output:
id
time_value_agg
C
{op=9, au=8}
A
{mn=4, sv=3, rw=2, nh=7}
B
{ty=5, tb=6}
Or turn data into ROW type (or map) before aggregation:
select id,
array_agg(arr) time_value_agg
from (
select id, cast (row(time, value) as row(time varchar, value integer))arr
from example_table
)
group by id
Output:
id
time_value_agg
C
[{time=op, value=9}, {time=au, value=8}]
A
[{time=nh, value=7}, {time=mn, value=4}, {time=sv, value=3}, {time=rw, value=2}]
B
[{time=tb, value=6}, {time=ty, value=5}]

Group elements of a column into mulitple subgroups SQL

I am looking at different breeds of cattle and their AnimalTypeCode , BreedCateoryID and resultant Growth.
I have the following query
SELECT DATEPART(yyyy,[KillDate])
,[AnimalTypeCode]
,AVG([Growth])
,[BreedCategoryID]
FROM [dbo].[tblAnimal]
WHERE (AnimalTypeCode='C'
or AnimalTypeCode= 'E')
GROUP BY DATEPART(yyyy,[KillDate])
,[AnimalTypeCode]
,[BreedCategoryID]
GO
This query is good and gives me almost what I want, but BreedCategoryID is numbered 1 through 7 and I would like to group them:
(1 = Pure Dairy),
(2 and 3 = Dairy)
(4, 5, 6 and 7 = Beef)
So instead of getting the mean Growthrate for each BreedCategoryID I would like to get the average for Pure Dairy, Dairy, and Beef.
Any help greatly appreciated!
You can assign a new "variable" using cross apply in the from clause:
SELECT YEAR(KillDate]), a.AnimalTypeCode, v.grp,
AVG([Growth])
FROM [dbo].[tblAnimal] a CROSS APPLY
(VALUES (CASE WHEN a.BreedCategoryID IN (1) THEN 'Pure Dairy'
WHEN a.BreedCategoryID IN (2, 3) THEN 'Dairy'
WHEN a.BreedCategoryID IN (4, 5, 6, 7) THEN 'Beef'
END)
) as v(grp)
WHERE a.AnimalTypeCode IN ('C', 'E')
GROUP BY YEAR(KillDate]), a.AnimalTypeCode, v.grp;
Note that I also introduced table aliases and qualified all the column references.
Do the calculations in a derived table (the subquery). GROUP BY its result:
select killyear, [AnimalTypeCode], AVG([Growth]), BreedCat
(
SELECT DATEPART(yyyy,[KillDate]) killyear
,[AnimalTypeCode]
,[Growth]
,case when [BreedCategoryID] = 1 then 'Pure Dairy'
when [BreedCategoryID] in (2, 3) then 'Dairy'
when [BreedCategoryID] in (4, 5, 6, 7) then 'Beef'
end BreedCat
FROM [dbo].[tblAnimal]
WHERE (AnimalTypeCode='C'
or AnimalTypeCode= 'E')
) dt
GROUP BY killyear
,[AnimalTypeCode]
,BreedCat

Conditional group by with window function in Snowflake query

I have a table in Snowflake in following format:
create temp_test(name string, split string, value int)
insert into temp_test
values ('A','a', 100), ('A','b', 200), ('A','c',300), ('A', 'd', 400), ('A', 'e',500), ('B', 'a', 1000), ('B','b', 2000), ('B','c', 3000), ('B', 'd',4000), ('B','e', 5000)
First step, I needed only top 2 value per name (sorted on value), so I used following query to get that:
select name, split, value,
row_number() over (PARTITION BY (name) order by value desc) as row_num
from temp_test
qualify row_num <= 2
Which gives me following resultset:
NAME SPLIT VALUE ROW_NUM
A e 500 1
A d 400 2
B e 5000 1
B d 4000 2
Now, I need to sum values other than Top 2 and put it in a different Split named as "Others", like this:
NAME SPLIT VALUE
A e 500
A d 400
A Others 600
B e 5000
B d 4000
B Others 6000
How to do that in Snowflake query or SQL in general?
with data as (
select name, split, value,
row_number() over (partition by (name) order by value desc) as row_num
from temp_test
)
select
name,
case when row_num <= 2 then split else 'Others' end as split,
sum(value) as value
from data
group by name, case when row_num <= 2 then row_num else 3 end
Shawnt00's answer is good, but for the record in Snowflake this can be written simpler:
Firstly the group by at the end can refer to the results by index or name:
GROUP BY 1,2
or
GROUP BY name, split
also as the CASE only has too branches an IFF can be used and seems you are using a CTE to add the row_number you can push the IFF into the CTE also
WITH data AS (
SELECT name, value,
ROW_NUMBER() OVER (PARTITION BY name ORDER BY value DESC) AS row_num,
IFF(row_num < 3, split, 'Others') as n_split
FROM VALUES ('A','a', 100), ('A','b', 200), ('A','c',300), ('A', 'd', 400),
('A', 'e',500), ('B', 'a', 1000), ('B','b', 2000), ('B','c', 3000),
('B', 'd',4000), ('B','e', 5000)
v(name, split, value)
)
SELECT
name,
n_split,
SUM(value) AS value
FROM data
GROUP BY name, n_split;
and if super keen on small SQL push the ROW_NUMBER into the IFF:
WITH data AS (
SELECT name, value,
IFF(ROW_NUMBER() OVER (PARTITION BY name ORDER BY value DESC) < 3, split, 'Others') as n_split
FROM VALUES ('A','a', 100), ('A','b', 200), ('A','c',300), ('A', 'd', 400),
('A', 'e',500), ('B', 'a', 1000), ('B','b', 2000), ('B','c', 3000),
('B', 'd',4000), ('B','e', 5000)
v(name, split, value)
)
SELECT
name,
n_split AS split,
SUM(value) AS value
FROM data
GROUP BY name, n_split;
gives:
NAME SPLIT VALUE
A e 500
A d 400
A Others 600
B e 5000
B d 4000
B Others 6000

Logic to check if exact ids are present in a group in SQL Server

I have some sample data like:
INSERT INTO mytable ([ID], [FK_ID], [TYPE_ID])
VALUES
(1, 10, 1),
(2, 11, 1), (3, 11, 2),
(4, 12, 1), (5, 12, 2), (6, 12, 3),
(7, 14, 2), (8, 14, 3)
Now, here I am trying to check if in each group by FK_ID we have exact match of TYPE_ID values 1 & 2.
So, the expected output is like:
(1, 10, 1) this should fail
As in group FK_ID = 10 we only have one record
(2, 11, 1), (3, 11, 2) this should pass
As in group FK_ID = 11 we have two records.
And both the TYPE_ID are matching 1 & 2 values.
(4, 12, 1), (5, 12, 2), (6, 12, 3) this should also fail
As we have 3 records here.
(7, 14, 2), (8, 14, 3) this should also fail
Even though we have exact two records, it should fail as the TYPE_ID here are not matching with 1 & 2 values.
Here is my attempt:
select *
from mytable t1
where exists (select count(t2.TYPE_ID)
from mytable t2
where t2.FK_ID = t1.FK_ID
and t2.TYPE_ID in (1, 2)
group by t2.FK_ID
having count(t2.TYPE_ID) = 2);
This is not working as expected, because it also pass for FK_ID = 12 which has three records.
Demo: SQL Fiddle
There are probably several different ways of doing this. One could be:
SELECT FK_ID
FROM mytable
GROUP BY FK_ID
HAVING COUNT(*) = 2
AND MIN(TYPE_ID) = 1
AND MAX(TYPE_ID) = 2
We can add min and max to the group by query
select t1.* from mytable t1,
( select fk_id, count(*) As cnt from mytable
Group by fk_id
Having count(*) = 2
AND max(type_id)=2
ANd min(Type_id) = 1) As t2
Where t1.fk_id = t2.fk_id
Another way, but less optimal than Nenad's, is to use SELECT INTO (with output to temporary table) and then with another query SELECT only these rows that have proper TYPE_ID values.

Logic to check if exact ids (3+ records) are present in a group in SQL Server

I have some sample data like:
INSERT INTO mytable
([FK_ID], [TYPE_ID])
VALUES
(10, 1),
(11, 1), (11, 2),
(12, 1), (12, 2), (12, 3),
(14, 1), (14, 2), (14, 3), (14, 4),
(15, 1), (15, 2), (15, 4)
Now, here I am trying to check if in each group by FK_ID we have exact match of TYPE_ID values for 1, 2 & 3.
So, the expected output is like:
(10, 1) this should fail
As in group FK_ID = 10 we only have one record
(11, 1), (11, 2) this should also fail
As in group FK_ID = 11 we have two records.
(12, 1), (12, 2), (12, 3) this should pass
As in group FK_ID = 12 we have two records.
And all the TYPE_ID are exactly matching 1, 2 & 3 values.
(14, 1), (14, 2), (14, 3), (14, 4) this should also fail
As we have 4 records here.
(15, 1), (15, 2), (15, 4) this should also fail
Even though we have three records, it should fail as the TYPE_ID here (1, 2, 4) are not matching with required match (1, 2, 3).
Here is my attempt:
select * from mytable t1
where exists (select COUNT(t2.TYPE_ID)
from mytable t2 where t2.FK_ID = t1.FK_ID
and t2.TYPE_ID IN (1, 2, 3)
group by t2.FK_ID having COUNT(t2.TYPE_ID) = 3);
This is not working as expected, because it also pass for FK_ID = 14 which has four records.
Demo: SQL Fiddle
Also, how we can make it generic so that if we need to check for 4 or more TYPE_ID values like (1,2,3,4) or (1,2,3,4,5), we can do that easily by updating few values.
The following query will do what you want:
select fk_id
from t
group by fk_id
having sum(case when type_id in (1, 2, 3) then 1 else 0 end) = 3 and
sum(case when type_id not in (1, 2, 3) then 1 else 0 end) = 0;
This assumes that you have no duplicate pairs (although depending on how you want to handle duplicates, it might be as easy as using, from (select distinct * from t) t).
As for "genericness", you need to update the in lists and the 3.
If you want something more generic:
with vals as (
select id
from (values (1), (2), (3)) v(id)
)
select fk_id
from t
group by fk_id
having sum(case when type_id in (select id from vals) then 1 else 0 end) = (select count(*) from vals) and
sum(case when type_id not in (select id from vals) then 1 else 0 end) = 0;
You can use this code:
SELECT y.fk_id FROM
(SELECT x.fk_id, COUNT(x.type_id) AS count, SUM(x.type_id) AS sum
FROM mytable x GROUP BY (x.fk_id)) AS y
WHERE y.count = 3 AND y.sum = 6
For making it generic, you can equal y.count with N and y.sum with N*(N-1)/2, where N is the number you are looking for (1, 2, ..., N).
You can try this query. COUNT and DISTINCT used for eliminate duplicate records.
SELECT
[FK_ID]
FROM
#mytable T
GROUP BY
[FK_ID]
HAVING
COUNT(DISTINCT CASE WHEN [TYPE_ID] IN (1,2,3) THEN [TYPE_ID] END) = 3
AND COUNT(CASE WHEN [TYPE_ID] NOT IN (1,2,3) THEN [TYPE_ID] END) = 0
Try this:
select FK_ID,count(distinct TYPE_ID) from mytable
where TYPE_ID<=3
group by FK_ID
having count(distinct TYPE_ID)=3
You should use CTE with Dynamic pass Value which you have mentioned in Q.
WITH CTE
AS (
SELECT FK_ID,
COUNT(*) CNT
FROM #mytable
GROUP BY FK_ID
HAVING COUNT(*) = 3) <----- Pass Value here What you want to Display Result,
CTE1
AS (
SELECT T.[ID],
T.[FK_ID],
T.[TYPE_ID],
ROW_NUMBER() OVER(PARTITION BY T.[FK_ID] ORDER BY
(
SELECT NULL
)) RN
FROM #mytable T
INNER JOIN CTE C ON C.FK_ID = T.FK_ID),
CTE2
AS (
SELECT C1.FK_ID
FROM CTE1 C1
GROUP BY C1.FK_ID
HAVING SUM(C1.TYPE_ID) = SUM(C1.RN))
SELECT TT1.*
FROM CTE2 C2
INNER JOIN #mytable TT1 ON TT1.FK_ID = C2.FK_ID;
From above SQL Command which will produce Result (I have passed 3) :
ID FK_ID TYPE_ID
4 12 1
5 12 2
6 12 3