SQL query return only unique results - sql

I have the following SQL query which is working correctly...
SELECT *
FROM
wp_rg_lead
INNER JOIN wp_rg_lead_detail ON
wp_rg_lead.id=wp_rg_lead_detail.lead_id
WHERE wp_rg_lead.form_id = '7'
and cast(date_created as date) >= current_date - interval '7' day
I would like to modify this to only return unique records. The problem I have is that each 'entry' is actually made up of several like so....
id | form_id | entry_id | value
--------------------------------------------
1 1 75 red
2 1 75 broken
3 1 75 apple
4 7 33 yellow
5 7 33 faulty
6 7 33 banana
7 7 33 ripe
8 7 22 red
9 7 22 strawberry
10 7 22 broken
11 7 22 squashed
Using the above data I would only want it to return 2 results

Does this do what you want?
SELECT distinct form_id
FROM wp_rg_lead INNER JOIN
wp_rg_lead_detail
ON wp_rg_lead.id = wp_rg_lead_detail.lead_id
WHERE cast(date_created as date) >= current_date - interval '7' day;
If you want an entire arbitrary row, it looks like you are using Postgres, so you can use distinct on:
SELECT distinct on (form_id) *
FROM wp_rg_lead INNER JOIN
wp_rg_lead_detail
ON wp_rg_lead.id = wp_rg_lead_detail.lead_id
WHERE cast(date_created as date) >= current_date - interval '7' day
ORDER BY form_id, id;

Related

Snowflake SQL - Count Distinct Users within descending time interval

I want to count the distinct amount of users over the last 60 days, and then, count the distinct amount of users over the last 59 days, and so on and so forth.
Ideally, the output would look like this (TARGET OUTPUT)
Day Distinct Users
60 200
59 200
58 188
57 185
56 180
[...] [...]
where 60 days is the max total possible distinct users, and then 59 would have a little less and so on and so forth.
my query looks like this.
select
count(distinct (case when datediff(day,DATE,current_date) <= 60 then USER_ID end)) as day_60,
count(distinct (case when datediff(day,DATE,current_date) <= 59 then USER_ID end)) as day_59,
count(distinct (case when datediff(day,DATE,current_date) <= 58 then USER_ID end)) as day_58
FROM Table
The issue with my query is that This outputs the data by column instead of by rows (like shown below) AND, most importantly, I have to write out this logic 60x for each of the 60 days.
Current Output:
Day_60 Day_59 Day_58
209 207 207
Is it possible to write the SQL in a way that creates the target as shown initially above?
Using below data in CTE format -
with data_cte(dates,userid) as
(select * from values
('2022-05-01'::date,'UID1'),
('2022-05-01'::date,'UID2'),
('2022-05-02'::date,'UID1'),
('2022-05-02'::date,'UID2'),
('2022-05-03'::date,'UID1'),
('2022-05-03'::date,'UID2'),
('2022-05-03'::date,'UID3'),
('2022-05-04'::date,'UID1'),
('2022-05-04'::date,'UID1'),
('2022-05-04'::date,'UID2'),
('2022-05-04'::date,'UID3'),
('2022-05-04'::date,'UID4'),
('2022-05-05'::date,'UID1'),
('2022-05-06'::date,'UID1'),
('2022-05-07'::date,'UID1'),
('2022-05-07'::date,'UID2'),
('2022-05-08'::date,'UID1')
)
Query to get all dates and count and distinct counts -
select dates,count(userid) cnt, count(distinct userid) cnt_d
from data_cte
group by dates;
DATES
CNT
CNT_D
2022-05-01
2
2
2022-05-02
2
2
2022-05-03
3
3
2022-05-04
5
4
2022-05-05
1
1
2022-05-06
1
1
2022-05-08
1
1
2022-05-07
2
2
Query to get difference of date from current date
select dates,datediff(day,dates,current_date()) ddiff,
count(userid) cnt,
count(distinct userid) cnt_d
from data_cte
group by dates;
DATES
DDIFF
CNT
CNT_D
2022-05-01
45
2
2
2022-05-02
44
2
2
2022-05-03
43
3
3
2022-05-04
42
5
4
2022-05-05
41
1
1
2022-05-06
40
1
1
2022-05-08
38
1
1
2022-05-07
39
2
2
Get records with date difference beyond a certain range only -
include clause having
select datediff(day,dates,current_date()) ddiff,
count(userid) cnt,
count(distinct userid) cnt_d
from data_cte
group by dates
having ddiff<=43;
DDIFF
CNT
CNT_D
43
3
3
42
5
4
41
1
1
39
2
2
38
1
1
40
1
1
If you need to prefix 'day' to each date diff count, you can
add and outer query to previously fetched data-set and add the needed prefix to the date diff column as following -
I am using CTE syntax, but you may use sub-query given you will select from table -
,cte_1 as (
select datediff(day,dates,current_date()) ddiff,
count(userid) cnt,
count(distinct userid) cnt_d
from data_cte
group by dates
having ddiff<=43)
select 'day_'||to_char(ddiff) days,
cnt,
cnt_d
from cte_1;
DAYS
CNT
CNT_D
day_43
3
3
day_42
5
4
day_41
1
1
day_39
2
2
day_38
1
1
day_40
1
1
Updated the answer to get distinct user count for number of days range.
A clause can be included in the final query to limit to number of days needed.
with data_cte(dates,userid) as
(select * from values
('2022-05-01'::date,'UID1'),
('2022-05-01'::date,'UID2'),
('2022-05-02'::date,'UID1'),
('2022-05-02'::date,'UID2'),
('2022-05-03'::date,'UID5'),
('2022-05-03'::date,'UID2'),
('2022-05-03'::date,'UID3'),
('2022-05-04'::date,'UID1'),
('2022-05-04'::date,'UID6'),
('2022-05-04'::date,'UID2'),
('2022-05-04'::date,'UID3'),
('2022-05-04'::date,'UID4'),
('2022-05-05'::date,'UID7'),
('2022-05-06'::date,'UID1'),
('2022-05-07'::date,'UID8'),
('2022-05-07'::date,'UID2'),
('2022-05-08'::date,'UID9')
),cte_1 as
(select datediff(day,dates,current_date()) ddiff,userid
from data_cte), cte_2 as
(select distinct ddiff from cte_1 )
select cte_2.ddiff,
(select count(distinct userid)
from cte_1 where cte_1.ddiff <= cte_2.ddiff) cnt
from cte_2
order by cte_2.ddiff desc
DDIFF
CNT
47
9
46
9
45
9
44
8
43
5
42
4
41
3
40
1
You can do unpivot after getting your current output.
sample one.
select
*
from (
select
209 Day_60,
207 Day_59,
207 Day_58
)unpivot ( cnt for days in (Day_60,Day_59,Day_58));

Select max of nested id from amazon redshift

My database is an amazon redshift.
I have a table that looks like this -
id
nested_id
date
value
1
10
'2021-01-01'
5
1
20
'2021-01-01'
10
1
10
'2021-01-02'
6
1
20
'2021-01-02'
11
1
10
'2021-01-03'
7
1
20
'2021-01-03'
12
2
30
'2021-01-01'
5
2
40
'2021-01-01'
10
2
30
'2021-01-02'
6
2
40
'2021-01-02'
11
2
30
'2021-01-03'
7
2
40
'2021-01-03'
12
So this is basically a table that tracks values by id over time, except for every id there can be a nested_id. And the dates and values are primarily connected to the nested_id.
However, let's say I'm starting with the id field, but for each id I want to only return the points over time for the nested_id that has the greater sum of points.
So right now I'm just grabbing it like this...
select *
from mytable
where id in (1, 2)
except I only want it to return nested_id rows where the maximum value of that nested_id is the greatest.
So here's how I would do this manually.
For id of 1, the maximum value is 12, and the nested_id of that value is 20
For id of 2, the maximum value is 12, and the nested_id of that value is 40
So my return table should be
id
nested_id
date
value
1
20
'2021-01-01'
10
1
20
'2021-01-02'
11
1
20
'2021-01-03'
12
2
40
'2021-01-01'
10
2
40
'2021-01-02'
11
2
40
'2021-01-03'
12
Is there an easy way of performing this query? I'm assuming you have to partition somehow?
You can solve this with row_number window functions
with maxs as (
select id,
nested_id,
value,
row_number() over (partition by id order by value desc) rn
from mytable
)
select mt.*
from mytable mt
left join maxs on mt.id = maxs.id and mt.nested_id = maxs.nested_id
where maxs.rn = 1

How to Count Distinct for SAS PROC SQL with Rolling Date Window?

I have a SAS dataset that looks like the one below:
MEMBER_ID VAR1 VAR2 DATE
1 12 5 01/04/2020
1 12 5 02/06/2020
1 16 5 04/14/2020
1 12 7 09/10/2020
2 10 5 02/20/2020
2 10 6 04/20/2020
2 10 5 04/25/2020
2 10 5 05/15/2020
3 15 3 01/15/2020
3 16 4 01/25/2020
4 10 5 05/15/2020
5 11 7 03/03/2020
5 12 8 04/03/2020
5 13 9 05/03/2020
My goal is to count the distinct values in VAR1 and VAR2 grouped by MEMBER_ID and a rolling date range of 180 days. So if the date in row 2 is within the 180 days of row 1 for member 1, then they will be counted (distinctly). My current code looks as follows:
PROC SQL;
CREATE TABLE WORK.WANT AS
SELECT t1.MEMBER_ID,
t1.VAR1,
t1.VAR2,
t1.DATE,
/* var1Count */
(COUNT(DISTINCT(t1.VAR1))) FORMAT=10. LABEL="var1Count " WHERE (t1 BETWEEN t1.DATE- 180 AND t1.DATE) AS var1Count ,
/* var2Count */
(COUNT(DISTINCT(t1.VAR2))) FORMAT=10. LABEL="var2Count " WHERE (t1 BETWEEN t1.DATE- 180 AND t1.DATE) AS var2Count ,
FROM WORK.HAVE t1
GROUP BY t1.MEMBER_ID
HAVING (CALCULATED var1Count ) >= 2 AND (CALCULATED var2Count ) >= 2
ORDER BY t1.MEMBER_ID,
t1.DATE;
QUIT;
But while I think this WHERE statement in the column calculation may work for regular SQL code, it's giving me errors here. Any other ideas? It may be that I need to do this COUNT(DISTINCT VAR) in a different SAS data step, but I'm unsure (and fairly new to SAS for that matter). Any help at all is greatly appreciated!
I think you need to use correlated subqueries for this in SAS:
SELECT h.* ,
(SELECT COUNT(DISTINCT h2.VAR1)
FROM WORK.HAVE h2
WHERE h2.MEMBER_ID = h.MEMBER_ID AND
h2.DATE BETWEEN h.DATE - 180 AND h.DATE
) as var1count,
(SELECT COUNT(DISTINCT h2.VAR2)
FROM WORK.HAVE h2
WHERE h2.MEMBER_ID = h.MEMBER_ID AND
h2.DATE BETWEEN h.DATE - 180 AND h.DATE
) as var2count
FROM WORK.HAVE h;
If you want to filter on the counts, you can use a subquery.

How to group records by hours considering start date and end date

I'm trying to group records by hours with consideration of duration. Assume there are long running processes and there is log data when process has been started and finished. I'm trying to get report by hours how many processes were running
The data looks like this
Process_name Start End
'A' '2019/01/01 14:10' '2019/01/01/ 14:55'
'B' '2019/01/01 14:20' '2019/01/01/ 16:30'
'C' '2019/01/01 15:05' '2019/01/01/ 15:10'
The result should be like this
Hour ProcessQount
14 2
15 2
16 1
You can do it if you join a recursive cte which returns all the hours of the day to the table:
with cte as (
select 0 as hour
union all
select hour + 1
from cte
where hour < 23
)
select c.hour Hour, count(*) ProcessQount
from cte c inner join tablename t
on c.hour between datepart(hh, t.[Start]) and datepart(hh, t.[End])
group by c.hour
See the demo.
Results:
> Hour | ProcessQount
> ---: | -----------:
> 14 | 2
> 15 | 2
> 16 | 1
If you change to a LEFT JOIN and count([Process_name]) then you get results for all the hours of the day:
> Hour | ProcessQount
.........................
> 12 | 0
> 13 | 0
> 14 | 2
> 15 | 2
> 16 | 1
> 17 | 0
> 18 | 0
.........................
Generate the hours and then use inequalities and aggregation:
select h, count(t.process_name)
from (values (14), (15), (16)) v(h) left join
t
on datepart(hour, start <= v.h) and
datepart(hour, end >= v.h)
group by v.h
order by v.h;
For reasonable results, this assumes that all the data you are looking at is for one day, as in your sample data.

Count parts of total value as columns per row (pivot table)

I'm stuck with a seemingly easy query, but couldn't manage to get it working the last hours.
I have a table files that holds file names and some values like records in this file, DATE of creation (create_date), DATE of processing (processing_date) and so on. There can be multiple files for a create date in different hours and it is likely that they will not get processed in the same day of creaton, in fact it can even take up to three days or longer for them to get processed.
So let's assume I have these rows, as an example:
create_date | processing_date
------------------------------
2012-09-10 11:10:55.0 | 2012-09-11 18:00:18.0
2012-09-10 15:20:18.0 | 2012-09-11 13:38:19.0
2012-09-10 19:30:48.0 | 2012-09-12 10:59:00.0
2012-09-11 08:19:11.0 | 2012-09-11 18:14:44.0
2012-09-11 22:31:42.0 | 2012-09-21 03:51:09.0
What I want in a single query is to get a grouped column truncated to the day create_date with 11 additional columns for the differences between the processing_date and the create_date, so that the result should roughly look like this:
create_date | diff0days | diff1days | diff2days | ... | diff10days
------------------------------------------------------------------------
2012-09-10 | 0 2 1 ... 0
2012-09-11 | 1 0 0 ... 1
and so on, I hope you get the point :)
I have tried this and so far it works getting a single aggregated column for a create_date with a difference of - for example - 3:
SELECT TRUNC(f.create_date, 'DD') as created, count(1) FROM files f WHERE TRUNC(f.process_date, 'DD') - trunc(f.create_date, 'DD') = 3 GROUP BY TRUNC(f.create_date, 'DD')
I tried combining the single queries and I tried sub-queries, but that didn't help or at least my knowledge about SQL is not sufficient.
What I need is a hint so that I can include the various differences as columns, like shown above. How could I possibly achieve this?
That's basically the pivoting problem:
SELECT TRUNC(f.create_date, 'DD') as created
, sum(case TRUNC(f.process_date, 'DD') - trunc(f.create_date, 'DD')
when 0 then 1 end) as diff0days
, sum(case TRUNC(f.process_date, 'DD') - trunc(f.create_date, 'DD')
when 1 then 1 end) as diff1days
, sum(case TRUNC(f.process_date, 'DD') - trunc(f.create_date, 'DD')
when 2 then 1 end) as diff2days
, ...
FROM files f
GROUP BY
TRUNC(f.create_date, 'DD')
SELECT CreateDate,
sum(CASE WHEN DateDiff(day, CreateDate, ProcessDate) = 1 THEN 1 ELSE 0 END) AS Diff1,
sum(CASE WHEN DateDiff(day, CreateDate, ProcessDate) = 2 THEN 1 ELSE 0 END) AS Diff2,
...
FROM table
GROUP BY CreateDate
ORDER BY CreateDate
As you are using Oracle 11g you can also get desired result by using pivot query.
Here is an example:
-- sample of data from your question
SQL> create table Your_table(create_date, processing_date) as
2 (
3 select '2012-09-10', '2012-09-11' from dual union all
4 select '2012-09-10', '2012-09-11' from dual union all
5 select '2012-09-10', '2012-09-12' from dual union all
6 select '2012-09-11', '2012-09-11' from dual union all
7 select '2012-09-11', '2012-09-21' from dual
8 )
9 ;
Table created
SQL> with t2 as(
2 select create_date
3 , processing_date
4 , to_date(processing_date, 'YYYY-MM-DD')
- To_Date(create_date, 'YYYY-MM-DD') dif
5 from your_table
6 )
7 select create_date
8 , max(diff0) diff0
9 , max(diff1) diff1
10 , max(diff2) diff2
11 , max(diff3) diff3
12 , max(diff4) diff4
13 , max(diff5) diff5
14 , max(diff6) diff6
15 , max(diff7) diff7
16 , max(diff8) diff8
17 , max(diff9) diff9
18 , max(diff10) diff10
19 from (select *
20 from t2
21 pivot(
22 count(dif)
23 for dif in ( 0 diff0
24 , 1 diff1
25 , 2 diff2
26 , 3 diff3
27 , 4 diff4
28 , 5 diff5
29 , 6 diff6
30 , 7 diff7
31 , 8 diff8
32 , 9 diff9
33 , 10 diff10
34 )
35 ) pd
36 ) res
37 group by create_date
38 ;
Result:
Create_Date Diff0 Diff1 Diff2 Diff3 Diff4 Diff5 Diff6 Diff7 Diff8 Diff9 Diff10
--------------------------------------------------------------------------------
2012-09-10 0 2 1 0 0 0 0 0 0 0 0
2012-09-11 1 0 0 0 0 0 0 0 0 0 1