SQL count DISTINCT ONCE user_id multiple attributes - sql

Hello there I cant manage to get a good result for the following case:
I have a table which is like this:
UserID | Label
-------- ------
1 | Private
1 | Public
2 | Private
3 | Hidden
4 | Public
5 | Hidden
I want to have the following happening if a User has following assigned he is:
Private and Hidden are treaten the same: lets say Business
Public: BtoC
Public and Private and/or Hidden: both
So in the end I have a count(DISTINCT UserID) of
Business 3
BtoC 1
both 1
I have tried to use CASE WHEN but it doesn't work my current total query looks like this:
SELECT gen_month,
count(DISTINCT cu.id) as leads,
a.label
FROM generate_series(DATE_TRUNC('month', CURRENT_DATE::date - 96*INTERVAL '1 month'), CURRENT_DATE::date, '1 month') m(gen_month)
LEFT OUTER JOIN company_user AS cu
ON (date_trunc('month', cu.creation_date) = date_trunc('month', gen_month))
LEFT JOIN user u
ON u.user_id = cu.id
LEFT join user_account_status as uas
on cu.id = uas.user_id
LEFT JOIN account as a
on uas.account_id = a.id
where gen_month >= DATE_TRUNC('month',NOW() - INTERVAL '5 months')
group by m.gen_month, a.label
order by gen_month
So my main problem now is that the count appears in every attribute once.
How can I make a userid only count once under condition CASE WHEN user_id appears Public and (Private or Hidden) THEN count(DISTINCT user_id) as Both?
Addition: its mySQL mariaDB and postgreSQL. But first I would happy with Postgres

This is not implemented in your total query, but for counting users for each category, you can:
with the_table(UserID , Label) as(
select 1 ,'Private' union all
select 1 ,'Public' union all
select 2 ,'Private' union all
select 3 ,'Hidden' union all
select 4 ,'Public' union all
select 5 ,'Hidden'
)
select result, count(*) from (
select UserID, case when min(Label) = 'Public' then 'BtoC' when max(Label) in('Private','Hidden') then 'Business' else 'both' end as result
from the_table
group by UserID
) t
group by result

with
my_table(user_id, label) as (values
(1,'Private'),
(1,'Public'),
(2,'Private'),
(3,'Hidden'),
(4,'Public'),
(5,'Hidden')),
t as (
select
user_id,
string_agg('{'||label||'}', '') as labels
from my_table
group by user_id),
tt as (
select
user_id,
labels,
case
when
position('{Public}' in labels) > 0 and (position('{Private}' in labels) > 0 or position('{Hidden}' in labels) > 0) then 'Both'
when
position('{Private}' in labels) > 0 or position('{Hidden}' in labels) > 0 then 'Business'
when
position('{Public}' in labels) > 0 then 'BtoC'
end as kind
from t)
select kind, count(*) from tt group by kind;
For MariaDB use GROUP_CONCAT() instead of PostgreSQL string_agg().
Note that the case statement check conditions in order of appearance and returns the value for the first satisfied condition.
PS: Using PostgreSQL's arrays the conditions would be more elegant.

Related

Consolidate information (time serie) from two tables

MS SQL Server
I have two tables with different accounts from the same customer:
Table1:
ID
ACCOUNT
FROM
TO
1
A
01.10.2019
01.12.2019
1
A
01.02.2020
09.09.9999
and table2:
ID
ACCOUNT
FROM
TO
1
B
01.12.2019
01.01.2020
As result I want a table that summarize the story of this costumer and shows when he had an active account and when he doesn't.
Result:
ID
FROM
TO
ACTIV Y/N
1
01.10.2019
01.01.2020
Y
1
02.01.2020
31.01.2020
N
1
01.02.2020
09.09.9999
Y
Can someone help me with some ideas how to proceed?
This is the typical gaps and island problem, and it's not usually easy to solve.
You can achieve your goal using this query, I will explain it a little bit.
You can test on this db<>fiddle.
First of all... I have unified your two tables into one to simplify the query.
-- ##table1
select 1 as ID, 'A' as ACCOUNT, convert(date,'2019-10-01') as F, convert(date,'2019-12-01') as T into ##table1
union all
select 1 as ID, 'A' as ACCOUNT, convert(date,'2020-02-01') as F, convert(date,'9999-09-09') as T
-- ##table2
select 1 as ID, 'B' as ACCOUNT, convert(date,'2019-12-01') as F, convert(date,'2020-01-01') as T into ##table2
-- ##table3
select * into ##table3 from ##table1 union all select * from ##table2
You can then get your gaps and island using, for example, a query like this.
It combines recursive cte to generate a calendar (cte_cal) and lag and lead operations to get the previous/next record information to build the gaps.
with
cte_cal as (
select min(F) as D from ##table3
union all
select dateadd(day,1,D) from cte_cal where d < = '2021-01-01'
),
table4 as (
select t1.ID, t1.ACCOUNT, t1.F, isnull(t2.T, t1.T) as T, lag(t2.F, 1,null) over (order by t1.F) as SUP
from ##table3 t1
left join ##table3 t2
on t1.T=t2.F
)
select
ID,
case when T = D then F else D end as "FROM",
isnull(dateadd(day,-1,lead(D,1,null) over (order by D)),'9999-09-09') as "TO",
case when case when T = D then F else D end = F then 'Y' else 'N' end as "ACTIV Y/N"
from (
select *
from cte_cal c
cross apply (
select t.*
from table4 t
where t.SUP is null
and (
c.D = t or
c.D = dateadd(day,1,t.T)
)
) t
union all
select F, * from table4 where T = '9999-09-09'
) p
order by 1
option (maxrecursion 0)
Dates like '9999-09-09' must be treated like exceptions, otherwise I would have to create a calendar until that date, so the query would take long time to resolve.

User Life Cycle SQL Query Logic in Snowflake

I am working on building a query to track the life cycle of an user through the platform via events. The table EVENTS has 3 columns USER_ID, DATE_TIME and EVENT_NAME. Below is a snapshot of the table,
My query should return the below result (the first timestamp for the registered event followed by the immediate/next timestamp of the following log_in event and finally followed by the immediate/next timestamp of the final landing_page event),
Below is my query ,
WITH FIRST_STEP AS
(SELECT
USER_ID,
MIN(CASE WHEN EVENT_NAME = 'registered' THEN DATE_TIME ELSE NULL END) AS REGISTERED_TIMESTAMP
FROM EVENTS
GROUP BY 1
),
SECOND_STEP AS
(SELECT * FROM EVENTS
WHERE EVENT_NAME = 'log_in'
ORDER BY DATE_TIME
),
THIRD_STEP AS
(SELECT * FROM EVENTS
WHERE EVENT_NAME = 'landing_page'
ORDER BY DATE_TIME
)
SELECT
a.USER_ID,
a.REGISTERED_TIMESTAMP,
(SELECT
CASE WHEN b.DATE_TIME >= a.REGISTRATIONS_TIMESTAMP THEN b.DATE_TIME END AS LOG_IN_TIMESTAMP
FROM SECOND_STEP
LIMIT 1
),
(SELECT
CASE WHEN c.DATE_TIME >= LOG_IN_TIMESTAMP THEN c.DATE_TIME END AS LANDING_PAGE_TIMESTAMP
FROM THIRD_STEP
LIMIT 1
)
FROM FIRST_STEP AS a
LEFT JOIN SECOND_STEP AS b ON a.USER_ID = b.USER_ID
LEFT JOIN THIRD_STEP AS c ON b.USER_ID = c.USER_ID;
Unfortunately I am getting the "SQL compilation error: Unsupported subquery type cannot be evaluated" error when I try to run the query
This is a perfect use case for MATCH_RECOGNIZE.
The pattern you are looking for is register anything* login anything* landing and the measures are the min(iff(event_name='x', date_time, null)) for each.
Check:
https://towardsdatascience.com/funnel-analytics-with-sql-match-recognize-on-snowflake-8bd576d9b7b1
https://docs.snowflake.com/en/user-guide/match-recognize-introduction.html
Set the output to one row per match.
Untested sample query:
select *
from data
match_recognize(
partition by user_id
order by date_time
measures min(iff(event_name='register', date_time, null)) as t1
, min(iff(event_name='log_in', date_time, null)) as t2
, min(iff(event_name='landing_page', date_time, null)) as t3
one row per match
pattern(register anything* login anything* landing)
define
register as event_name = 'register'
, login as event_name = 'log_in'
, landing as event_name = 'landing_page'
);

Non duplicate records with max date query on oracle

Hello i have a problem with a simple query. I need to see the max date of some articles in two direfent sites.
This is my actual query:
SELECT a.aa_codart, MAX(t.tr_fechafac), t.tr_tipo
FROM ARTALM a, traspaso t
WHERE t.tr_codart = a.aa_codart
and t.tr_tipomov > 1
and a.aa_codalm = '1'
and (t.tr_tipo >= 1 and t.tr_tipo <=2)
group by a.aa_codart, t.tr_tipo;
And the result:
01..FRB10X80 30/11/07 2
01..FRB10X80 08/03/01 1
01.32122RS 05/02/16 1
01.32122RS 02/07/10 2
01.33052Z 21/09/15 1
01.60042Z 24/02/16 2
I want, for example in the two first rows, see only one row, like this:
01..FRB10X80 30/11/07 2
01.32122RS 05/02/16 1
01.33052Z 21/09/15 1
01.60042Z 24/02/16 2
Taking the max date
Thanks
This calls for an analytical query. This query shows how the ROW_NUMBER() function will assign the value 1 to the row with the article's most recent date. Give it a try first to help understand the final query, coming up next:
SELECT
a.aa_codart,
t.tr_fechafac,
t.tr_tipo,
ROW_NUMBER() OVER (PARTITION BY a.aa_codart ORDER BY t.tr_fechafac DESC) as rnk
FROM artalm a
INNER JOIN trapaso t ON a.aa_codart = t.tr_codart
WHERE t.tr_tipomov > 1
AND a.aa_codalm = '1'
AND t.tr_tipo BETWEEN 1 AND 2
You can't apply the WHERE clause to the rnk column because the column is calculated after the WHERE clause. You can get around this using a nested query:
SELECT * FROM (
SELECT
a.aa_codart,
t.tr_fechafac,
t.tr_tipo,
ROW_NUMBER() OVER (PARTITION BY a.aa_codart ORDER BY t.tr_fechafac DESC) as rnk
FROM artalm a
INNER JOIN trapaso t ON a.aa_codart = t.tr_codart
WHERE t.tr_tipomov > 1
AND a.aa_codalm = '1'
AND t.tr_tipo BETWEEN 1 AND 2
) WHERE rnk = 1;
I apologize in advance for any column names I may have retyped badly. The Oracle syntax should be fine; the column names maybe not so much :)
I think you may want to look at row_number() (then just pick the ones where it is one) something like this.
WITH t
AS (SELECT 'A' aa_codart,
TO_DATE ('17/05/00', 'dd/mm/yy') mydt,
1 tr_tipo
FROM DUAL
UNION ALL
SELECT 'A', TO_DATE ('12/04/00', 'dd/mm/yy'), 2 FROM DUAL
UNION ALL
SELECT 'B', TO_DATE ('30/06/98', 'dd/mm/yy'), 2 FROM DUAL
UNION ALL
SELECT 'C', TO_DATE ('30/06/98 ', 'dd/mm/yy'), 2 FROM DUAL),
t2
AS (SELECT aa_codart,
mydt,
tr_tipo,
ROW_NUMBER ()
OVER (PARTITION BY aa_codart ORDER BY mydt DESC)
rn
FROM t)
SELECT *
FROM t2
WHERE rn = 1

SQL query top 2 columns of joined table?

I am having no luck attempting to get the top (x number) of rows from a joined table. I want the top 2 resources (ordered by name) which in this case should be Katie and Simon and regardless of what I've tried, I can't seem to get it right. You can see below what I've commented out - and what looks like it should work (but doesn't). I cannot use a union. Any ideas?
select distinct
RTRESOURCE.RNAME as Resource,
RTTASK.TASK as taskname, SUM(distinct SOTRAN.QTY2BILL) AS quantitytobill from SOTRAN AS SOTRAN INNER JOIN RTTASK AS RTTASK ON sotran.taskid = rttask.taskid
left outer JOIN RTRESOURCE AS RTRESOURCE ON rtresource.keyno=sotran.resid
WHERE sotran.phantom<>'y' and sotran.pgroup = 'L' and sotran.timesheet = 'y' and sotran.taskid >0 AND RTRESOURCE.KEYNO in ('193','159','200') AND ( SOTRAN.ADDDATE>='8/15/2015 12:00:00 AM' AND SOTRAN.ADDDATE<'9/3/2015 11:59:59 PM' )
//and RTRESOURCE.RNAME in ( select distinct top 2 RTRESOURCE.RNAME from RTRESOURCE order by RTRESOURCE.RNAME)
//and ( select count(*) from RTRESOURCE RTRESOURCE2 where RTRESOURCE2.RNAME = RTRESOURCE.RNAME ) <= 2
GROUP BY RTRESOURCE.rname,RTTASK.task,RTTASK.taskid,RTTASK.mdsstring ORDER BY Resource,taskname
You should provide a schema.
But lets assume your query work. You create a CTE.
WITH youQuery as (
SELECT *
FROM < you big join query>
), maxBill as (
SELECT Resource, Max(quantitytobill) as Bill
FROM yourQuery
)
SELECT top 2 *
FROM maxBill
ORDER BY Bill
IF you want top 2 alphabetical
WITH youQuery as (
SELECT *
FROM < you big join query>
), Names as (
SELECT distinct Resource
FROM yourQuery
Order by Resource
)
SELECT top 2 *
FROM Names

T-sql problem with running sum

I am trying to write T-sql script which will find "open" records for one table
Structure of data is following
Id (int PK) Ts (datetime) Art_id (int) Amount (float)
1 '2009-01-01' 1 1
2 '2009-01-05' 1 -1
3 '2009-01-10' 1 1
4 '2009-01-11' 1 -1
5 '2009-01-13' 1 1
6 '2009-01-14' 1 1
7 '2009-01-15' 2 1
8 '2009-01-17' 2 -1
9 '2009-01-18' 2 1
According to my needs I am trying to show only records after last sum for every one articles where 0 sorting by date of last running sum of zero value. So I am trying to abstract (show) records 5 and 6 for Art_id=1 and record 9 for art_id=2. I am using MSSQL2005 and my table has around 30K records with 6000 distinct values of ART_ID.
In this solution I simply want to find all the rows where there isn't a subsequent row for that Art_id where the running sum was 0. I am assuming we can use the ID as a better tiebreaker than TS, since two rows can come in with the same timestamp but they will get sequential identity values.
;WITH base AS
(
SELECT
ID, Art_id, TS, Amount,
RunningSum = Amount + COALESCE
(
(
SELECT SUM(Amount)
FROM dbo.foo
WHERE Art_id = f.Art_id
AND ID < f.ID
)
, 0
)
FROM dbo.[table name] AS f
)
SELECT ID, Art_id, TS, Amount
FROM base AS b1
WHERE NOT EXISTS
(
SELECT 1
FROM base AS b2
WHERE Art_id = b1.Art_id
AND ID >= b1.ID
AND RunningSum = 0
)
ORDER BY ID;
Complete working query:
SELECT
*
FROM TABLE_NAME E
JOIN
(SELECT
C.ART_ID,
MAX(TS) MAX_TS
FROM
(SELECT
ART_ID,
TS,
COALESCE((SELECT SUM(AMOUNT) FROM TABLE_NAME B WHERE (B.Art_id = A.Art_id) AND (B.Ts < A.Ts)),0) ROW_SUM
FROM TABLE_NAME A) C
WHERE C.ROW_SUM = 0
GROUP BY C.ART_ID) D
ON
(D.ART_ID = E.ART_ID) AND
(E.TS >= D.MAX_TS)
First we calculate running sums for every row:
SELECT
ART_ID,
TS,
COALESCE((SELECT SUM(AMOUNT) FROM TABLE_NAME B WHERE (B.Art_id = A.Art_id) AND (B.Ts < A.Ts)),0) ROW_SUM
FROM TABLE_NAME A
Then we look for last article with 0:
SELECT
C.ART_ID,
MAX(TS) MAX_TS
FROM
(SELECT
ART_ID,
TS,
COALESCE((SELECT SUM(AMOUNT) FROM TABLE_NAME B WHERE (B.Art_id = A.Art_id) AND (B.Ts < A.Ts)),0) ROW_SUM
FROM TABLE_NAME A) C
WHERE C.ROW_SUM = 0
GROUP BY C.ART_ID
You can find all rows where the running sum is zero with:
select cur.id, cur.art_id
from #articles cur
left join #articles prev
on prev.art_id = cur.art_id
and prev.id <= cur.id
group by cur.id, cur.art_id
having sum(prev.amount) = 0
Then you can query all rows that come after the rows with a zero running sum:
select a.*
from #articles a
left join (
select cur.id, cur.art_id, running = sum(prev.amount)
from #articles cur
left join #articles prev
on prev.art_id = cur.art_id
and prev.ts <= cur.ts
group by cur.id, cur.art_id
having sum(prev.amount) = 0
) later_zero_running on
a.art_id = later_zero_running.art_id
and a.id <= later_zero_running.id
where later_zero_running.id is null
The LEFT JOIN in combination with the WHERE says: there can not be a row after this row, where the running sum is zero.