Aggregate total number of orders using Invoice Line Items - sql

I am trying to count the number of unique booking by each Customer by looking at the invoice_id, however they aren't producing the correct results.
Schema
CREATE TABLE invoice_line_items (
invoice_id int,
customer_id int,
tstamp datetime
);
Data
INSERT INTO invoice_line_items (invoice_id, customer_id, tstamp)
VALUES ('1', '123', '2018-12-21 10:00:00'),
('1', '123', '2018-12-21 10:00:00'),
('2', '123', '2018-12-22 10:00:00'),
('2', '124', '2018-12-22 10:00:00'),
('3', '124', '2018-12-22 10:00:00'),
('4', '124', '2018-12-22 10:00:00'),
('5', '124', '2018-12-22 10:00:00'),
('5', '124', '2018-12-22 10:00:00');
Query
select customer_id, count(*) as number_of_orders
from invoice_line_items
where tstamp >= '2018-01-01'
group by customer_id, invoice_id
Desired Output
customer_id | number_of_orders
123 | 2
124 | 4

You don't need to include invoice_id in group by :
select customer_id, count(distinct invoice_id) as number_of_orders
from invoice_line_items
where tstamp >= '2018-01-01'
group by customer_id;
Use DISTINCT inside COUNT() to find unique booking.

Related

How to create a sql query get the 2nd to Max result in table?

I have a table in snowflake and I want to see the results for the max date and 2nd to Max date. I wrote a query below which gives the results for max date
SELECT MAX(FILE_DATE) "MAX_DATE"
,UPPER(RETAIL) "SHOP"
FROM PRODUCTS
GROUP BY UPPER(RETAIL)
the results are something like this
MAX_DATE SHOP
2022-09-27 SHOP1
2022-08-01 SHOP2
I am looking for
2nd_MAX_DATE MAX_DATE SHOP
2022-08-27 2022-09-27 SHOP1
2022-07-01 2022-08-01 SHOP2
I tried something with where clause WHERE FILE_DATE < ( SELECT MAX(FILE_DATE) FROM "PRODUCTS" GROUP BY UPPER(RETAILER)) but this isn't working
One way is to use a cross join.
Here's a full sample roughly altered to your schema using SQL Server syntax.
CREATE TABLE Products (
Id INT PRIMARY KEY,
Retail VARCHAR(100),
FileDate DATETIME
);
INSERT INTO Products (Id, Retail, FileDate)
VALUES (1, 'shop 1', '2022-09-01'),
(2, 'shop 1', '2022-09-01'),
(3, 'shop 1', '2022-09-01'),
(4, 'shop 2', '2022-09-01'),
(5, 'shop 2', '2022-09-01'),
(6, 'shop 2', '2022-09-30'),
(7, 'shop 3', '2022-09-01'),
(8, 'shop 3', '2022-09-01'),
(9, 'shop 3', '2022-09-20');
SELECT DISTINCT
MAX(p1.FileDate) "MAX_DATE",
UPPER(p1.Retail) "SHOP"
FROM Products p1
CROSS JOIN Products p2
WHERE
p1.FileDate > p2.FileDate
GROUP BY
p1.Retail,
p2.Retail
Output
2022-09-30|SHOP 2
2022-09-20|SHOP 3
As you want the latest two file_dates for every shop, you can use WINDOW function
WITH CTE as(
SELECT FILE_DATE
,UPPER(RETAIL) "SHOP"
, ROW_NUMBER() OVER(PARTITION BY UPPER(RETAIL) ORDER BY FILE_DATE DESC) rn
FROM PRODUCTS
)
SELECT "SHOP",MIN(FILE_DATE) second_date, MAX(FILE_DATE) first_date
FROM CTE
WHERE rn < 3
GROUP BY "SHOP"
create or replace TABLE SNOWPARK_QUICKSTART.TELCO.Products (
ID INT,
FileDate TIMESTAMP,
Retail VARCHAR(16777216)
);
INSERT INTO Products (Id, Retail, FileDate)
VALUES (1, 'shop 1', '2022-09-01'),
(2, 'shop 1', '2022-09-01'),
(3, 'shop 1', '2022-09-01'),
(4, 'shop 2', '2022-09-01'),
(5, 'shop 2', '2022-09-01'),
(6, 'shop 2', '2022-09-30'),
(7, 'shop 3', '2022-09-01'),
(8, 'shop 3', '2022-09-01'),
(9, 'shop 3', '2022-09-20');
SELECT RETAIL
,FILEDATE
,IFF(RANK()OVER(PARTITION BY RETAIL ORDER BY FILEDATE DESC)=2,'SECOND MAX DATE','MAX DATE') DAY_DESC
FROM
Products
GROUP BY
RETAIL,FILEDATE
QUALIFY
RANK()OVER(PARTITION BY RETAIL ORDER BY FILEDATE DESC)<3
Using the data provided PeteGo as a CTE:
with Products (Id, Retail, FileDate) as
(
select * from values
(1, 'shop 1', '2022-09-01'::date),
(2, 'shop 1', '2022-09-01'::date),
(3, 'shop 1', '2022-09-01'::date),
(4, 'shop 2', '2022-09-01'::date),
(5, 'shop 2', '2022-09-01'::date),
(6, 'shop 2', '2022-09-30'::date),
(7, 'shop 3', '2022-09-01'::date),
(8, 'shop 3', '2022-09-01'::date),
(9, 'shop 3', '2022-09-20'::date)
)
you can use the window function NTH_VALUE and then combine with DISTINCT to de-dup:
select distinct
Retail
,nth_value(FileDate, 1) over(partition by retail order by filedate desc) as "1st_MAX_DATE"
,nth_value(FileDate, 2) over(partition by retail order by filedate desc) as "2nd_MAX_DATE"
from products
gives:
RETAIL
1st_MAX_DATE
2nd_MAX_DATE
shop 1
2022-09-01
2022-09-01
shop 2
2022-09-30
2022-09-01
shop 3
2022-09-20
2022-09-01
But you can see the two values of 2022-09-01 for shop 1 are getting picked.
and can be shown if you also pick the id's:
select distinct
Retail
,nth_value(FileDate, 1) over(partition by retail order by filedate, id desc) as "1st_MAX_DATE"
,nth_value(FileDate, 2) over(partition by retail order by filedate, id desc) as "2nd_MAX_DATE"
,nth_value(id, 1) over(partition by retail order by filedate, id desc) as "1st_id"
,nth_value(id, 2) over(partition by retail order by filedate, id desc) as "2nd_id"
from products
RETAIL
1st_MAX_DATE
2nd_MAX_DATE
1st_id
2nd_id
shop 1
2022-09-01
2022-09-01
3
2
shop 2
2022-09-01
2022-09-01
5
4
shop 3
2022-09-01
2022-09-01
8
7
you can therefore de-dup the shops/dates:
select distinct
Retail
,nth_value(FileDate, 1) over(partition by retail order by filedate desc) as "1st_MAX_DATE"
,nth_value(FileDate, 2) over(partition by retail order by filedate desc) as "2nd_MAX_DATE"
from (
select distinct Retail, FileDate from products
)
now gives a null as I originally expected:
RETAIL
1st_MAX_DATE
2nd_MAX_DATE
shop 1
2022-09-01
null
shop 2
2022-09-30
2022-09-01
shop 3
2022-09-20
2022-09-01

Compare values between two tables with over partition criteria

DB-Fiddle
/* Table Campaigns */
CREATE TABLE campaigns (
id SERIAL PRIMARY KEY,
insert_time DATE,
campaign VARCHAR,
tranches VARCHAR,
quantity DECIMAL);
INSERT INTO campaigns
(insert_time, campaign, tranches, quantity)
VALUES
('2021-01-01', 'C001', 't', '500'),
('2021-01-01', 'C002', 't', '600'),
('2021-01-02', 'C001', 't', '500'),
('2021-01-02', 'C002', 't', '600');
/* Table Tranches */
CREATE TABLE tranches (
id SERIAL PRIMARY KEY,
insert_time DATE,
campaign VARCHAR,
tranches VARCHAR,
quantity DECIMAL);
INSERT INTO tranches
(insert_time, campaign, tranches, quantity)
VALUES
('2021-01-01', 'C001', 't1', '200'),
('2021-01-01', 'C001', 't2', '120'),
('2021-01-01', 'C001', 't3', '180'),
('2021-01-01','C002', 't1', '350'),
('2021-01-01','C002', 't2', '250'),
('2021-01-02', 'C001', 't1', '400'),
('2021-01-02', 'C001', 't2', '120'),
('2021-01-02', 'C001', 't3', '180'),
('2021-01-02','C002', 't1', '350'),
('2021-01-02','C002', 't2', '250');
Expected Result:
insert_time | campaign | tranches | quantity_campaigns | quantity_tranches | check
--------------|------------|------------|---------------------|---------------------|-----------
2021-01-01 | C001 | t | 500 | 500 | ok
2021-01-01 | C002 | t | 600 | 600 | ok
--------------|------------|------------|---------------------|---------------------|------------
2021-01-02 | C001 | t | 500 | 700 | error
2021-01-02 | C002 | t | 600 | 500 | ok
I want to compare the total quantity per campaign in table campaigns with the total quantity per campaign in table tranches.
So far I have been able to develop this query:
SELECT
c.insert_time AS insert_time,
c.campaign AS campaign,
c.tranches AS tranches,
c.quantity AS quantity_campaigns,
t.quantity AS quantity_tranches,
(CASE WHEN
MAX(c.quantity) OVER(PARTITION BY c.insert_time, c.campaign) = SUM(t.quantity) OVER(PARTITION BY t.insert_time, t.campaign)
THEN 'ok' ELSE 'error' END) AS check
FROM campaigns c
LEFT JOIN tranches t ON c.campaign = t.campaign
ORDER BY 1,2,3,4,5;
However, it does not give me the expected result?
What do I need to change to make it work?
I think the result you're looking for should be something like this. The problem is that you're trying to aggregate over two groupings after a join which will either yield too many results or incorrect calculations. By aggregating in CTE, and then joining the CTEs after aggregation has occurred you can achieve the results you are looking for. See my example below:
WITH campaign_agg AS(
SELECT c.insert_time, c.campaign, c.tranches, MAX(c.quantity) c_quantity
FROM campaigns c
GROUP BY c.insert_time, c.campaign, c.tranches
), tranch_agg AS(
SELECT t.insert_time, t.campaign, SUM(t.quantity) as t_sum
FROM tranches t
GROUP BY t.insert_time, t.campaign
)
SELECT c.insert_time, c.campaign, c.tranches, c.c_quantity, t.t_sum,
CASE WHEN c.c_quantity = t.t_sum THEN 'ok' ELSE 'error' END as check
FROM campaign_agg c
JOIN
tranch_agg t ON
t.insert_time = c.insert_time
AND t.campaign = c.campaign
ORDER BY c.insert_time, c.campaign
I have a db-fiddle for this as well: https://www.db-fiddle.com/f/33x4upVEcgTMNehiHCKzfN/1
DB-Fiddle
SELECT
c.insert_time AS insert_time,
c.campaign AS campaign,
c.tranches AS tranches,
SUM(c.quantity) AS quantity_campaigns,
SUM(t1.quantity) AS quantity_tranches,
(CASE WHEN SUM(c.quantity) <> SUM(t1.quantity) THEN 'error' ELSE 'ok' END) AS check
FROM campaigns c
LEFT JOIN
(SELECT
t.insert_time AS insert_time,
t.campaign AS campaign,
SUM(t.quantity) AS quantity
FROM tranches t
GROUP BY 1,2
ORDER BY 1,2) t1 on t1.insert_time = c.insert_time AND t1.campaign = c.campaign
GROUP BY 1,2,3
ORDER BY 1,2,3;

SQL: Identify first and last date within each consecutive group of days

Objective:
The objective is to find the first and last observation date for which the room has a constant price using postgresql SQL queries.
We are completely lost so any guidance would be highly appreciated.
Create example:
CREATE TABLE table_prices
(
pk int GENERATED BY DEFAULT AS IDENTITY PRIMARY KEY,
room_id character varying(50) COLLATE pg_catalog."default",
check_in date,
price integer,
observation_date date
)
Insert data:
insert into table_prices (room_id, check_in, price, observation_date) values
('1', '2019-05-01', 100, '2019-01-01'),
('1', '2019-05-01', 100, '2019-01-02'),
('1', '2019-05-01', 100, '2019-01-03'),
('1', '2019-05-01', 150, '2019-01-04'),
('1', '2019-05-01', 150, '2019-01-05'),
('1', '2019-05-01', 150, '2019-01-06'),
('1', '2019-05-01', 150, '2019-01-07'),
('1', '2019-05-01', 100, '2019-01-08'),
('1', '2019-05-01', 100, '2019-01-09'),
('2', '2019-05-01', 200, '2019-01-01'),
('2', '2019-05-01', 200, '2019-01-02'),
('2', '2019-05-01', 200, '2019-01-03'),
('2', '2019-05-01', 200, '2019-01-04'),
('2', '2019-05-01', 200, '2019-01-05'),
('2', '2019-05-01', 200, '2019-01-06'),
('2', '2019-05-01', 200, '2019-01-07'),
('2', '2019-05-01', 200, '2019-01-08'),
('2', '2019-05-01', 200, '2019-01-09')
Expected outcome:
room_id, check_in, first_observation, last_observation, price
1, 2019-05-01, 2019-01-01, 2019-01-03, 100
1, 2019-05-01, 2019-01-04, 2019-01-07, 150
1, 2019-05-01, 2019-01-08, 2019-01-09, 100
2, 2019-05-01, 2019-01-01, 2019-01-09, 200
This is a gap & island problem -you can try using row_number()
DEMO
select room_id, check_in,min(observation_date) first_observation,max(observation_date)
last_observation,price
from
(
select *,island=row_number() over(partition by room_id order by observation_date) -
row_number() over(partition by room_id, price order by observation_date)
from table_prices
)A group by room_id, check_in,island,price
OUTPUT:
room_id check_in first_observation last_observation price
1 01/05/2019 00:00:00 01/01/2019 00:00:00 03/01/2019 00:00:00 100
1 01/05/2019 00:00:00 04/01/2019 00:00:00 07/01/2019 00:00:00 150
1 01/05/2019 00:00:00 08/01/2019 00:00:00 09/01/2019 00:00:00 100
2 01/05/2019 00:00:00 01/01/2019 00:00:00 09/01/2019 00:00:00 200

Postgres query different COUNT and ROW_NUMBER()

I have a table messages with the following columns
group_id BIGINT,
user_id BIGINT,
message_date timestamp
For the right user_id I would like to be able to count the total rows with that user_id, the distinct groups with that user_id, and considering a leaderboard made by the count of user_id, the position too.
I tried this query
SELECT main.total_m, main.group_number, main.pos
FROM (
SELECT user_id, COUNT(group_id) AS group_number, COUNT(user_id) AS total_m,
ROW_NUMBER() OVER (
PARTITION BY COUNT(user_id)
ORDER BY COUNT(user_id) DESC
) AS pos
FROM messages
WHERE message_date > date_trunc('week', now())
GROUP BY user_id, group_id
) AS main
WHERE user_id = %s
But I don't get the result I would like to have. Where am I wrong?
The power of "sample data" and "expected result" is it enables others to answer efficiently. The following is a complete guess, but perhaps it will prompt you to prepare a "Minimal, Complete, and Verifiable Example" (MCVE)
The detials below can be accessed at SQL Fiddle
PostgreSQL 9.6 Schema Setup:
CREATE TABLE Messages
(USER_ID int, GROUP_ID int, MESSAGE_DATE timestamp)
;
INSERT INTO Messages
(USER_ID, GROUP_ID, MESSAGE_DATE)
VALUES
(1, 7, '2017-09-01 10:00:00'),
(1, 6, '2017-09-02 10:00:00'),
(1, 5, '2017-09-03 10:00:00'),
(1, 4, '2017-09-04 10:00:00'),
(1, 7, '2017-09-05 10:00:00'),
(2, 6, '2017-09-01 10:00:00'),
(2, 5, '2017-09-02 10:00:00'),
(2, 7, '2017-09-03 10:00:00'),
(2, 6, '2017-09-04 10:00:00'),
(2, 4, '2017-09-05 10:00:00'),
(2, 8, '2017-09-11 10:00:00')
;
Query 1:
select
user_id
, num_grps
, num_msgs
, dense_rank() over(order by num_grps DESC, num_msgs DESC, max_date DESC, user_id) rnk
from (
select
user_id
, count(distinct group_id) num_grps
, count(*) num_msgs
, max(message_date) max_date
from messages
group by
user_id
) d
Results:
| user_id | num_grps | num_msgs | rnk |
|---------|----------|----------|-----|
| 2 | 5 | 6 | 1 |
| 1 | 4 | 5 | 2 |
Looking at just the inner query, I see this in the select:
SELECT user_id, COUNT(group_id), ...
But this in the GROUP BY:
GROUP BY user_id, group_id
Put those together, and you'll never have a COUNT() result of anything other than 1, because each group_id has it's own group. It works for the same for total_m column.

Postgresql. Merge and split date ranges from two tables by set of keys

I'm trying to combine multiple date ranges from two same tables with same or diferrent data. (PostgreSql 9.*)
Tables structure:
CREATE TABLE "first_activities" (
"id" int4 NOT NULL DEFAULT nextval('first_activities_id_seq'::regclass),
"start_time" timestamptz,
"end_time" timestamptz,
"activity_type" int2,
"user_id" int4
)
WITH (OIDS=FALSE);
ALTER TABLE "first_activities" ADD PRIMARY KEY ("id") NOT DEFERRABLE INITIALLY IMMEDIATE;
CREATE TABLE "second_activities" (
"id" int4 NOT NULL DEFAULT nextval('second_activities_id_seq'::regclass),
"start_time" timestamptz,
"end_time" timestamptz,
"activity_type" int2,
"user_id" int4
)
WITH (OIDS=FALSE);
ALTER TABLE "second_activities" ADD PRIMARY KEY ("id") NOT DEFERRABLE INITIALLY IMMEDIATE;
Data in First table:
INSERT INTO "first_activities" VALUES
(NULL, '2014-10-31 01:00:00', '2014-10-31 02:00:00', '3', '1'),
(NULL, '2014-10-31 02:00:00', '2014-10-31 03:00:00', '4', '1'),
(NULL, '2014-10-31 03:00:00', '2014-10-31 04:00:00', '2', '1'),
(NULL, '2014-10-31 04:30:00', '2014-10-31 05:00:00', '3', '1'),
(NULL, '2014-10-31 05:30:00', '2014-11-01 06:00:00', '4', '1'),
(NULL, '2014-11-01 06:30:00', '2014-11-01 07:00:00', '2', '1'),
(NULL, '2014-11-01 07:30:00', '2014-11-01 08:00:00', '1', '1'),
(NULL, '2014-11-01 08:00:00', '2014-11-01 09:00:00', '3', '1'),
(NULL, '2014-11-01 09:00:00', '2014-11-02 10:00:00', '4', '1'),
(NULL, '2014-08-27 10:00:00', '2014-08-27 11:00:00', '2', '1'),
(NULL, '2014-08-27 11:00:00', '2014-08-27 12:00:00', '1', '1'),
Data in Second table:
INSERT INTO "second_activities" VALUES
(NULL, '2014-10-31 01:00:00', '2014-10-31 02:00:00', '3', '1'),
(NULL, '2014-10-31 02:00:00', '2014-10-31 03:00:00', '4', '1'),
-- Differece from first table
(NULL, '2014-10-31 03:30:00', '2014-10-31 04:00:00', '1', '1'),
(NULL, '2014-10-31 04:25:00', '2014-10-31 04:35:00', '3', '1'),
(NULL, '2014-10-31 04:45:00', '2014-10-31 05:35:00', '3', '1'),
-- End of Difference from first table
(NULL, '2014-08-27 10:00:00', '2014-08-27 11:00:00', '2', '1'),
(NULL, '2014-08-27 11:00:00', '2014-08-27 12:00:00', '1', '1');
How can I filter result set that starting from query:
SELECT * FROM first_activities UNION ALL SELECT * from second_activities
ORDER BY start_time ASC;
to get final result set.
Final Result:
-- merge same data by user_id and activity_type and combine with
-- and split data with range intersection but not same user_id and acitvity_type
-- start_time end_time type user_id
'2014-10-31 01:00:00', '2014-10-31 02:00:00', '3', '1');
'2014-10-31 02:00:00', '2014-10-31 03:00:00', '4', '1');
--data dont merge. Splitting with range intersection
'2014-10-31 03:00:00', '2014-10-31 03:30:00', '2', '1'); -- from first table
'2014-10-31 03:30:00', '2014-10-31 04:00:00', '1', '1'); -- from second table
-- data merged by same user_id and activity_type
'2014-10-31 04:25:00', '2014-10-31 05:35:00', '3', '1');
'2014-10-31 05:30:00', '2014-11-01 06:00:00', '4', '1');
'2014-11-01 06:30:00', '2014-11-01 07:00:00', '2', '1');
'2014-11-01 07:30:00', '2014-11-01 08:00:00', '1', '1');
'2014-11-01 08:00:00', '2014-11-01 09:00:00', '3', '1');
'2014-11-01 09:00:00', '2014-11-02 10:00:00', '4', '1');
'2014-08-27 10:00:00', '2014-08-27 11:00:00', '2', '1');
'2014-08-27 11:00:00', '2014-08-27 12:00:00', '1', '1');
The issue can be reduced to the question of how to combine (compact) a group of adjacent (overlapping) ranges into one. I had to deal with this some time ago and found it a bit complicated in plain SQL. There is a simple solution using loop in a plpgsql code, but I found also a general solution with the use of custom aggregate.
The function compact_ranges(anyrange, anyrange) returns the sum of ranges if they are adjacent (overlapping) or the second range otherwise:
create or replace function compact_ranges(anyrange, anyrange)
returns anyrange language sql as $$
select case
when $1 && $2 or $1 -|- $2 then $1+ $2
else $2
end
$$;
create aggregate compact_ranges_agg (anyrange) (
sfunc = compact_ranges,
stype = anyrange
);
The aggregate has a narrow scope of usage, it should be called as a progressive window function like in the example:
with test(rng) as (
values
('[ 1, 2)'::int4range),
('[ 3, 7)'), -- group 1
('[ 5, 10)'), -- group 1
('[ 6, 8)'), -- group 1
('[11, 17)'), -- group 2
('[12, 16)'), -- group 2
('[15, 16)'), -- group 2
('[18, 19)')
)
select distinct on (lower(new_rng)) new_rng
from (
select *, compact_ranges_agg(rng) over (order by rng) new_rng
from test
) s
order by lower(new_rng), new_rng desc;
new_rng
---------
[1,2)
[3,10)
[11,17)
[18,19)
(4 rows)
In the same way you can use it for your tables:
with merged as (
select tstzrange(start_time, end_time) rng, activity_type, user_id
from first_activities
union
select tstzrange(start_time, end_time) rng, activity_type, user_id
from second_activities
),
compacted as (
select distinct on (user_id, activity_type, lower(new_rng))
lower(new_rng) start_time,
upper(new_rng) end_time,
activity_type,
user_id
from (
select
user_id, activity_type,
compact_ranges_agg(rng) over (partition by user_id, activity_type order by rng) new_rng
from merged
) s
order by user_id, activity_type, lower(new_rng), new_rng desc
)
select
start_time,
case when end_time > lead(start_time) over w then lead(start_time) over w else end_time end,
activity_type,
user_id
from compacted
window w as (order by start_time)
order by start_time;
The result:
start_time | end_time | activity_type | user_id
------------------------+------------------------+---------------+---------
2014-08-27 10:00:00+02 | 2014-08-27 11:00:00+02 | 2 | 1
2014-08-27 11:00:00+02 | 2014-08-27 12:00:00+02 | 1 | 1
2014-10-31 01:00:00+01 | 2014-10-31 02:00:00+01 | 3 | 1
2014-10-31 02:00:00+01 | 2014-10-31 03:00:00+01 | 4 | 1
2014-10-31 03:00:00+01 | 2014-10-31 03:30:00+01 | 2 | 1
2014-10-31 03:30:00+01 | 2014-10-31 04:00:00+01 | 1 | 1
2014-10-31 04:25:00+01 | 2014-10-31 05:30:00+01 | 3 | 1
2014-10-31 05:30:00+01 | 2014-11-01 06:00:00+01 | 4 | 1
2014-11-01 06:30:00+01 | 2014-11-01 07:00:00+01 | 2 | 1
2014-11-01 07:30:00+01 | 2014-11-01 08:00:00+01 | 1 | 1
2014-11-01 08:00:00+01 | 2014-11-01 09:00:00+01 | 3 | 1
2014-11-01 09:00:00+01 | 2014-11-02 10:00:00+01 | 4 | 1
(12 rows)