How to group and collect data in PostgreSQL by date periods? - sql

Here is a demo data:
create table Invoices (
id INT,
name VARCHAR,
customer_id INT,
total_amount FLOAT,
state VARCHAR,
invoice_date DATE
);
INSERT INTO Invoices
(id, name, customer_id, total_amount, state, invoice_date)
VALUES
(1, 'INV/2020/0001', 2, 100, 'posted', '2020-04-05'),
(2, 'INV/2020/0002', 1, 100, 'draft', '2020-04-05'),
(3, 'INV/2020/0003', 2, 100, 'draft', '2020-05-24'),
(4, 'INV/2020/0004', 1, 100, 'posted', '2020-05-25'),
(5, 'INV/2020/0005', 2, 100, 'posted', '2020-06-05'),
(6, 'INV/2020/0006', 1, 100, 'posted', '2020-07-05'),
(7, 'INV/2020/0007', 1, 100, 'draft', '2020-08-24'),
(8, 'INV/2020/0008', 1, 100, 'posted', '2020-08-25'),
(9, 'INV/2020/0009', 1, 100, 'posted', '2020-09-05'),
(10, 'INV/2020/0010', 1, 100, 'draft', '2020-09-05'),
(11, 'INV/2020/0011', 2, 100, 'draft', '2020-10-24'),
(12, 'INV/2020/0012', 1, 100, 'posted', '2020-10-25'),
(13, 'INV/2020/0013', 2, 100, 'posted', '2020-11-05'),
(14, 'INV/2020/0014', 1, 100, 'posted', '2020-11-05'),
(15, 'INV/2020/0015', 2, 100, 'draft', '2020-11-24'),
(16, 'INV/2020/0016', 1, 100, 'posted', '2020-11-25')
I have a query that computes a sum of all posted invoices for customer with id = 1
SELECT sum(total_amount), customer_id
FROM Invoices
WHERE state = 'posted' AND customer_id = 1
GROUP BY customer_id
I need to group the data (sum(total_amount)) by 3 time periods - 2 or 3 months each (2 or 3 needs to be able to change by changing the number in the query. I want to pass it as a parameter to the query from python code).
Also I need to get the average sums of the period.
Can you help me please?
Expected output for period = 2 months is:
+--------------+--------------+--------------+--------+
| Period_1_sum | Period_2_sum | Period_3_sum | Avg |
+--------------+--------------+--------------+--------+
| 300 | 300 | 100 | 233.33 |
+--------------+--------------+--------------+--------+

You can use conditional aggregation for that:
SELECT customer_id,
sum(total_amount) as total_amount,
sum(total_amount) filter (where invoice_date >= date '2020-04-01' and invoice_date < date '2020-07-01') as period_1_sum,
sum(total_amount) filter (where invoice_date >= date '2020-07-01' and invoice_date < date '2020-10-01') as period_2_sum,
sum(total_amount) filter (where invoice_date >= date '2020-10-01' and invoice_date < date '2021-01-01') as period_3_sum
FROM Invoices
WHERE state = 'posted'
GROUP BY customer_id
By changing the filter condition you can control which rows are aggregated for each period.
Online example

Related

SQL: how to get a subset from a subset that is subject to an "only" condition

I have a table named orders in a Postgres database (see Fiddle at http://sqlfiddle.com/#!17/ac4f9).
CREATE TABLE orders
(
user_id INTEGER,
order_id INTEGER,
order_date DATE,
price FLOAT,
product VARCHAR(255)
);
INSERT INTO orders(user_id, order_id, order_date, price, product)
VALUES
(1, 2, '2021-03-05', 15, 'books'),
(1, 13, '2022-03-07', 3, 'music'),
(1, 14, '2022-06-15', 900, 'travel'),
(1, 11, '2021-11-17', 25, 'books'),
(1, 16, '2022-08-03', 32, 'books'),
(2, 4, '2021-04-12', 4, 'music'),
(2, 7, '2021-06-29', 9, 'music'),
(2, 20, '2022-11-03', 8, 'music'),
(2, 22, '2022-11-07', 575, 'travel'),
(2, 24, '2022-11-20', 95, 'food'),
(3, 3, '2021-03-17', 25, 'books'),
(3, 5, '2021-06-01', 650, 'travel'),
(3, 17, '2022-08-17', 1200, 'travel'),
(3, 19, '2022-10-02', 6, 'music'),
(3, 23, '2022-11-08', 7, 'food'),
(4, 9, '2021-08-20', 3200, 'travel'),
(4, 10, '2021-10-29', 2750, 'travel'),
(4, 15, '2022-07-15', 1820, 'travel'),
(4, 21, '2022-11-05', 8000, 'travel'),
(4, 25, '2022-11-29', 2300, 'travel'),
(5, 1, '2021-01-04', 3, 'music'),
(5, 6, '2021-06-09', 820, 'travel'),
(5, 8, '2021-07-30', 19, 'books'),
(5, 12, '2021-12-10', 22, 'music'),
(5, 18, '2022-09-19', 20, 'books'),
(6, 26, '2023-01-09', 700, 'travel'),
(6, 27, '2023-01-23', 1900, 'travel');
From the list of users who have placed an order for the either the travel product OR the books product, I would like to get the subset of these users who have placed an order for ONLY the travel product.
The desired result set would be:
user_id count_orders
-----------------------
4 5
6 2
How would I do this?
Thank you.
select o.user_id, count(*) count_orders
from orders o
where not exists(select * from orders where product<>'travel' and user_id=o.user_id)
group by o.user_id
http://sqlfiddle.com/#!17/ac4f9/17
Count all orders and travel orders first. Filter records with same count values.
With inline view: http://sqlfiddle.com/#!17/ac4f9/18/0
SELECT user_id, n_orders AS count_orders
FROM (
SELECT user_id
, COUNT(CASE WHEN product = 'travel' THEN 1 END) AS n_travels
, COUNT(*) AS n_orders
FROM orders
GROUP BY user_id
) v
WHERE v.n_travels = v.n_orders
Using HAVING clause 1: http://sqlfiddle.com/#!17/ac4f9/22/0
SELECT user_id
, COUNT(*) AS count_orders
FROM orders
GROUP BY user_id
HAVING COUNT(CASE WHEN product != 'travel' THEN 1 END) = COUNT(*)
Using HAVING clause 2: http://sqlfiddle.com/#!17/ac4f9/21/0
SELECT user_id
, COUNT(*) AS count_orders
FROM orders
GROUP BY user_id
HAVING COUNT(CASE WHEN product != 'travel' THEN 1 END) = 0
Using EXCEPT operation
select user_id, count(*)
from orders
where user_id in (
select user_id from orders where product = 'travel'
except
select user_id from orders where product <> 'travel'
)
group by user_id
order by user_id

match recognize collect row data into single column

I'm following the tutorial for match_recognize found here:
create or replace temporary table stock_price_history (company text, price_date date, price int);
insert into stock_price_history values
('ABCD', '2020-10-01', 50),
('ABCD', '2020-10-02', 50),
('ABCD', '2020-10-03', 51),
('ABCD', '2020-10-04', 51),
('ABCD', '2020-10-05', 51),
('ABCD', '2020-10-06', 52),
('ABCD', '2020-10-07', 71),
('ABCD', '2020-10-08', 80),
('ABCD', '2020-10-09', 90),
('ABCD', '2020-10-10', 63),
('XYZ' , '2020-10-01', 24),
('XYZ' , '2020-10-02', 24),
('XYZ' , '2020-10-03', 37),
('XYZ' , '2020-10-04', 63),
('XYZ' , '2020-10-05', 65),
('XYZ' , '2020-10-06', 66),
('XYZ' , '2020-10-07', 50),
('XYZ' , '2020-10-08', 54),
('XYZ' , '2020-10-09', 30),
('XYZ' , '2020-10-10', 32);
select * from stock_price_history
match_recognize(
partition by company
order by price_date
measures
match_number() as match_number,
price as all_price,
first(price_date) as start_date,
last(price_date) as end_date,
count(*) as rows_in_sequence,
count(row_with_price_stationary.*) as num_stationary,
count(row_with_price_increase.*) as num_increases
one row per match
after match skip to last row_with_price_increase
pattern(row_before_increase row_with_price_increase{1} row_with_price_stationary* row_with_price_increase{1})
define
row_with_price_increase as price > lag(price),
row_with_price_stationary as price = lag(price)
)
order by company, match_number;
The code above is my version of the tutorial code. Everything works fine except the price as all_price part in the measures clause. What I want to do is collect all prices in the pattern and return it as an array into a single column. I know I can do all rows per match to get all rows but that's not what I want.
How would I go about doing that?
You have to specify all rows per match or lose that information out of the match_recognize function. You can use array_agg within group to get the prices in a single array. Since this aggregates row counts down you may want to do the same for the dates of each of these prices - something like this:
select COMPANY
,array_agg(PRICE) within group (order by PRICE_DATE) as ALL_PRICE
,array_agg(PRICE_DATE) within group (order by PRICE_DATE) as ALL_PRICE_DATE
from stock_price_history
match_recognize(
partition by company
order by price_date
measures
match_number() as match_number,
price as all_price,
first(price_date) as start_date,
last(price_date) as end_date,
count(*) as rows_in_sequence,
count(row_with_price_stationary.*) as num_stationary,
count(row_with_price_increase.*) as num_increases
all rows per match
after match skip to last row_with_price_increase
pattern(row_before_increase row_with_price_increase{1} row_with_price_stationary* row_with_price_increase{1})
define
row_with_price_increase as price > lag(price),
row_with_price_stationary as price = lag(price)
)
group by company
order by company
;
COMPANY
ALL_PRICE
ALL_PRICE_DATE
ABCD
[ 50, 51, 51, 51, 52, 52, 71, 80 ]
[ "2020-10-02", "2020-10-03", "2020-10-04", "2020-10-05", "2020-10-06", "2020-10-06", "2020-10-07", "2020-10-08" ]
XYZ
[ 24, 37, 63, 63, 65, 66 ]
[ "2020-10-02", "2020-10-03", "2020-10-04", "2020-10-04", "2020-10-05", "2020-10-06" ]
If you want to keep all rows, you can use the window function version of array_agg:
select * exclude ALL_PRICE
,array_agg(PRICE) within group (order by PRICE_DATE)
over (partition by COMPANY) as ALL_PRICE
from stock_price_history
match_recognize(
partition by company
order by price_date
measures
match_number() as match_number,
price as all_price,
first(price_date) as start_date,
last(price_date) as end_date,
count(*) as rows_in_sequence,
count(row_with_price_stationary.*) as num_stationary,
count(row_with_price_increase.*) as num_increases
all rows per match
after match skip to last row_with_price_increase
pattern(row_before_increase row_with_price_increase{1} row_with_price_stationary* row_with_price_increase{1})
define
row_with_price_increase as price > lag(price),
row_with_price_stationary as price = lag(price)
)
order by company
;

SQL: how to get sum of grouped items for a given corhort

I have the following orders table:
(1, 2, '2021-03-05', 15, 'books'),
(1, 13, '2022-03-07', 3, 'music'),
(1, 14, '2022-06-15', 900, 'travel'),
(1, 11, '2021-11-17', 25, 'books'),
(1, 16, '2022-08-03', 32, 'books'),
(2, 4, '2021-04-12', 4, 'music'),
(2, 7, '2021-06-29', 9, 'music'),
(2, 20, '2022-11-03', 8, 'music'),
(2, 22, '2022-11-07', 575, 'food'),
(2, 24, '2022-11-20', 95, 'food'),
(3, 3, '2021-03-17', 25, 'books'),
(3, 5, '2021-06-01', 650, 'travel'),
(3, 17, '2022-08-17', 1200, 'travel'),
(3, 19, '2022-10-02', 6, 'music'),
(3, 23, '2022-11-08', 7, 'food'),
(4, 9, '2021-08-20', 3200, 'travel'),
(4, 10, '2021-10-29', 2750, 'travel'),
(4, 15, '2022-07-15', 1820, 'travel'),
(4, 21, '2022-11-05', 8000, 'travel'),
(4, 25, '2022-11-29', 2300, 'travel'),
(5, 1, '2021-01-04', 3, 'music'),
(5, 6, '2021-06-09', 820, 'travel'),
(5, 8, '2021-07-30', 19, 'books'),
(5, 12, '2021-12-10', 22, 'music'),
(5, 18, '2022-09-19', 20, 'books'),
(6, 26, '2023-01-09', 700, 'travel'),
(6, 27, '2023-01-23', 1900, 'travel')
Here's a Fiddle: http://sqlfiddle.com/#!17/71698/3
I would like to get the sum of revenue by product among those customers who have ever purchased a travel product.
In this case, customers 1, 3, 4, 5, and 6 have purchased the travel product. Therefore, the desired result set would look like this:
customer_id
revenue_books
revenue_music
revenue_food
1
72
3
0
3
25
6
7
4
0
0
0
5
39
25
0
6
0
0
0
How would I do this? Thank you!
I my answer I show how to think about the problem to get the result -- break it down to parts and then combine it. Some answer give a less verbose query, but I don't think they will be faster. This should be easier to understand for someone new to SQL
First the people who have purchased a travel product
SELECT DISTINCT user_id
FROM orders
WHERE product = 'travel'
You care about books, music and food, you can get the totals for those like this:
SELECT user_id, product, SUM(revenue) as TOT
FROM orders
GROUP BY user_id, product
WHERE product in ('books', 'music', 'food'),
Now join them together
WITH sums AS (
SELECT user_id, product, SUM(revenue) as TOT
FROM orders
GROUP BY user_id, product
WHERE product in ('books', 'music', 'food'),
)
SELECT u.user_id, books.TOT as book_total, music.TOT as music_total, food.TOT as food_total
FROM (
SELECT DISTINCT user_id
FROM orders
WHERE product = 'travel'
) as U
LEFT JOIN sums as books ON u.userid = books.user_id and books.product = 'books'
LEFT JOIN sums as music ON u.userid = music.user_id and music.product = 'music'
LEFT JOIN sums as food ON u.userid = food.user_id and food.product = 'food'
SELECT
user_id
,sum(case when product='books' then revenue else 0 end) as revenue_books
,sum(case when product='music' then revenue else 0 end) as revenue_music
,sum(case when product='food' then revenue else 0 end) as revenue_food
FROM
orders
where user_id in (select user_id from orders where product='travel')
group by user_id
http://sqlfiddle.com/#!17/71698/5
EDIT
As suggested, this is another option. But it shows null instead of zero. If zero is needed you should use coalesce()
SELECT
user_id
,coalesce(sum(revenue) filter (where product = 'books'),0) as revenue_books
,coalesce(sum(revenue) filter (where product = 'music'),0) as revenue_music
,coalesce(sum(revenue) filter (where product = 'food'),0) as revenue_food
FROM orders
where user_id in (select user_id from orders where product='travel')
group by user_id
http://sqlfiddle.com/#!17/71698/7

Get userwise balance and first transaction date of users in SQL

I have created a Transaction table with columns card_id, amount, created_at. There may be more than 1 row of one user so I want to return the value card_id, sum(amount), first created_at date of all users.
CREATE TABLE Transactions(card_id int, amount money, created_at date)
INSERT INTO Transactions(card_id, amount, created_at)
SELECT 1, 500, '2016-01-01' union all
SELECT 1, 100, '2016-01-01' union all
SELECT 1, 100, '2016-01-01' union all
SELECT 1, 200, '2016-01-02' union all
SELECT 1, 300, '2016-01-03' union all
SELECT 2, 100, '2016-01-04' union all
SELECT 2, 200, '2016-01-05' union all
SELECT 3, 700, '2016-01-06' union all
SELECT 1, 100, '2016-01-07' union all
SELECT 2, 100, '2016-01-07' union all
SELECT 3, 100, '2016-01-07'
I have created function for that but one of my client says I need query not function. Can anyone here suggest what query to use?
CREATE FUNCTION [dbo].[card_id_data]()
RETURNS #t TABLE
(
card_id text,
amount money,
dateOfFirstTransaction date
)
AS
BEGIN
INSERT INTO #t(card_id)
SELECT DISTINCT(card_id) FROM Transactions;
UPDATE #t
SET dateOfFirstTransaction = b.createdat
FROM
(SELECT DISTINCT(card_id) cardid,
MIN(created_at) createdat
FROM Transactions
WHERE amount < 0
GROUP BY card_id) b
WHERE card_id = b.cardid;
UPDATE #t
SET amount = T.AMOUNT
FROM
(SELECT
card_id AS cardid, SUM(MIN(AMOUNT)) AMOUNT, created_at
FROM Transactions
WHERE amount < 0
GROUP BY card_id, created_at) T
WHERE card_id = cardid
AND dateOfFirstTransaction = created_at;
RETURN
END
I want a result as shown in this screenshot:
You can use DENSE_RANK for this. It will number the rows, taking into account tied places (same dates)
SELECT
t.card_id,
SumAmount = SUM(amount),
FirstDate = MIN(t.created_at)
FROM (
SELECT *,
rn = DENSE_RANK() OVER (PARTITION BY t.card_id ORDER BY t.created_at)
FROM dbo.Transactions t
) t
WHERE t.rn = 1
GROUP BY t.card_id;
If the dates are actually dates and times, and you want to sum the whole day, change t.created_at to CAST(t.created_at AS date)
Try this:
/*
CREATE TABLE dbo.Transactions
(
card_id INT,
amount MONEY,
created_at DATE
);
INSERT INTO dbo.Transactions (card_id, amount, created_at)
VALUES (1, 500, '2016-01-01'),
(1, 100, '2016-01-01'),
(1, 100, '2016-01-01'),
(1, 200, '2016-01-02'),
(1, 300, '2016-01-03'),
(2, 100, '2016-01-04'),
(2, 200, '2016-01-05'),
(3, 700, '2016-01-06'),
(1, 100, '2016-01-07'),
(2, 100, '2016-01-07'),
(3, 100, '2016-01-07');
*/
WITH FirstDatePerCard AS
(
SELECT
card_id,
FirstDate = MIN(created_at)
FROM
dbo.Transactions
GROUP BY
card_id
)
SELECT DISTINCT
t.card_id,
SumAmount = SUM(amount) OVER (PARTITION BY t.card_id),
FirstDate = f.FirstDate
FROM
FirstDatePerCard f
INNER JOIN
dbo.Transactions t ON f.card_id = t.card_id AND f.FirstDate = t.created_at
You'll get an output something like this:
card_id SumAmount FirstDate
--------------------------------
1 700.00 2016-01-01
2 100.00 2016-01-04
3 700.00 2016-01-06
Is that what you're looking for??
UPDATE: OK, so you want to sum the amount only for the first_date, for every card_id - is that correct? (wasn't clear from the original question)
Updated my solution accordingly

Unique Count (Print out 0) within Time Range

I have a table TB contains columns Time, User_id, State with data as below.
INSERT INTO TB
(Time, User_id, State)
VALUES
(1, 1, 'VA'),
(1, 2, 'VA'),
(1, 2, 'DC'),
(2, 1, 'VA'),
(2, 2, 'MD'),
(3, 1, 'MD'),
(3, 112, 'MD'),
(3, 134, 'VA'),
(3, 111, 'MD'),
(4, 12, 'VA'),
(4, 22, 'MD')
;
I would like a unique count of User_id given State = 'DC' every 2 seconds. The result should be looking like this:
NewTime | Count
1 | 1
2 | 0
I got answer to print out 0 when the count is 0 using
SELECT TIME,
SUM(CASE WHEN State = 'DC' THEN 1 ELSE 0 END) Count
FROM TB
GROUP BY Time
And get the unique count within a time range using
SELECT
((Time - 1) / 2) + 1 as NewTime,
COUNT(DISTINCT User_id) as Count
FROM ...
GROUP BY ((Time - 1) / 2) + 1
But I'm not sure how to combine these two to get the result I want.
Thanks in advance.
Are you looking for this:
SELECT
((Time - 1) / 2) + 1 as NewTime,
COUNT(DISTINCT case when state = 'DC' then User_id end) as Count
FROM ...
GROUP BY ((Time - 1) / 2) + 1
This assumes that you have some record (though not necessarily DC) every two seconds.