Compare values between two tables with over partition criteria - sql

DB-Fiddle
/* Table Campaigns */
CREATE TABLE campaigns (
id SERIAL PRIMARY KEY,
insert_time DATE,
campaign VARCHAR,
tranches VARCHAR,
quantity DECIMAL);
INSERT INTO campaigns
(insert_time, campaign, tranches, quantity)
VALUES
('2021-01-01', 'C001', 't', '500'),
('2021-01-01', 'C002', 't', '600'),
('2021-01-02', 'C001', 't', '500'),
('2021-01-02', 'C002', 't', '600');
/* Table Tranches */
CREATE TABLE tranches (
id SERIAL PRIMARY KEY,
insert_time DATE,
campaign VARCHAR,
tranches VARCHAR,
quantity DECIMAL);
INSERT INTO tranches
(insert_time, campaign, tranches, quantity)
VALUES
('2021-01-01', 'C001', 't1', '200'),
('2021-01-01', 'C001', 't2', '120'),
('2021-01-01', 'C001', 't3', '180'),
('2021-01-01','C002', 't1', '350'),
('2021-01-01','C002', 't2', '250'),
('2021-01-02', 'C001', 't1', '400'),
('2021-01-02', 'C001', 't2', '120'),
('2021-01-02', 'C001', 't3', '180'),
('2021-01-02','C002', 't1', '350'),
('2021-01-02','C002', 't2', '250');
Expected Result:
insert_time | campaign | tranches | quantity_campaigns | quantity_tranches | check
--------------|------------|------------|---------------------|---------------------|-----------
2021-01-01 | C001 | t | 500 | 500 | ok
2021-01-01 | C002 | t | 600 | 600 | ok
--------------|------------|------------|---------------------|---------------------|------------
2021-01-02 | C001 | t | 500 | 700 | error
2021-01-02 | C002 | t | 600 | 500 | ok
I want to compare the total quantity per campaign in table campaigns with the total quantity per campaign in table tranches.
So far I have been able to develop this query:
SELECT
c.insert_time AS insert_time,
c.campaign AS campaign,
c.tranches AS tranches,
c.quantity AS quantity_campaigns,
t.quantity AS quantity_tranches,
(CASE WHEN
MAX(c.quantity) OVER(PARTITION BY c.insert_time, c.campaign) = SUM(t.quantity) OVER(PARTITION BY t.insert_time, t.campaign)
THEN 'ok' ELSE 'error' END) AS check
FROM campaigns c
LEFT JOIN tranches t ON c.campaign = t.campaign
ORDER BY 1,2,3,4,5;
However, it does not give me the expected result?
What do I need to change to make it work?

I think the result you're looking for should be something like this. The problem is that you're trying to aggregate over two groupings after a join which will either yield too many results or incorrect calculations. By aggregating in CTE, and then joining the CTEs after aggregation has occurred you can achieve the results you are looking for. See my example below:
WITH campaign_agg AS(
SELECT c.insert_time, c.campaign, c.tranches, MAX(c.quantity) c_quantity
FROM campaigns c
GROUP BY c.insert_time, c.campaign, c.tranches
), tranch_agg AS(
SELECT t.insert_time, t.campaign, SUM(t.quantity) as t_sum
FROM tranches t
GROUP BY t.insert_time, t.campaign
)
SELECT c.insert_time, c.campaign, c.tranches, c.c_quantity, t.t_sum,
CASE WHEN c.c_quantity = t.t_sum THEN 'ok' ELSE 'error' END as check
FROM campaign_agg c
JOIN
tranch_agg t ON
t.insert_time = c.insert_time
AND t.campaign = c.campaign
ORDER BY c.insert_time, c.campaign
I have a db-fiddle for this as well: https://www.db-fiddle.com/f/33x4upVEcgTMNehiHCKzfN/1

DB-Fiddle
SELECT
c.insert_time AS insert_time,
c.campaign AS campaign,
c.tranches AS tranches,
SUM(c.quantity) AS quantity_campaigns,
SUM(t1.quantity) AS quantity_tranches,
(CASE WHEN SUM(c.quantity) <> SUM(t1.quantity) THEN 'error' ELSE 'ok' END) AS check
FROM campaigns c
LEFT JOIN
(SELECT
t.insert_time AS insert_time,
t.campaign AS campaign,
SUM(t.quantity) AS quantity
FROM tranches t
GROUP BY 1,2
ORDER BY 1,2) t1 on t1.insert_time = c.insert_time AND t1.campaign = c.campaign
GROUP BY 1,2,3
ORDER BY 1,2,3;

Related

risk_score result for each of month

I want to generate highest risk_score result for each of month (Jan, Feb & Mar)
Displaying the following columns: Firm_id_1, risk_score_Jan, risk_score_Feb, risk_score_Mar
CREATE table firm_risk (
firm_id_1 INT,
assessment_date DATE,
risk_score FLOAT
);
INSERT INTO firm_risk (firm_id_1, assessment_date, risk_score)
VALUES (123, '1/01/2018', 0.43),
(123, '1/28/2018', 0.80),
(123, '2/11/2018', 0.28),
(123, '2/23/2018', 0.91),
(123, '3/11/2018', 0.08),
(123, '3/31/2018', 0.60),
(456, '1/4/2018', 0.87),
(456, '1/6/2018', 0.02),
(456, '1/20/2018', 0.39),
(456, '2/3/2018', 0.10),
(456, '3/1/2018', 0.12),
(789, '1/1/2018', 0.20),
(789, '3/1/2018', 0.17);
SELECT * FROM firm_risk;
SELECT firm_id_1, date_part('month', assessment_date) AS AD
FROM firm_risk
WHERE assessment_date = (SELECT MAX (assessment_date) FROM firm_risk)
GROUP BY firm_id_1, risk_score, assessment_date;
CREATE table latest_risk_score (
firm_id_2 integer,
latest_risk_score_Jan float,
latest_risk_score_Feb float,
latest_risk_score_Mar float
);
SELECT * FROM latest_risk_score;
INSERT INTO latest_risk_score (firm_id_2)
VALUES (123),
(456),
(789);
SELECT firm_risk.firm_id_1, date_part('month', assessment_date), firm_risk.risk_score
FROM firm_risk
INNER JOIN latest_risk_score
ON firm_risk.firm_id_1 = latest_risk_score.firm_id_2
GROUP BY firm_risk.firm_id_1, firm_risk.risk_score, assessment_date;
SELECT firm_risk.firm_id_1, date_part('month', assessment_date), firm_risk.risk_score
FROM firm_risk
WHERE assessment_date = (SELECT MAX (assessment_date) FROM firm_risk)
AND assessment_date LIKE '_%-01-2018%';
SELECT firm_risk.firm_id_1, date_part('month', assessment_date)
FROM firm_risk
WHERE assessment_date >= date_part('month', assessment_date - '3 months')
GROUP BY firm_risk.firm_id_1, ('month', assessment_date);
UPDATE latest_risk_score SET latest_risk_score_Jan = (SELECT Risk_Score FROM firm_risk.firm_id_1 WHERE Assessment_Date = (SELECT MAX(Assessment_Date)
FROM firm_risk.firm_id_1 WHERE firm_id_1 = 123 AND Assessment_Date LIKE "2018-01-%" ORDER BY Assessment_Date))
WHERE firm_id_1 = 123;
update latest_risk_score
set latest_risk_score_Feb = (select Risk_Score from firm_risk.firm_id_1 where Assessment_Date = (select max(Assessment_Date)
from firm_risk.firm_id_1 where firm_id_1 = 123 and Assessment_Date like "2018-02-%" order by Assessment_Date))
where firm_id_1 = 123;
update latest_risk_score
set latest_risk_score_Mar = (select Risk_Score from firm_risk.firm_id_1 where Assessment_Date = (select max(Assessment_Date)
from firm_risk.firm_id_1 where firm_id_1 = 123 and Assessment_Date like "2018-03-%" order by Assessment_Date))
where firm_id_1 = 123;
select * from latest_risk_score;
Assuming postgres is relevant (due to existence of "date_part" in question)
CREATE table firm_risk (
firm_id_1 INT,
assessment_date DATE,
risk_score FLOAT
);
INSERT INTO firm_risk (firm_id_1, assessment_date, risk_score)
VALUES (123, '2018-01-01', 0.43),
(123, '2018-01-28', 0.80),
(123, '2018-02-11', 0.28),
(123, '2018-02-23', 0.91),
(123, '2018-03-11', 0.08),
(123, '2018-03-31', 0.60),
(456, '2018-01-04', 0.87),
(456, '2018-01-06', 0.02),
(456, '2018-01-20', 0.39),
(456, '2018-02-03', 0.10),
(456, '2018-03-01', 0.12),
(789, '2018-01-01', 0.20),
(789, '2018-03-01', 0.17);
SELECT
firm_risk.firm_id_1
, max(case when date_part('month',assessment_date) = 1 then firm_risk.risk_score end) jan_risk
, max(case when date_part('month',assessment_date) = 2 then firm_risk.risk_score end) feb_risk
, max(case when date_part('month',assessment_date) = 3 then firm_risk.risk_score end) mar_risk
FROM firm_risk
WHERE date_part('month',assessment_date) in (1,2,3)
GROUP BY
firm_risk.firm_id_1
firm_id_1 | jan_risk | feb_risk | mar_risk
--------: | :------- | :------- | :-------
789 | 0.2 | null | 0.17
456 | 0.87 | 0.1 | 0.12
123 | 0.8 | 0.91 | 0.6
db<>fiddle here

What SQL query can be used to limit continious periods by parameter value, and then to calculate datediff inside them?

I have a table of phone calls consisting of user_id, call_date, city,
where city can be either A or B.
It looks like this:
user_id
call_date
city
1
2021-01-01
A
1
2021-01-02
B
1
2021-01-03
B
1
2021-01-05
B
1
2021-01-10
A
1
2021-01-12
B
1
2021-01-16
A
2
2021-01-17
A
2
2021-01-20
B
2
2021-01-22
B
2
2021-01-23
A
2
2021-01-24
B
2
2021-01-26
B
2
2021-01-30
A
For this table, we need to select for each user all the periods when he was in city B.
These periods are counted in days and start when the first call is made from city B, and end as soon as the next call is made from city A.
So for user_id = 1 fist period starts on 2021-01-02 and ands on 2021-01-10. There can be several such periods for each user.
The result should be the following table:
user_id
period_1
period_2
1
8
4
2
3
6
Can you please tell me how I can limit the periods according to the condition of the problem, and then calculate the datediff within each period?
Thank you
This is a typical gaps and islands problem. You need to group consecutive rows first, then find the first call_date of the next group. Sample code for Postgres is below, the same may be adapted to another DBMS by applying appropriate function to calculate the difference in days.
with a (user_id, call_date, city)
as (
select *
from ( values
('1', date '2021-01-01', 'A'),
('1', date '2021-01-02', 'B'),
('1', date '2021-01-03', 'B'),
('1', date '2021-01-05', 'B'),
('1', date '2021-01-10', 'A'),
('1', date '2021-01-12', 'B'),
('1', date '2021-01-16', 'A'),
('2', date '2021-01-17', 'A'),
('2', date '2021-01-20', 'B'),
('2', date '2021-01-22', 'B'),
('2', date '2021-01-23', 'A'),
('2', date '2021-01-24', 'B'),
('2', date '2021-01-26', 'B'),
('2', date '2021-01-30', 'A')
) as t
)
, grp as (
/*Identify groups*/
select a.*,
/*This is a grouping of consecutive rows:
they will have the same difference between
two row_numbers while the more detailed
row_number changes, which means the attribute had changed.
*/
dense_rank() over(
partition by user_id
order by call_date asc
) -
dense_rank() over(
partition by user_id, city
order by call_date asc
) as grp,
/*Get next call date*/
lead(call_date, 1, call_date)
over(
partition by user_id
order by call_date asc
) as next_dt
from a
)
select
user_id,
city,
min(call_date) as dt_from,
max(next_dt) as dt_to,
max(next_dt) - min(call_date) as diff
from grp
where city = 'B'
group by user_id, grp, city
order by 1, 3
user_id | city | dt_from | dt_to | diff
:------ | :--- | :--------- | :--------- | ---:
1 | B | 2021-01-02 | 2021-01-10 | 8
1 | B | 2021-01-12 | 2021-01-16 | 4
2 | B | 2021-01-20 | 2021-01-23 | 3
2 | B | 2021-01-24 | 2021-01-30 | 6
db<>fiddle here

postgresql How show most frequent value per day date

I've got a problem with a query that is supposed to return the value which occur most per date
+------------+------------------+
| Date | value |
+------------+------------------+
| 2020-01-01 | Programmer |
| 2020-01-02 | Technician |
| 2020-01-03 | Business Analyst |
+------------+------------------+
So far I have done
select count(headline) as asd, publication_date, employer -> 'name' as dsa from jobhunter
group by publication_date,dsa
ORDER BY publication_date DESC
But it shows 2020-12-31 19:06:00 instead of just YYYY-MM-DD
Any idea on how to fix this?
enter image description here
Test data:
create table tbl (
id serial primary key,
row_datetime TIMESTAMP,
row_val VARCHAR(60)
);
insert into tbl (row_datetime, row_val) values ('2021-01-01 00:00:00', 'a');
insert into tbl (row_datetime, row_val) values ('2021-01-01 01:00:00', 'a');
insert into tbl (row_datetime, row_val) values ('2021-01-01 02:00:00', 'b');
insert into tbl (row_datetime, row_val) values ('2021-01-02 00:00:00', 'a');
insert into tbl (row_datetime, row_val) values ('2021-01-02 01:00:00', 'b');
insert into tbl (row_datetime, row_val) values ('2021-01-02 02:00:00', 'b');
Example query:
SELECT dt, val, cnt
FROM (
SELECT dt, val, cnt, ROW_NUMBER() OVER (PARTITION BY dt ORDER BY cnt DESC) AS row_num
FROM (
SELECT dt, val, COUNT(val) AS cnt
FROM (
SELECT DATE(row_datetime) AS dt, row_val AS val FROM tbl
) AS T1 GROUP BY dt, val
) AS T2
) AS T3
WHERE row_num=1
ORDER BY dt ASC
You can additionally customize your query to optimize the performance, get more fields, etc.

Group list of dates based on values from joined table

DB-Fiddle
CREATE TABLE sales (
id SERIAL PRIMARY KEY,
event_date DATE,
country VARCHAR,
channel VARCHAR,
sales DECIMAL
);
INSERT INTO sales
(event_date, country, channel, sales)
VALUES
('2020-01-04', 'DE', 'channel_01', '500'),
('2020-01-04', 'FR', 'channel_01', '900'),
('2020-01-04', 'NL', 'channel_01', '100'),
('2020-02-20', 'DE', 'channel_01', '0'),
('2020-02-20', 'FR', 'channel_01', '0'),
('2020-02-20', 'NL', 'channel_01', '0'),
('2020-03-15', 'DE', 'channel_01', '700'),
('2020-03-15', 'FR', 'channel_01', '500'),
('2020-03-15', 'NL', 'channel_03', '300');
/* Table Dates */
CREATE TABLE dates (
id SERIAL PRIMARY KEY,
date DATE
);
INSERT INTO dates
(date)
SELECT generate_series ('2020-01-01'::date, '2020-12-31'::date, interval '1 day');
Expected Result:
date_list | country
--------------|--------------------------
2020-01-01 | DE
2020-01-01 | FR
2020-01-01 | NL
--------------|---------------------------
2020-01-02 | DE
2020-01-02 | FR
2020-01-02 | NL
--------------|---------------------------
: | :
: | :
: | :
--------------|--------------------------
2020-12-29 | DE
2020-12-30 | NL
2020-12-31 | FR
I want to list all dates from table dates and group them by all countries that are available in table sales no matter if the date exist in both tables. So far I have developed this query:
SELECT
d.date AS date_list,
t2.country
FROM dates d
LEFT JOIN
(SELECT
s.event_date,
s.country,
s.sales
FROM sales s
GROUP BY 1,2,3
ORDER BY 1,2) t2 ON t2.event_date = d.date
GROUP BY 1,2
ORDER BY 1,2;
However, it only groups the results by country if the s.event_date matches the d.date.
How do I have to modify the query to get the expected result?
I am not sure, if I understand your requirements right, but seems it is about CROSS JOIN
SELECT D.DATE,X.COUNTRY
FROM DATES AS D
CROSS JOIN
(
SELECT DISTINCT COUNTRY FROM SALES
)X

How to get latest date of specific record without specifying record?

I have 2 tables: order and transportation.
ORDER
id
TRANSPORTATION
id
order_id
date
status (could be 'ok', 'ca', 'ko')
1 order can have more than 1 transportation. I want all orders which its latest transportation status is 'OK'.
If I do:
select ord.*
from orders ord
join transportation tr
on ord.id = tr.order_id
where tr.date = (select max(date) from transportation where status like 'OK');
I will get the latest date of ALL transportations but I only want the latest date of all transportations of that order in specific.
For example, if I have these orders with these transportations and I want the last transportations of each order which status are 'ok':
order_id, transportation_id, date, status
001, 001, 01/01/19, ok
001, 002, 01/01/20, ca
002, 003, 01/01/19, ca
002, 004, 01/01/18, ok
003, 005, 01/01/17, ok
003, 006, 01/01/16, ca
I would expect these results:
003, 005, 01/01/17, ok
You can do it without an additional sub-query using an analytic query:
SELECT order_id,
transportation_id,
"DATE",
status
FROM (
select ord.id AS order_id,
tr.id AS transportation_id,
tr."DATE",
tr.status,
RANK() OVER ( PARTITION BY ord.id ORDER BY tr."DATE" DESC ) AS rnk
from orders ord
join transportation tr
on ord.id = tr.order_id
)
WHERE rnk = 1
AND status = 'ok';
Use RANK (or DENSE_RANK) if you want to return rows tied for the greatest date per order id; or use ROW_NUMBER if you only want a single row per order id.
So for your test data:
CREATE TABLE ORDERS ( id ) AS
SELECT 1 FROM DUAL UNION ALL
SELECT 2 FROM DUAL UNION ALL
SELECT 3 FROM DUAL;
CREATE TABLE TRANSPORTATION ( order_id, id, "DATE", status ) AS
SELECT 001, 001, DATE '2001-01-19', 'ok' FROM DUAL UNION ALL
SELECT 001, 002, DATE '2001-01-20', 'ca' FROM DUAL UNION ALL
SELECT 002, 003, DATE '2001-01-19', 'ca' FROM DUAL UNION ALL
SELECT 002, 004, DATE '2001-01-18', 'ok' FROM DUAL UNION ALL
SELECT 003, 005, DATE '2001-01-17', 'ok' FROM DUAL UNION ALL
SELECT 003, 006, DATE '2001-01-16', 'ca' FROM DUAL;
This outputs:
ORDER_ID | TRANSPORTATION_ID | DATE | STATUS
-------: | ----------------: | :------------------ | :-----
3 | 5 | 2001-01-17 00:00:00 | ok
db<>fiddle here
DO it with not exists
select ord.* from orders ord join transportation tr on ord.id = tr.order_id
where tr.status = 'OK'
and not exists(select 1 from transportation b where b.status = 'OK' and tr.date > b.date)