Retrieve start and last created date from a duplicated record - sql

I have a table called mov_entrys and they have multiple historic records, in other words, duplicated records this is my main SQL statement
SELECT me.mov_entry_id, me.mov_entry_ean13, me.plate, mv.name vehiclename,
ma.name areaname, ml.name locationame, me.value, mov_pay.name paytype ,me.validated, me_usu.name operatorName,
to_char(me.created_at, 'DD/MM/YYYY HH24:MI:SS') created_at,
to_char(me.updated_at, 'DD/MM/YYYY HH24:MI:SS') updated_at
FROM mov_entrys me
LEFT OUTER JOIN mov_payment_type mov_pay on mov_pay.mov_payment_type_id = me.mov_payment_type_id
JOIN mov_vehicles mv ON me.mov_vehicle_id = mv.mov_vehicle_id
JOIN mov_areas ma ON me.mov_area_id = ma.mov_area_id
JOIN mov_locations ml ON ma.mov_location_id = ml.mov_location_id
JOIN mov_users me_usu ON me.mov_user_id = me_usu.mov_user_id
WHERE me.value > 0
AND me.validated <> 'O'
AND me.validated <> 'V'
AND me.validated <> 'I'
AND me.created_at >= date_trunc('month', NOW()) - '1 month'::interval
order by mov_entry_id desc
But the records are like this example:
id ean13 value validated created_at updated_at
1 7800003378198 0 N 2019-10-04 09:00:31 2019-10-04 09:00:31
2 7800003378198 8 S 2019-10-04 13:01:11 2019-10-04 13:01:11
3 7800003378198 10.5 AD 2019-10-04 13:02:13 2019-10-04 13:02:13
3 7800003378198 10.5 I 2019-10-04 13:05:13 2019-10-04 13:05:13
In Laravel when a data is created the created_at and updated_at is the same data, but I want the start date and last date from this record that has the same ean13 data 7800003378198
The main query returns the record that I want but with the same data and I want the real created data and last data
if you think the record in the top, I want the return like this, if I use group I cant have like this
id ean13 value validated created_at updated_at
2 7800003378198 8 S 2019-10-04 09:00:31 2019-10-04 13:05:13
3 7800003378198 10.5 AD 2019-10-04 09:00:31 2019-10-04 13:05:13
I try to make a subquery but I don't know why always make an INFINITE load to return a data I think I'm doing something wrong, this is my query with subquery
SELECT me.mov_entry_id, me.mov_entry_ean13, me.plate, mv.name vehiclename,
ma.name areaname, ml.name locationame, me.value, mov_pay.name paytype ,me.validated, me_usu.name operatorName,
to_char(me.created_at, 'DD/MM/YYYY HH24:MI:SS') created_at,
to_char(me.updated_at, 'DD/MM/YYYY HH24:MI:SS') updated_at,
(
SELECT me_v2.created_at FROM mov_entrys me_v2 WHERE me.mov_entry_ean13 = me_v2.mov_entry_ean13
ORDER BY me_v2.created_at DESC
LIMIT 1
) date_test
FROM mov_entrys me
LEFT OUTER JOIN mov_payment_type mov_pay on mov_pay.mov_payment_type_id = me.mov_payment_type_id
JOIN mov_vehicles mv ON me.mov_vehicle_id = mv.mov_vehicle_id
JOIN mov_areas ma ON me.mov_area_id = ma.mov_area_id
JOIN mov_locations ml ON ma.mov_location_id = ml.mov_location_id
JOIN mov_users me_usu ON me.mov_user_id = me_usu.mov_user_id
WHERE me.value > 0
AND me.validated <> 'O'
AND me.validated <> 'V'
AND me.validated <> 'I'
AND me.created_at >= date_trunc('month', NOW()) - '1 month'::interval
order by mov_entry_id desc
When I execute this query alone it is working fine giving me the last record,
what I'm doing wrong? This is my test query :
SELECT me.created_at FROM mov_entrys me WHERE me.mov_entry_ean13 = '7800003378198'
ORDER BY me.created_at DESC
LIMIT 1
I mana

Consider aggregating on dates with MIN and MAX, grouping by all other SELECT columns. Below uses shorter aliases for readability:
SELECT e.mov_entry_id, e.mov_entry_ean13, e.plate, v.name AS vehiclename,
a.name AS areaname, l.name AS locationame, e.value, p.name AS paytype,
e.validated, u.name AS operatorName,
to_char(MIN(e.created_at), 'DD/MM/YYYY HH24:MI:SS') AS created_at,
to_char(MAX(e.updated_at), 'DD/MM/YYYY HH24:MI:SS') AS updated_at
FROM mov_entrys e
LEFT OUTER JOIN mov_payment_type p ON p.mov_payment_type_id = e.mov_payment_type_id
INNER JOIN mov_vehicles v ON e.mov_vehicle_id = v.mov_vehicle_id
INNER JOIN mov_areas a ON e.mov_area_id = a.mov_area_id
INNER JOIN mov_locations l ON a.mov_location_id = l.mov_location_id
INNER JOIN mov_users u ON e.mov_user_id = u.mov_user_id
WHERE e.value > 0
AND e.validated <> 'O'
AND e.validated <> 'V'
AND e.validated <> 'I'
AND e.created_at >= date_trunc('month', NOW()) - '1 month'::interval
GROUP BY e.mov_entry_id, e.mov_entry_ean13, e.plate, v.name,
a.name,l.name, e.value, p.name, e.validated, u.name
ORDER BY e.mov_entry_id desc

Related

How to subtract two timestamps in SQL and then count?

I want to basically find out how many users paid within 15 mins, 30 mins and 60 mins of my payment_time and trigger_time
I have the following query
with redshift_direct() as conn:
trigger_time_1 = pd.read_sql(f"""
with new_data as
(
select
cycle_end_date
, prime_tagging_by_issuer_and_product
, u.user_id
, settled_status
, delay,
ots_created_at + interval '5:30 hours' as payment_time
,case when to_char(cycle_end_date,'DD') = '15' then 'Odd' else 'Even' end as cycle_order
from
settlement_summary_from_snapshot s
left join (select distinct user_phone_number, user_id from user_events where event_name = 'UserCreatedEvent') u
on u.user_id = s.user_id
and cycle_type = 'end_cycle'
and cycle_end_date > '2021-11-30' and cycle_end_date < '2022-01-15'
)
select
bucket_id
, cycle_end_date, d.cycle_order
, date(cycle_end_date) as t_cycle_end_date
,d.prime_tagging_by_issuer_and_product
,source
,status as cause
,split_part(campaign_name ,'|', 1) as campaign
,split_part(campaign_name ,'|', 2) as sms_cycle_end_date
,split_part(campaign_name ,'|', 3) as day
,split_part(campaign_name ,'|', 4) as type
,to_char(to_date(split_part(campaign_name ,'|', 2) , 'DD/MM/YYYY'), 'YYYY-MM-DD') as campaign_date,
d.payment_time, payload_event_timestamp + interval '5:30 hours' as trigger_time
,count( s.user_id) as count
from sms_callback_events s
inner join new_data d
on s.user_id = d.user_id
where bucket_id > 'date_2021_11_30' and bucket_id < 'date_2022_01_15'
and campaign_name like '%RC%'
and event_name = 'SmsStatusUpdatedEvent'
group by 1,2,3,4,5,6,7,8,9,10,11,12,13,14
""",conn)
How do i achieve making 3 columns with number of users who paid within 15mins, 30 mins and 60 mins after trigger_time in this query? I was doing it with Pandas but I want to find a way to do it here itself. Can someone help?
I wrote my own DATEDIFF function, which returns an integer value of differencing between two dates, difference by day, by month, by year, by hour, by minute and etc. You can use this function on your queries.
DATEDIFF Function SQL Code on GitHub
Sample Query about using our DATEDIFF function:
select
datediff('minute', mm.start_date, mm.end_date) as diff_minute
from
(
select
'2022-02-24 09:00:00.100'::timestamp as start_date,
'2022-02-24 09:15:21.359'::timestamp as end_date
) mm;
Result:
---------------
diff_minute
---------------
15
---------------

SELECT list expression references column integration_start_date which is neither grouped nor aggregated at

I'm facing an issue with the following query. It gave me this error [SELECT list expression references column integration_start_date which is neither grouped nor aggregated at [34:63]]. In particular, it points to the first 'when' in the result table, which I don't know how to fix. This is on BigQuery if that helps. I see everything is written correctly or I could be wrong. Seeking for help.
with plan_data as (
select format_date("%Y-%m-%d",last_day(date(a.basis_date))) as invoice_date,
a.sponsor_id as sponsor_id,
b.company_name as sponsor_name,
REPLACE(SUBSTR(d.meta,STRPOS(d.meta,'merchant_id')+12,13),'"','') as merchant_id,
a.state as plan_state,
date(c.start_date) as plan_start_date,
a.employee_id as square_employee_id,
date(
(select min(date)
from glproductionview.stats_sponsors
where sponsor_id = a.sponsor_id and sponsor_payroll_provider_identifier = 'square' and date >= c.start_date) )
as integration_start_date,
count(distinct a.employee_id) as eligible_pts_count, --pts that are in active plan and have payroll activities (payroll deductions) in the reporting month
from glproductionview.payroll_activities as a
left join glproductionview.sponsors as b
on a.sponsor_id = b.id
left join glproductionview.dc_plans as c
on a.plan_id = c.id
left join glproductionview.payroll_connections as d
on a.sponsor_id = d.sponsor_id and d.provider_identifier = 'rocket' and a.company_id = d.payroll_id
where a.payroll_provider_identifier = 'rocket'
and format_date("%Y-%m",date(a.basis_date)) = '2021-07'
and a.amount_cents > 0
group by 1,2,3,4,5,6,7,8
order by 2 asc
)
select invoice_date,
sponsor_id,
sponsor_name,
eligible_pts_count,
case
when eligible_pts_count <= 5 and date_diff(current_date(),integration_start_date, month) <= 12 then 20
when eligible_pts_count <= 5 and date_diff(current_date(),integration_start_date, month) > 12 then 15
when eligible_pts_count > 5 and date_diff(current_date(),integration_start_date, month) <= 12 then count(distinct square_employee_id)*4
when eligible_pts_count > 5 and date_diff(current_date(),integration_start_date, month) > 12 then count(distinct square_employee_id)*3
else 0
end as fees
from plan_data
group by 1,2,3,4;

Left Join Lateral is Very Slow

I have the following query
WITH time_series AS (
SELECT *
FROM generate_series(now() - interval '1days', now(), INTERVAL '1 hour') AS ts
), recent_instances AS (
SELECT instance_id,
(CASE WHEN last_update_granted_ts IS NOT NULL THEN last_update_granted_ts ELSE created_ts END),
version,
4 status
FROM instance_application
WHERE group_id=$1
AND last_check_for_updates >= now() - interval '1days'
ORDER BY last_update_granted_ts DESC
), instance_versions AS (
SELECT instance_id, created_ts, version, status
FROM instance_status_history
WHERE instance_id IN (SELECT instance_id
FROM recent_instances)
AND status = 4
UNION
(SELECT * FROM recent_instances)
ORDER BY created_ts DESC
)
SELECT ts,
(CASE WHEN version IS NULL THEN '' ELSE version END),
sum(CASE WHEN version IS NOT null THEN 1 ELSE 0 END) total
FROM (
SELECT *
FROM time_series
LEFT JOIN LATERAL (
SELECT distinct ON (instance_id) instance_Id, version, created_ts
FROM instance_versions
WHERE created_ts <= time_series.ts
ORDER BY instance_Id, created_ts DESC
) _ ON true
) AS _
GROUP BY 1,2
ORDER BY ts DESC;
So instance_versions subquery is executed with every value of timestamps generated from time_series query(see the last select statement). But for some reason the lateral join is very slow,the rows returned by the subquery of lateral join ranges in around 12k-15k(for a single timestamp from time_series query) which is not a big number and the final no of rows returned after the Lateral join ranges from 250k-350k. Is there a way i can optimize this?

SQL averages per row from multiple columns and nulls

I have an app that logs data for sensors and I want to be able to produce averages from multiple sensors, could be one, two, three or plenty...
EDIT: These are temperature sensors so 0 is a value that the sensors might store as a value in the database.
My initial starting point was this SQL query:
SELECT grid.t5||'.000000' as ts,
avg(t.sensorvalue) sensorvalue1
, avg(w.sensorvalue)AS sensorvalue2
FROM
(SELECT generate_series(min(date_trunc('hour', ts))
,max(ts), interval '5 min') AS t5 FROM device_history_20865735 where
ts between '2015/05/13 09:00' and '2015/05/14 09:00' ) grid
LEFT JOIN device_history_20865735 t ON t.ts >= grid.t5 AND t.ts < grid.t5 + interval '5 min'
LEFT JOIN device_history_493417852 w ON w.ts >= grid.t5 AND w.ts < grid.t5 + interval '5 min'
--WHERE t.sensorvalue notnull
GROUP BY grid.t5 ORDER BY grid.t5
I get 5 min averages as it is better for my app.
The results as expected have NULL values for either sensorvalue1 or 2:
ts;sensorvalue1;sensorvalue2
"2015-05-13 09:00:00.000000";19.9300003051758;
"2015-05-13 09:05:00.000000";20;
"2015-05-13 09:10:00.000000";;
"2015-05-13 09:15:00.000000";20.0599994659424;
"2015-05-13 09:20:00.000000";;
"2015-05-13 09:25:00.000000";20.1200008392334;
My aim is to calculate an average for each 5 min interval from all the available sensors so as NULLs are a problem I thought of using a CASE statement so if there is a NULL to get the value of the other sensor...
SELECT grid.t5||'.000000' as ts,
CASE
WHEN avg(t.sensorvalue) ISNULL THEN avg(w.sensorvalue)
ELSE avg(t.sensorvalue)
END AS sensorvalue
,
CASE
WHEN avg(w.sensorvalue) ISNULL THEN avg(t.sensorvalue)
ELSE avg(w.sensorvalue)
END AS sensorvalue2
FROM
(SELECT generate_series(min(date_trunc('hour', ts)),max(ts), interval '5 min') AS t5
FROM device_history_20865735 where
ts between '2015/05/13 09:00' and '2015/05/14 09:00' ) grid
LEFT JOIN device_history_20865735 t ON t.ts >= grid.t5 AND t.ts < grid.t5 + interval '5 min'
LEFT JOIN device_history_493417852 w ON w.ts >= grid.t5 AND w.ts < grid.t5 + interval '5 min'
GROUP BY grid.t5 ORDER BY grid.t5
but then to calculate the average I have to do another select on top of this and devide per number of columns (aka sensors) and if they are just two it is OK but if there are 3 or 4 sensors this can get very messy as there could be multiple sensors with NULL values per row...
The SQL is derived grammatically from an app (using Python) using postgres 9.4 so is there a simple way to achieve what is needed as I feel I'm down a rather complex route...?
EDIT #2: With your input I've produce this SQL code, again it seems rather complex but open to your ideas and scrutiny if it is reliable and maintainable:
SELECT ts, sensortotal, sensorcount,
CASE
WHEN sensorcount = 0 THEN -1000
ELSE sensortotal/sensorcount
END AS sensorAvg
FROM (
WITH grid as (
SELECT t5
FROM (SELECT generate_series(min(date_trunc('hour', ts)), max(ts), interval '5 min') as t5
FROM device_history_20865735
) d
WHERE t5 between '2015-05-13 09:00' and '2015-05-14 09:00'
)
SELECT d1.t5 || '.000000' as ts
, Coalesce(avg(d1.sensorvalue), 0) + Coalesce(avg(d2.sensorvalue),0) as sensorTotal
, (CASE
WHEN avg(d1.sensorvalue) ISNULL THEN 0
ELSE 1
END + CASE
WHEN avg(d2.sensorvalue) ISNULL THEN 0
ELSE 1
END) as sensorCount
FROM (SELECT grid.t5, avg(t.sensorvalue) as sensorvalue
FROM grid LEFT JOIN
device_history_20865735 t
ON t.ts >= grid.t5 AND t.ts <grid.t5 + interval '5 min'
GROUP BY grid.t5
) d1 LEFT JOIN
(SELECT grid.t5, avg(t.sensorvalue) as sensorvalue
FROM grid LEFT JOIN
device_history_493417852 t
ON t.ts >= grid.t5 AND t.ts <grid.t5 + interval '5 min'
GROUP BY grid.t5
) d2 on d1.t5 = d2.t5
GROUP BY d1.t5
ORDER BY d1.t5
) tmp;
Thanks!
It sounds like you want to something like this:
(coalesce(value1,0) + coalesce(value2,0) + coalesce(value3,0)) /
(value1 IS NOT NULL::int + value2 IS NOT NULL::int + value3 IS NOT NULL::int)
AS average
Basically, just do the math you want to do for each row. The only "tricky" part is how to "count" the non-null values--I used a cast, but there are other options such as:
CASE WHEN value1 IS NULL THEN 0 ELSE 1 END
To get accurate averages, you need to calculate each one separately before the join:
WITH grid as (
SELECT t5
FROM (SELECT generate_series(min(date_trunc('hour', ts)), max(ts), interval '5 min') as t5
FROM device_history_20865735
) d
WHERE t5 between '2015-05-13 09:00' and '2015-05-14 09:00'
)
SELECT d1.t5 || '.000000' as ts,
avg(d1.sensorvalue) as sensorvalue1
, avg(d2.sensorvalue) as sensorvalue2
FROM (SELECT grid.t5, avg(t.sensorvalue) as sensorvalue
FROM grid LEFT JOIN
device_history_20865735 t
ON t.ts >= grid.t5 AND t.ts <grid.t5 + interval '5 min'
GROUP BY grid.t5
) d1 LEFT JOIN
(SELECT grid.t5, avg(t.sensorvalue) as sensorvalue
FROM grid LEFT JOIN
device_history_493417852 t
ON t.ts >= grid.t5 AND t.ts <grid.t5 + interval '5 min'
GROUP BY grid.t5
) d2 on d1.t5 = d2.t5
GROUP BY d1.t5
ORDER BY d1.t5;

Convert a nested subquery into normal query

I have problem with following query where in which the nested query should be
converted to normal query:
select
count(*) as count,
TO_CHAR(RH.updated_datetime,'DD-MM-YYYY HH:MI:SS') as date,
SUM(
extract (
epoch from (
RH.updated_datetime - PRI.procedure_performed_datetime
)
)/60
)::integer/count(*) as diff
from
procedure_runtime_information PRI,
study S,
report R,
report_history RH
where
RH.report_fk = R.pk AND
R.study_fk = S.pk AND
S.procedure_runtime_fk = PRI.pk AND
RH.old_status_fk = 21 AND
RH.revision = (select max(revision) from report_history where RH.report_fk = RH.report_fk) AND
RH.updated_datetime > TO_DATE('22-01-2013 00:00:00', 'DD-MM-YYYY HH24:MI:SS') AND RH.updated_datetime < TO_DATE('22-01-2014 00:00:00', 'DD-MM-YYYY HH24:MI:SS')
group by date order by date asc;
Assuming this
(select max(revision) from report_history where RH.report_fk = RH.report_fk)
should really be:
(select max(revision) from report_history x where x.report_fk = RH.report_fk)
You could transform the nested (correlated) subquery into a plain subquery like this (one way of many):
SELECT count(*) AS ct
,to_char(rh.updated_datetime,'DD-MM-YYYY HH:MI:SS') AS date -- HH24?
,sum(extract(epoch FROM (RH.updated_datetime
- PRI.procedure_performed_datetime))
/ 60)::int / count(*) AS diff
FROM procedure_runtime_information PRI
JOIN study S ON S.procedure_runtime_fk = PRI.pk
JOIN report R ON R.study_fk = S.pk
JOIN report_history RH ON RH.report_fk = R.pk
JOIN (
SELECT report_fk, max(revision) AS revision
FROM report_history RH1
GROUP BY 1
) RH1 ON RH1.report_fk = RH.report_fk
AND RH1.revision = RH.revision
WHERE RH.old_status_fk = 21
AND RH.updated_datetime > to_date('22-01-2013', 'DD-MM-YYYY') -- >= ?
AND RH.updated_datetime < to_date('22-01-2014', 'DD-MM-YYYY') -- to_timestamp?
GROUP BY date -- where does date come from?
ORDER BY date;