Computing session start and end using SQL window functions - sql

I've a table of game logs containing a handDate, like this:
ID
handDate
1
2019-06-30 16:14:02.000
2
2019-07-12 06:18:02.000
3
...
I'd like to compute game sessions from this table (start and end), given that:
A new session is considered if there is no activity since 1 hour.
a session can exist across 2 days
So I'd like results like this:
day
session_start
sesssion_end
2019-06-30
2019-06-15 16:14:02.000
2019-06-15 16:54:02.000
2019-07-02
2019-07-02 16:18:02.000
2019-07-02 17:18:02.000
2019-07-02
2019-07-02 23:18:02.000
2019-07-03 03:18:02.000
2019-07-03
2019-07-03 06:18:02.000
2019-07-03 08:28:02.000
Currently I'm playing with the following code, but cannot achieve what I want:
SELECT *
FROM (
SELECT *,
strftime( '%s', handDate) - strftime( '%s', prev_event) AS inactivity
FROM (
SELECT handDate,
date( handDate) as day,
FIRST_VALUE( handDate) OVER (PARTITION BY date( handDate) ORDER BY handDate) AS first_event,
MIN(handDate) OVER (PARTITION BY date( handDate) ORDER BY handDate),
MAX(handDate) OVER (PARTITION BY date( handDate) ORDER BY handDate),
LAG( handDate) OVER (PARTITION BY date( handDate) ORDER BY handDate ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW ) AS prev_event,
LEAD( handDate) OVER (PARTITION BY date( handDate) ORDER BY handDate) AS next_event
FROM hands
) last
) final
I'm using SQLite.

I found the following solution:
SELECT day,
sessionId,
MIN(handDate) as sessionStart,
MAX(handDate) as sessionEnd
FROM(
SELECT day,
handDate,
sum(is_new_session) over (
order by handDate rows between unbounded preceding and current row
) as sessionId
FROM (
SELECT *,
CASE
WHEN prev_event IS NULL
OR strftime('%s', handDate) - strftime('%s', prev_event) > 3600 THEN true
ELSE false
END AS is_new_session
FROM (
SELECT handDate,
date(handDate) as day,
LAG(handDate) OVER (
PARTITION BY date(handDate)
ORDER BY handDate RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
) AS prev_event
FROM hands
)
)
)
GROUP BY sessionId

DROP TABLE IF EXISTS hands;
CREATE TABLE hands(handDate TIMESTAMP);
INSERT INTO hands(handDate)
VALUES ('2021-10-29 10:30:00')
, ('2021-10-29 11:35:00')
, ('2021-10-29 11:36:00')
, ('2021-10-29 11:37:00')
, ('2021-10-29 12:38:00')
, ('2021-10-29 12:39:00')
, ('2021-10-29 12:39:10')
;
SELECT start_period, end_period
FROM (
SELECT is_start, handDate AS start_period
, CASE WHEN is_start AND is_end THEN handDate
ELSE LEAD(handDate) OVER (ORDER BY handDate)
END AS END_period
FROM (
SELECT *
FROM (
SELECT *
,CASE WHEN (event-prev_event) * 1440.0 > 60 OR prev_event IS NULL THEN true ELSE FALSE END AS is_start
,CASE WHEN (next_event-event) * 1440.0 > 60 OR next_event IS NULL THEN true ELSE FALSE END AS is_end
FROM (
SELECT handDate
, juliANDay(handDate) event
, juliANDay(LAG(handDate) OVER (ORDER BY handDate)) AS prev_event
, juliANDay(LEAD(handDate) OVER (ORDER BY handDate)) AS next_event
FROM hands
) t
) t
WHERE is_start OR is_end
)t
)t
WHERE is_start

Related

SQL - Return count of consecutive days where value was unchanged

I have a table like
date
ticker
Action
'2022-03-01'
AAPL
BUY
'2022-03-02'
AAPL
SELL.
'2022-03-03'
AAPL
BUY.
'2022-03-01'
CMG
SELL.
'2022-03-02'
CMG
HOLD.
'2022-03-03'
CMG
HOLD.
'2022-03-01'
GPS
SELL.
'2022-03-02'
GPS
SELL.
'2022-03-03'
GPS
SELL.
I want to do a group by ticker then count all the times that Actions have sequentially been the value that they are as of the last date, here it's 2022-03-03. ie for this example table it'd be like;
ticker
NumSequentialDaysAction
AAPL
0
CMG
1
GPS
2
Fine to pass in 2022-03-03 as a value, don't need to figure that out on the fly.
Tried something like this
---Table Creation---
CREATE TABLE UserTable
([Date] DATETIME2, [Ticker] varchar(5), [Action] varchar(5))
;
INSERT INTO UserTable
([Date], [Ticker], [Action])
VALUES
('2022-03-01' , 'AAPL' , 'BUY'),
('2022-03-02' , 'AAPL' , 'SELL'),
('2022-03-03' , 'AAPL' , 'BUY'),
('2022-03-01' , 'CMG' , 'SELL'),
('2022-03-02' , 'CMG' , 'HOLD'),
('2022-03-03' , 'CMG' , 'HOLD'),
('2022-03-01' , 'GPS' , 'SELL'),
('2022-03-02' , 'GPS' , 'SELL'),
('2022-03-03' , 'GPS' , 'SELL')
;
---Attempted Solution---
I'm thinking that I need to do a sub query to get the last value and join on itself to get the matching values. Then apply a window function, ordered by date to see that the proceeding value is sequential.
WITH CTE AS (SELECT Date, Ticker, Action,
ROW_NUMBER() OVER (PARTITION BY Ticker, Action ORDER BY Date) as row_num
FROM UserTable)
SELECT Ticker, COUNT(DISTINCT Date) as count_of_days
FROM CTE
WHERE row_num = 1
GROUP BY Ticker;
WITH CTE AS (SELECT Date, Ticker, Action,
DENSE_RANK() OVER (PARTITION BY Ticker ORDER BY Action,Date) as rank
FROM table)
SELECT Ticker, COUNT(DISTINCT Date) as count_of_days
FROM CTE
WHERE rank = 1
GROUP BY Ticker;
You can do this with the help of the LEAD function like so. You didn't specify which RDBMS you're using. This solution works in PostgreSQL:
WITH "withSequential" AS (
SELECT
ticker,
(LEAD("Action") OVER (PARTITION BY ticker ORDER BY date ASC) = "Action") AS "nextDayIsSameAction"
FROM UserTable
)
SELECT
ticker,
SUM(
CASE
WHEN "nextDayIsSameAction" IS TRUE THEN 1
ELSE 0
END
) AS "NumSequentialDaysAction"
FROM "withSequential"
GROUP BY ticker
Here is a way to do this using gaps and islands solution.
Thanks for sharing the create and insert scripts, which helps to build the solution quickly.
dbfiddle link.
https://dbfiddle.uk/rZLDTrNR
with data
as (
select date
,ticker
,action
,case when lag(action) over(partition by ticker order by date) <> action then
1
else 0
end as marker
from usertable
)
,interim_data
as (
select *
,sum(marker) over(partition by ticker order by date) as grp_val
from data
)
,interim_data2
as (
select *
,count(*) over(partition by ticker,grp_val) as NumSequentialDaysAction
from interim_data
)
select ticker,NumSequentialDaysAction
from interim_data2
where date='2022-03-03'
Another option, you could use the difference between two row_numbers approach as the following:
select [Ticker], count(*)-1 NumSequentialDaysAction -- you could use (distinct) to remove duplicate rows
from
(
select *,
row_number() over (partition by [Ticker] order by [Date]) -
row_number() over (partition by [Ticker], [Action] order by [Date]) grp
from UserTable
where [date] <= '2022-03-03'
) RN_Groups
/* get only rows where [Action] = last date [Action] */
where [Action] = (select top 1 [Action] from UserTable T
where T.[Ticker] = RN_Groups.[Ticker] and [date] <= '2022-03-03'
order by [Date] desc)
group by [Ticker], [Action], grp
See demo

DBT - use DBT modeling to insert rows in a table like date dimension table in Azure Synapse

I need reference to inserting rows in a table using DBT models. Sample example that can be considered is a date dimension table, where we want to insert rows for next years.
dbt is built to handle the inserts for you since it generally works as a transformation layer on data already in your warehouse.
As an example of how to build a date dimension table, the gitlab data team have a public repo which includes an example of how to build that using the dbt-utils package macro for a date spine
The simplest version would just be:
date_dim.sql
WITH date_spine AS (
{{ dbt_utils.date_spine(
start_date="to_date('01/01/2000', 'mm/dd/yyyy')",
datepart="day",
end_date="to_date('12/01/2050', 'mm/dd/yyyy')"
)
}}
)
select * from date_spine
And the link to the gitlab example:
date_details_source.sql
WITH date_spine AS (
{{ dbt_utils.date_spine(
start_date="to_date('11/01/2009', 'mm/dd/yyyy')",
datepart="day",
end_date="dateadd(year, 40, current_date)"
)
}}
), calculated as (
SELECT
date_day,
date_day AS date_actual,
DAYNAME(date_day) AS day_name,
DATE_PART('month', date_day) AS month_actual,
DATE_PART('year', date_day) AS year_actual,
DATE_PART(quarter, date_day) AS quarter_actual,
DATE_PART(dayofweek, date_day) + 1 AS day_of_week,
CASE WHEN day_name = 'Sun' THEN date_day
ELSE DATEADD('day', -1, DATE_TRUNC('week', date_day)) END AS first_day_of_week,
CASE WHEN day_name = 'Sun' THEN WEEK(date_day) + 1
ELSE WEEK(date_day) END AS week_of_year_temp, --remove this column
CASE WHEN day_name = 'Sun' AND LEAD(week_of_year_temp) OVER (ORDER BY date_day) = '1'
THEN '1'
ELSE week_of_year_temp END AS week_of_year,
DATE_PART('day', date_day) AS day_of_month,
ROW_NUMBER() OVER (PARTITION BY year_actual, quarter_actual ORDER BY date_day) AS day_of_quarter,
ROW_NUMBER() OVER (PARTITION BY year_actual ORDER BY date_day) AS day_of_year,
CASE WHEN month_actual < 2
THEN year_actual
ELSE (year_actual+1) END AS fiscal_year,
CASE WHEN month_actual < 2 THEN '4'
WHEN month_actual < 5 THEN '1'
WHEN month_actual < 8 THEN '2'
WHEN month_actual < 11 THEN '3'
ELSE '4' END AS fiscal_quarter,
ROW_NUMBER() OVER (PARTITION BY fiscal_year, fiscal_quarter ORDER BY date_day) AS day_of_fiscal_quarter,
ROW_NUMBER() OVER (PARTITION BY fiscal_year ORDER BY date_day) AS day_of_fiscal_year,
TO_CHAR(date_day, 'MMMM') AS month_name,
TRUNC(date_day, 'Month') AS first_day_of_month,
LAST_VALUE(date_day) OVER (PARTITION BY year_actual, month_actual ORDER BY date_day) AS last_day_of_month,
FIRST_VALUE(date_day) OVER (PARTITION BY year_actual ORDER BY date_day) AS first_day_of_year,
LAST_VALUE(date_day) OVER (PARTITION BY year_actual ORDER BY date_day) AS last_day_of_year,
FIRST_VALUE(date_day) OVER (PARTITION BY year_actual, quarter_actual ORDER BY date_day) AS first_day_of_quarter,
LAST_VALUE(date_day) OVER (PARTITION BY year_actual, quarter_actual ORDER BY date_day) AS last_day_of_quarter,
FIRST_VALUE(date_day) OVER (PARTITION BY fiscal_year, fiscal_quarter ORDER BY date_day) AS first_day_of_fiscal_quarter,
LAST_VALUE(date_day) OVER (PARTITION BY fiscal_year, fiscal_quarter ORDER BY date_day) AS last_day_of_fiscal_quarter,
FIRST_VALUE(date_day) OVER (PARTITION BY fiscal_year ORDER BY date_day) AS first_day_of_fiscal_year,
LAST_VALUE(date_day) OVER (PARTITION BY fiscal_year ORDER BY date_day) AS last_day_of_fiscal_year,
DATEDIFF('week', first_day_of_fiscal_year, date_actual) +1 AS week_of_fiscal_year,
CASE WHEN EXTRACT('month', date_day) = 1 THEN 12
ELSE EXTRACT('month', date_day) - 1 END AS month_of_fiscal_year,
LAST_VALUE(date_day) OVER (PARTITION BY first_day_of_week ORDER BY date_day) AS last_day_of_week,
(year_actual || '-Q' || EXTRACT(QUARTER FROM date_day)) AS quarter_name,
(fiscal_year || '-' || DECODE(fiscal_quarter,
1, 'Q1',
2, 'Q2',
3, 'Q3',
4, 'Q4')) AS fiscal_quarter_name,
('FY' || SUBSTR(fiscal_quarter_name, 3, 7)) AS fiscal_quarter_name_fy,
DENSE_RANK() OVER (ORDER BY fiscal_quarter_name) AS fiscal_quarter_number_absolute,
fiscal_year || '-' || MONTHNAME(date_day) AS fiscal_month_name,
('FY' || SUBSTR(fiscal_month_name, 3, 8)) AS fiscal_month_name_fy,
(CASE WHEN MONTH(date_day) = 1 AND DAYOFMONTH(date_day) = 1 THEN 'New Year''s Day'
WHEN MONTH(date_day) = 12 AND DAYOFMONTH(date_day) = 25 THEN 'Christmas Day'
WHEN MONTH(date_day) = 12 AND DAYOFMONTH(date_day) = 26 THEN 'Boxing Day'
ELSE NULL END)::VARCHAR AS holiday_desc,
(CASE WHEN HOLIDAY_DESC IS NULL THEN 0
ELSE 1 END)::BOOLEAN AS is_holiday,
DATE_TRUNC('month', last_day_of_fiscal_quarter) AS last_month_of_fiscal_quarter,
IFF(DATE_TRUNC('month', last_day_of_fiscal_quarter) = date_actual, TRUE, FALSE) AS is_first_day_of_last_month_of_fiscal_quarter,
DATE_TRUNC('month', last_day_of_fiscal_year) AS last_month_of_fiscal_year,
IFF(DATE_TRUNC('month', last_day_of_fiscal_year) = date_actual, TRUE, FALSE) AS is_first_day_of_last_month_of_fiscal_year,
DATEADD('day',7,DATEADD('month',1,first_day_of_month)) AS snapshot_date_fpa,
DATEADD('day',44,DATEADD('month',1,first_day_of_month)) AS snapshot_date_billings
FROM date_spine
), final AS (
SELECT
date_day,
date_actual,
day_name,
month_actual,
year_actual,
quarter_actual,
day_of_week,
first_day_of_week,
week_of_year,
day_of_month,
day_of_quarter,
day_of_year,
fiscal_year,
fiscal_quarter,
day_of_fiscal_quarter,
day_of_fiscal_year,
month_name,
first_day_of_month,
last_day_of_month,
first_day_of_year,
last_day_of_year,
first_day_of_quarter,
last_day_of_quarter,
first_day_of_fiscal_quarter,
last_day_of_fiscal_quarter,
first_day_of_fiscal_year,
last_day_of_fiscal_year,
week_of_fiscal_year,
month_of_fiscal_year,
last_day_of_week,
quarter_name,
fiscal_quarter_name,
fiscal_quarter_name_fy,
fiscal_quarter_number_absolute,
fiscal_month_name,
fiscal_month_name_fy,
holiday_desc,
is_holiday,
last_month_of_fiscal_quarter,
is_first_day_of_last_month_of_fiscal_quarter,
last_month_of_fiscal_year,
is_first_day_of_last_month_of_fiscal_year,
snapshot_date_fpa,
snapshot_date_billings
FROM calculated
)
** I believe the gitlab team uses Snowflake so if you're using another platform, you may need to change a few functions **

Identify date range and merge into max and min dates

I have data ( int, date , date types)
SELECT * FROM
(
VALUES
(1700171048,'2020-12-21','2021-01-03'),
(1700171048,'2021-01-05','2021-01-12'),
(1700171048,'2021-01-13','2021-01-17'),
(1700171048,'2021-01-18','2021-01-19'),
(1700171048,'2021-01-22','2021-01-27'),
(1700171048,'2021-01-28','2021-02-17')
(1700171049,'2020-12-21','2021-01-03'),
(1700171049,'2021-01-04','2021-01-05'),
(1700171049,'2021-01-06','2021-01-17'),
(1700171049,'2021-01-18','2021-01-19'),
(1700171049,'2021-01-20','2021-01-27'),
(1700171049,'2021-01-28','2021-02-17')
) AS c (id1, st, endt )
I need output( i.e. if start and end dates are continuous then make it part of group )
id1 st endt
1700171048 '2020-12-21' , '2021-01-03'
1700171048 '2021-01-05' , '2021-01-19'
1700171048 '2021-01-22' , '2021-02-17'
1700171049 '2020-12-21' to '2021-02-17'
I tried this, won't work.
select id, case when min(b.st) = max(b.endt) + 1 then min(b.st) end,
case when min(b.endt) = min(b.st) + 1 then max(b.st) end
from c a join c b
group by id
This is a type of gaps-and-islands problem. Use lag() to identify if there is an overlap. Then a cumulative sum of when there is no overlaps and aggregation:
select id1, min(st), max(endt)
from (select t.*,
sum(case when prev_endt >= st + interval '-1 day' then 0 else 1 end) over (partition by id1 order by st) as grp
from (select t.*,
lag(endt) over (partition by id1 order by st) as prev_endt
from t
) t
) t
group by id1, grp;
Here is a db<>fiddle.

Calculating average by using the previous row's value and following row's value

I have calculated average values for each month. Some months are NULL and my manager wants me to use the previous row's value and following month's value and fill the months which are having NULL values.
Current result (see below pic):
Expected Result
DECLARE #DATE DATE = '2017-01-01';
WITH DATEDIM AS
(
SELECT DISTINCT DTM.FirstDayOfMonth
FROM DATEDIM DTM
WHERE Date >= '01/01/2017'
AND Date <= DATEADD(mm,-1,Getdate())
),
Tab1 AS
(
SELECT
T1.FirstDayOfMonth AS MONTH_START,
AVG1,
ROW_NUMBER() OVER (
ORDER BY DATEADD(MM,DATEDIFF(MM, 0, T1.FirstDayOfMonth),0) DESC
) AS RNK
FROM DATEDIM T1
LEFT OUTER JOIN (
SELECT
DATEADD(MM,DATEDIFF(MM, 0, StartDate),0) MONTH_START,
AVG(CAST(DATEDIFF(dd, StartDate, EndDate) AS FLOAT)) AS AVG1
FROM DATATable
WHERE EndDate >= StartDate
AND StartDate >= #DATE
AND EndDate >= #DATE
GROUP BY DATEADD(MM,DATEDIFF(MM, 0, StartDate),0)
) T2 ON T1.FirstDayOfMonth = T2.MONTH_START
)
SELECT *
FROM Tab1
Using your CTEs
select MONTH_START,
case when AVG1 is null then
(select top(1) t2.AVG1
from Tab1 t2
where t1.RNK > t2.RNK and t2.AVG1 is not null
order by t2.RNK desc)
else AVG1 end AVG1,
RNK
from Tab1 t1
Edit
Version for an average of nearest peceding and nearest following non-nulls. Both must exist otherwise NULL is returned.
select MONTH_START,
case when AVG1 is null then
( (select top(1) t2.AVG1
from Tab1 t2
where t1.RNK > t2.RNK and t2.AVG1 is not null
order by t2.RNK desc)
+(select top(1) t2.AVG1
from Tab1 t2
where t1.RNK < t2.RNK and t2.AVG1 is not null
order by t2.RNK)
) / 2
else AVG1 end AVG1,
RNK
from Tab1 t1
I can't quite tell what you are trying to calculate the average of, but this is quite simple with window functions:
select t.*,
avg(val) over (order by month_start rows between 1 preceding and 1 rollowing)
from t;
In your case, I think this translates as:
select datefromparts(year(startdate), month(startdate), 1) as float,
avg(val) as monthaverage,
avg(avg(val)) over (order by min(startdate) rows between 1 preceding and 1 following)
from datatable d
where . . .
group by datefromparts(year(startdate), month(startdate), 1)
You can manipulate previous and following row values using window functions:
SELECT MAX(row_value) OVER(
ORDER BY ... ROWS BETWEEN 1 PRECEDING AND 1 PRECEDING) AS Previous_Value,
MAX(row_value) OVER(
ORDER BY ... ROWS BETWEEN 1 FOLLOWING AND 1 FOLLOWING) AS Next_Value
Alternatively you can use LAG/LEAD functions and modify your sub-query where you get the AVG:
SELECT
src.MONTH_START,
CASE
WHEN src.prev_val IS NULL OR src.next_val IS NULL
THEN COALESCE(src.prev_val, src.next_val) -- Return non-NULL value (if exists)
ELSE (src.prev_val + src.next_val ) / 2
END AS AVG_new
FROM (
SELECT
DATEADD(MM,DATEDIFF(MM, 0, StartDate),0) MONTH_START,
LEAD(CAST(DATEDIFF(dd, StartDate, EndDate) AS FLOAT)) OVER(ORDER BY ...) AS prev_val,
LAG(CAST(DATEDIFF(dd, StartDate, EndDate) AS FLOAT)) OVER(ORDER BY ...) AS next_val
-- AVG(CAST(DATEDIFF(dd, StartDate, EndDate) AS FLOAT)) AS AVG1
FROM DATATable
WHERE EndDate >= StartDate
AND StartDate >= #DATE
AND EndDate >= #DATE
GROUP BY DATEADD(MM,DATEDIFF(MM, 0, StartDate),0)
) AS src
I haven't tested it, but give it a shot and see how it works. You may need to put at least one column in the ORDER BY portion of the window function.
You could try this query (I just reflected in my sample data relevant parts, I omitted date column):
declare #tbl table (rank int, value int);
insert into #tbl values
(1, null),
(2, 20),
(3, 30),
(4, null),
(5, null),
(6, null),
(7, 40),
(8, null),
(9, null),
(10, 36),
(11, 22);
;with cte as (
select *,
DENSE_RANK() over (order by case when value is null then rank else value end) drank,
case when value is null then lag(value) over (order by rank) end lag,
case when value is null then lead(value) over (order by rank) end lead
from #tbl
)
select rank, value, case when value is null then
max(lag) over (partition by grp) / 2 +
max(lead) over (partition by grp) / 2
else value end valueWithAvg
from (
select *,
rank - drank grp from cte
) a order by rank

postgresql return 0 if returned value is null

I have a query that returns avg(price)
select avg(price)
from(
select *, cume_dist() OVER (ORDER BY price desc) from web_price_scan
where listing_Type='AARM'
and u_kbalikepartnumbers_id = 1000307
and (EXTRACT(Day FROM (Now()-dateEnded)))*24 < 48
and price>( select avg(price)* 0.50
from(select *, cume_dist() OVER (ORDER BY price desc)
from web_price_scan
where listing_Type='AARM'
and u_kbalikepartnumbers_id = 1000307
and (EXTRACT(Day FROM (Now()-dateEnded)))*24 < 48
)g
where cume_dist < 0.50
)
and price<( select avg(price)*2
from( select *, cume_dist() OVER (ORDER BY price desc)
from web_price_scan
where listing_Type='AARM'
and u_kbalikepartnumbers_id = 1000307
and (EXTRACT(Day FROM (Now()-dateEnded)))*24 < 48
)d
where cume_dist < 0.50)
)s
having count(*) > 5
how to make it return 0 if no value is available?
use coalesce
COALESCE(value [, ...])
The COALESCE function returns the first of its arguments that is not null.
Null is returned only if all arguments are null. It is often
used to substitute a default value for null values when data is
retrieved for display.
Edit
Here's an example of COALESCE with your query:
SELECT AVG( price )
FROM(
SELECT *, cume_dist() OVER ( ORDER BY price DESC ) FROM web_price_scan
WHERE listing_Type = 'AARM'
AND u_kbalikepartnumbers_id = 1000307
AND ( EXTRACT( DAY FROM ( NOW() - dateEnded ) ) ) * 24 < 48
AND COALESCE( price, 0 ) > ( SELECT AVG( COALESCE( price, 0 ) )* 0.50
FROM ( SELECT *, cume_dist() OVER ( ORDER BY price DESC )
FROM web_price_scan
WHERE listing_Type='AARM'
AND u_kbalikepartnumbers_id = 1000307
AND ( EXTRACT( DAY FROM ( NOW() - dateEnded ) ) ) * 24 < 48
) g
WHERE cume_dist < 0.50
)
AND COALESCE( price, 0 ) < ( SELECT AVG( COALESCE( price, 0 ) ) *2
FROM( SELECT *, cume_dist() OVER ( ORDER BY price desc )
FROM web_price_scan
WHERE listing_Type='AARM'
AND u_kbalikepartnumbers_id = 1000307
AND ( EXTRACT( DAY FROM ( NOW() - dateEnded ) ) ) * 24 < 48
) d
WHERE cume_dist < 0.50)
)s
HAVING COUNT(*) > 5
IMHO COALESCE should not be use with AVG because it modifies the value. NULL means unknown and nothing else. It's not like using it in SUM. In this example, if we replace AVG by SUM, the result is not distorted. Adding 0 to a sum doesn't hurt anyone but calculating an average with 0 for the unknown values, you don't get the real average.
In that case, I would add price IS NOT NULL in WHERE clause to avoid these unknown values.
(this answer was added to provide shorter and more generic examples to the question - without including all the case-specific details in the original question).
There are two distinct "problems" here, the first is if a table or subquery has no rows, the second is if there are NULL values in the query.
For all versions I've tested, postgres and mysql will ignore all NULL values when averaging, and it will return NULL if there is nothing to average over. This generally makes sense, as NULL is to be considered "unknown". If you want to override this you can use coalesce (as suggested by Luc M).
$ create table foo (bar int);
CREATE TABLE
$ select avg(bar) from foo;
avg
-----
(1 row)
$ select coalesce(avg(bar), 0) from foo;
coalesce
----------
0
(1 row)
$ insert into foo values (3);
INSERT 0 1
$ insert into foo values (9);
INSERT 0 1
$ insert into foo values (NULL);
INSERT 0 1
$ select coalesce(avg(bar), 0) from foo;
coalesce
--------------------
6.0000000000000000
(1 row)
of course, "from foo" can be replaced by "from (... any complicated logic here ...) as foo"
Now, should the NULL row in the table be counted as 0? Then coalesce has to be used inside the avg call.
$ select coalesce(avg(coalesce(bar, 0)), 0) from foo;
coalesce
--------------------
4.0000000000000000
(1 row)
I can think of 2 ways to achieve this:
IFNULL():
The IFNULL() function returns a specified value if the expression is NULL.If the expression is NOT NULL, this function returns the expression.
Syntax:
IFNULL(expression, alt_value)
Example of IFNULL() with your query:
SELECT AVG( price )
FROM(
SELECT *, cume_dist() OVER ( ORDER BY price DESC ) FROM web_price_scan
WHERE listing_Type = 'AARM'
AND u_kbalikepartnumbers_id = 1000307
AND ( EXTRACT( DAY FROM ( NOW() - dateEnded ) ) ) * 24 < 48
AND IFNULL( price, 0 ) > ( SELECT AVG( IFNULL( price, 0 ) )* 0.50
FROM ( SELECT *, cume_dist() OVER ( ORDER BY price DESC )
FROM web_price_scan
WHERE listing_Type='AARM'
AND u_kbalikepartnumbers_id = 1000307
AND ( EXTRACT( DAY FROM ( NOW() - dateEnded ) ) ) * 24 < 48
) g
WHERE cume_dist < 0.50
)
AND IFNULL( price, 0 ) < ( SELECT AVG( IFNULL( price, 0 ) ) *2
FROM( SELECT *, cume_dist() OVER ( ORDER BY price desc )
FROM web_price_scan
WHERE listing_Type='AARM'
AND u_kbalikepartnumbers_id = 1000307
AND ( EXTRACT( DAY FROM ( NOW() - dateEnded ) ) ) * 24 < 48
) d
WHERE cume_dist < 0.50)
)s
HAVING COUNT(*) > 5
COALESCE()
The COALESCE() function returns the first non-null value in a list.
Syntax:
COALESCE(val1, val2, ...., val_n)
Example of COALESCE() with your query:
SELECT AVG( price )
FROM(
SELECT *, cume_dist() OVER ( ORDER BY price DESC ) FROM web_price_scan
WHERE listing_Type = 'AARM'
AND u_kbalikepartnumbers_id = 1000307
AND ( EXTRACT( DAY FROM ( NOW() - dateEnded ) ) ) * 24 < 48
AND COALESCE( price, 0 ) > ( SELECT AVG( COALESCE( price, 0 ) )* 0.50
FROM ( SELECT *, cume_dist() OVER ( ORDER BY price DESC )
FROM web_price_scan
WHERE listing_Type='AARM'
AND u_kbalikepartnumbers_id = 1000307
AND ( EXTRACT( DAY FROM ( NOW() - dateEnded ) ) ) * 24 < 48
) g
WHERE cume_dist < 0.50
)
AND COALESCE( price, 0 ) < ( SELECT AVG( COALESCE( price, 0 ) ) *2
FROM( SELECT *, cume_dist() OVER ( ORDER BY price desc )
FROM web_price_scan
WHERE listing_Type='AARM'
AND u_kbalikepartnumbers_id = 1000307
AND ( EXTRACT( DAY FROM ( NOW() - dateEnded ) ) ) * 24 < 48
) d
WHERE cume_dist < 0.50)
)s
HAVING COUNT(*) > 5