PostGIS recursive intersection between polygons - sql

I'm trying to perform a recursive intersection between all the Polygons in a spatial Table, and obtain the resulting (multi)pololygons and the information about every intersection for each of them.
An image (not really in scale) to explain it:
Let's say there are A, B, C squares in a table. I'd like to have A, B, C, A+B, A+C, B+C, A+B+C polygons in output, and I need to know that A+B is the intersection of A and B and so on.
So far I have a query which performs the intersections, but it does not "cut off" the intersected part of the original polygons. For example:
Polygon A should be A - (A+B) - (A+C) - (A+B+C)
Polygon A+C should be A+C - (A+B+C)
An image of the result I get now for the A and A+C polygons:
Here is a test script, using the squares in the images as data. Looking at the area column, it is clear some recursive ST_Difference is missing, I just can't figure out how. Any idea is welcomed.
-- Create a test table
CREATE TABLE test (
name text PRIMARY KEY,
geom geometry(POLYGON)
);
-- Insert test data
INSERT INTO test (name, geom) VALUES
('A', ST_GeomFromText('POLYGON((1 2, 1 6, 5 6, 5 2, 1 2))')),
('B', ST_GeomFromText('POLYGON((0 0, 0 4, 4 4, 4 0, 0 0))')),
('C', ST_GeomFromText('POLYGON((2 0, 2 4, 6 4, 6 0, 2 0))'));
-- Query
WITH RECURSIVE
source (rownum, geom, ret) AS (
SELECT row_number() OVER (ORDER BY name ASC), ST_Multi(geom), ARRAY[name] FROM test
),
r (rownum, geom, ret, incroci) AS (
SELECT rownum, geom, ret, 0 FROM source
UNION ALL
SELECT s.rownum, ST_CollectionExtract(ST_Intersection(s.geom, r.geom), 3), (r.ret || s.ret), (r.incroci + 1)
FROM source AS s INNER JOIN r ON s.rownum > r.rownum AND ST_Intersects(s.geom, r.geom) AND ST_Area(ST_Intersection(s.geom, r.geom)) > 0.5
),
result (geom, ret) AS (
SELECT ST_Union(geom) AS geom, ret FROM r GROUP BY ret
)
SELECT geom, ST_Area(geom) AS area, ret FROM result ORDER BY ret
The window function isn't strictly necessary in this particular example of course, but this code is a simplified version of my real case, which does a few more things on the side.
I'm using PostgreSQL 9.2 and PostGIS 2.0

ST_DIFFRENCE doesn't have to be recursive, you already have all the polygons so from every geom you have to substract the union of other geoms which contain that ret but ain't equal to it. This works so you should do it kinda like that:
WITH RECURSIVE
source (rownum, geom, ret) AS (
SELECT row_number() OVER (ORDER BY name ASC), ST_Multi(geom), ARRAY[name] FROM test
),
r (rownum, geom, ret, incroci) AS (
SELECT rownum, geom, ret, 0 FROM source
UNION ALL
SELECT s.rownum, ST_CollectionExtract(ST_Intersection(s.geom, r.geom), 3), (r.ret || s.ret), (r.incroci + 1)
FROM source AS s INNER JOIN r ON s.rownum > r.rownum AND ST_Intersects(s.geom, r.geom) AND ST_Area(ST_Intersection(s.geom, r.geom)) > 0.5
),
result (geom, ret) AS (
SELECT ST_Difference(ST_Union(r.geom),q.geom) AS geom, r.ret FROM r JOIN (SELECT r.ret,ST_UNION(COALESCE(r2.geom,ST_GeomFromText('POLYGON EMPTY'))) as geom FROM r LEFT JOIN r AS r2 ON r.ret<#r2.ret AND r.ret!=r2.ret GROUP BY r.ret) AS q on r.ret=q.ret GROUP BY r.ret,q.geom
)
SELECT geom, ST_Area(geom) AS area, ret FROM result ORDER BY ret

Related

Common point in multiple polygons in SQL

i have two tables that contains a list geometry data
Ex. (0xE6100000010CFB24190B88E44A40AADDAB69817F3740)
i did the intersection of shapes between the two tables , now i'm trying to find a common point in all the intersected shapes
i tried to find the STCentroid() of each shape , but i can't find out how to find the common point in all of them
select p1.shape_data.STIntersection(p2.shape_data).STCentroid() as inter_geometry
from map_shapes p1
inner join areas_map_shapes p2 on p2.shape_data.STIntersects(p1.shape_data) = 1
where p2.shape_data.STIntersects(p1.shape_data) = 1
and p2.shape_id = 206
i tried also to aggregate all the intersected shapes
SELECT
geometry::UnionAggregate(ss.shape_data),
geometry::STGeomFromText( geometry::UnionAggregate(ss.shape_data).STCentroid().ToString(), 0).STY as lat,
geometry::STGeomFromText( geometry::UnionAggregate(ss.shape_data).STCentroid().ToString(), 0).STX as lon
FROM areas_map_shapes T
inner join map_shapes SS on SS.shape_data.STIntersects(T.shape_data) = 1
WHERE SS.shape_data.STIntersects(T.shape_data) = 1
AND T.shape_id = 206
and T.status = 1
and SS.status = 1
and T.country_id = 4
my problem is that i need to find the only one common point in all the shapes that intersects
adding image to represent what i got so far , this shows all the shapes the intersects with the main shape , i need to find a common point in all of them
Its hard to tell from your example because (as #nbk pointed out) its difficult to reproduce what you're asking for. That said, it looks like you're looking for the STIntersection function.
DECLARE #GeometryTable TABLE(
ID INT,
geom GEOMETRY
)
INSERT INTO #GeometryTable (ID, Geom) VALUES (1, GEOMETRY::STGeomFromText('POLYGON((0 0, 0 2, 2 2, 2 0, 0 0))', 0))
INSERT INTO #GeometryTable (ID, Geom) VALUES (2, GEOMETRY::STGeomFromText('POLYGON((1 1, 1 3, 3 3, 3 1, 1 1))', 0))
INSERT INTO #GeometryTable (ID, Geom) VALUES (3, GEOMETRY::STGeomFromText('POLYGON((0 1, 0 3, 2 3, 2 1, 0 1))', 0))
SELECT
G1.geom.STIntersection(G2.geom).STIntersection(G3.geom)
FROM
#GeometryTable G1
INNER JOIN
#GeometryTable G2
ON
G1.geom.STIntersects(G2.geom) = 1
INNER JOIN
#GeometryTable G3
ON
G1.geom.STIntersects(G3.geom) = 1
AND G2.geom.STIntersects(G3.geom) = 1
WHERE
G1.ID = 1
AND G2.ID = 2
AND G3.ID = 3
Not sure there's an easy/fast way to do it. One idea is to use STIntersection to create a intersection polygon of all your areas in a recursive CTE:
drop table #t_geoms
create table #t_geoms (geom geometry, row_id int identity)
-- create some random data
insert into #t_geoms
select top 30 GEOMETRY::Point(ROW_NUMBER() OVER(ORDER BY object_id) * 0.01 + 10,ROW_NUMBER() OVER(ORDER BY object_id) * 0.01 + 10, 4326).STBuffer(3) x
from sys.objects
;with cte as (
select geom, row_id
from #t_geoms
where row_id = 1
union all
select g.geom.STIntersection(c.geom), g.row_id
from cte c
inner join #t_geoms g
ON g.row_id = c.row_id + 1
)
select top 1 geom, geom.STCentroid() AS centerPointOfIntersection
from cte
order by row_id desc
option(MAXRECURSION 0)
Note that if not all polygons actually intersect, you get an emptry geom

Snowflake table and generator functions does not give expected result

I tried to create a simple SQL to track query_history usage, but got into trouble when creating my timeslots using the table and generator functions (the CTE named x below).
I got no results at all when limiting the query_history using my timeslots, so after a while I hardcoded an SQL to give the same result (the CTE named y below) and this works fine.
Why does not x work? As far as I can see x and y produce identical result?
To test the example first run the code as it is, this produces no result.
Then comment the line x as timeslots and un-comment the line y as timeslots, this will give the desired result.
with
x as (
select
dateadd('min',seq4()*10,dateadd('min',-60,current_timestamp())) f,
dateadd('min',(seq4()+1)*10,dateadd('min',-60,current_timestamp())) t
from table(generator(rowcount => 6))
),
y as (
select
dateadd('min',n*10,dateadd('min',-60,current_timestamp())) f,
dateadd('min',(n+1)*10,dateadd('min',-60,current_timestamp())) t
from (select 0 n union all select 1 n union all select 2 union all select 3
union all select 4 union all select 5)
)
--select * from x;
--select * from y;
select distinct
user_name,
timeslots.f
from snowflake.account_usage.query_history,
x as timeslots
--y as timeslots
where start_time >= timeslots.f
and start_time < timeslots.t
order by timeslots.f desc;
(I know the code is not optimal, this is only meant to illustrate the problem)
SEQ:
Returns a sequence of monotonically increasing integers, with wrap-around. Wrap-around occurs after the largest representable integer of the integer width (1, 2, 4, or 8 byte).
If a fully ordered, gap-free sequence is required, consider using the ROW_NUMBER window function.
For:
with x as (
select
dateadd('min',seq4()*10,dateadd('min',-60,current_timestamp())) f,
dateadd('min',(seq4()+1)*10,dateadd('min',-60,current_timestamp())) t
from table(generator(rowcount => 6))
)
SELECT * FROM x;
Should be:
with x as (
select
(ROW_NUMBER() OVER(ORDER BY seq4())) - 1 AS n,
dateadd('min',n*10,dateadd('min',-60,current_timestamp())) f,
dateadd('min',(n+1)*10,dateadd('min',-60,current_timestamp())) t
from table(generator(rowcount => 6))
)
SELECT * FROM x;

Recursive Formula based on previous Row's Result

Let's consider the following query:
with
init as (
select 0.1 as y0
),
cte as (
select 1 as i, 1 as x -- x_1
union all
select 2 as i, 10 as x -- x_2
union all
select 3 as i, 100 as x -- x_3
order by i asc
)
select cte.x, init.y0 -- <- ?
from cte
join init
on true
There is a CTE init specifying an inital value y_0 and a CTE cte specifying rows with a value x and an index i.
My question is whether I can write a select which realizes the following simple, recursive formula.
y_n+1 = y_n + x_n+1
So, the result should be 3 rows with values: 1.1, 11.1, 111.1 (for y_1, y_2, y_3).
Would that be possible?
write a select which realizes the following simple, recursive formula.
y_n+1 = y_n + x_n+1
Consider below
select x, y0 + sum(x) over(order by i) as y
from cte, init
if applied to sample data in your question - output is
Note: the expected result you shown in your question - does not match the formula you provided - so obviously above output is different from one in your question :o)
You need to use the “OVER” statement. You can see more documentation about the syntax.
with
init as (
select 0.1 as y0
),
cte as (
select 1 as ts, 1 as i, 1 as x -- x_1
union all
select 2, 2, 10 as x -- x_2
union all
select 3, 3, 100 as x
union all
select 4, 4, 109 as x -- x_3
union all
select 5, 5, 149 as x
order by i asc
)
SELECT *,init.y0 + SUM(i) OVER(
ORDER BY (ts)
) AS res
FROM cte join init
on true

How to build "Star Rating" report in BigQuery (or sparklines, or color gradients)

Suppose I have the followng sample input:
WITH Ratings AS (
(SELECT 'A' name, 2 score) UNION ALL
(SELECT 'B' name, 0 score) UNION ALL
(SELECT 'C' name, 5 score) UNION ALL
(SELECT 'D' name, 1 score))
Where score is number between 0 and 5.
How can I produce a report showing names and corresponding number of stars ?
We can build star rating as a string using two Unicode characters:
★ - Unicode code point 9733
☆ - Unicode code point 9734
We can use CODE_POINTS_TO_STRING function to build the stars, and REPEAT function to produce the right number of stars
Combined together the solution for sample input will be:
WITH Ratings AS (
(SELECT 'A' name, 2 score) UNION ALL
(SELECT 'B' name, 0 score) UNION ALL
(SELECT 'C' name, 5 score) UNION ALL
(SELECT 'D' name, 1 score))
SELECT
name,
CONCAT(
REPEAT(CODE_POINTS_TO_STRING([9733]), score),
REPEAT(CODE_POINTS_TO_STRING([9734]), 5-score)) score
FROM Ratings
It will produce the following result:
name score
A ★★☆☆☆
B ☆☆☆☆☆
C ★★★★★
D ★☆☆☆☆
My entry does a color gradient, because sparklines only look good with certain fonts - and that's not a font that the BigQuery web UI uses.
During a day, when is Stack Overflow the most active per tag:
#standardSQL
CREATE TEMP FUNCTION barchart(v ARRAY<FLOAT64>, mm STRUCT<min FLOAT64, max FLOAT64>) AS ((
SELECT STRING_AGG(SUBSTR('🏿🏾🏽🏼🏻', 1+CAST(ROUND(y) AS INT64), 1), '')
FROM (SELECT IFNULL(SAFE_DIVIDE((e-mm.min),(mm.max-mm.min))*4, 0) y FROM UNNEST(v) e)));
CREATE TEMP FUNCTION vbar(v ARRAY<FLOAT64>) AS (
barchart(v, (SELECT AS STRUCT MIN(a), MAX(a) FROM UNNEST(v) a))
);
WITH top_tags AS (
(SELECT x.value FROM (SELECT APPROX_TOP_COUNT(tag, 24) x FROM `bigquery-public-data.stackoverflow.posts_questions`, UNNEST(SPLIT(tags,'|')) tag WHERE EXTRACT(YEAR FROM creation_date)>=2016), UNNEST(x) x)
)
SELECT tag, vbar(ARRAY_AGG(1.0*hhh.count ORDER BY hhh.value)) gradient, SUM(hhh.count) c
FROM (
SELECT tag, APPROX_TOP_COUNT(EXTRACT(HOUR FROM creation_date), 24) h_h
FROM `bigquery-public-data.stackoverflow.posts_questions`, UNNEST(SPLIT(tags,'|')) tag
WHERE tag IN (SELECT * FROM top_tags) AND EXTRACT(YEAR FROM creation_date)>=2016
GROUP BY 1
), UNNEST(h_h) hhh
GROUP BY tag
ORDER BY STRPOS(gradient, '🏼')
Row gradient c tag
1 🏿🏿🏿🏿🏾🏽🏼🏼🏼🏻🏻🏻🏻🏼🏼🏼🏼🏽🏽🏽🏽🏾🏾🏿 317538 android
2 🏿🏿🏿🏿🏾🏽🏼🏼🏼🏻🏻🏻🏻🏻🏻🏻🏼🏼🏽🏽🏽🏾🏾🏿 59445 asp.net
3 🏿🏿🏿🏿🏾🏽🏼🏼🏼🏻🏻🏻🏼🏼🏼🏼🏽🏽🏽🏽🏾🏾🏾🏿 159134 ios
4 🏿🏿🏿🏿🏾🏽🏼🏼🏼🏻🏻🏻🏻🏻🏻🏼🏼🏽🏽🏽🏽🏾🏾🏿 111988 angularjs
5 🏿🏿🏿🏿🏾🏾🏽🏼🏼🏻🏻🏻🏻🏻🏻🏼🏼🏼🏽🏽🏽🏽🏾🏿 212843 jquery
6 🏿🏿🏿🏾🏾🏾🏽🏼🏼🏻🏻🏻🏻🏻🏻🏻🏼🏼🏼🏽🏽🏽🏾🏿 138143 mysql
7 🏿🏿🏿🏿🏿🏾🏽🏼🏼🏻🏻🏻🏼🏻🏻🏻🏻🏼🏼🏼🏼🏽🏾🏾 107586 swift
8 🏿🏿🏿🏿🏾🏾🏽🏼🏼🏻🏻🏻🏼🏻🏼🏼🏼🏽🏽🏽🏽🏾🏾🏿 318294 php
9 🏿🏿🏿🏿🏾🏾🏽🏼🏼🏻🏻🏻🏻🏻🏻🏻🏼🏼🏼🏽🏽🏽🏾🏾 84723 json
10 🏿🏿🏿🏿🏿🏾🏽🏼🏼🏻🏻🏻🏻🏻🏻🏻🏼🏼🏼🏼🏽🏽🏾🏾 233100 html
11 🏿🏿🏿🏿🏿🏾🏽🏼🏼🏻🏻🏻🏻🏻🏻🏻🏼🏼🏼🏽🏽🏽🏾🏿 390245 java
12 🏿🏿🏿🏿🏿🏾🏽🏽🏼🏻🏻🏼🏻🏻🏻🏻🏼🏽🏽🏽🏽🏽🏾🏿 83787 angular
13 🏿🏿🏿🏿🏾🏾🏽🏽🏼🏼🏼🏼🏼🏻🏻🏻🏼🏼🏽🏽🏽🏽🏾🏿 70150 sql-server
14 🏿🏿🏿🏿🏿🏾🏽🏽🏼🏻🏻🏻🏻🏻🏻🏻🏼🏼🏼🏼🏽🏽🏾🏾 534663 javascript
15 🏿🏿🏿🏿🏿🏾🏽🏽🏼🏻🏻🏼🏼🏻🏻🏻🏼🏼🏽🏽🏽🏾🏾🏿 291541 c#
16 🏿🏿🏿🏿🏿🏿🏾🏾🏽🏼🏼🏽🏼🏼🏻🏻🏻🏻🏻🏼🏼🏽🏽🏾 65668 c
17 🏿🏿🏿🏿🏿🏾🏽🏽🏽🏼🏼🏼🏼🏻🏻🏻🏼🏼🏼🏼🏽🏽🏾🏿 111792 sql
18 🏿🏿🏿🏿🏿🏾🏾🏽🏽🏼🏻🏼🏼🏻🏻🏻🏻🏼🏼🏼🏼🏽🏾🏾 158999 css
19 🏿🏿🏿🏿🏿🏿🏾🏽🏽🏼🏼🏼🏼🏻🏻🏻🏻🏼🏼🏼🏼🏽🏽🏾 88146 arrays
20 🏿🏿🏿🏿🏿🏿🏾🏾🏽🏼🏼🏽🏼🏼🏻🏻🏻🏼🏼🏼🏼🏼🏽🏾 61840 ruby-on-rails
21 🏿🏿🏿🏿🏿🏿🏾🏾🏽🏼🏼🏼🏼🏻🏻🏻🏼🏼🏼🏼🏼🏽🏾🏾 136265 c++
22 🏿🏿🏿🏿🏿🏾🏽🏽🏽🏻🏻🏼🏼🏻🏻🏻🏻🏼🏼🏼🏽🏽🏾🏾 104218 node.js
23 🏿🏿🏿🏿🏿🏿🏿🏾🏾🏽🏽🏽🏼🏼🏻🏻🏻🏼🏼🏼🏼🏽🏾🏾 360396 python
24 🏿🏿🏿🏿🏿🏿🏿🏾🏾🏽🏽🏽🏽🏼🏻🏻🏻🏼🏼🏼🏼🏽🏾🏾 98690 r
And a more compact shaded gradient, but with only 3 values:
#standardSQL
CREATE TEMP FUNCTION barchart(v ARRAY<FLOAT64>, mm STRUCT<min FLOAT64, max FLOAT64>) AS ((
SELECT STRING_AGG(SUBSTR('▓▒░', 1+CAST(ROUND(y) AS INT64), 1), '')
FROM (SELECT IFNULL(SAFE_DIVIDE((e-mm.min),(mm.max-mm.min))*2, 0) y FROM UNNEST(v) e)));
CREATE TEMP FUNCTION vbar(v ARRAY<FLOAT64>) AS (
barchart(v, (SELECT AS STRUCT MIN(a), MAX(a) FROM UNNEST(v) a))
);
WITH top_countries AS (
(SELECT x.value FROM (SELECT APPROX_TOP_COUNT(country_code, 12) x FROM `ghtorrent-bq.ght_2017_09_01.users`), UNNEST(x) x)
)
SELECT vbar(ARRAY_AGG(1.0*hhh.count ORDER BY hhh.value)) gradient, SUM(hhh.count) c, country_code
FROM (
SELECT country_code, APPROX_TOP_COUNT(EXTRACT(HOUR FROM a.created_at), 24) h_h
FROM `githubarchive.year.2017` a
JOIN `ghtorrent-bq.ght_2017_09_01.users` b
ON a.actor.login=b.login
WHERE country_code IN (SELECT * FROM top_countries)
AND actor.login NOT IN (SELECT value FROM (SELECT APPROX_TOP_COUNT(actor.login, 1000) x FROM `githubarchive.year.2017` WHERE type='WatchEvent'), UNNEST(x))
AND a.type='WatchEvent'
GROUP BY 1
), UNNEST(h_h) hhh
GROUP BY country_code
ORDER BY STRPOS(gradient, '░')
Row gradient c country_code
1 ░░░░░░░▒▒▒▒▒▒▒▒▓▓▓▓▓▓▒▒░ 204023 au
2 ▒░░░░░░░░░▒▒▒▒▒▒▒▓▓▓▓▓▓▒ 293589 jp
3 ▓▒░░▒▒░░░░▒▒▒▒▒▒▒▓▓▓▓▓▓▓ 2125724 cn
4 ▓▓▓▒▒░░░░░░░░▒▒▒▒▒▒▒▒▓▓▓ 447092 in
5 ▓▓▓▓▓▓▒▒░░░░░░░░▒▒▒▒▒▒▒▓ 381510 ru
6 ▓▓▓▓▓▓▒▒░░░░░░░░▒▒▒▒▒▒▒▒ 545906 de
7 ▓▓▓▓▓▓▓▒░░░▒░░░░▒▒▒▒▒▒▒▒ 395949 fr
8 ▓▓▓▓▓▓▓▒▒░░░░░░░░▒▒▒▒▒▒▒ 491068 gb
9 ▒▒▒▒▓▓▓▓▓▓▓▒░░░▒░░░░░▒▒▒ 419608 br
10 ▒▒▒▒▒▒▒▓▓▓▓▓▓▒▒░░░░░░░░▒ 2443381 us
11 ▒▒▒▒▒▒▒▓▓▓▓▓▓▒▒░░░░░░░▒▒ 294793 ca
And a short code for sparklines - works great with Data Studio:
#standardSQL
CREATE TEMP FUNCTION barchart(v ARRAY<FLOAT64>, mm STRUCT<min FLOAT64, max FLOAT64>) AS ((
SELECT STRING_AGG(SUBSTR('▁▂▃▄▅▆▇█', 1+CAST(ROUND(y) AS INT64), 1), '')
FROM (SELECT IFNULL(SAFE_DIVIDE((e-mm.min),(mm.max-mm.min))*7, 0) y FROM UNNEST(v) e)));
CREATE TEMP FUNCTION vbar(v ARRAY<FLOAT64>) AS (
barchart(v, (SELECT AS STRUCT MIN(a), MAX(a) FROM UNNEST(v) a))
);
Adding more-less generic option for producing time-series/sparklines type of report
#standardSQL
CREATE TEMP FUNCTION sparklines(arr ARRAY<INT64>) AS ((
SELECT STRING_AGG(CODE_POINTS_TO_STRING([code]), '')
FROM UNNEST(arr) el,
UNNEST([(SELECT MAX(el) FROM UNNEST(arr) el)]) mx,
UNNEST([(SELECT MIN(el) FROM UNNEST(arr) el)]) mn
JOIN UNNEST([9602, 9603, 9605, 9606, 9607]) code WITH OFFSET pos
ON pos = CAST(IF(mx = mn, 1, (el - mn) / (mx - mn)) * 4 AS INT64)
));
WITH series AS (
SELECT 1 id, [3453564, 5343333, 2876345, 3465234] arr UNION ALL
SELECT 2, [5743231, 3276438, 1645738, 2453657] UNION ALL
SELECT 3, [1,2,3,4,5,6,7,8,9,0] UNION ALL
SELECT 4, [3245876, 2342879, 5876324, 7342564]
)
SELECT
id, TO_JSON_STRING(arr) arr, sparklines(arr) sparklines
FROM series
with result as below
Row id arr sparklines
1 1 [3453564,5343333,2876345,3465234] ▃▇▂▃
2 2 [5743231,3276438,1645738,2453657] ▇▅▂▃
3 3 [1,2,3,4,5,6,7,8,9,0] ▂▃▃▅▅▆▆▇▇▂
4 4 [3245876,2342879,5876324,7342564] ▃▂▆▇
Adding Mosha's version (taken from his comments below)
#standardSQL
CREATE TEMP FUNCTION barchart(v ARRAY<FLOAT64>, MIN FLOAT64, MAX FLOAT64) AS (
IF(
MIN = MAX,
REPEAT(CODE_POINTS_TO_STRING([9603]), ARRAY_LENGTH(v)),
(
SELECT STRING_AGG(CODE_POINTS_TO_STRING([9601 + CAST(ROUND(y) AS INT64)]), '')
FROM (
SELECT SAFE_DIVIDE(e-min, MAX - MIN) * 7 y
FROM UNNEST(v) e)
)
)
);
CREATE TEMP FUNCTION vbar(v ARRAY<FLOAT64>) AS (
barchart(v, (SELECT MIN(a) FROM UNNEST(v) a), (SELECT MAX(a) FROM UNNEST(v) a))
);
WITH numbers AS (
SELECT 1 id, [3453564., 5343333., 2876345., 3465234.] arr UNION ALL
SELECT 2, [5743231., 3276438., 1645738., 2453657.] UNION ALL
SELECT 3, [1.,2,3,4,5,6,7,8,9,0] UNION ALL
SELECT 4, [3245876., 2342879, 5876324, 7342564]
)
SELECT
id, TO_JSON_STRING(arr) arr, vbar(arr) sparklines
FROM numbers
if applied to same dummy data as above versions - produces below
Row id arr sparklines
1 1 [3453564,5343333,2876345,3465234] ▃█▁▃
2 2 [5743231,3276438,1645738,2453657] █▄▁▂
3 3 [1,2,3,4,5,6,7,8,9,0] ▂▃▃▄▅▆▆▇█▁
4 4 [3245876,2342879,5876324,7342564] ▂▁▆█
More craziness here 😊
Totally useless - but fun to play with
Applying all different options presented in this post for image processing and drawing (using profile pictures of those contribute into this post) + some new
1st and 2nd result (for Felipe's picture) produced using Felipe's Color Gradient approach with different scaling options
3rd result - using Felipe's Shaded Gradient approach
4th result - using Mikhail's(mine)/Mosha's Spark-line approach
Finally 5th and 6th results - using ASCII characters sets representing ASCII Shades of Gray - respectively:
Short set - " .:-=+*#%#"
Full (long) set - "$#B%8&WM#*oahkbdpqwmZO0QLCJUYXzcvunxrjft/\|()1{}[]?-_+~<>i!lI;:,"^``'. "
Code is trivial and literally same as in respective answers - the only difference is that data used in above exercises is image's pixels data that is simply acquired using HTML canvas getImageData() Method - obviously outside of BigQuery - with just simple html page
Options for getting crazy here and having fun playing with image transformation / processing - limitless! but probably useless outside of just learning scope 😜
Fitting vertical bar chart into single character is challenging because there are only 8 different heights we could use. But horizontal bar charts don't have this limitation, we can scale horizontal chart by arbitrary length. Example below uses 30, and it shows number of births per day of week as horizontal bar chart. Data is based on public dataset:
create temp function hbar(value int64, max int64) as (
repeat('█', cast(30 * value / max as int64))
);
select
['sunday', 'monday', 'tuesday', 'wednesday',
'thursday', 'friday', 'saturday'][ordinal(wday)] wday, bar from (
select wday, hbar(count(*), max(count(*)) over()) bar
from `bigquery-public-data.samples.natality`
where wday is not null
group by 1
order by 1 asc)
Results in
wday bar
---------------------------------------------
sunday ███████████████████
monday ███████████████████████████
tuesday ██████████████████████████████
wednesday ██████████████████████████████
thursday █████████████████████████████
friday █████████████████████████████
saturday █████████████████████

Simple way to calculate median with MySQL

What's the simplest (and hopefully not too slow) way to calculate the median with MySQL? I've used AVG(x) for finding the mean, but I'm having a hard time finding a simple way of calculating the median. For now, I'm returning all the rows to PHP, doing a sort, and then picking the middle row, but surely there must be some simple way of doing it in a single MySQL query.
Example data:
id | val
--------
1 4
2 7
3 2
4 2
5 9
6 8
7 3
Sorting on val gives 2 2 3 4 7 8 9, so the median should be 4, versus SELECT AVG(val) which == 5.
In MariaDB / MySQL:
SELECT AVG(dd.val) as median_val
FROM (
SELECT d.val, #rownum:=#rownum+1 as `row_number`, #total_rows:=#rownum
FROM data d, (SELECT #rownum:=0) r
WHERE d.val is NOT NULL
-- put some where clause here
ORDER BY d.val
) as dd
WHERE dd.row_number IN ( FLOOR((#total_rows+1)/2), FLOOR((#total_rows+2)/2) );
Steve Cohen points out, that after the first pass, #rownum will contain the total number of rows. This can be used to determine the median, so no second pass or join is needed.
Also AVG(dd.val) and dd.row_number IN(...) is used to correctly produce a median when there are an even number of records. Reasoning:
SELECT FLOOR((3+1)/2),FLOOR((3+2)/2); -- when total_rows is 3, avg rows 2 and 2
SELECT FLOOR((4+1)/2),FLOOR((4+2)/2); -- when total_rows is 4, avg rows 2 and 3
Finally, MariaDB 10.3.3+ contains a MEDIAN function
I just found another answer online in the comments:
For medians in almost any SQL:
SELECT x.val from data x, data y
GROUP BY x.val
HAVING SUM(SIGN(1-SIGN(y.val-x.val))) = (COUNT(*)+1)/2
Make sure your columns are well indexed and the index is used for filtering and sorting. Verify with the explain plans.
select count(*) from table --find the number of rows
Calculate the "median" row number. Maybe use: median_row = floor(count / 2).
Then pick it out of the list:
select val from table order by val asc limit median_row,1
This should return you one row with just the value you want.
I found the accepted solution didn't work on my MySQL install, returning an empty set, but this query worked for me in all situations that I tested it on:
SELECT x.val from data x, data y
GROUP BY x.val
HAVING SUM(SIGN(1-SIGN(y.val-x.val)))/COUNT(*) > .5
LIMIT 1
Unfortunately, neither TheJacobTaylor's nor velcrow's answers return accurate results for current versions of MySQL.
Velcro's answer from above is close, but it does not calculate correctly for result sets with an even number of rows. Medians are defined as either 1) the middle number on odd numbered sets, or 2) the average of the two middle numbers on even number sets.
So, here's velcro's solution patched to handle both odd and even number sets:
SELECT AVG(middle_values) AS 'median' FROM (
SELECT t1.median_column AS 'middle_values' FROM
(
SELECT #row:=#row+1 as `row`, x.median_column
FROM median_table AS x, (SELECT #row:=0) AS r
WHERE 1
-- put some where clause here
ORDER BY x.median_column
) AS t1,
(
SELECT COUNT(*) as 'count'
FROM median_table x
WHERE 1
-- put same where clause here
) AS t2
-- the following condition will return 1 record for odd number sets, or 2 records for even number sets.
WHERE t1.row >= t2.count/2 and t1.row <= ((t2.count/2) +1)) AS t3;
To use this, follow these 3 easy steps:
Replace "median_table" (2 occurrences) in the above code with the name of your table
Replace "median_column" (3 occurrences) with the column name you'd like to find a median for
If you have a WHERE condition, replace "WHERE 1" (2 occurrences) with your where condition
I propose a faster way.
Get the row count:
SELECT CEIL(COUNT(*)/2) FROM data;
Then take the middle value in a sorted subquery:
SELECT max(val) FROM (SELECT val FROM data ORDER BY val limit #middlevalue) x;
I tested this with a 5x10e6 dataset of random numbers and it will find the median in under 10 seconds.
Install and use this mysql statistical functions: http://www.xarg.org/2012/07/statistical-functions-in-mysql/
After that, calculate median is easy:
SELECT median(val) FROM data;
A comment on this page in the MySQL documentation has the following suggestion:
-- (mostly) High Performance scaling MEDIAN function per group
-- Median defined in http://en.wikipedia.org/wiki/Median
--
-- by Peter Hlavac
-- 06.11.2008
--
-- Example Table:
DROP table if exists table_median;
CREATE TABLE table_median (id INTEGER(11),val INTEGER(11));
COMMIT;
INSERT INTO table_median (id, val) VALUES
(1, 7), (1, 4), (1, 5), (1, 1), (1, 8), (1, 3), (1, 6),
(2, 4),
(3, 5), (3, 2),
(4, 5), (4, 12), (4, 1), (4, 7);
-- Calculating the MEDIAN
SELECT #a := 0;
SELECT
id,
AVG(val) AS MEDIAN
FROM (
SELECT
id,
val
FROM (
SELECT
-- Create an index n for every id
#a := (#a + 1) mod o.c AS shifted_n,
IF(#a mod o.c=0, o.c, #a) AS n,
o.id,
o.val,
-- the number of elements for every id
o.c
FROM (
SELECT
t_o.id,
val,
c
FROM
table_median t_o INNER JOIN
(SELECT
id,
COUNT(1) AS c
FROM
table_median
GROUP BY
id
) t2
ON (t2.id = t_o.id)
ORDER BY
t_o.id,val
) o
) a
WHERE
IF(
-- if there is an even number of elements
-- take the lower and the upper median
-- and use AVG(lower,upper)
c MOD 2 = 0,
n = c DIV 2 OR n = (c DIV 2)+1,
-- if its an odd number of elements
-- take the first if its only one element
-- or take the one in the middle
IF(
c = 1,
n = 1,
n = c DIV 2 + 1
)
)
) a
GROUP BY
id;
-- Explanation:
-- The Statement creates a helper table like
--
-- n id val count
-- ----------------
-- 1, 1, 1, 7
-- 2, 1, 3, 7
-- 3, 1, 4, 7
-- 4, 1, 5, 7
-- 5, 1, 6, 7
-- 6, 1, 7, 7
-- 7, 1, 8, 7
--
-- 1, 2, 4, 1
-- 1, 3, 2, 2
-- 2, 3, 5, 2
--
-- 1, 4, 1, 4
-- 2, 4, 5, 4
-- 3, 4, 7, 4
-- 4, 4, 12, 4
-- from there we can select the n-th element on the position: count div 2 + 1
If MySQL has ROW_NUMBER, then the MEDIAN is (be inspired by this SQL Server query):
WITH Numbered AS
(
SELECT *, COUNT(*) OVER () AS Cnt,
ROW_NUMBER() OVER (ORDER BY val) AS RowNum
FROM yourtable
)
SELECT id, val
FROM Numbered
WHERE RowNum IN ((Cnt+1)/2, (Cnt+2)/2)
;
The IN is used in case you have an even number of entries.
If you want to find the median per group, then just PARTITION BY group in your OVER clauses.
Rob
Most of the solutions above work only for one field of the table, you might need to get the median (50th percentile) for many fields on the query.
I use this:
SELECT CAST(SUBSTRING_INDEX(SUBSTRING_INDEX(
GROUP_CONCAT(field_name ORDER BY field_name SEPARATOR ','),
',', 50/100 * COUNT(*) + 1), ',', -1) AS DECIMAL) AS `Median`
FROM table_name;
You can replace the "50" in example above to any percentile, is very efficient.
Just make sure you have enough memory for the GROUP_CONCAT, you can change it with:
SET group_concat_max_len = 10485760; #10MB max length
More details: http://web.performancerasta.com/metrics-tips-calculating-95th-99th-or-any-percentile-with-single-mysql-query/
I have this below code which I found on HackerRank and it is pretty simple and works in each and every case.
SELECT M.MEDIAN_COL FROM MEDIAN_TABLE M WHERE
(SELECT COUNT(MEDIAN_COL) FROM MEDIAN_TABLE WHERE MEDIAN_COL < M.MEDIAN_COL ) =
(SELECT COUNT(MEDIAN_COL) FROM MEDIAN_TABLE WHERE MEDIAN_COL > M.MEDIAN_COL );
You could use the user-defined function that's found here.
Building off of velcro's answer, for those of you having to do a median off of something that is grouped by another parameter:
SELECT grp_field, t1.val FROM (
SELECT grp_field, #rownum:=IF(#s = grp_field, #rownum + 1, 0) AS row_number,
#s:=IF(#s = grp_field, #s, grp_field) AS sec, d.val
FROM data d, (SELECT #rownum:=0, #s:=0) r
ORDER BY grp_field, d.val
) as t1 JOIN (
SELECT grp_field, count(*) as total_rows
FROM data d
GROUP BY grp_field
) as t2
ON t1.grp_field = t2.grp_field
WHERE t1.row_number=floor(total_rows/2)+1;
Takes care about an odd value count - gives the avg of the two values in the middle in that case.
SELECT AVG(val) FROM
( SELECT x.id, x.val from data x, data y
GROUP BY x.id, x.val
HAVING SUM(SIGN(1-SIGN(IF(y.val-x.val=0 AND x.id != y.id, SIGN(x.id-y.id), y.val-x.val)))) IN (ROUND((COUNT(*))/2), ROUND((COUNT(*)+1)/2))
) sq
My code, efficient without tables or additional variables:
SELECT
((SUBSTRING_INDEX(SUBSTRING_INDEX(group_concat(val order by val), ',', floor(1+((count(val)-1) / 2))), ',', -1))
+
(SUBSTRING_INDEX(SUBSTRING_INDEX(group_concat(val order by val), ',', ceiling(1+((count(val)-1) / 2))), ',', -1)))/2
as median
FROM table;
Single query to archive the perfect median:
SELECT
COUNT(*) as total_rows,
IF(count(*)%2 = 1, CAST(SUBSTRING_INDEX(SUBSTRING_INDEX( GROUP_CONCAT(val ORDER BY val SEPARATOR ','), ',', 50/100 * COUNT(*)), ',', -1) AS DECIMAL), ROUND((CAST(SUBSTRING_INDEX(SUBSTRING_INDEX( GROUP_CONCAT(val ORDER BY val SEPARATOR ','), ',', 50/100 * COUNT(*) + 1), ',', -1) AS DECIMAL) + CAST(SUBSTRING_INDEX(SUBSTRING_INDEX( GROUP_CONCAT(val ORDER BY val SEPARATOR ','), ',', 50/100 * COUNT(*)), ',', -1) AS DECIMAL)) / 2)) as median,
AVG(val) as average
FROM
data
Optionally, you could also do this in a stored procedure:
DROP PROCEDURE IF EXISTS median;
DELIMITER //
CREATE PROCEDURE median (table_name VARCHAR(255), column_name VARCHAR(255), where_clause VARCHAR(255))
BEGIN
-- Set default parameters
IF where_clause IS NULL OR where_clause = '' THEN
SET where_clause = 1;
END IF;
-- Prepare statement
SET #sql = CONCAT(
"SELECT AVG(middle_values) AS 'median' FROM (
SELECT t1.", column_name, " AS 'middle_values' FROM
(
SELECT #row:=#row+1 as `row`, x.", column_name, "
FROM ", table_name," AS x, (SELECT #row:=0) AS r
WHERE ", where_clause, " ORDER BY x.", column_name, "
) AS t1,
(
SELECT COUNT(*) as 'count'
FROM ", table_name, " x
WHERE ", where_clause, "
) AS t2
-- the following condition will return 1 record for odd number sets, or 2 records for even number sets.
WHERE t1.row >= t2.count/2
AND t1.row <= ((t2.count/2)+1)) AS t3
");
-- Execute statement
PREPARE stmt FROM #sql;
EXECUTE stmt;
END//
DELIMITER ;
-- Sample usage:
-- median(table_name, column_name, where_condition);
CALL median('products', 'price', NULL);
My solution presented below works in just one query without creation of table, variable or even sub-query.
Plus, it allows you to get median for each group in group-by queries (this is what i needed !):
SELECT `columnA`,
SUBSTRING_INDEX(SUBSTRING_INDEX(GROUP_CONCAT(`columnB` ORDER BY `columnB`), ',', CEILING((COUNT(`columnB`)/2))), ',', -1) medianOfColumnB
FROM `tableC`
-- some where clause if you want
GROUP BY `columnA`;
It works because of a smart use of group_concat and substring_index.
But, to allow big group_concat, you have to set group_concat_max_len to a higher value (1024 char by default).
You can set it like that (for current sql session) :
SET SESSION group_concat_max_len = 10000;
-- up to 4294967295 in 32-bits platform.
More infos for group_concat_max_len: https://dev.mysql.com/doc/refman/5.1/en/server-system-variables.html#sysvar_group_concat_max_len
Another riff on Velcrow's answer, but uses a single intermediate table and takes advantage of the variable used for row numbering to get the count, rather than performing an extra query to calculate it. Also starts the count so that the first row is row 0 to allow simply using Floor and Ceil to select the median row(s).
SELECT Avg(tmp.val) as median_val
FROM (SELECT inTab.val, #rows := #rows + 1 as rowNum
FROM data as inTab, (SELECT #rows := -1) as init
-- Replace with better where clause or delete
WHERE 2 > 1
ORDER BY inTab.val) as tmp
WHERE tmp.rowNum in (Floor(#rows / 2), Ceil(#rows / 2));
Knowing exact row count you can use this query:
SELECT <value> AS VAL FROM <table> ORDER BY VAL LIMIT 1 OFFSET <half>
Where <half> = ceiling(<size> / 2.0) - 1
SELECT
SUBSTRING_INDEX(
SUBSTRING_INDEX(
GROUP_CONCAT(field ORDER BY field),
',',
((
ROUND(
LENGTH(GROUP_CONCAT(field)) -
LENGTH(
REPLACE(
GROUP_CONCAT(field),
',',
''
)
)
) / 2) + 1
)),
',',
-1
)
FROM
table
The above seems to work for me.
I used a two query approach:
first one to get count, min, max and avg
second one (prepared statement) with a "LIMIT #count/2, 1" and "ORDER BY .." clauses to get the median value
These are wrapped in a function defn, so all values can be returned from one call.
If your ranges are static and your data does not change often, it might be more efficient to precompute/store these values and use the stored values instead of querying from scratch every time.
as i just needed a median AND percentile solution, I made a simple and quite flexible function based on the findings in this thread. I know that I am happy myself if I find "readymade" functions that are easy to include in my projects, so I decided to quickly share:
function mysql_percentile($table, $column, $where, $percentile = 0.5) {
$sql = "
SELECT `t1`.`".$column."` as `percentile` FROM (
SELECT #rownum:=#rownum+1 as `row_number`, `d`.`".$column."`
FROM `".$table."` `d`, (SELECT #rownum:=0) `r`
".$where."
ORDER BY `d`.`".$column."`
) as `t1`,
(
SELECT count(*) as `total_rows`
FROM `".$table."` `d`
".$where."
) as `t2`
WHERE 1
AND `t1`.`row_number`=floor(`total_rows` * ".$percentile.")+1;
";
$result = sql($sql, 1);
if (!empty($result)) {
return $result['percentile'];
} else {
return 0;
}
}
Usage is very easy, example from my current project:
...
$table = DBPRE."zip_".$slug;
$column = 'seconds';
$where = "WHERE `reached` = '1' AND `time` >= '".$start_time."'";
$reaching['median'] = mysql_percentile($table, $column, $where, 0.5);
$reaching['percentile25'] = mysql_percentile($table, $column, $where, 0.25);
$reaching['percentile75'] = mysql_percentile($table, $column, $where, 0.75);
...
Here is my way . Of course, you could put it into a procedure :-)
SET #median_counter = (SELECT FLOOR(COUNT(*)/2) - 1 AS `median_counter` FROM `data`);
SET #median = CONCAT('SELECT `val` FROM `data` ORDER BY `val` LIMIT ', #median_counter, ', 1');
PREPARE median FROM #median;
EXECUTE median;
You could avoid the variable #median_counter, if you substitude it:
SET #median = CONCAT( 'SELECT `val` FROM `data` ORDER BY `val` LIMIT ',
(SELECT FLOOR(COUNT(*)/2) - 1 AS `median_counter` FROM `data`),
', 1'
);
PREPARE median FROM #median;
EXECUTE median;
After reading all previous ones they didn't match with my actual requirement so I implemented my own one which doesn't need any procedure or complicate statements, just I GROUP_CONCAT all values from the column I wanted to obtain the MEDIAN and applying a COUNT DIV BY 2 I extract the value in from the middle of the list like the following query does :
(POS is the name of the column I want to get its median)
(query) SELECT
SUBSTRING_INDEX (
SUBSTRING_INDEX (
GROUP_CONCAT(pos ORDER BY CAST(pos AS SIGNED INTEGER) desc SEPARATOR ';')
, ';', COUNT(*)/2 )
, ';', -1 ) AS `pos_med`
FROM table_name
GROUP BY any_criterial
I hope this could be useful for someone in the way many of other comments were for me from this website.
Based on #bob's answer, this generalizes the query to have the ability to return multiple medians, grouped by some criteria.
Think, e.g., median sale price for used cars in a car lot, grouped by year-month.
SELECT
period,
AVG(middle_values) AS 'median'
FROM (
SELECT t1.sale_price AS 'middle_values', t1.row_num, t1.period, t2.count
FROM (
SELECT
#last_period:=#period AS 'last_period',
#period:=DATE_FORMAT(sale_date, '%Y-%m') AS 'period',
IF (#period<>#last_period, #row:=1, #row:=#row+1) as `row_num`,
x.sale_price
FROM listings AS x, (SELECT #row:=0) AS r
WHERE 1
-- where criteria goes here
ORDER BY DATE_FORMAT(sale_date, '%Y%m'), x.sale_price
) AS t1
LEFT JOIN (
SELECT COUNT(*) as 'count', DATE_FORMAT(sale_date, '%Y-%m') AS 'period'
FROM listings x
WHERE 1
-- same where criteria goes here
GROUP BY DATE_FORMAT(sale_date, '%Y%m')
) AS t2
ON t1.period = t2.period
) AS t3
WHERE
row_num >= (count/2)
AND row_num <= ((count/2) + 1)
GROUP BY t3.period
ORDER BY t3.period;
create table med(id integer);
insert into med(id) values(1);
insert into med(id) values(2);
insert into med(id) values(3);
insert into med(id) values(4);
insert into med(id) values(5);
insert into med(id) values(6);
select (MIN(count)+MAX(count))/2 from
(select case when (select count(*) from
med A where A.id<B.id)=(select count(*)/2 from med) OR
(select count(*) from med A where A.id>B.id)=(select count(*)/2
from med) then cast(B.id as float)end as count from med B) C;
?column?
----------
3.5
(1 row)
OR
select cast(avg(id) as float) from
(select t1.id from med t1 JOIN med t2 on t1.id!= t2.id
group by t1.id having ABS(SUM(SIGN(t1.id-t2.id)))=1) A;
Often, we may need to calculate Median not just for the whole table, but for aggregates with respect to our ID. In other words, calculate median for each ID in our table, where each ID has many records. (good performance and works in many SQL + fixes problem of even and odds, more about performance of different Median-methods https://sqlperformance.com/2012/08/t-sql-queries/median )
SELECT our_id, AVG(1.0 * our_val) as Median
FROM
( SELECT our_id, our_val,
COUNT(*) OVER (PARTITION BY our_id) AS cnt,
ROW_NUMBER() OVER (PARTITION BY our_id ORDER BY our_val) AS rn
FROM our_table
) AS x
WHERE rn IN ((cnt + 1)/2, (cnt + 2)/2) GROUP BY our_id;
Hope it helps
MySQL has supported window functions since version 8.0, you can use ROW_NUMBER or DENSE_RANK (DO NOT use RANK as it assigns the same rank to same values, like in sports ranking):
SELECT AVG(t1.val) AS median_val
FROM (SELECT val,
ROW_NUMBER() OVER(ORDER BY val) AS rownum
FROM data) t1,
(SELECT COUNT(*) AS num_records FROM data) t2
WHERE t1.row_num IN
(FLOOR((t2.num_records + 1) / 2),
FLOOR((t2.num_records + 2) / 2));
A simple way to calculate Median in MySQL
set #ct := (select count(1) from station);
set #row := 0;
select avg(a.val) as median from
(select * from table order by val) a
where (select #row := #row + 1)
between #ct/2.0 and #ct/2.0 +1;
The most simple and fast way to calculate median in mysql.
select x.col
from (select lat_n,
count(1) over (partition by 'A') as total_rows,
row_number() over (order by col asc) as rank_Order
from station ft) x
where x.rank_Order = round(x.total_rows / 2.0, 0)