How to build "Star Rating" report in BigQuery (or sparklines, or color gradients)

How to build "Star Rating" report in BigQuery (or sparklines, or color gradients) - google-bigquery

Suppose I have the followng sample input:
WITH Ratings AS (
(SELECT 'A' name, 2 score) UNION ALL
(SELECT 'B' name, 0 score) UNION ALL
(SELECT 'C' name, 5 score) UNION ALL
(SELECT 'D' name, 1 score))
Where score is number between 0 and 5.
How can I produce a report showing names and corresponding number of stars ?

We can build star rating as a string using two Unicode characters:
★ - Unicode code point 9733
☆ - Unicode code point 9734
We can use CODE_POINTS_TO_STRING function to build the stars, and REPEAT function to produce the right number of stars
Combined together the solution for sample input will be:
WITH Ratings AS (
(SELECT 'A' name, 2 score) UNION ALL
(SELECT 'B' name, 0 score) UNION ALL
(SELECT 'C' name, 5 score) UNION ALL
(SELECT 'D' name, 1 score))
SELECT
name,
CONCAT(
REPEAT(CODE_POINTS_TO_STRING([9733]), score),
REPEAT(CODE_POINTS_TO_STRING([9734]), 5-score)) score
FROM Ratings
It will produce the following result:
name score
A ★★☆☆☆
B ☆☆☆☆☆
C ★★★★★
D ★☆☆☆☆

My entry does a color gradient, because sparklines only look good with certain fonts - and that's not a font that the BigQuery web UI uses.
During a day, when is Stack Overflow the most active per tag:
#standardSQL
CREATE TEMP FUNCTION barchart(v ARRAY<FLOAT64>, mm STRUCT<min FLOAT64, max FLOAT64>) AS ((
SELECT STRING_AGG(SUBSTR('🏿🏾🏽🏼🏻', 1+CAST(ROUND(y) AS INT64), 1), '')
FROM (SELECT IFNULL(SAFE_DIVIDE((e-mm.min),(mm.max-mm.min))*4, 0) y FROM UNNEST(v) e)));
CREATE TEMP FUNCTION vbar(v ARRAY<FLOAT64>) AS (
barchart(v, (SELECT AS STRUCT MIN(a), MAX(a) FROM UNNEST(v) a))
);
WITH top_tags AS (
(SELECT x.value FROM (SELECT APPROX_TOP_COUNT(tag, 24) x FROM `bigquery-public-data.stackoverflow.posts_questions`, UNNEST(SPLIT(tags,'|')) tag WHERE EXTRACT(YEAR FROM creation_date)>=2016), UNNEST(x) x)
)
SELECT tag, vbar(ARRAY_AGG(1.0*hhh.count ORDER BY hhh.value)) gradient, SUM(hhh.count) c
FROM (
SELECT tag, APPROX_TOP_COUNT(EXTRACT(HOUR FROM creation_date), 24) h_h
FROM `bigquery-public-data.stackoverflow.posts_questions`, UNNEST(SPLIT(tags,'|')) tag
WHERE tag IN (SELECT * FROM top_tags) AND EXTRACT(YEAR FROM creation_date)>=2016
GROUP BY 1
), UNNEST(h_h) hhh
GROUP BY tag
ORDER BY STRPOS(gradient, '🏼')
Row gradient c tag
1 🏿🏿🏿🏿🏾🏽🏼🏼🏼🏻🏻🏻🏻🏼🏼🏼🏼🏽🏽🏽🏽🏾🏾🏿 317538 android
2 🏿🏿🏿🏿🏾🏽🏼🏼🏼🏻🏻🏻🏻🏻🏻🏻🏼🏼🏽🏽🏽🏾🏾🏿 59445 asp.net
3 🏿🏿🏿🏿🏾🏽🏼🏼🏼🏻🏻🏻🏼🏼🏼🏼🏽🏽🏽🏽🏾🏾🏾🏿 159134 ios
4 🏿🏿🏿🏿🏾🏽🏼🏼🏼🏻🏻🏻🏻🏻🏻🏼🏼🏽🏽🏽🏽🏾🏾🏿 111988 angularjs
5 🏿🏿🏿🏿🏾🏾🏽🏼🏼🏻🏻🏻🏻🏻🏻🏼🏼🏼🏽🏽🏽🏽🏾🏿 212843 jquery
6 🏿🏿🏿🏾🏾🏾🏽🏼🏼🏻🏻🏻🏻🏻🏻🏻🏼🏼🏼🏽🏽🏽🏾🏿 138143 mysql
7 🏿🏿🏿🏿🏿🏾🏽🏼🏼🏻🏻🏻🏼🏻🏻🏻🏻🏼🏼🏼🏼🏽🏾🏾 107586 swift
8 🏿🏿🏿🏿🏾🏾🏽🏼🏼🏻🏻🏻🏼🏻🏼🏼🏼🏽🏽🏽🏽🏾🏾🏿 318294 php
9 🏿🏿🏿🏿🏾🏾🏽🏼🏼🏻🏻🏻🏻🏻🏻🏻🏼🏼🏼🏽🏽🏽🏾🏾 84723 json
10 🏿🏿🏿🏿🏿🏾🏽🏼🏼🏻🏻🏻🏻🏻🏻🏻🏼🏼🏼🏼🏽🏽🏾🏾 233100 html
11 🏿🏿🏿🏿🏿🏾🏽🏼🏼🏻🏻🏻🏻🏻🏻🏻🏼🏼🏼🏽🏽🏽🏾🏿 390245 java
12 🏿🏿🏿🏿🏿🏾🏽🏽🏼🏻🏻🏼🏻🏻🏻🏻🏼🏽🏽🏽🏽🏽🏾🏿 83787 angular
13 🏿🏿🏿🏿🏾🏾🏽🏽🏼🏼🏼🏼🏼🏻🏻🏻🏼🏼🏽🏽🏽🏽🏾🏿 70150 sql-server
14 🏿🏿🏿🏿🏿🏾🏽🏽🏼🏻🏻🏻🏻🏻🏻🏻🏼🏼🏼🏼🏽🏽🏾🏾 534663 javascript
15 🏿🏿🏿🏿🏿🏾🏽🏽🏼🏻🏻🏼🏼🏻🏻🏻🏼🏼🏽🏽🏽🏾🏾🏿 291541 c#
16 🏿🏿🏿🏿🏿🏿🏾🏾🏽🏼🏼🏽🏼🏼🏻🏻🏻🏻🏻🏼🏼🏽🏽🏾 65668 c
17 🏿🏿🏿🏿🏿🏾🏽🏽🏽🏼🏼🏼🏼🏻🏻🏻🏼🏼🏼🏼🏽🏽🏾🏿 111792 sql
18 🏿🏿🏿🏿🏿🏾🏾🏽🏽🏼🏻🏼🏼🏻🏻🏻🏻🏼🏼🏼🏼🏽🏾🏾 158999 css
19 🏿🏿🏿🏿🏿🏿🏾🏽🏽🏼🏼🏼🏼🏻🏻🏻🏻🏼🏼🏼🏼🏽🏽🏾 88146 arrays
20 🏿🏿🏿🏿🏿🏿🏾🏾🏽🏼🏼🏽🏼🏼🏻🏻🏻🏼🏼🏼🏼🏼🏽🏾 61840 ruby-on-rails
21 🏿🏿🏿🏿🏿🏿🏾🏾🏽🏼🏼🏼🏼🏻🏻🏻🏼🏼🏼🏼🏼🏽🏾🏾 136265 c++
22 🏿🏿🏿🏿🏿🏾🏽🏽🏽🏻🏻🏼🏼🏻🏻🏻🏻🏼🏼🏼🏽🏽🏾🏾 104218 node.js
23 🏿🏿🏿🏿🏿🏿🏿🏾🏾🏽🏽🏽🏼🏼🏻🏻🏻🏼🏼🏼🏼🏽🏾🏾 360396 python
24 🏿🏿🏿🏿🏿🏿🏿🏾🏾🏽🏽🏽🏽🏼🏻🏻🏻🏼🏼🏼🏼🏽🏾🏾 98690 r
And a more compact shaded gradient, but with only 3 values:
#standardSQL
CREATE TEMP FUNCTION barchart(v ARRAY<FLOAT64>, mm STRUCT<min FLOAT64, max FLOAT64>) AS ((
SELECT STRING_AGG(SUBSTR('▓▒░', 1+CAST(ROUND(y) AS INT64), 1), '')
FROM (SELECT IFNULL(SAFE_DIVIDE((e-mm.min),(mm.max-mm.min))*2, 0) y FROM UNNEST(v) e)));
CREATE TEMP FUNCTION vbar(v ARRAY<FLOAT64>) AS (
barchart(v, (SELECT AS STRUCT MIN(a), MAX(a) FROM UNNEST(v) a))
);
WITH top_countries AS (
(SELECT x.value FROM (SELECT APPROX_TOP_COUNT(country_code, 12) x FROM `ghtorrent-bq.ght_2017_09_01.users`), UNNEST(x) x)
)
SELECT vbar(ARRAY_AGG(1.0*hhh.count ORDER BY hhh.value)) gradient, SUM(hhh.count) c, country_code
FROM (
SELECT country_code, APPROX_TOP_COUNT(EXTRACT(HOUR FROM a.created_at), 24) h_h
FROM `githubarchive.year.2017` a
JOIN `ghtorrent-bq.ght_2017_09_01.users` b
ON a.actor.login=b.login
WHERE country_code IN (SELECT * FROM top_countries)
AND actor.login NOT IN (SELECT value FROM (SELECT APPROX_TOP_COUNT(actor.login, 1000) x FROM `githubarchive.year.2017` WHERE type='WatchEvent'), UNNEST(x))
AND a.type='WatchEvent'
GROUP BY 1
), UNNEST(h_h) hhh
GROUP BY country_code
ORDER BY STRPOS(gradient, '░')
Row gradient c country_code
1 ░░░░░░░▒▒▒▒▒▒▒▒▓▓▓▓▓▓▒▒░ 204023 au
2 ▒░░░░░░░░░▒▒▒▒▒▒▒▓▓▓▓▓▓▒ 293589 jp
3 ▓▒░░▒▒░░░░▒▒▒▒▒▒▒▓▓▓▓▓▓▓ 2125724 cn
4 ▓▓▓▒▒░░░░░░░░▒▒▒▒▒▒▒▒▓▓▓ 447092 in
5 ▓▓▓▓▓▓▒▒░░░░░░░░▒▒▒▒▒▒▒▓ 381510 ru
6 ▓▓▓▓▓▓▒▒░░░░░░░░▒▒▒▒▒▒▒▒ 545906 de
7 ▓▓▓▓▓▓▓▒░░░▒░░░░▒▒▒▒▒▒▒▒ 395949 fr
8 ▓▓▓▓▓▓▓▒▒░░░░░░░░▒▒▒▒▒▒▒ 491068 gb
9 ▒▒▒▒▓▓▓▓▓▓▓▒░░░▒░░░░░▒▒▒ 419608 br
10 ▒▒▒▒▒▒▒▓▓▓▓▓▓▒▒░░░░░░░░▒ 2443381 us
11 ▒▒▒▒▒▒▒▓▓▓▓▓▓▒▒░░░░░░░▒▒ 294793 ca
And a short code for sparklines - works great with Data Studio:
#standardSQL
CREATE TEMP FUNCTION barchart(v ARRAY<FLOAT64>, mm STRUCT<min FLOAT64, max FLOAT64>) AS ((
SELECT STRING_AGG(SUBSTR('▁▂▃▄▅▆▇█', 1+CAST(ROUND(y) AS INT64), 1), '')
FROM (SELECT IFNULL(SAFE_DIVIDE((e-mm.min),(mm.max-mm.min))*7, 0) y FROM UNNEST(v) e)));
CREATE TEMP FUNCTION vbar(v ARRAY<FLOAT64>) AS (
barchart(v, (SELECT AS STRUCT MIN(a), MAX(a) FROM UNNEST(v) a))
);

Adding more-less generic option for producing time-series/sparklines type of report
#standardSQL
CREATE TEMP FUNCTION sparklines(arr ARRAY<INT64>) AS ((
SELECT STRING_AGG(CODE_POINTS_TO_STRING([code]), '')
FROM UNNEST(arr) el,
UNNEST([(SELECT MAX(el) FROM UNNEST(arr) el)]) mx,
UNNEST([(SELECT MIN(el) FROM UNNEST(arr) el)]) mn
JOIN UNNEST([9602, 9603, 9605, 9606, 9607]) code WITH OFFSET pos
ON pos = CAST(IF(mx = mn, 1, (el - mn) / (mx - mn)) * 4 AS INT64)
));
WITH series AS (
SELECT 1 id, [3453564, 5343333, 2876345, 3465234] arr UNION ALL
SELECT 2, [5743231, 3276438, 1645738, 2453657] UNION ALL
SELECT 3, [1,2,3,4,5,6,7,8,9,0] UNION ALL
SELECT 4, [3245876, 2342879, 5876324, 7342564]
)
SELECT
id, TO_JSON_STRING(arr) arr, sparklines(arr) sparklines
FROM series
with result as below
Row id arr sparklines
1 1 [3453564,5343333,2876345,3465234] ▃▇▂▃
2 2 [5743231,3276438,1645738,2453657] ▇▅▂▃
3 3 [1,2,3,4,5,6,7,8,9,0] ▂▃▃▅▅▆▆▇▇▂
4 4 [3245876,2342879,5876324,7342564] ▃▂▆▇
Adding Mosha's version (taken from his comments below)
#standardSQL
CREATE TEMP FUNCTION barchart(v ARRAY<FLOAT64>, MIN FLOAT64, MAX FLOAT64) AS (
IF(
MIN = MAX,
REPEAT(CODE_POINTS_TO_STRING([9603]), ARRAY_LENGTH(v)),
(
SELECT STRING_AGG(CODE_POINTS_TO_STRING([9601 + CAST(ROUND(y) AS INT64)]), '')
FROM (
SELECT SAFE_DIVIDE(e-min, MAX - MIN) * 7 y
FROM UNNEST(v) e)
)
)
);
CREATE TEMP FUNCTION vbar(v ARRAY<FLOAT64>) AS (
barchart(v, (SELECT MIN(a) FROM UNNEST(v) a), (SELECT MAX(a) FROM UNNEST(v) a))
);
WITH numbers AS (
SELECT 1 id, [3453564., 5343333., 2876345., 3465234.] arr UNION ALL
SELECT 2, [5743231., 3276438., 1645738., 2453657.] UNION ALL
SELECT 3, [1.,2,3,4,5,6,7,8,9,0] UNION ALL
SELECT 4, [3245876., 2342879, 5876324, 7342564]
)
SELECT
id, TO_JSON_STRING(arr) arr, vbar(arr) sparklines
FROM numbers
if applied to same dummy data as above versions - produces below
Row id arr sparklines
1 1 [3453564,5343333,2876345,3465234] ▃█▁▃
2 2 [5743231,3276438,1645738,2453657] █▄▁▂
3 3 [1,2,3,4,5,6,7,8,9,0] ▂▃▃▄▅▆▆▇█▁
4 4 [3245876,2342879,5876324,7342564] ▂▁▆█

More craziness here 😊
Totally useless - but fun to play with
Applying all different options presented in this post for image processing and drawing (using profile pictures of those contribute into this post) + some new
1st and 2nd result (for Felipe's picture) produced using Felipe's Color Gradient approach with different scaling options
3rd result - using Felipe's Shaded Gradient approach
4th result - using Mikhail's(mine)/Mosha's Spark-line approach
Finally 5th and 6th results - using ASCII characters sets representing ASCII Shades of Gray - respectively:
Short set - " .:-=+*#%#"
Full (long) set - "$#B%8&WM#*oahkbdpqwmZO0QLCJUYXzcvunxrjft/\|()1{}[]?-_+~<>i!lI;:,"^``'. "
Code is trivial and literally same as in respective answers - the only difference is that data used in above exercises is image's pixels data that is simply acquired using HTML canvas getImageData() Method - obviously outside of BigQuery - with just simple html page
Options for getting crazy here and having fun playing with image transformation / processing - limitless! but probably useless outside of just learning scope 😜

Fitting vertical bar chart into single character is challenging because there are only 8 different heights we could use. But horizontal bar charts don't have this limitation, we can scale horizontal chart by arbitrary length. Example below uses 30, and it shows number of births per day of week as horizontal bar chart. Data is based on public dataset:
create temp function hbar(value int64, max int64) as (
repeat('█', cast(30 * value / max as int64))
);
select
['sunday', 'monday', 'tuesday', 'wednesday',
'thursday', 'friday', 'saturday'][ordinal(wday)] wday, bar from (
select wday, hbar(count(*), max(count(*)) over()) bar
from `bigquery-public-data.samples.natality`
where wday is not null
group by 1
order by 1 asc)
Results in
wday bar
---------------------------------------------
sunday ███████████████████
monday ███████████████████████████
tuesday ██████████████████████████████
wednesday ██████████████████████████████
thursday █████████████████████████████
friday █████████████████████████████
saturday █████████████████████

Related

Oracle SQL : Calculating weighted probability

I'm struggling to retrieve a "weighted probability" from a database table in my SQL statement.
What do I need to do:
I have tabular information of probable financial values like:
Table my_table
ID
P [%]
Value [$]
1
50
200
2
50
200
3
60
100
I need to calculate the weighted probability of reasonable worst case financial value to occur.
The formula is:
P_weighted = 1 - (1 - P_1 * Value_1/Max(Value_1-n) * (1 - P_2 * Value_2/Max(Value_1-n) * ...
i.e.
P_weighted = 1 - Product(1 - P_i * Value_i / Max(Value_1-n)
P_weighted = 1 - (1 - 50% * 200 / 200) * (1 - 50% * 200 / 200) * (1 - 60% * 100 / 200) = 82.5%
I know the is not product function in (Oracle) SQL, and this can be substituted by EXP( SUM LN(x))) ensuring x is always positive.
Hence, if I were only to calculate the combined probability I could (regardless of the value I could do like:
SELECT EXP(SUM(LN(1 - t.P))) FROM FROM my_table t WHERE condition
When I need to include the Max(t.Value) I've got the following problem:
A SELECT list cannot include both a group function, such as AVG, COUNT, MAX, MIN, SUM, STDDEV, or VARIANCE, and an individual column expression, unless the individual column expression is included in a GROUP BY clause.
So I tried the following:
SELECT ROUND(1-EXP(SUM(LN(1 - t.P*t.Value/max(t.Value)))),1) FROM FROM my_table t WHERE condition GROUP BY t.P, t.Value
But this does obviously group the output by probability rather than multiplying it and just returns 0.5 or 50% instead of the product which should be 0.825 or 82.5%.
How do I get the weighted probability from by table above using (Oracle) SQL?

Does this do it:
with da as (select .50 as p, 200 as v from dual union all select .50 , 200 from dual union all select .60,100 from dual),
mx as (select max(v) mx from da)
select exp(sum(ln(1-da.p*da.v/mx))) from da, mx;
EXP(SUM(LN(1-DA.P*DA.V/MX)))
----------------------------
.175

with
test1 as(
select max(value) v_max from my_table
),
test2 as(
select 1-(my.p/100* value/t1.v_max) rez
from my_table my, test1 t1
)
select to_char(round((1-(EXP (SUM (LN (rez)))))*100,2))||'%' "Weighted probability"
from test2
RESULT:
Weighted probability
--------------------
82,5%

If you want the calculation per-row then you can use an analytic SUM:
SELECT id,
ROUND(1 - EXP(SUM(LN(1 - wp)) OVER (ORDER BY id)), 3) AS cwp
FROM (
SELECT id,
p * value / MAX(value) OVER () AS wp
FROM table_name
)
Which, for the sample data:
CREATE TABLE table_name (ID, P, Value) AS
SELECT 1, .50, 200 FROM DUAL UNION ALL
SELECT 2, .50, 200 FROM DUAL UNION ALL
SELECT 3, .60, 100 FROM DUAL;
Outputs the cumulative weighted probabilities:
ID
CWP
1
.5
2
.75
3
.825
If you just want the total weighted probability then:
SELECT ROUND(1 - EXP(SUM(LN(1 - wp))), 3) AS twp
FROM (
SELECT id,
p * value / MAX(value) OVER () AS wp
FROM table_name
)
Which, for the sample data, outputs:
TWP
.825
db<>fiddle here

Snowflake table and generator functions does not give expected result

I tried to create a simple SQL to track query_history usage, but got into trouble when creating my timeslots using the table and generator functions (the CTE named x below).
I got no results at all when limiting the query_history using my timeslots, so after a while I hardcoded an SQL to give the same result (the CTE named y below) and this works fine.
Why does not x work? As far as I can see x and y produce identical result?
To test the example first run the code as it is, this produces no result.
Then comment the line x as timeslots and un-comment the line y as timeslots, this will give the desired result.
with
x as (
select
dateadd('min',seq4()*10,dateadd('min',-60,current_timestamp())) f,
dateadd('min',(seq4()+1)*10,dateadd('min',-60,current_timestamp())) t
from table(generator(rowcount => 6))
),
y as (
select
dateadd('min',n*10,dateadd('min',-60,current_timestamp())) f,
dateadd('min',(n+1)*10,dateadd('min',-60,current_timestamp())) t
from (select 0 n union all select 1 n union all select 2 union all select 3
union all select 4 union all select 5)
)
--select * from x;
--select * from y;
select distinct
user_name,
timeslots.f
from snowflake.account_usage.query_history,
x as timeslots
--y as timeslots
where start_time >= timeslots.f
and start_time < timeslots.t
order by timeslots.f desc;
(I know the code is not optimal, this is only meant to illustrate the problem)

SEQ:
Returns a sequence of monotonically increasing integers, with wrap-around. Wrap-around occurs after the largest representable integer of the integer width (1, 2, 4, or 8 byte).
If a fully ordered, gap-free sequence is required, consider using the ROW_NUMBER window function.
For:
with x as (
select
dateadd('min',seq4()*10,dateadd('min',-60,current_timestamp())) f,
dateadd('min',(seq4()+1)*10,dateadd('min',-60,current_timestamp())) t
from table(generator(rowcount => 6))
)
SELECT * FROM x;
Should be:
with x as (
select
(ROW_NUMBER() OVER(ORDER BY seq4())) - 1 AS n,
dateadd('min',n*10,dateadd('min',-60,current_timestamp())) f,
dateadd('min',(n+1)*10,dateadd('min',-60,current_timestamp())) t
from table(generator(rowcount => 6))
)
SELECT * FROM x;

union of columns in standard sql [duplicate]

We have a table of bid prices and sizes of two buyers. Bid price p with size s means that the buyer is open to buy s number of product at price p. We have a table that contains a few columns (like timestamp, validity flag) together with these four columns:
bid prices offered by the two buyers, pA and pB.
bid sizes, sA and sB.
Our job is to add a new best size column (bS) to the table, that returns the size at the best price. If the two buyers have the same price then bS is equal to sA + sB, otherwise, we need to take the bid size of the buyer that offers the higher price.
An example table (ignoring columns that are neither prices nor sizes) with the desired output is below.
A simple solution to the problem:
SELECT *,
CASE
WHEN pA = pB THEN sA + sB
WHEN pA > pB THEN sA
ELSE sB
END AS bS
FROM t
Now let us generalize the problem to four buyers. A standard SQL solution is
WITH t_ext AS (
SELECT *, GREATEST(pA, pB, pC, pD) as bP
FROM `t`
)
SELECT *, (sA * CAST(pA = bP AS INT64) +
sB * CAST(pB = bP AS INT64) +
sC * CAST(pC = bP AS INT64) +
sD * CAST(pD = bP AS INT64))
AS bS FROM t_ext
Question:
Is there a simplified query that
uses function SUM instead of adding four items manually
avoids repeated casting?
Note that we cannot identify the price and size columns by indices but only by name. Otherwise, we could use the solution proposed at
Weighted sum of a column vector and a derived bit vector
Btw. I wrote a blog post about this problem that focuses on solutions in Python and Q and I am wondering how the best solution in standard sql looks like.

Below is for BigQuery Standard SQL
Note that we cannot identify the price and size columns by indices but only by name
#standardSQL
WITH t_ext AS (
SELECT * EXCEPT(arr),
ARRAY(SELECT CAST(val AS INT64) FROM UNNEST(arr) val WITH OFFSET WHERE OFFSET < ARRAY_LENGTH(arr) / 2) AS prices,
ARRAY(SELECT CAST(val AS INT64) FROM UNNEST(arr) val WITH OFFSET WHERE OFFSET >= ARRAY_LENGTH(arr) / 2) AS sizes,
(SELECT MAX(CAST(val AS INT64)) FROM UNNEST(arr) val WITH OFFSET WHERE OFFSET < ARRAY_LENGTH(arr) / 2) AS bestPrice
FROM (
SELECT *, REGEXP_EXTRACT_ALL(TO_JSON_STRING(T), r'(?:"(?:pA|pB|pC|pD|sA|sB|sC|sD)"):(\d+)') AS arr
FROM `project.dataset.table` t
)
)
SELECT * EXCEPT(prices, sizes),
(SELECT SUM(size)
FROM UNNEST(prices) price WITH OFFSET
JOIN UNNEST(sizes) size WITH OFFSET
USING(OFFSET)
WHERE price = bestPrice
) AS bS
FROM t_ext
As you can see - the only what you should supply is the list of price and size column names as in below example
pA|pB|pC|pD|sA|sB|sC|sD
If to apply to dummy data as below
#standardSQL
WITH `project.dataset.table` AS (
SELECT 'a' id, 1 pA, 2 pB, 3 pC, 4 pD, 'x' extra_col1, 1 sA, 1 sB, 1 sC, 5 sD UNION ALL
SELECT 'b', 1, 4, 2, 4, 'y', 1, 6, 1, 5 UNION ALL
SELECT 'c', 5, 4, 2, 1, 'z', 7, 1, 1, 1
), t_ext AS (
SELECT * EXCEPT(arr),
ARRAY(SELECT CAST(val AS INT64) FROM UNNEST(arr) val WITH OFFSET WHERE OFFSET < ARRAY_LENGTH(arr) / 2) AS prices,
ARRAY(SELECT CAST(val AS INT64) FROM UNNEST(arr) val WITH OFFSET WHERE OFFSET >= ARRAY_LENGTH(arr) / 2) AS sizes,
(SELECT MAX(CAST(val AS INT64)) FROM UNNEST(arr) val WITH OFFSET WHERE OFFSET < ARRAY_LENGTH(arr) / 2) AS bestPrice
FROM (
SELECT *, REGEXP_EXTRACT_ALL(TO_JSON_STRING(T), r'(?:"(?:pA|pB|pC|pD|sA|sB|sC|sD)"):(\d+)') AS arr
FROM `project.dataset.table` t
)
)
SELECT * EXCEPT(prices, sizes),
(SELECT SUM(size)
FROM UNNEST(prices) price WITH OFFSET
JOIN UNNEST(sizes) size WITH OFFSET
USING(OFFSET)
WHERE price = bestPrice
) AS bS
FROM t_ext
result is
Row id pA pB pC pD extra_col1 sA sB sC sD bestPrice bS
1 a 1 2 3 4 x 1 1 1 5 4 5
2 b 1 4 2 4 y 1 6 1 5 4 11
3 c 5 4 2 1 z 7 1 1 1 5 7
Hope, this is what you are looking for

Weighted sum of a column vector and a derived bit vector - Version 2

We have a table of bid prices and sizes of two buyers. Bid price p with size s means that the buyer is open to buy s number of product at price p. We have a table that contains a few columns (like timestamp, validity flag) together with these four columns:
bid prices offered by the two buyers, pA and pB.
bid sizes, sA and sB.
Our job is to add a new best size column (bS) to the table, that returns the size at the best price. If the two buyers have the same price then bS is equal to sA + sB, otherwise, we need to take the bid size of the buyer that offers the higher price.
An example table (ignoring columns that are neither prices nor sizes) with the desired output is below.
A simple solution to the problem:
SELECT *,
CASE
WHEN pA = pB THEN sA + sB
WHEN pA > pB THEN sA
ELSE sB
END AS bS
FROM t
Now let us generalize the problem to four buyers. A standard SQL solution is
WITH t_ext AS (
SELECT *, GREATEST(pA, pB, pC, pD) as bP
FROM `t`
)
SELECT *, (sA * CAST(pA = bP AS INT64) +
sB * CAST(pB = bP AS INT64) +
sC * CAST(pC = bP AS INT64) +
sD * CAST(pD = bP AS INT64))
AS bS FROM t_ext
Question:
Is there a simplified query that
uses function SUM instead of adding four items manually
avoids repeated casting?
Note that we cannot identify the price and size columns by indices but only by name. Otherwise, we could use the solution proposed at
Weighted sum of a column vector and a derived bit vector
Btw. I wrote a blog post about this problem that focuses on solutions in Python and Q and I am wondering how the best solution in standard sql looks like.

Below is for BigQuery Standard SQL
Note that we cannot identify the price and size columns by indices but only by name
#standardSQL
WITH t_ext AS (
SELECT * EXCEPT(arr),
ARRAY(SELECT CAST(val AS INT64) FROM UNNEST(arr) val WITH OFFSET WHERE OFFSET < ARRAY_LENGTH(arr) / 2) AS prices,
ARRAY(SELECT CAST(val AS INT64) FROM UNNEST(arr) val WITH OFFSET WHERE OFFSET >= ARRAY_LENGTH(arr) / 2) AS sizes,
(SELECT MAX(CAST(val AS INT64)) FROM UNNEST(arr) val WITH OFFSET WHERE OFFSET < ARRAY_LENGTH(arr) / 2) AS bestPrice
FROM (
SELECT *, REGEXP_EXTRACT_ALL(TO_JSON_STRING(T), r'(?:"(?:pA|pB|pC|pD|sA|sB|sC|sD)"):(\d+)') AS arr
FROM `project.dataset.table` t
)
)
SELECT * EXCEPT(prices, sizes),
(SELECT SUM(size)
FROM UNNEST(prices) price WITH OFFSET
JOIN UNNEST(sizes) size WITH OFFSET
USING(OFFSET)
WHERE price = bestPrice
) AS bS
FROM t_ext
As you can see - the only what you should supply is the list of price and size column names as in below example
pA|pB|pC|pD|sA|sB|sC|sD
If to apply to dummy data as below
#standardSQL
WITH `project.dataset.table` AS (
SELECT 'a' id, 1 pA, 2 pB, 3 pC, 4 pD, 'x' extra_col1, 1 sA, 1 sB, 1 sC, 5 sD UNION ALL
SELECT 'b', 1, 4, 2, 4, 'y', 1, 6, 1, 5 UNION ALL
SELECT 'c', 5, 4, 2, 1, 'z', 7, 1, 1, 1
), t_ext AS (
SELECT * EXCEPT(arr),
ARRAY(SELECT CAST(val AS INT64) FROM UNNEST(arr) val WITH OFFSET WHERE OFFSET < ARRAY_LENGTH(arr) / 2) AS prices,
ARRAY(SELECT CAST(val AS INT64) FROM UNNEST(arr) val WITH OFFSET WHERE OFFSET >= ARRAY_LENGTH(arr) / 2) AS sizes,
(SELECT MAX(CAST(val AS INT64)) FROM UNNEST(arr) val WITH OFFSET WHERE OFFSET < ARRAY_LENGTH(arr) / 2) AS bestPrice
FROM (
SELECT *, REGEXP_EXTRACT_ALL(TO_JSON_STRING(T), r'(?:"(?:pA|pB|pC|pD|sA|sB|sC|sD)"):(\d+)') AS arr
FROM `project.dataset.table` t
)
)
SELECT * EXCEPT(prices, sizes),
(SELECT SUM(size)
FROM UNNEST(prices) price WITH OFFSET
JOIN UNNEST(sizes) size WITH OFFSET
USING(OFFSET)
WHERE price = bestPrice
) AS bS
FROM t_ext
result is
Row id pA pB pC pD extra_col1 sA sB sC sD bestPrice bS
1 a 1 2 3 4 x 1 1 1 5 4 5
2 b 1 4 2 4 y 1 6 1 5 4 11
3 c 5 4 2 1 z 7 1 1 1 5 7
Hope, this is what you are looking for

Weighted sum of a column vector and a derived bit vector

We have a table of bid prices and sizes of two buyers. Bid price p with size s means that the buyer is open to buy s number of product at price p. We have a table of four columns:
bid prices offered by the two buyers, pA and pB.
bid sizes, sA and sB.
Our job is to add a new best size column (bS) to the table, that returns the size at the best price. If the two buyers have the same price then bS is equal to sA + sB, otherwise, we need to take the bid size of the buyer that offers the higher price.
An example table with the desired output is below.
A simple solution to the problem:
SELECT pA, pB, sA, sB,
CASE
WHEN pA = pB THEN sA + sB
WHEN pA > pB THEN sA
ELSE sB
END AS bS
FROM t
Now let us generalize the problem to four buyers. A standard SQL solution is
WITH t_ext AS (
SELECT *, GREATEST(pA, pB, pC, pD) as bestPrice
FROM `t`
)
SELECT *, (sA * CAST(pA = bestPrice AS INT64) +
sB * CAST(pB = bestPrice AS INT64) +
sC * CAST(pC = bestPrice AS INT64) +
sD * CAST(pD = bestPrice AS INT64))
AS bS FROM t_ext
Question 1)
Is there a simplified query that
uses function SUM instead of adding four items manually
avoids repeated casting?
Question 2)
Is there a way in Google BigQuery ecosystem to reuse this query for another table that has column name e.g. priceA, priceB instead of pA, pB?
Btw. I wrote a blog post about this problem that focuses on solutions in Python and Q and I am wondering how the best solution in standard sql looks like.

Below is for BigQuery Standard SQL an dis generic enough to not depend on number of buyers as well as naming for price and size fields. The only expectation is for all prices go first and then all respective sizes as it is in your example. Also i assume all numbers are integers (as in example in question) but this can be adjust to deal with FLOATs
#standardSQL
WITH t_ext AS (
SELECT * EXCEPT(arr),
ARRAY(SELECT CAST(val AS INT64) FROM UNNEST(arr) val WITH OFFSET WHERE OFFSET < 4) AS prices,
ARRAY(SELECT CAST(val AS INT64) FROM UNNEST(arr) val WITH OFFSET WHERE OFFSET >= 4) AS sizes,
(SELECT MAX(CAST(val AS INT64)) FROM UNNEST(arr) val WITH OFFSET WHERE OFFSET < 4) AS bestPrice
FROM (
SELECT *, REGEXP_EXTRACT_ALL(TO_JSON_STRING(T), r':(\d+)') AS arr
FROM `project.dataset.table` t
)
)
SELECT * EXCEPT(prices, sizes),
(SELECT SUM(size)
FROM UNNEST(prices) price WITH OFFSET
JOIN UNNEST(sizes) size WITH OFFSET
USING(OFFSET)
WHERE price = bestPrice
) AS bS
FROM t_ext
The only what you need to change in above query is number of buyers - in below expressions (in those below - 4 can be replaced with ARRAY_LENGTH(arr) / 2
WHERE OFFSET < 4
WHERE OFFSET >= 4
WHERE OFFSET < 4
For example, for below dummy data (4 buyers)
#standardSQL
WITH `project.dataset.table` AS (
SELECT 1 pA, 2 pB, 3 pC, 4 pD, 1 sA, 1 sB, 1 sC, 5 sD UNION ALL
SELECT 1, 4, 2, 4, 1, 6, 1, 5 UNION ALL
SELECT 4, 4, 2, 1, 7, 1, 1, 1
), t_ext AS (
SELECT * EXCEPT(arr),
ARRAY(SELECT CAST(val AS INT64) FROM UNNEST(arr) val WITH OFFSET WHERE OFFSET < 4) AS prices,
ARRAY(SELECT CAST(val AS INT64) FROM UNNEST(arr) val WITH OFFSET WHERE OFFSET >= 4) AS sizes,
(SELECT MAX(CAST(val AS INT64)) FROM UNNEST(arr) val WITH OFFSET WHERE OFFSET < 4) AS bestPrice
FROM (
SELECT *, REGEXP_EXTRACT_ALL(TO_JSON_STRING(T), r':(\d+)') AS arr
FROM `project.dataset.table` t
)
)
SELECT * EXCEPT(prices, sizes),
(SELECT SUM(size)
FROM UNNEST(prices) price WITH OFFSET
JOIN UNNEST(sizes) size WITH OFFSET
USING(OFFSET)
WHERE price = bestPrice
) AS bS
FROM t_ext
result is
Row pA pB pC pD sA sB sC sD bestPrice bS
1 1 2 3 4 1 1 1 5 4 5
2 1 4 2 4 1 6 1 5 4 11
3 4 4 2 1 7 1 1 1 4 8

We Keep Coding

sql objective-c vba vb.net react-native apache vue.js tensorflow api pandas

How to build "Star Rating" report in BigQuery (or sparklines, or color gradients) - google-bigquery

Related

Oracle SQL : Calculating weighted probability

Snowflake table and generator functions does not give expected result

union of columns in standard sql [duplicate]

Weighted sum of a column vector and a derived bit vector - Version 2

Weighted sum of a column vector and a derived bit vector

Categories

Resources