Convolution with BigQuery Window functions? - google-bigquery

Is this something I can do? I'm looking to try and cross-correlate two time series.
I think the window function should do it, but not 100% sure how to construct it in SQL.

Just as an idea to play with (more suited for comments but easier to present in answer):
SELECT
CORR(a, b) AS correlation,
CORR(a, b1) AS cross_correlation_lag_1,
CORR(a, b2) AS cross_correlation_lag_2,
CORR(a, b3) AS cross_correlation_lag_3,
CORR(a, b4) AS cross_correlation_lag_4,
CORR(a, b5) AS cross_correlation_lag_5
FROM (
SELECT
a.ts AS ts,
a.val AS a,
b.val AS b,
LEAD(b.val, 1) OVER(ORDER BY b.ts) AS b1,
LEAD(b.val, 2) OVER(ORDER BY b.ts) AS b2,
LEAD(b.val, 3) OVER(ORDER BY b.ts) AS b3,
LEAD(b.val, 4) OVER(ORDER BY b.ts) AS b4,
LEAD(b.val, 5) OVER(ORDER BY b.ts) AS b5
FROM time_series1 AS a
JOIN time_series2 AS b
ON a.ts = b.ts
)

Related

Is there a way to make this Wordle-like SQL query using advanced BigQuery features?

I'm trying to do some word/letter analysis using BigQuery using rules similar to Wordle.
You know the drill:
🟨 letter present but in other position
🟩 letter in correct position
⬜ letter not present.
I have a working query but I'm not really satisfied for three reasons:
It doesn't look too BigQuery-y
For large sets of words, the query doesn't scale well
Extending to, say, 7 letter words would look really, REALLY, REALLY horrible, messy and any number of nasty adjectives could easily describe it
The way I'm doing this vaguely reminds me of recursive CTE, but sadly those are totally elusive to me... 😅
Making it faster is not really a concern but I'm wondering if there's a more advanced/elegant way (for lack of a better term) to accomplish the same result: recursive CTE, TVFs I guess using Javascript UDFs is an option, but I'd like to stay away from them and instead explore an SQL-only alternative.
This is the ugly query:
with words as (
select normalized_word as word, 'solution' word_type
from unnest(["SCAPE","PETER","SMACK","MAMMA"]) as normalized_word
),
split_word as (
select word,letter,word_type,
row_number() over(partition by word) as pos
from words, unnest(split(word,'')) as letter
),
check_correct as (
select a.word word1,any_value(a.word_type) as word_type,b.word word2,
string_agg(if(a.letter=b.letter,'🟩',a.letter),'' order by a.pos) w1,
string_agg(if(a.letter=b.letter,'❌',b.letter),'' order by a.pos) w2,
sum(if(a.letter=b.letter,1,0)) c,0 p
from split_word as a, split_word as b
where a.word <> b.word
and b.word_type = 'solution'
and a.pos = b.pos
group by a.word,b.word
),
check_first as (
select word1,word_type,word2,c,
if(instr(w2,substring(w1,1,1))>0,p+1,p) p,
if(instr(w2,substring(w1,1,1))>0,concat('🟨',right(w1,4)),w1) w1,
if(instr(w2,substring(w1,1,1))>0,
concat(
left(w2,instr(w2,substring(w1,1,1))-1),
'❌',
right(w2,length(w2)-instr(w2,substring(w1,1,1)))),
w2) w2,
from check_correct
),
check_second as (
select word1,word_type,word2,c,
if(instr(w2,substring(w1,2,1))>0,p+1,p) p,
if(instr(w2,substring(w1,2,1))>0,concat(left(w1,1),'🟨',right(w1,3)),w1) w1,
if(instr(w2,substring(w1,2,1))>0,
concat(
left(w2,instr(w2,substring(w1,2,1))-1),
'❌',
right(w2,length(w2)-instr(w2,substring(w1,2,1)))),
w2) w2,
from check_first
),
check_third as (
select word1,word_type,word2,c,
if(instr(w2,substring(w1,3,1))>0,p+1,p) p,
if(instr(w2,substring(w1,3,1))>0,concat(left(w1,2),'🟨',right(w1,2)),w1) w1,
if(instr(w2,substring(w1,3,1))>0,
concat(
left(w2,instr(w2,substring(w1,3,1))-1),
'❌',
right(w2,length(w2)-instr(w2,substring(w1,3,1)))),
w2) w2,
from check_second
),
check_fourth as (
select word1,word_type,word2,c,
if(instr(w2,substring(w1,4,1))>0,p+1,p) p,
if(instr(w2,substring(w1,4,1))>0,concat(left(w1,3),'🟨',right(w1,1)),w1) w1,
if(instr(w2,substring(w1,4,1))>0,
concat(
left(w2,instr(w2,substring(w1,4,1))-1),
'❌',
right(w2,length(w2)-instr(w2,substring(w1,4,1)))),
w2) w2,
from check_third
),
check_fifth as (
select word1,word_type,word2,c,
if(instr(w2,substring(w1,5,1))>0,p+1,p) p,
if(instr(w2,substring(w1,5,1))>0,concat(left(w1,4),'🟨'),w1) w1,
if(instr(w2,substring(w1,5,1))>0,
concat(
left(w2,instr(w2,substring(w1,5,1)) - 1),
'❌',
right(w2,length(w2)-instr(w2,substring(w1,5,1)))),
w2) w2, -- for completeness
from check_fourth
),
final_result as (
select word1 as guess, word_type as guess_type, word2 as solution, c as correct, p as present,
length(w1)-c-p as absent,
regexp_replace(w1, r'([A-Z])', '⬜') as wordle, w1, w2
from check_fifth
)
select *
from final_result
order by guess
The result looks like this:
Note some interesting edge cases with repeated letters in the guess word: 2, 3, 5. My ugly query manages this by "consuming" matched letters in solution and replacing them with ❌ to avoid further matches. That's the sole purpose of w2: avoid more than one match.
Update
Mikhail's solution almost works, but fails when guess word has repeated letters:
Consider below approach - will work for any number of letters
with words as (
select normalized_word as word, 'solution' word_type
from unnest(["SHAPE","PETER","TAPES","JUMBO","NINJA","MAMMA"]) as normalized_word
), pairs as (
select t1.word as guess, t2.word as solution
from words t1, words t2
), greens as (
select guess, solution, x, offset, color
from pairs t, unnest(array(
select as struct x, offset, if(x=y, '🟩', '⬜') color
from unnest(split(guess, '')) x with offset
left join unnest(split(solution, '')) y with offset
using(offset)
))
where guess != solution
), yellows_temp as (
select guess, solution, x, color
from pairs t, unnest(array(
select as struct x, '🟨' color
from unnest(split(guess, '')) x with offset as pos1
join unnest(split(solution, '')) y with offset as pos2
on x = y and pos1 != pos2
group by x, color
))
where guess != solution
), yellows as (
select guess, solution, x, '🟨' color
from yellows_temp y
left join (
select guess, solution, x
from greens
where color = '🟩'
) g
using (guess, solution, x)
where g.x is null
)
select guess, solution,
countif(g.color = '🟩') correct,
count(distinct if(g.color = '⬜' and y.color = '🟨', y.x, null)) present,
string_agg(if(g.color = '⬜' and y.color = '🟨', '🟨', g.color), '' order by offset) as wordle,
string_agg(if(g.color = '⬜' and y.color = '🟨', '🟨', if(g.color = '🟩', '🟩', g.x)), '' order by offset) as w1
from greens g
left join yellows y
using(guess, solution, x)
group by guess, solution
-- order by guess, solution
with output
Leaving w2 for you - should be relatively simple using above as a starting point :o)

How can I use decode to merge IF ELSE in Oracle. Required for code re-usability

It is something like this:
if xyz = a then
select col1 from tab1
where w1, w2;
else
select sum (col1) from tab1
where w1, w2, w3;
endif;
You can't really put this logic into a single query. You can use union all though:
select col1
from tab1
where w1, w2 and (xyz = a)
union all
select sum(col1)
from tab1
where w1, w2, w3 and (xyz <> a)
having (xyz <> a);
select case when xyz = a then (select col1 from tab1 where w1, w2)
else (select sum(col1) from tab1 where w1, w2, w3)
end [as <column_alias>]
from dual
;

Prolog - Formatting crossword

I've written a predicate called solve_crossword that looks like this:
solve_crossword(X,C):-
C= [A1,A2,A3,A4,A5,
B1,' ',B3, ' ', B5,
C1, C2,C3,C4,C5,
D1,' ',D3,' ', D5,
E1,E2,E3,E4,E5],
member([A1, A2, A3, A4, A5], X),
member([C1, C2, C3, C4, C5], X),
member([E1, E2, E3, E4, E5], X),
member([A1, B1, C1, D1, E1], X),
member([A3, B3, C3, D3, E3], X),
member([A5, B5, C5, D5, E5], X).
Now, I want to write a predicate called write_crossword that formats the crossword. If I have a list of words I want it to look like this:
| ?- words(X), solve_crossword(X, C), write_crossword(C).
DITCH
O U O
DITTO
G O E
EARLY
C = [[68,73,84,67,72],[79,32,85,32,79],...
X = [[68,73,83,84,82],[68,73,84,67,72],...
With
words([
"DISTR",
"DITCH",
"DITTO",
"DITTY",
"DODGE",
"EARED",
"EARLY",
"EARTH",
"EASEL",
"HONOR",
"HOOEY",
"HORDE",
"TUQUE",
"TURPS",
"TUTOR",
"TWAIN"
]).
Rows 1, 3, 5 and columns 1, 3, 5 are supposed to be words.
You can try something like this (note I corrected your code for solve_crossword):
solve_crossword(X,C):-
C= [[A1,A2,A3,A4,A5],
[B1,Space,B3, Space,B5],
[C1,C2,C3,C4,C5],
[D1,Space,D3,Space,D5],
[E1,E2,E3,E4,E5]],
atom_codes(' ', [Space]),
member([A1, A2, A3, A4, A5], X),
member([C1, C2, C3, C4, C5], X),
member([E1, E2, E3, E4, E5], X),
member([A1, B1, C1, D1, E1], X),
member([A3, B3, C3, D3, E3], X),
member([A5, B5, C5, D5, E5], X).
write_crossword([]).
write_crossword([Line|Lines]):-
atom_codes(SLine, Line),
write(SLine),
nl,
write_crossword(Lines).
atom_codes/2 converts between an atom and a list of character codes.

How can I find duplicate consecutive values in this table?

Say I have a table which I query like so:
select date, value from mytable order by date
and this gives me results:
date value
02/26/2009 14:03:39 1
02/26/2009 14:10:52 2 (a)
02/26/2009 14:27:49 2 (b)
02/26/2009 14:34:33 3
02/26/2009 14:48:29 2 (c)
02/26/2009 14:55:17 3
02/26/2009 14:59:28 4
I'm interested in the rows of this result set where the value is the same as the one in the previous or next row, like row b which has value=2 the same as row (a). I don't care about rows like row (c) which has value=2 but does not come directly after a row with value=2. How can I query the table to give me all rows like (a) and (b) only? This is on Oracle, if it matters.
Use the lead and lag analytic functions.
create table t3 (d number, v number);
insert into t3(d, v) values(1, 1);
insert into t3(d, v) values(2, 2);
insert into t3(d, v) values(3, 2);
insert into t3(d, v) values(4, 3);
insert into t3(d, v) values(5, 2);
insert into t3(d, v) values(6, 3);
insert into t3(d, v) values(7, 4);
select d, v, case when v in (prev, next) then '*' end match, prev, next from (
select
d,
v,
lag(v, 1) over (order by d) prev,
lead(v, 1) over (order by d) next
from
t3
)
order by
d
;
Matching neighbours are marked with * in the match column,
This is a simplified version of #Bob Jarvis' answer, the main difference being the use of just one subquery instead of four,
with f as (select row_number() over(order by d) rn, d, v from t3)
select
a.d, a.v,
case when a.v in (prev.v, next.v) then '*' end match
from
f a
left join
f prev
on a.rn = prev.rn + 1
left join
f next
on a.rn = next.rn - 1
order by a.d
;
As #Janek Bogucki has pointed out LEAD and LAG are probably the easiest way to accomplish this - but just for fun let's try to do it by using only basic join operations:
SELECT mydate, VALUE FROM
(SELECT a.mydate, a.value,
CASE WHEN a.value = b.value THEN '*' ELSE NULL END AS flag1,
CASE WHEN a.value = c.value THEN '*' ELSE NULL END AS flag2
FROM
(SELECT ROWNUM AS outer_rownum, mydate, VALUE
FROM mytable
ORDER BY mydate) a
LEFT OUTER JOIN
(select ROWNUM-1 AS inner_rownum, mydate, VALUE
from mytable
order by myDATE) b
ON b.inner_rownum = a.outer_rownum
LEFT OUTER JOIN
(select ROWNUM+1 AS inner_rownum, mydate, VALUE
from mytable
order by myDATE) c
ON c.inner_rownum = a.outer_rownum
ORDER BY a.mydate)
WHERE flag1 = '*' OR
flag2 = '*';
Share and enjoy.

MS-SQL Average Columns with NULL

So I've got 3 different columns (basket 1, 2, and 3). Sometimes these columns have all the information and sometimes one or two of them are null. I have another column that I'm going to average these values into and save.
Is there a sleek/easy way to get the average of these three columns even if one of them is null? Or do I have to have a special check for each one being null?
Example data( ~~ is null)
- B1 - B2 - B3 - Avg
------------------------------
- 10 - 20 - 30 - 20
- 10 - ~~ - 30 - 20
- ~~ - 20 - ~~ - 20
How would I write the T-SQL to update my temp table?
UPDATE #MyTable
SET Avg = ???
Answer:
Thanks to Aaronaught for the method I used. I'm going to put my code here just in case someone else has the same thing.
WITH AverageView AS
(
SELECT Results_Key AS xxx_Results_Key,
AVG(AverageValue) AS xxx_Results_Average
FROM #MyResults
UNPIVOT (AverageValue FOR B IN (Results_Basket_1_Price, Results_Basket_2_Price, Results_Basket_3_Price)) AS UnpivotTable
GROUP BY Results_Key
)
UPDATE #MyResults
SET Results_Baskets_Average_Price = xxx_Results_Average
FROM AverageView
WHERE Results_Key = xxx_Results_Key;
Assuming you have some sort of ID column, the most effective way is probably to use UNPIVOT so you can use the normal row-based AVG operator (which ignores NULL values):
DECLARE #Tbl TABLE
(
ID int,
B1 int,
B2 int,
B3 int
)
INSERT #Tbl (ID, B1, B2, B3) VALUES (1, 10, 20, 30)
INSERT #Tbl (ID, B1, B2, B3) VALUES (2, 10, NULL, 30)
INSERT #Tbl (ID, B1, B2, B3) VALUES (3, 10, NULL, NULL)
SELECT ID, AVG(Value) AS Average
FROM #Tbl
UNPIVOT (Value FOR B IN (B1, B2, B3)) AS u
GROUP BY ID
If you don't have the ID column, you can generate a surrogate ID using ROW_NUMBER:
;WITH CTE AS
(
SELECT
B1, B2, B3,
ROW_NUMBER() OVER (ORDER BY (SELECT 1)) AS ID
FROM #Tbl
)
SELECT ID, AVG(Value)
FROM CTE
UNPIVOT (Value FOR B IN (B1, B2, B3)) AS u
GROUP BY ID
SELECT (
SELECT AVG(b)
FROM (
SELECT b1 AS b
UNION ALL
SELECT b2
UNION ALL
SELECT b3
) q
)
FROM mytable
SELECT (ISNULL(B1,0) + ISNULL(B2,0) + ISNULL(B3,0))
/(CASE WHEN B1 IS NULL THEN 0 ELSE 1 END
+CASE WHEN B2 IS NULL THEN 0 ELSE 1 END
+CASE WHEN B3 IS NULL THEN 0 ELSE 1 END)
and put logic in there to exclude cases where all three are null if you need to.