BIGQUERY - Query Exceeded resource limit - sql

I am running the below query to join the two tables and get certain records based on Fuzzy logic (Levenshtein distance)
WITH main_table as (
select *
from
`project.data.Roof_Address`
), reference_table as (
select *
from `project.data.DATA_TREE_Address`
)
select
DR_NBR,
ARRAY_AGG(
STRUCT(n.LotSizeSqFt)
ORDER BY EDIT_DISTANCE(l.ordered_fullname, n.ordered_fullname) LIMIT 1
)[OFFSET(0)].*,
ARRAY_AGG(
EDIT_DISTANCE(l.ordered_fullname, n.ordered_fullname) LIMIT 1
)[OFFSET(0)] distance_score
FROM main_table l
CROSS JOIN reference_table n
GROUP BY 1
having ARRAY_AGG(
EDIT_DISTANCE(l.ordered_fullname, n.ordered_fullname) LIMIT 1
)[OFFSET(0)] < 10
This query will return the
Project_Id(Dr_NBR)
from first table and
Project_area(LotSizeSqFt)
from second table based on the Levenshtein Score filter at the end.
This query is resulting in the below error
Any suggestions how to optimize the above query?
The distance I am using is from the below function
#standardSQL
CREATE TEMPORARY FUNCTION EDIT_DISTANCE(string1 STRING, string2 STRING)
RETURNS INT64
LANGUAGE js AS """
var _extend = function(dst) {
var sources = Array.prototype.slice.call(arguments, 1);
for (var i=0; i<sources.length; ++i) {
var src = sources[i];
for (var p in src) {
if (src.hasOwnProperty(p)) dst[p] = src[p];
}
}
return dst;
};
var Levenshtein = {
/**
* Calculate levenshtein distance of the two strings.
*
* #param str1 String the first string.
* #param str2 String the second string.
* #return Integer the levenshtein distance (0 and above).
*/
get: function(str1, str2) {
// base cases
if (str1 === str2) return 0;
if (str1.length === 0) return str2.length;
if (str2.length === 0) return str1.length;
// two rows
var prevRow = new Array(str2.length + 1),
curCol, nextCol, i, j, tmp;
// initialise previous row
for (i=0; i<prevRow.length; ++i) {
prevRow[i] = i;
}
// calculate current row distance from previous row
for (i=0; i<str1.length; ++i) {
nextCol = i + 1;
for (j=0; j<str2.length; ++j) {
curCol = nextCol;
// substution
nextCol = prevRow[j] + ( (str1.charAt(i) === str2.charAt(j)) ? 0 : 1 );
// insertion
tmp = curCol + 1;
if (nextCol > tmp) {
nextCol = tmp;
}
// deletion
tmp = prevRow[j + 1] + 1;
if (nextCol > tmp) {
nextCol = tmp;
}
// copy current col value into previous (in preparation for next iteration)
prevRow[j] = curCol;
}
// copy last col value into previous (in preparation for next iteration)
prevRow[j] = nextCol;
}
return nextCol;
}
};
var the_string1;
try {
the_string1 = decodeURI(string1).toLowerCase();
} catch (ex) {
the_string1 = string1.toLowerCase();
}
try {
the_string2 = decodeURI(string2).toLowerCase();
} catch (ex) {
the_string2 = string2.toLowerCase();
}
return Levenshtein.get(the_string1, the_string2)
""";
Snapshot for Roof_Address table
Snapshot for DATA_TREE_Address

The main query cost would most likely be the ORDER by in the :
ARRAY_AGG(
STRUCT(n.LotSizeSqFt)
ORDER BY EDIT_DISTANCE(l.ordered_fullname, n.ordered_fullname) LIMIT 1
)[OFFSET(0)].*,
I see you're only returning a single record for each array_agg.
I'd recommend removing the ARRAY_AGG and do a MAX or MIN on the results from the EDIT_DISTANCE. A MAX or MIN is much much cheaper than ORDERING ALL records and taking the first or last one.

Related

Cannot use bigquery udf (bqutil) in processing location: us-west-2

We are trying to use these in us-west2 - https://github.com/GoogleCloudPlatform/bigquery-utils/tree/master/udfs/community.
this first query processes just fine, in US
this second query wont run
Our dataset models is in us West 2. It seems all queries from the 2nd query editor are then processed in us-west 2 where, it seems bqutil does not exist? How can we find the function bqutil.fn.levenshtein when processing in us-west2 (where our datasets all exist)?
To use the levenshtein UDF in your BigQuery table, you need to create a UDF in the location where your dataset resides.
You can refer to the below UDF and the screenshot where the data resides in us-west2 location.
UDF :
CREATE OR REPLACE FUNCTION
`stackdemo.fn_LevenshteinDistance`(in_a STRING, in_b STRING) RETURNS INT64 LANGUAGE js AS R"""
var a = in_a.toLowerCase();
var b = in_b.toLowerCase();
if(a.length == 0) return b.length;
if(b.length == 0) return a.length;
var matrix = [];
// increment along the first column of each row
var i;
for(i = 0; i <= b.length; i++){
matrix[i] = [i];
}
// increment each column in the first row
var j;
for(j = 0; j <= a.length; j++){
matrix[0][j] = j;
}
// Fill in the rest of the matrix
for(i = 1; i <= b.length; i++){
for(j = 1; j <= a.length; j++){
if(b.charAt(i-1) == a.charAt(j-1)){
matrix[i][j] = matrix[i-1][j-1];
} else {
matrix[i][j] =
Math.min(matrix[i-1][j-1] + 1, // substitution
Math.min(matrix[i][j-1] + 1, // insertion
matrix[i-1][j] + 1)); // deletion
}
}
}
return matrix[b.length][a.length];
""";
Query :
SELECT
source,
target,
`stackdemo.fn_LevenshteinDistance`(source, target) distance,
FROM UNNEST([
STRUCT('analyze' AS source, 'analyse' AS target),
STRUCT('opossum', 'possum'),
STRUCT('potatoe', 'potatoe'),
STRUCT('while', 'whilst'),
STRUCT('aluminum', 'alumininium'),
STRUCT('Connecticut', 'CT')
]);
Output :

how to join two tables with condition may contains regex condition or array condition

I have two tables tab1 and tab2 and the data like as follows
tab1:
tab2:
Here item description in tab1 and tab2 is not matching is there any way to join these two tables to fetch the customer ids
Thanks
Try below
#standardSQL
CREATE TEMPORARY FUNCTION similarity(Text1 STRING, Text2 STRING)
RETURNS FLOAT64
LANGUAGE js AS """
var _extend = function(dst) {
var sources = Array.prototype.slice.call(arguments, 1);
for (var i=0; i<sources.length; ++i) {
var src = sources[i];
for (var p in src) {
if (src.hasOwnProperty(p)) dst[p] = src[p];
}
}
return dst;
};
var Levenshtein = {
get: function(str1, str2) {
// base cases
if (str1 === str2) return 0;
if (str1.length === 0) return str2.length;
if (str2.length === 0) return str1.length;
// two rows
var prevRow = new Array(str2.length + 1),
curCol, nextCol, i, j, tmp;
// initialise previous row
for (i=0; i<prevRow.length; ++i) {
prevRow[i] = i;
}
// calculate current row distance from previous row
for (i=0; i<str1.length; ++i) {
nextCol = i + 1;
for (j=0; j<str2.length; ++j) {
curCol = nextCol;
// substution
nextCol = prevRow[j] + ( (str1.charAt(i) === str2.charAt(j)) ? 0 : 1 );
// insertion
tmp = curCol + 1;
if (nextCol > tmp) {
nextCol = tmp;
}
// deletion
tmp = prevRow[j + 1] + 1;
if (nextCol > tmp) {
nextCol = tmp;
}
// copy current col value into previous (in preparation for next iteration)
prevRow[j] = curCol;
}
// copy last col value into previous (in preparation for next iteration)
prevRow[j] = nextCol;
}
return nextCol;
}
};
var the_Text1;
try {the_Text1 = decodeURI(Text1).toLowerCase();} catch (ex) {the_Text1 = Text1.toLowerCase();}
try {the_Text2 = decodeURI(Text2).toLowerCase();} catch (ex) {the_Text2 = Text2.toLowerCase();}
return Levenshtein.get(the_Text1, the_Text2) / the_Text1.length;
""";
SELECT *, (
SELECT t1.Item_description
FROM `project.dataset.tab1` t1
ORDER BY similarity(t2.Item_description, REPLACE(t1.Item_description, '|', ', '))
LIMIT 1
) matched_description
FROM `project.dataset.tab2` t2
If to apply to sample data from your question - result will be
Row Customer_ld Item_description matched_description
1 1001 Item Lenovo x1 Yoga, i7 14" is delivered Lenovo x1 Yoga|i7 14"
2 1002 Lenovo x1 Yoga, i5 13" is delivered to customer Lenovo x1 Yoga|i5 13"
3 1003 Lenovo Yoga, i7 14" is delivered to customer#1003 Lenovo Yoga|i7 14"
4 1004 Item lenovo x1 yoga, i7 14" is delivered successfully Lenovo x1 Yoga|i7 14"
5 1005 Item Lenovo x1 Yoga, i7 14" is delivered#1005 Lenovo x1 Yoga|i7 14"
I would use regex to tokenize the features that make each description unique.
with Tab1x as (
select
Item_description,
ifnull(regexp_extract(Item_description,r'([x][0-9])'),'none') as xspec,
ifnull(regexp_extract(Item_description,r'([i][0-9])'), 'none') as ispec,
ifnull(regexp_extract(Item_description,r'([0-9]{2}\")'), 'none') as size
from Tab1
),
Tab2x as (
select
Customer_id,
Item_description,
ifnull(regexp_extract(Item_description,r'([x][0-9])'),'none') as xspec,
ifnull(regexp_extract(Item_description,r'([i][0-9])'), 'none') as ispec,
ifnull(regexp_extract(Item_description,r'([0-9]{2}\")'), 'none') as size
from Tab2
)
select
Tab1x.Item_description as Tab1_Item_description,
Tab2x.Item_description as Tab2_Item_description,
Tab2x.Customer_id
from Tab1x
left join Tab2x using(xspec,ispec,size)
Note, I didn't touch Lenovo or Yoga, but if your real data set has multiple brands/models, you will need to take care of that in a similar fashion.

How to count letter differences of two strings in bigquery?

For example i have:
1: 6c71d997ba39
2: 6c71d997d269
I need to get 4.
You can consider using Levenshtein distance for your use-case
the Levenshtein distance is a string metric for measuring the difference between two sequences. Informally, the Levenshtein distance between two words is the minimum number of single-character edits (insertions, deletions or substitutions) required to change one word into the other
Below example is for BigQuery Standard SQL
#standardSQL
CREATE TEMPORARY FUNCTION EDIT_DISTANCE(string1 STRING, string2 STRING)
RETURNS INT64
LANGUAGE js AS """
var _extend = function(dst) {
var sources = Array.prototype.slice.call(arguments, 1);
for (var i=0; i<sources.length; ++i) {
var src = sources[i];
for (var p in src) {
if (src.hasOwnProperty(p)) dst[p] = src[p];
}
}
return dst;
};
var Levenshtein = {
/**
* Calculate levenshtein distance of the two strings.
*
* #param str1 String the first string.
* #param str2 String the second string.
* #return Integer the levenshtein distance (0 and above).
*/
get: function(str1, str2) {
// base cases
if (str1 === str2) return 0;
if (str1.length === 0) return str2.length;
if (str2.length === 0) return str1.length;
// two rows
var prevRow = new Array(str2.length + 1),
curCol, nextCol, i, j, tmp;
// initialise previous row
for (i=0; i<prevRow.length; ++i) {
prevRow[i] = i;
}
// calculate current row distance from previous row
for (i=0; i<str1.length; ++i) {
nextCol = i + 1;
for (j=0; j<str2.length; ++j) {
curCol = nextCol;
// substution
nextCol = prevRow[j] + ( (str1.charAt(i) === str2.charAt(j)) ? 0 : 1 );
// insertion
tmp = curCol + 1;
if (nextCol > tmp) {
nextCol = tmp;
}
// deletion
tmp = prevRow[j + 1] + 1;
if (nextCol > tmp) {
nextCol = tmp;
}
// copy current col value into previous (in preparation for next iteration)
prevRow[j] = curCol;
}
// copy last col value into previous (in preparation for next iteration)
prevRow[j] = nextCol;
}
return nextCol;
}
};
var the_string1;
try {
the_string1 = decodeURI(string1).toLowerCase();
} catch (ex) {
the_string1 = string1.toLowerCase();
}
try {
the_string2 = decodeURI(string2).toLowerCase();
} catch (ex) {
the_string2 = string2.toLowerCase();
}
return Levenshtein.get(the_string1, the_string2)
""";
WITH strings AS (
SELECT '1: 6c71d997ba39' string1, '2: 6c71d997d269' string2
)
SELECT string1, string2, EDIT_DISTANCE(string1, string2) changes
FROM strings
with result
Row string1 string2 changes
1 1: 6c71d997ba39 2: 6c71d997d269 4
SELECT
(SELECT COUNTIF(c != s2[OFFSET(off)])
FROM UNNEST(SPLIT(s1, '')) AS c WITH OFFSET off) AS count
FROM dataset.table
Source: https://stackoverflow.com/a/57499387/11059644
Ready to use shared UDFs - Levenshtein distance:
SELECT fhoffa.x.levenshtein('felipe', 'hoffa'), fhoffa.x.levenshtein('googgle', 'goggles'), fhoffa.x.levenshtein('is this the', 'Is This The')

Matching partial words in two different columns

I am working on trying to weed out a certain customer from our database. I've noticed a trend where people fill out their first name with the same name that is partial to how they fill out their company name. So an example would look like:
business_name first_name
------------- ----------
locksmith taylorsville locksmith
locksmith roy locksmi
locksmith clinton locks
locksmith farmington locksmith
These are people I do not want being pulled in a query. They are bad eggs. I'm trying to put together a query with a WHERE statement (presumably) that isolates anyone who has a first name that contains at least a partial match to their business name, but I'm stumped and could use some help.
You can use LIKE operator:
SELECT * FROM table WHERE business_name NOT LIKE CONCAT(first_name, '%')
% stands for anything.
You can employ similarity based approach
Try code at bottom of answer
It produces result like below
business_name partial_business_name first_name similarity
locksmith taylorsville locksmith locksmith 1.0
locksmith farmington locksmith locksmith 1.0
locksmith roy locksmith locksmi 0.7777777777777778
locksmith clinton locksmith locks 0.5555555555555556
So, you will be able to control what to filter out based on similarity value
** Code **
SELECT business_name, partial_business_name, first_name, similarity FROM
JS( // input table
(
SELECT business_name, REGEXP_EXTRACT(business_name, r'^(\w+)') AS partial_business_name, first_name AS first_name FROM
(SELECT 'locksmith taylorsville' AS business_name, 'locksmith' AS first_name),
(SELECT 'locksmith roy' AS business_name, 'locksmi' AS first_name),
(SELECT 'locksmith clinton' AS business_name, 'locks' AS first_name),
(SELECT 'locksmith farmington' AS business_name, 'locksmith' AS first_name),
) ,
// input columns
business_name, partial_business_name, first_name,
// output schema
"[{name: 'business_name', type:'string'},
{name: 'partial_business_name', type:'string'},
{name: 'first_name', type:'string'},
{name: 'similarity', type:'float'}]
",
// function
"function(r, emit) {
var _extend = function(dst) {
var sources = Array.prototype.slice.call(arguments, 1);
for (var i=0; i<sources.length; ++i) {
var src = sources[i];
for (var p in src) {
if (src.hasOwnProperty(p)) dst[p] = src[p];
}
}
return dst;
};
var Levenshtein = {
/**
* Calculate levenshtein distance of the two strings.
*
* #param str1 String the first string.
* #param str2 String the second string.
* #return Integer the levenshtein distance (0 and above).
*/
get: function(str1, str2) {
// base cases
if (str1 === str2) return 0;
if (str1.length === 0) return str2.length;
if (str2.length === 0) return str1.length;
// two rows
var prevRow = new Array(str2.length + 1),
curCol, nextCol, i, j, tmp;
// initialise previous row
for (i=0; i<prevRow.length; ++i) {
prevRow[i] = i;
}
// calculate current row distance from previous row
for (i=0; i<str1.length; ++i) {
nextCol = i + 1;
for (j=0; j<str2.length; ++j) {
curCol = nextCol;
// substution
nextCol = prevRow[j] + ( (str1.charAt(i) === str2.charAt(j)) ? 0 : 1 );
// insertion
tmp = curCol + 1;
if (nextCol > tmp) {
nextCol = tmp;
}
// deletion
tmp = prevRow[j + 1] + 1;
if (nextCol > tmp) {
nextCol = tmp;
}
// copy current col value into previous (in preparation for next iteration)
prevRow[j] = curCol;
}
// copy last col value into previous (in preparation for next iteration)
prevRow[j] = nextCol;
}
return nextCol;
}
};
var the_partial_business_name;
try {
the_partial_business_name = decodeURI(r.partial_business_name).toLowerCase();
} catch (ex) {
the_partial_business_name = r.partial_business_name.toLowerCase();
}
try {
the_first_name = decodeURI(r.first_name).toLowerCase();
} catch (ex) {
the_first_name = r.first_name.toLowerCase();
}
emit({business_name: r.business_name, partial_business_name: the_partial_business_name, first_name: the_first_name,
similarity: 1 - Levenshtein.get(the_partial_business_name, the_first_name) / the_partial_business_name.length});
}"
)
ORDER BY similarity DESC
Was used in How to perform trigram operations in Google BigQuery? and based on https://storage.googleapis.com/thomaspark-sandbox/udf-examples/pataky.js by #thomaspark where Levenshtein's distance is used to measure similarity
this will do the trick,
select * from TableName where lower(business_name) contains lower(first_name)
use lower() just in case they have upper case letters. Hope it helps.

REGEXP_REPLACE pattern has to be const? Comparing strings in BigQuery

I'm trying to measure similarity between strings using Dice's Coefficient (aka Pair Similarity) in BigQuery. For a second I thought that I can do that using just standard functions.
Suppose I need to compare "gana" and "gano". Then I would "cook" these two strings upfront into 'ga|an|na' and 'ga|an|no' (lists of 2-grams) and do this:
REGEXP_REPLACE('ga|an|na', 'ga|an|no', '')
Then based on change in length I can calculate my coeff.
But once applied to the table I get:
REGEXP_REPLACE second argument must be const and non-null
Is there any workaround for that? With simple REPLACE() second argument can be a field.
Maybe there is a better way to do it? I know, I can do UDF instead. But I wanted to avoid them here. We are running big tasks and UDFs are generally slower (at least in my experience) and are subject to different concurrency limit.
You can have JavaScript code inside for BigQuery SQL queries.
To measure similarity you could use Levenshtein's distance with a query like this (from https://stackoverflow.com/a/33443564/132438):
SELECT *
FROM js(
(
SELECT title,target FROM
(SELECT 'hola' title, 'hello' target), (SELECT 'this is beautiful' title, 'that is fantastic' target)
),
title, target,
// Output schema.
"[{name: 'title', type:'string'},
{name: 'target', type:'string'},
{name: 'distance', type:'integer'}]",
// The function
"function(r, emit) {
var _extend = function(dst) {
var sources = Array.prototype.slice.call(arguments, 1);
for (var i=0; i<sources.length; ++i) {
var src = sources[i];
for (var p in src) {
if (src.hasOwnProperty(p)) dst[p] = src[p];
}
}
return dst;
};
var Levenshtein = {
/**
* Calculate levenshtein distance of the two strings.
*
* #param str1 String the first string.
* #param str2 String the second string.
* #return Integer the levenshtein distance (0 and above).
*/
get: function(str1, str2) {
// base cases
if (str1 === str2) return 0;
if (str1.length === 0) return str2.length;
if (str2.length === 0) return str1.length;
// two rows
var prevRow = new Array(str2.length + 1),
curCol, nextCol, i, j, tmp;
// initialise previous row
for (i=0; i<prevRow.length; ++i) {
prevRow[i] = i;
}
// calculate current row distance from previous row
for (i=0; i<str1.length; ++i) {
nextCol = i + 1;
for (j=0; j<str2.length; ++j) {
curCol = nextCol;
// substution
nextCol = prevRow[j] + ( (str1.charAt(i) === str2.charAt(j)) ? 0 : 1 );
// insertion
tmp = curCol + 1;
if (nextCol > tmp) {
nextCol = tmp;
}
// deletion
tmp = prevRow[j + 1] + 1;
if (nextCol > tmp) {
nextCol = tmp;
}
// copy current col value into previous (in preparation for next iteration)
prevRow[j] = curCol;
}
// copy last col value into previous (in preparation for next iteration)
prevRow[j] = nextCol;
}
return nextCol;
}
};
var the_title;
try {
the_title = decodeURI(r.title).toLowerCase();
} catch (ex) {
the_title = r.title.toLowerCase();
}
emit({title: the_title, target: r.target,
distance: Levenshtein.get(the_title, r.target)});
}")
Below is tailored for similarity
Was used in How to perform trigram operations in Google BigQuery? and based on https://storage.googleapis.com/thomaspark-sandbox/udf-examples/pataky.js by #thomaspark
SELECT text1, text2, similarity FROM
JS(
// input table
(
SELECT * FROM
(SELECT 'mikhail' AS text1, 'mikhail' AS text2),
(SELECT 'mikhail' AS text1, 'mike' AS text2),
(SELECT 'mikhail' AS text1, 'michael' AS text2),
(SELECT 'mikhail' AS text1, 'javier' AS text2),
(SELECT 'mikhail' AS text1, 'thomas' AS text2)
) ,
// input columns
text1, text2,
// output schema
"[{name: 'text1', type:'string'},
{name: 'text2', type:'string'},
{name: 'similarity', type:'float'}]
",
// function
"function(r, emit) {
var _extend = function(dst) {
var sources = Array.prototype.slice.call(arguments, 1);
for (var i=0; i<sources.length; ++i) {
var src = sources[i];
for (var p in src) {
if (src.hasOwnProperty(p)) dst[p] = src[p];
}
}
return dst;
};
var Levenshtein = {
/**
* Calculate levenshtein distance of the two strings.
*
* #param str1 String the first string.
* #param str2 String the second string.
* #return Integer the levenshtein distance (0 and above).
*/
get: function(str1, str2) {
// base cases
if (str1 === str2) return 0;
if (str1.length === 0) return str2.length;
if (str2.length === 0) return str1.length;
// two rows
var prevRow = new Array(str2.length + 1),
curCol, nextCol, i, j, tmp;
// initialise previous row
for (i=0; i<prevRow.length; ++i) {
prevRow[i] = i;
}
// calculate current row distance from previous row
for (i=0; i<str1.length; ++i) {
nextCol = i + 1;
for (j=0; j<str2.length; ++j) {
curCol = nextCol;
// substution
nextCol = prevRow[j] + ( (str1.charAt(i) === str2.charAt(j)) ? 0 : 1 );
// insertion
tmp = curCol + 1;
if (nextCol > tmp) {
nextCol = tmp;
}
// deletion
tmp = prevRow[j + 1] + 1;
if (nextCol > tmp) {
nextCol = tmp;
}
// copy current col value into previous (in preparation for next iteration)
prevRow[j] = curCol;
}
// copy last col value into previous (in preparation for next iteration)
prevRow[j] = nextCol;
}
return nextCol;
}
};
var the_text1;
try {
the_text1 = decodeURI(r.text1).toLowerCase();
} catch (ex) {
the_text1 = r.text1.toLowerCase();
}
try {
the_text2 = decodeURI(r.text2).toLowerCase();
} catch (ex) {
the_text2 = r.text2.toLowerCase();
}
emit({text1: the_text1, text2: the_text2,
similarity: 1 - Levenshtein.get(the_text1, the_text2) / the_text1.length});
}"
)
ORDER BY similarity DESC
REGEXP_REPLACE second argument must be const and non-null
Is there any
workaround for that?
Below is just an idea/direction to address above question applied to logic you described:
I would "cook" these two strings upfront into 'ga|an|na' and
'ga|an|no' (lists of 2-grams) and do this: REGEXP_REPLACE('ga|an|na',
'ga|an|no', ''). Then based on change in length I can calculate my
coeff.
The "workaround" is:
SELECT a.w AS w1, b.w AS w2, SUM(a.x = b.x) / COUNT(1) AS c
FROM (
SELECT w, SPLIT(p, '|') AS x, ROW_NUMBER() OVER(PARTITION BY w) AS pos
FROM
(SELECT 'gana' AS w, 'ga|an|na' AS p)
) AS a
JOIN (
SELECT w, SPLIT(p, '|') AS x, ROW_NUMBER() OVER(PARTITION BY w) AS pos
FROM
(SELECT 'gano' AS w, 'ga|an|no' AS p),
(SELECT 'gamo' AS w, 'ga|am|mo' AS p),
(SELECT 'kana' AS w, 'ka|an|na' AS p)
) AS b
ON a.pos = b.pos
GROUP BY w1, w2
Maybe there is a better way to do it?
Below is the simple example of how Pair Similarity can be approached here (including building bigrams sets and calculation of coefficient:
SELECT
a.word AS word1, b.word AS word2,
2 * SUM(a.bigram = b.bigram) /
(EXACT_COUNT_DISTINCT(a.bigram) + EXACT_COUNT_DISTINCT(b.bigram) ) AS c
FROM (
SELECT word, char + next_char AS bigram
FROM (
SELECT word, char, LEAD(char, 1) OVER(PARTITION BY word ORDER BY pos) AS next_char
FROM (
SELECT word, SPLIT(word, '') AS char, ROW_NUMBER() OVER(PARTITION BY word) AS pos
FROM
(SELECT 'gana' AS word)
)
)
WHERE next_char IS NOT NULL
GROUP BY 1, 2
) a
CROSS JOIN (
SELECT word, char + next_char AS bigram
FROM (
SELECT word, char, LEAD(char, 1) OVER(PARTITION BY word ORDER BY pos) AS next_char
FROM (
SELECT word, SPLIT(word, '') AS char, ROW_NUMBER() OVER(PARTITION BY word) AS pos
FROM
(SELECT 'gano' AS word)
)
)
WHERE next_char IS NOT NULL
GROUP BY 1, 2
) b
GROUP BY 1, 2