bigquery: find following row matching condition - sql

I'm looking at text sequences in BigQuery and trying to identify word completions over a number of rows (sharing an ID). The data looks like:
ID, Text
1, t
1, th
1, the
1, the
1, the c
1, the ca
1, the cat
1, the cat
1, the cat s
...
1, the cat sat on the mat
2, r
...
For each given ID and sequence i'm trying to find the next word boundary. So the ideal output would be:
ID, Text, Boundary
1, t, the
1, th, the
1, the c, the cat
1, the ca, the cat
1, the cat s, the cat sat
In the above the next subsequent row that both shares an ID and ends in a space gives the next (there can be multiple) word completion boundary.

Below is for BigQuery Standard SQL
Note: it is brute force approach so query is not that elegant as potentially can be - but hope this will give you good start
#standardSQL
SELECT id, item, boundary
FROM (
SELECT id, grp,
STRING_AGG(IF(boundary, text, ''), '') boundary,
ARRAY_AGG(IF(NOT boundary, text, NULL) IGNORE NULLS ORDER BY LENGTH(text)) items
FROM (
SELECT id, text,
LENGTH(text) - LENGTH(REPLACE(text, ' ', '')) - IF(SUBSTR(text, -1) = ' ', 1, 0) grp,
SUBSTR(text, -1) = ' ' boundary
FROM `project.dataset.table`
)
GROUP BY id, grp
), UNNEST(items) item WITH OFFSET pos
WHERE RTRIM(item) != RTRIM(boundary)
if to apply to dummy data in your question as below
#standardSQL
WITH `project.dataset.table` AS (
SELECT 1 id, 't' text UNION ALL
SELECT 1, 'th' UNION ALL
SELECT 1, 'the' UNION ALL
SELECT 1, 'the ' UNION ALL
SELECT 1, 'the c' UNION ALL
SELECT 1, 'the ca' UNION ALL
SELECT 1, 'the cat' UNION ALL
SELECT 1, 'the cat ' UNION ALL
SELECT 1, 'the cat s' UNION ALL
SELECT 1, 'the cat sat '
)
SELECT id, item, boundary
FROM (
SELECT id, grp,
STRING_AGG(IF(boundary, text, ''), '') boundary,
ARRAY_AGG(IF(NOT boundary, text, NULL) IGNORE NULLS ORDER BY LENGTH(text)) items
FROM (
SELECT id, text,
LENGTH(text) - LENGTH(REPLACE(text, ' ', '')) - IF(SUBSTR(text, -1) = ' ', 1, 0) grp,
SUBSTR(text, -1) = ' ' boundary
FROM `project.dataset.table`
)
GROUP BY id, grp
), UNNEST(items) item WITH OFFSET pos
WHERE RTRIM(item) != RTRIM(boundary)
ORDER BY id, grp, pos
result is
Row id item boundary
1 1 t the
2 1 th the
3 1 the c the cat
4 1 the ca the cat
5 1 the cat s the cat sat

BigQuery UDF's come in handy in these situations. Here is a working solution:
#standardSQL
/*boundary function*/
create temp function boundaryf (text string, sentence string) as (
array_to_string(array(
select q.w from unnest(
array(select struct(w as w, row_number() over () as i) from unnest(split(sentence, ' ')) w
)
) q
-- respect the ending space
where q.i <= array_length(split(text, ' ')) - (length(text) - length(rtrim(text)))
), ' ')
);
WITH items AS (
#--your data. assuming this is already ordered
SELECT 1 as id, 't' as text UNION ALL
SELECT 1, 'th' UNION ALL
SELECT 1, 'the' UNION ALL
SELECT 1, 'the ' UNION ALL
SELECT 1, 'the c' UNION ALL
SELECT 1, 'the ca' UNION ALL
SELECT 1, 'the cat' UNION ALL
SELECT 1, 'the cat ' UNION ALL
SELECT 1, 'the cat s' UNION ALL
SELECT 1, 'the cat sa' union all
SELECT 1, 'the cat sat' union all
SELECT 1, 'the cat sat ' union all
SELECT 1, 'the cat sat o' union all
SELECT 1, 'the cat sat on' union all
SELECT 1, 'the cat sat on ' union all
SELECT 1, 'the cat sat on a' union all
SELECT 1, 'the cat sat on a ' union all
SELECT 1, 'the cat sat on a m' union all
SELECT 1, 'the cat sat on a ma' union all
SELECT 1, 'the cat sat on a mat' union all
select 2, 'i' union all
select 2, 'i a' union all
select 2, 'i am' union all
select 2, 'i am f' union all
select 2, 'i am fr' union all
select 2, 'i am fre' union all
select 2, 'i am free'
),
sentences as (
select id, sentences[offset (array_length(sentences)-1)] as sentence from (
select id, array_agg(text) as sentences
from items group by 1
)
),
control as (
select i.id, i.text, boundaryf(i.text, s.sentence) as boundary
from items i
left join sentences s on s.id = i.id
)
select * from control

Related

How to count data based on specific condition in Oracle

Let's say I have a table of persons with ids and names as follows
[Person]
ID NAME
================
1 Michael
2 Michelle
3 Emma
4 Evan
5 Ellen
6 Gary
I want to count the number of persons based on the first characters of their names.
Here's the output I expect
NUMBER_OF_PERSONS
=================
2 //M = Michael and Michelle
3 //E = Emma, Evan and Ellen
1 //G = Gary
How do I achieve this in Oracle?
And here's my query
select count(id) as number_of_person
from person
where substr(name) in (select distinct substr(name,1,1) from person);
You can acheive that purpose using below solution.
with Person (ID, NAME ) as (
select 1, 'Michael' from dual union all
select 2, 'Michelle' from dual union all
select 3, 'Emma' from dual union all
select 4, 'Evan' from dual union all
select 5, 'Ellen' from dual union all
select 6, 'Gary' from dual
)
select count(*) || ' //' || substr(NAME, 1, 1) || ' = ' ||
case
when regexp_count( listagg(NAME, ' and ') within group ( order by ID ), ' and ') > 1
then regexp_replace( listagg(NAME, ', ') within group ( order by ID ), ', ([^,]+)$', ' and \1 ', 1, 1 )
else listagg(NAME, ' and ') within group ( order by ID )
end NUMBER_OF_PERSONS
from Person
group by substr(NAME, 1, 1)
order by substr(NAME, 1, 1)
;
db<>fiddle
If you just want the count, you would use group by:
select substr(name, 1, 1) as first_letter,
count(*) as number_of_person
from person
group by substr(name, 1, 1) ;
If, in addition, you actually wanted the list of names, you could put that in another column, assuming there are not too many:
select substr(name, 1, 1) as first_letter,
count(*) as number_of_person,
listagg(name, ', ') within group (order by name) as names
from person
group by substr(name, 1, 1) ;
This is my solution to it:
WITH tbl AS (
SELECT 1 AS ID, 'Michael' AS NAME FROM dual UNION
SELECT 2, 'Michelle' FROM dual UNION
SELECT 3, 'Emma' FROM dual UNION
SELECT 4, 'Evan' FROM dual UNION
SELECT 5, 'Ellen' FROM dual UNION
SELECT 6, 'Gary' FROM dual
)
SELECT COUNT(1)
, SUBSTR(names.name,1,1)
, REGEXP_REPLACE((listagg(names.name,', ') WITHIN GROUP (ORDER BY names.name)), ',([^,]*)$', ' and \1')
FROM tbl names
GROUP BY SUBSTR(names.name,1,1);

Masking each word in sentence except first and last letter with a fixed amount of asterisks

I want to mask except first and last letter of word in sentence in oracle.
My query is:
select regexp_replace('Hello World', '(?<!^.?).(?!.?$)','*') as str2
from dual;
result: Hello World
Expect Result: H***o W***d
You may use
SELECT REGEXP_REPLACE('Hello World', '(\w)\w*(\w)', '\1***\2') as str2 from dual
See an online demo
The regex will only match at least two-letter words, so any one-letter word will remain unaffected.
Pattern details
(\w) - Group 1 (later, referred to via \1): any word (letter, digit or _) char
\w* - any 0+ word chars as many as possible
(\w) - Group 2 (later, referred to via \2): any word char.
The \1***\2 replacement pattern replaces the match with the contents of Group 1, then three asterisks (adjust as you see fit), and then the contents of Group 2.
If you want words to have the same length and asterisk out each character except for the first and last then you cannot do it with a simple regular expression (as Oracle does not support look-ahead/behind in its regular expressions). But you can do it with a recursive sub-query factoring clause that recursively finds each word and performs the masking on that word and then considers the next word in the string:
WITH replacements ( value, before, word, pos ) AS (
SELECT value,
CASE REGEXP_INSTR( value, '[[:alnum:]]+', 1, 1, 0 )
WHEN 0
THEN value
ELSE SUBSTR(
value,
1,
REGEXP_INSTR( value, '[[:alnum:]]+', 1, 1, 0 ) - 1
)
END,
SUBSTR(
value,
REGEXP_INSTR( value, '[[:alnum:]]+', 1, 1, 0 ),
REGEXP_INSTR( value, '[[:alnum:]]+', 1, 1, 1 )
- REGEXP_INSTR( value, '[[:alnum:]]+', 1, 1, 0 )
),
REGEXP_INSTR( value, '[[:alnum:]]+', 1, 1, 1 )
FROM test_data
UNION ALL
SELECT value,
before
|| CASE
WHEN LENGTH(word) < 3
THEN word
ELSE SUBSTR(word,1,1) || LPAD( SUBSTR(word,-1), LENGTH(word)-1, '*' )
END
|| CASE REGEXP_INSTR( value, '[[:alnum:]]+', pos, 1, 0 )
WHEN 0
THEN SUBSTR( value, pos )
ELSE SUBSTR(
value,
pos,
REGEXP_INSTR( value, '[[:alnum:]]+', pos, 1, 0 ) - pos
)
END,
SUBSTR(
value,
REGEXP_INSTR( value, '[[:alnum:]]+', pos, 1, 0 ),
REGEXP_INSTR( value, '[[:alnum:]]+', pos, 1, 1 )
- REGEXP_INSTR( value, '[[:alnum:]]+', pos, 1, 0 )
),
REGEXP_INSTR( value, '[[:alnum:]]+', pos, 1, 1 )
FROM replacements
WHERE pos > 0
)
SELECT value,
before AS replaced_value
FROM replacements
WHERE pos = 0
So for the test data:
CREATE TABLE test_data( value ) AS
SELECT 'Hello World' FROM DUAL UNION ALL
SELECT 'A short sentence, with a "quote".' FROM DUAL UNION ALL
SELECT 'Some numbers 1000, 2000' FROM DUAL UNION ALL
SELECT '==Hello==World==' FROM DUAL UNION ALL
SELECT 'Be in at 10' FROM DUAL UNION ALL
SELECT '"!!!!"' FROM DUAL
This outputs:
VALUE | REPLACED_VALUE
:-------------------------------- | :--------------------------------
"!!!!" | "!!!!"
Hello World | H***o W***d
==Hello==World== | ==H***o==W***d==
Some numbers 1000, 2000 | S**e n*****s 1**0, 2**0
Be in at 10 | Be in at 10
A short sentence, with a "quote". | A s***t s******e, w**h a "q***e".
db<>fiddle here

Add numbers within a string in an SQL statement

I have below query of which i would like to modify.
I want to sum all the numbers that occur within a string with the condition that it is joined with the text GB or MB .
If it is in GB it first has to be converted to MB. (This i have done simply by multiplying by 1024)
SELECT /*+ PARALLEL */
'SOME TEXT 20GB+2GB+SOMETEXT' SOMETEXT,
CASE
WHEN REGEXP_SUBSTR('SOME TEXT 20GB+2GB+SOMETEXT','GB',1,1) = 'GB'
THEN 1024*to_number(regexp_replace(REGEXP_SUBSTR('SOME TEXT 20GB+2GB+SOMETEXT','(\d+)GB',1,1), '[^0-9]', ''))
ELSE to_number(regexp_replace(REGEXP_SUBSTR('SOME TEXT 20GB+2GB+SOMETEXT','(\d+)MB',1,1), '[^0-9]', ''))
END TOTAL_MBs
FROM DUAL;
TEST STRINGS
TEXT TEXT_35MB+ MORETEXT
OTHERTEXT 480MB + 3MB AND_TEXT
SOMETEXT 7MB + 7NUMBER
TEXT 1GB AND SOME_TEXT
SOME TEXT 20GB+2GB+SOMETEXT
Here is where i am stuck: To add the numbers that occur more than once in one text
For example:-
For this text OTHERTEXT 480MB + 3MB AND_TEXT I want my result to have 483 as TOTAL_MBS and not 480
Think you are searching for something like:
with da as (
select 1 id, 'TEXT TEXT_35MB+ MORETEXT' tcase from dual
union all select 2 id,'OTHERTEXT 480MB + 3MB AND_TEXT' tcase from dual
union all select 3 id,'SOMETEXT 7MB + 7NUMBER' tcase from dual
union all select 4 id,'TEXT 1GB AND SOME_TEXT' tcase from dual
union all select 5 id,'SOME TEXT 20GB+2GB+SOMETEXT' tcase from dual
union all select 6 id,'SOME TEXT 20MB+2GB+SOMETEXT' tcase from dual
),
split as(
select id
, tcase
, REGEXP_SUBSTR(tcase,'(\d+)(M|G)B',1,level) ot
from da
connect by REGEXP_SUBSTR(tcase,'(\d+)(M|G)B',1,level)is not null
and prior id = id
and PRIOR DBMS_RANDOM.VALUE IS NOT NULL)
select id
, tcase
, sum( case when ot like '%GB%' then 1024 else 1 end * regexp_substr(ot,'\d+')) v
from split
group by id
,tcase
order by id;
Result:
1 TEXT TEXT_35MB+ MORETEXT 35
2 OTHERTEXT 480MB + 3MB AND_TEXT 483
3 SOMETEXT 7MB + 7NUMBER 7
4 TEXT 1GB AND SOME_TEXT 1024
5 SOME TEXT 20GB+2GB+SOMETEXT 22528
6 SOME TEXT 20MB+2GB+SOMETEXT 2068
You can use a recursive sub-query factoring clause:
SELECT sometext,
COALESCE(
REGEXP_SUBSTR( sometext, '(\d+)([MG])B', 1, 1, NULL, 1 )
* CASE REGEXP_SUBSTR( sometext, '(\d+)([MG])B', 1, 1, NULL, 2 )
WHEN 'M' THEN 1
WHEN 'G' THEN 1024
END,
0
),
1,
REGEXP_COUNT( sometext, '(\d+)([MG])B' )
FROM test_data
UNION ALL
SELECT sometext,
total_mb
+ REGEXP_SUBSTR( sometext, '(\d+)([MG])B', 1, i + 1, NULL, 1 )
* CASE REGEXP_SUBSTR( sometext, '(\d+)([MG])B', 1, i + 1, NULL, 2 )
WHEN 'M' THEN 1
WHEN 'G' THEN 1024
END,
i + 1,
num_terms
FROM terms
WHERE i < num_terms
)
SELECT sometext,
total_mb
FROM terms
WHERE i >= num_terms;
which for the test data:
CREATE TABLE test_data ( sometext ) AS
SELECT 'SOME TEXT 20GB+2GB+SOMETEXT' FROM DUAL UNION ALL
SELECT '1MB+1GB+10MB+10GB' FROM DUAL;
outputs:
SOMETEXT | TOTAL_MB
:-------------------------- | -------:
SOME TEXT 20GB+2GB+SOMETEXT | 22528
1MB+1GB+10MB+10GB | 11275
db<>fiddle here
below I used a view/memory table to assign regex function to the specific string and it worked for me
with tbl1 as (
select 1 pd, ' 20GB+2GB sometext +7500 + 45sometext' string from dual
),
tbl2 as(
select pd
, string
, REGEXP_SUBSTR(string,'(\d+)(M|G)B',1,level) string2
from tbl1
connect by REGEXP_SUBSTR(string,'(\d+)(M|G)B',1,level)is not NULL
and prior pd = pd
and PRIOR DBMS_RANDOM.VALUE IS NOT NULL)
select pd
, string
, sum( case when string2 like '%GB%' then 1024 end * regexp_substr(string2,'\d+')) string3
from tbl2
group by pd
,string
order by pd;

Split comma separated values in Oracle 9i

In Oracle, I have columns called orderids
orderids
111,222,333
444,55,66
77,77
How can get the output as
Orderid
111
222
333
444
55
66
77
77
Try this:
WITH TT AS
(SELECT orderid COL1 FROM orders)
SELECT substr(str,
instr(str, ',', 1, LEVEL) + 1,
instr(str, ',', 1, LEVEL + 1) -
instr(str, ',', 1, LEVEL) - 1) COL1
FROM (SELECT rownum AS r,
','|| COL1||',' AS STR
FROM TT )
CONNECT BY PRIOR r = r
AND instr(str, ',', 1, LEVEL + 1) > 0
AND PRIOR dbms_random.STRING('p', 10) IS NOT NULL
;
See this SQLFiddle
This is one appraoch:
with order_table as (
select '111,222,333' as orderids from dual
union all select '444,55,66' from dual
union all select '77,77' from dual
)
select substr(orderids, instr(orderids, ',', 1, lvl) + 1, instr(orderids, ',', 1, lvl + 1) - instr(orderids, ',', 1, lvl) - 1) orderid
from
( select ',' || orderids || ',' as orderids from order_table ),
( select level as lvl from dual connect by level <= 100 )
where lvl <= length(orderids) - length(replace(orderids, ',')) - 1;
Just remove the WITH clause and replace the order_table with your real table.
This too might help you,
with t(orderid) as
(
SELECT '111,222,333' FROM dual
UNION
SELECT '444,55,66' FROM dual
UNION
SELECT '177,77' FROM dual
)
SELECT trim(x.COLUMN_VALUE.EXTRACT('e/text()')) cols
FROM t t, TABLE (xmlsequence(XMLTYPE('<e><e>' || REPLACE(t.orderid,',','</e><e>')|| '</e></e>').EXTRACT('e/e'))) x;
instr(','||NVL('972414AQ,972414AQ',I.CUSIP)||',', ','||I.CUSIP||',') > 0
This is the actual query I was looking for.

how to separate text from integer

I have addresses:
ALKOŅU 3-20;
M.LUBŠNAS 16V-9;
STIEBRU 6-22;
ANDREJA UPĪĀA IELA 16-2;
MISNKAS 4 -115;
CISKADI,BAZNICAS 4;
How it is possible in sql to separate first text part (district) from integer (house and flat number)?
Assuming the break-point is ALWAYS the first digit, then
SELECT RTRIM(LEFT(col, PATINDEX('%[0-9]%', col + '0') -1)) as District,
STUFF(col, 1, PATINDEX('%[0-9]%', col + '0') -1, '') as HouseAndFlat
FROM ...
e.g.
with t(col) as (
select
'ALKOŅU 3-20' union all select
'M.LUBŠNAS 16V-9' union all select
'STIEBRU 6-22' union all select
'ANDREJA UPĪĀA IELA 16-2' union all select
'MISNKAS 4 -115' union all select
'CISKADI,BAZNICAS 4')
SELECT RTRIM(LEFT(col, PATINDEX('%[0-9]%', col + '0') -1)) as District,
STUFF(col, 1, PATINDEX('%[0-9]%', col + '0') -1, '') as HouseAndFlat
FROM t