I need a query that every time the indicator column turns into zero and there are 3 zeros in a row, I would like to assign them a unique group number.
Here is a sample data:
select 0 as offset, 1 as indicator, -1 as grp union all
select 1, 1, -1 union all
select 2, 1, -1 union all
select 3, 1, -1 union all
select 4, 1, -1 union all
select 5, 1, -1 union all
select 6, 1, -1 union all
select 7, 0, 1 union all
select 8, 0, 1 union all
select 9, 0, 1 union all
select 10, 1, -1 union all
select 11, 0, 2 union all
select 12, 0, 2 union all
select 13, 0, 2 union all
select 14, 1, -1 union all
select 15, 1, -1 union all
select 16, 1, -1
In this example there are two sequences of 3 zeros, indicated as grp=1 and grp=2.
Consider below approach
select offset, indicator, if(grp = 0, -1, grp) as grp
from (
select offset, indicator, dense_rank() over(order by pregroup) - 1 as grp
from (
select offset, indicator,
if(countif(indicator = 0) over(partition by pregroup) = 3 and indicator = 0, pregroup, -1) as pregroup
from (
select offset, indicator, count(*) over win - countif(indicator = 0) over win as pregroup
from your_table
window win as (order by offset)
)
)
)
if applied to slightly modified sample data n your question (with sequence of 4 zeros - just for test purpose) - output is
The below query solves this.
Firstly it assigns all of the desired groups a tag.
Secondly, we get the row number for them and use integer casting on row_number to assign them a unique group number.
with data as (select 0 as offset, 1 as indicator, -1 as grp union all
select 1, 1, -1 union all
select 2, 1, -1 union all
select 3, 1, -1 union all
select 4, 1, -1 union all
select 5, 1, -1 union all
select 6, 1, -1 union all
select 7, 0, 1 union all
select 8, 0, 1 union all
select 9, 0, 1 union all
select 10, 1, -1 union all
select 11, 0, 2 union all
select 12, 0, 2 union all
select 13, 0, 2 union all
select 14, 1, -1 union all
select 15, 1, -1 union all
select 16, 1, -1 ),
tagged as (select
*,
-- mark as part of the group if both indicators in front, both indicators behind, or one indicator in front and behind are 0.
case
when indicator = 0 and lead(indicator) over(order by offset) = 0 and lead(indicator, 2) over(order by offset) = 0 then true
when indicator = 0 and lead(indicator) over(order by offset) = 0 and lag(indicator) over(order by offset) = 0 then true
when indicator = 0 and lag(indicator) over(order by offset) = 0 and lag(indicator, 2) over(order by offset) = 0 then true
else false
end as part_of_group
from data),
group_tags as (
select
*,
-- use cast as int to acquire the group number from the row number
CAST((row_number() over(order by offset) + 1)/3 AS INT) as group_tag
from
tagged
where
part_of_group = true)
-- rejoin this data back together
select
d.*,
gt.group_tag
from data as d
left join
group_tags as gt
on
d.offset = gt.offset
You may consider below approach as well,
WITH partitions AS (
SELECT *, indicator = 0 AND COUNT(div) OVER (PARTITION BY div, indicator) = 3 AS flag
FROM (
SELECT *, SUM(indicator) OVER (ORDER BY offset) AS div FROM sample_data
)
)
SELECT offset, indicator, IF(flag, DENSE_RANK() OVER w, -1) AS grp
FROM partitions
WINDOW w AS (PARTITION BY CASE WHEN flag THEN 0 ELSE 1 END ORDER BY div)
ORDER BY offset;
Query results
I need to validate the number of repeat character in a email.
I try the next code who give me the percentage of repeat character, but only work if character are next to each other. So one posibily its order the email by character to get my result.
SELECT
round(((REGEXP_COUNT(regexp_replace(SUBSTR('999824123#HOTMAIL.COM',1,INSTR('989824123#HOTMAIL.COM', '#', 1)-1), '(.)\1+','&'),'&')+length(SUBSTR('989824123#HOTMAIL.COM',1,INSTR('989824123#HOTMAIL.COM', '#', 1)-1)) - length(regexp_replace(SUBSTR('989824123#HOTMAIL.COM',1,INSTR('989824123#HOTMAIL.COM', '#', 1)-1), '(.)\1+','\1')))* 100)/length(SUBSTR('989824123#HOTMAIL.COM',1,INSTR('989824123#HOTMAIL.COM', '#', 1)-1)),2) AS PORCENTAJE_IGUAL
FROM DUAL A;
I expect 60% of repeat character for this email 989824123#HOTMAIL.COM. not incluing domain.
please Help.
PD: sorry for the bad english
Numbers 9, 8, 2 repeats in email, so we have 6 characters (9, 9, 8, 8, 2, 2) which repeats and 3 unique (1, 3, 4). 6/9 gives us 66,67%.
You can use this query to count this:
with
t(email) as (select '989824123#hotmail.com' from dual),
a(email) as (select substr(email, 1,instr(email, '#', 1)-1) from t),
l as (select substr(email, level, 1) ltr from a connect by level <= length(email))
select sum(case when cnt <> 1 then cnt end) / sum(cnt)
from (select ltr, count(1) cnt from l group by ltr)
I cut domain, then in subquery l I divided string into one-letter rows, rest was only to count non-unique chars and divide by number of all chars.
edit:
how do you apply something like this in a update or select for a large
scale data base with many email?
You can create function:
create or replace function rpt_similarity(i_email in varchar2) return number is
v_email varchar2(100);
v_ret number;
begin
v_email := substr(i_email, 1, instr(i_email, '#', 1) - 1);
with l as (
select substr(v_email, level, 1) ltr
from dual
connect by level <= length(v_email))
select sum(case when cnt <> 1 then cnt end) / sum(cnt)
into v_ret
from (select ltr, count(1) cnt from l group by ltr);
return v_ret;
end;
and use it like here:
select rpt_similarity('abxabc#pqr.com') from dual;
or:
select rpt_similarity(email) from your_table;
Also you can use above solution in select directly, without function, here is the example:
create table test(id, email) as (
select 101, '989824123#hotmail.com' from dual union all
select 102, 'hsimpson#gmail.com' from dual union all
select 103, 'msimpson#gmail.com' from dual union all
select 104, 'bsimpson121314#hotmail.com' from dual union all
select 105, 'abxabx#hotmail.com' from dual );
with
a(id, email) as (select id, substr(email, 1,instr(email, '#', 1)-1) from test),
l as (
select id, email, substr(email, level, 1) ltr from a
connect by level <= length(email)
and prior id = id and prior sys_guid() is not null)
select id, email, sum(case when cnt <> 1 then cnt end) / sum(cnt)
from (select id, email, ltr, count(1) cnt from l group by id, ltr, email)
group by id, email;
connect by queries tends to be slow for large sets of data. Maybe you can adapt your regexp functions and it will be faster. I tried to do it, but your regexp_replace changes 99 into $ and 999 also into one $.
i need to generate some sample data from a population. I want to do this with an SQL query on an Oracle 11g database.
Here is a simple working example with population size 4 and sample size 2:
with population as (
select 1 as val from dual union all
select 2 from dual union all
select 3 from dual union all
select 4 from dual)
select val from (
select val, dbms_random.value(0,10) AS RANDORDER
from population
order by randorder)
where rownum <= 2
(the oracle sample() funtion didn't work in connection with the WITH-clause for me)
But now I, I want to "upscale" or multiply my sample data. So that I can get something like 150 % sample data of the population data (population size 4 and sample size 6, e.g.)
Is there a good way to achieve this with an SQL query?
You could use CONNECT BY:
with population(val, RANDOMORDER) as (
select level, dbms_random.value(0,10) AS RANDORDER
from dual
connect by level <= 6
ORDER BY RANDORDER
)
select val
FROM population
WHERE rownum <= 4;
db<>fiddle demo
The solution depends, if you want all rows from first initial set(s) and random additional rows from last one then use:
with params(size_, sample_) as (select 4, 6 from dual)
select val
from (
select mod(level - 1, size_) + 1 val, sample_,
case when level <= size_ * floor(sample_ / size_) then 0
else dbms_random.value()
end rand
from params
connect by level <= size_ * ceil(sample_ / size_)
order by rand)
where rownum <= sample_
But if you allow possibility of result like (1, 1, 2, 2, 3, 3), where some values may not appear at all in output (here 4) then use this:
with params(size_, sample_) as (select 4, 6 from dual)
select val
from (
select mod(level - 1, size_) + 1 val, sample_, dbms_random.value() rand
from params
connect by level <= size_ * ceil(sample_ / size_)
order by rand)
where rownum <= sample_
How it works? We build set of (1, 2, 3, 4) as many times as it results from division sample / size. Then we assign random values. In first case I assign 0 to first set(s), so they will be in output for sure, and random values to last set. In second case randoms are assigned to all rows.
I have a table with two columns:
OLD_REVISIONS |NEW_REVISIONS
-----------------------------------
1,25,26,24 |1,26,24,25
1,56,55,54 |1,55,54
1 |1
1,2 |1
1,96,95,94 |1,96,94,95
1 |1
1 |1
1 |1
1 |1
1,2 |1,2
1 |1
1 |1
1 |1
1 |1
For each row there will be a list of revisions for a document (comma separated)
The comma separated list might be the same in both columns but the order/sort might be different - e.g.
2,1 |1,2
I would like to find all the instances where the highest revision in the OLD_REVISIONS column is lower than than the highest revision in NEW_REVISIONS
The following would fit that criteria
OLD_REVISIONS |NEW_REVISIONS
-----------------------------------
1,2 |1
1,56,55,54 |1,55,54
I tried a solution using the MINUS option (joining the table to itself) but it returns differences even for when the list is the same but in the wrong order
I tried the function GREATEST (i.e where greatest(new_Revisions) < greatest(old_revisions)) but i am not sure why greatest(OLD_REVISIONS) always just returns the comma separated value. It does not return the max value. I suspect it is comparing strings because the columns are VARCHAR.
Also, MAX function expects a single number.
Is there another way i can achieve the above? I am looking for a pure SQL option so i can print out the results (or a PL/SQL option that can print out the results)
Edit
Apologies for not mentioning this but for the NEW_REVISIONS i do actually have the data in a table where each revision is in a separate row:
"DOCNUMBER" "REVISIONNUMBER"
67 1
67 24
67 25
67 26
75 1
75 54
75 55
75 56
78 1
79 1
79 2
83 1
83 96
83 94
Just to give some content, a few weeks ago i suspected that there are revisions disappearing.
To investigate this, i decided to take a count of all revisions for all documents and take a snapshot to compare later to see if revisions are indeed missing.
The snapshot that i took contained the following columns:
docnumber, count, revisions
The revisions were stored in a comma separated list using the listagg function.
The trouble i have now is the on live table, new revisions have been added so when i compare the main table and the snapshot using a MINUS i get a difference because
of the new revisions in the main table.
Even though in the actual table the revisions are individual rows, in the snapshot table i dont have the individual rows.
I am thinking the only way to recreate the snapshot in the same format and compare them find out if maximum revision in the main table is lower than the max revision in the snapshot table (hence why im trying to find out how to find out the max in a comma separated string)
Enjoy.
select xmlcast(xmlquery(('max((' || OLD_REVISIONS || '))') RETURNING CONTENT) as int) as OLD_REVISIONS_max
,xmlcast(xmlquery(('max((' || NEW_REVISIONS || '))') RETURNING CONTENT) as int) as NEW_REVISIONS_max
from t
;
Assuming your base table has an id column (versions of what?) - here is a solution based on splitting the rows.
Edit: If you like this solution, check out vkp's solution, which is better than mine. I explain why his solution is better in a Comment to his Answer.
with
t ( id, old_revisions, new_revisions ) as (
select 101, '1,25,26,24', '1,26,24,25' from dual union all
select 102, '1,56,55,54', '1,55,54' from dual union all
select 103, '1' , '1' from dual union all
select 104, '1,2' , '1' from dual union all
select 105, '1,96,95,94', '1,96,94,95' from dual union all
select 106, '1' , '1' from dual union all
select 107, '1' , '1' from dual union all
select 108, '1' , '1' from dual union all
select 109, '1' , '1' from dual union all
select 110, '1,2' , '1,2' from dual union all
select 111, '1' , '1' from dual union all
select 112, '1' , '1' from dual union all
select 113, '1' , '1' from dual union all
select 114, '1' , '1' from dual
)
-- END of TEST DATA; the actual solution (SQL query) begins below.
select id, old_revisions, new_revisions
from (
select id, old_revisions, new_revisions, 'old' as flag,
to_number(regexp_substr(old_revisions, '\d+', 1, level)) as rev_no
from t
connect by level <= regexp_count(old_revisions, ',') + 1
and prior id = id
and prior sys_guid() is not null
union all
select id, old_revisions, new_revisions, 'new' as flag,
to_number(regexp_substr(new_revisions, '\d+', 1, level)) as rev_no
from t
connect by level <= regexp_count(new_revisions, ',') + 1
and prior id = id
and prior sys_guid() is not null
)
group by id, old_revisions, new_revisions
having max(case when flag = 'old' then rev_no end) !=
max(case when flag = 'new' then rev_no end)
order by id -- ORDER BY is optional
;
ID OLD_REVISION NEW_REVISION
--- ------------ ------------
102 1,56,55,54 1,55,54
104 1,2 1
You can compare every value by putting together the revisions in the same order using listagg function.
SELECT listagg(o,',') WITHIN GROUP (ORDER BY o) old_revisions,
listagg(n,',') WITHIN GROUP (ORDER BY n) new_revisions
FROM (
SELECT DISTINCT rowid r,
regexp_substr(old_revisions, '[^,]+', 1, LEVEL) o,
regexp_substr(new_revisions, '[^,]+', 1, LEVEL) n
FROM table
WHERE regexp_substr(old_revisions, '[^,]+', 1, LEVEL) IS NOT NULL
CONNECT BY LEVEL<=(SELECT greatest(MAX(regexp_count(old_revisions,',')),MAX(regexp_count(new_revisions,',')))+1 c FROM table)
)
GROUP BY r
HAVING listagg(o,',') WITHIN GROUP (ORDER BY o)<>listagg(n,',') WITHIN GROUP (ORDER BY n);
This could be a way:
select
OLD_REVISIONS,
NEW_REVISIONS
from
REVISIONS t,
table(cast(multiset(
select level
from dual
connect by level <= length (regexp_replace(t.OLD_REVISIONS, '[^,]+')) + 1
) as sys.OdciNumberList
)
) levels_old,
table(cast(multiset(
select level
from dual
connect by level <= length (regexp_replace(t.NEW_REVISIONS, '[^,]+')) + 1
)as sys.OdciNumberList
)
) levels_new
group by t.ROWID,
OLD_REVISIONS,
NEW_REVISIONS
having max(to_number(trim(regexp_substr(t.OLD_REVISIONS, '[^,]+', 1, levels_old.column_value)))) >
max(to_number(trim(regexp_substr(t.new_REVISIONS, '[^,]+', 1, levels_new.column_value))))
This uses a double string split to pick the values from every field, and then simply finds the rows where the max values among the two collections match your requirement.
You should edit this by adding some unique key in the GROUP BYclause, or a rowid if you don't have any unique key on your table.
One way to do is to split the columns on comma separation using regexp_substr and checking if the max and min values are different.
Sample Demo
with rownums as (select t.*,row_number() over(order by old_revisions) rn from t)
select old_revisions,new_revisions
from rownums
where rn in (select rn
from rownums
group by rn
connect by regexp_substr(old_revisions, '[^,]+', 1, level) is not null
or regexp_substr(new_revisions, '[^,]+', 1, level) is not null
having max(cast(regexp_substr(old_revisions,'[^,]+', 1, level) as int))
<> max(cast(regexp_substr(new_revisions,'[^,]+', 1, level) as int))
)
Comments say normalise data. I agree but also I understand it may be not possible. I would try something like query below:
select greatest(val1, val2), t1.r from (
select max(val) val1, r from (
select regexp_substr(v1,'[^,]+', 1, level) val, rowid r from tab1
connect by regexp_substr(v1, '[^,]+', 1, level) is not null
) group by r) t1
inner join (
select max(val) val2, r from (
select regexp_substr(v2,'[^,]+', 1, level) val, rowid r from tab1
connect by regexp_substr(v2, '[^,]+', 1, level) is not null
) group by r) t2
on (t1.r = t2.r);
Tested on:
create table tab1 (v1 varchar2(100), v2 varchar2(100));
insert into tab1 values ('1,3,5','1,4,7');
insert into tab1 values ('1,3,5','1,2,9');
insert into tab1 values ('1,3,5','1,3,5');
insert into tab1 values ('1,3,5','1,4');
and seems to work fine. I left rowid for reference. I guess you have some id in table.
After your edit I would change query to:
select greatest(val1, val2), t1.r from (
select max(val) val1, r from (
select regexp_substr(v1,'[^,]+', 1, level) val, DOCNUMBER r from tab1
connect by regexp_substr(v1, '[^,]+', 1, level) is not null
) group by DOCNUMBER) t1
inner join (
select max(DOCNUMBER) val2, DOCNUMBER r from NEW_REVISIONS) t2
on (t1.r = t2.r);
You may write a PL/SQL function parsing the string and returning the maximal number
select max_num( '1,26,24,25') max_num from dual;
MAX_NUM
----------
26
The query ist than very simple:
select OLD_REVISIONS NEW_REVISIONS
from revs
where max_num(OLD_REVISIONS) < max_num(NEW_REVISIONS);
A prototyp function without validation and error handling
create or replace function max_num(str_in VARCHAR2) return NUMBER as
i number;
x varchar2(1);
n number := 0;
max_n number := 0;
pow number := 0;
begin
for i in 0.. length(str_in)-1 loop
x := substr(str_in,length(str_in)-i,1);
if x = ',' then
-- check max number
if n > max_n then
max_n := n;
end if;
-- reset
n := 0;
pow := 0;
else
n := n + to_number(x)*power(10,pow);
pow := pow +1;
end if;
end loop;
return(max_n);
end;
/