Select equal number of random rows with respect to a column - sql

Consider I have a table like this
Col1 || Col2
-------------
a || 0
b || 0
c || 1
d || 1
e || 0
How can I select rows from it so that I have equal number of 1s and 0s, like below can be a result
Col1 || Col2
-------------
a || 0
c || 1
d || 1
e || 0
The rows removed/left out are at random and deleting from an existing table would work as well.

For each col2 partition, you can give each row a row number and then find those rows where there is only one instance of the row number and delete them:
DELETE FROM table_name
WHERE ROWID IN (
SELECT MIN(ROWID)
FROM (
SELECT ROW_NUMBER() OVER (PARTITION BY col2 ORDER BY DBMS_RANDOM.VALUE)
AS rn
FROM table_name
)
GROUP BY rn
HAVING COUNT(*) < 2
);
If you just want to SELECT the rows then you can use a similar technique:
SELECT col1, col2
FROM (
SELECT col1,
col2,
COUNT(*) OVER (PARTITION BY rn) AS cnt
FROM (
SELECT col1,
col2,
ROW_NUMBER() OVER (PARTITION BY col2 ORDER BY DBMS_RANDOM.VALUE)
AS rn
FROM table_name
)
)
WHERE cnt = 2;
db<>fiddle here

How can I select rows from it so that I have equal number of 1s and 0s?
Yet another option might be to count COL2 values and use least of those two (as the final result has to have equal number of 0s and 1s) in a UNION set operation. Something like this:
Sample data:
SQL> select * from test;
COL1 COL2
---- ----------
a 0
b 0
c 1
d 1
e 0
Query & result:
SQL> with cnts as
2 -- count rows by COL2 value
3 (select sum(case when col2 = 0 then 1 else 0 end) cnt_0,
4 sum(case when col2 = 1 then 1 else 0 end) cnt_1
5 from test
6 )
7 select t.* from test t cross join cnts c
8 where t.col2 = 0 and rownum <= least(c.cnt_0, c.cnt_1)
9 union all
10 select t.* from test t cross join cnts c
11 where t.col2 = 1 and rownum <= least(c.cnt_0, c.cnt_1);
COL1 COL2
---- ----------
a 0
b 0
c 1
d 1
SQL>

You can do this with only one subquery/CTE. The following returns the smaller number of 0s and 1 (which determines the number of rows being returned):
least( sum(col2), sum(1 - col2) ) as num_rows
Then, you can incorporate this into a window function with row_number():
select col1, col2
from (select t.*,
least(sum(col2) over (), sum(1-col2) over ()) as num_rows,
row_number() over (partition by col2 order by dbms_random.value) as seqnum
from t
) t
where seqnum <= num_rows;

use the window function to count the frequency of col2 and row number over col2. Then get the minimum frequency from it. Later get the rows with rownum less than or equal to min frequency.
with data AS
(
SELECT *, row_number() over(partition by col2 order by dbms_random.value()) as rownum, COUNT(*) over(partition by col2) freq from test
),
data2 as
(
SELECT min(freq) as cnt from data
)
SELECT col1, col2 from data,data2 where rownum <= cnt

This analytic function check if there are more zeroes or ones in the table
sum(decode(col2,0,-1,col2)) over()
Depending on the result use cumulative sum starting with that value of col2 that appears in lower count and mapping (using decode) it to -1, the other value is mapped to 1.
The filter is done on cum_sum <= 0 i.e. you get the same number of 0 and 1.
with t1 as (
select
col1, col2,
case when sum(decode(col2,0,-1,col2)) over() <= 0 then
/* more zeroes */
sum(decode(col2,0,1,1,-1)) over(order by col2 desc, col1)
else
sum(decode(col2,0,-1,col2)) over(order by col2 , col1)
end as cum_sum
from tab)
select col1, col2
from t1
where cum_sum <= 0;

Related

SQL - Two Columns into One Distinct Ordered Column

If I have a table like this:
Col 1 | Col 2
-------------
A | 1
A | 2
B | 1
C | 1
C | 2
C | 3
How can I write a query to pull one column that looks like this --
Col 1
------
A
1
2
B
1
C
1
2
3
SELECT col1
FROM Some_Table_You_Did_Not_Name
UNION ALL
SELECT col2
FROM Some_Table_You_Did_Not_Name
If the order matters in your example then you want this:
WITH data AS
(
SELECT col1, col2, ROW_NUMBER() OVER (ORDER BY col1, col2) as RN
FROM Some_Table_You_Did_Not_Name
)
SELECT col
FROM (
SELECT DISTINCT col1 as col, RN, 1 as O
FROM data
UNION ALL
SELECT DISTINCT col2 as col, RN, 2 as O
FROM data
) JC_IS_THAT_GUY
ORDER BY RN ASC, O ASC, col ASC
You can use a query like the following:
SELECT Col1
FROM (
SELECT DISTINCT Col1, Col1 AS Col2, 0 AS grp
FROM mytable
UNION ALL
SELECT Col2 AS Col1, Col1 AS Col2, 1 AS grp
FROM mytable) AS t
ORDER BY Col2, grp, Col1
Demo here
There is absolutely no need to do a UNION, UNION ALL or reference the table more than once to unpivot data...
-- if Col2 is always a well ordered sequense like the test data...
SELECT
Col1 = x.Value
FROM
#TestData td
CROSS APPLY ( VALUES (IIF(td.Col2 = 1, td.Col1, NULL)), (CAST(td.Col2 AS CHAR(1))) ) x (Value)
WHERE
x.Value IS NOT NULL;
-- if it isn't...
WITH
cre_Add_RN AS (
SELECT
td.Col1,
td.Col2,
RN = ROW_NUMBER() OVER (PARTITION BY td.Col1 ORDER BY td.Col2)
FROM
#TestData td
)
SELECT
x.Value
FROM
cre_Add_RN arn
CROSS APPLY ( VALUES (IIF(arn.RN = 1, arn.Col1, NULL)), (CAST(arn.Col2 AS CHAR(1))) ) x (Value)
WHERE
x.Value IS NOT NULL;
HTH,
Jason

Oracle Group by based on next row value

We are trying to get a group by result by checking the next rows value.
Sample Data:
Table A
COL1 COL2 COL3
---- ---- ----
B BUY 1
B SELL 1.2
B SELL 2
C BUY 3
C SELL 4
C BUY 5
Result:
COL1 COL2 COUNT(1)
---- ---- --------
B BUY 1
B SELL 2
C BUY 1
C SELL 1
C BUY 1
You appear to have ordered by COL3; if this is the case then:
SELECT col1,
col2,
change - COALESCE( LAG( change ) OVER ( PARTITION BY col1 ORDER BY change ), 0 )
AS cnt
FROM (
SELECT col1,
col2,
CASE LEAD( col2 ) OVER ( PARTITION BY col1 ORDER BY col3 )
WHEN col2
THEN NULL
ELSE ROW_NUMBER() OVER ( PARTITION BY col1 ORDER BY col3 )
END AS change
FROM a
)
WHERE change IS NOT NULL;
If I understand correctly, you can do this with a difference of row numbers approach:
select col1, col2, count(*)
from (select t.*,
row_number() over (partition by col1 order by col3) as seqnum,
row_number() over (partition by col1, col2 order by col3) as seqnum_2,
from t
) t
group by col1, col2, (seqnum - seqnum_2);
This identifies groups of adjacent col2 values based on the ordering in col3.

SQL Server : get max of the column2 and column3 value must be 1

I have an output of some part of my stored proedure like this:
col1 col2 col3 col4
--------------------------
2016-05-05 1 2 2
2016-05-05 1 3 32
2016-05-12 2 1 11
2016-05-12 3 1 31
Now I need to get result based on this condition
col2 = 1 and col3 = max or col3 = 1
and col2 = max
The final result should be
col1 col2 col3 col4
-------------------------
2016-05-05 1 3 32
2016-05-12 3 1 31
Not sure if thats the most efficient way , but you can use ROW_NUMBER() :
SELECT * FROM (
SELECT t.*,
ROW_NUMBER() OVER(PARTITION BY t.col1 ORDER BY t.col3 DESC) as rnk,
WHERE t.col2 = 1
UNION ALL
SELECT t.*,
ROW_NUMBER() OVER(PARTITION BY t.col1 ORDER BY t.col2 DESC) as rnk,
WHERE t.col3 = 1) tt
WHERE rnk = 1
This will give you all the records with
(col2=1 and col3=max) or (col3=1 and col2=max)
This is a bit tricky. Your data has no ambiguities, such as duplicate maximuma in col4 or "1" values in both col2 and col3.
The following is a direct translation of the logic in your question:
select t.*
from t
where t.col4 = (select max(t2.col4)
from t t2
where t2.col1 = t.col1 and (t2.col2 = 1 or t2.col3 = 1)
);
Try this. Note if there are more than 1 same max value, then you need all of those in output. And it will work for all scenarios, even when col1 is not in sync with col2 and col3.
I am first finding highest values of col2 and col3 and assigning them value as 1. Then in outer query, I am using your join condition. Demo created for Postgres DB as SQLServer wasn't available.
SQLFiddle Demo
select col1,col2,col3,col4
from
(
select t.*,
RANK() OVER(ORDER BY col3 DESC) as col3_max,
RANK() OVER(ORDER BY col2 DESC) as col2_max
from your_table t
) t1
where
(col2=1 and col3_max=1)
OR
(col3=1 and col2_max=1)
Alternative way:
SELECT * FROM (
SELECT *, ROW_NUMBER() OVER (PARTITION BY col1 ORDER BY iif(col2 = 1, col3, col2) DESC) as r
FROM tbl) t
WHERE r = 1

select query to fetch rows corresponding to all values in a column

Consider this example table "Table1".
Col1 Col2
A 1
B 1
A 4
A 5
A 3
A 2
D 1
B 2
C 3
B 4
I am trying to fetch those values from Col1 which corresponds to all values (in this case, 1,2,3,4,5). Here the result of the query should return 'A' as none of the others have all values 1,2,3,4,5 in Col2.
Note that the values in Col2 are decided by other parameters in the query and they will always return some numeric values. Out of those values the query needs to fetch values from Col1 corresponding to all in Col2. The values in Col2 could be 11,12,1,2,3,4 for instance (meaning not necessarily in sequence).
I have tried the following select query:
select distinct Col1 from Table1 where Col1 in (1,2,3,4,5);
select distinct Col1 from Table1 where Col1 exists (select distinct Col2 from Table1);
and its different variations. But the problem is that I need to apply an 'and' for Col2 not an 'or'.
like Return a value from Col1 where Col2 'contains' all values between 1 and 5.
Appreciate any suggestion.
You could use analytic ROW_NUMBER() function.
SQL FIddle for a setup and working demonstration.
SELECT col1
FROM
(SELECT col1,
col2,
row_number() OVER(PARTITION BY col1 ORDER BY col2) rn
FROM your_table
WHERE col2 IN (1,2,3,4,5)
)
WHERE rn =5;
UPDATE As requested by OP, some explanation about how the query works.
The inner sub-query gives you the following resultset:
SQL> SELECT col1,
2 col2,
3 row_number() OVER(PARTITION BY col1 ORDER BY col2) rn
4 FROM t
5 WHERE col2 IN (1,2,3,4,5);
C COL2 RN
- ---------- ----------
A 1 1
A 2 2
A 3 3
A 4 4
A 5 5
B 1 1
B 2 2
B 4 3
C 3 1
D 1 1
10 rows selected.
PARTITION BY clause will group each sets of col1, and ORDER BY will sort col2 in each group set of col1. Thus the sub-query gives you the row_number for each row in an ordered way. now you know that you only need those rows where row_number is at least 5. So, in the outer query all you need ot do is WHERE rn =5 to filter the rows.
You can use listagg function, like
SELECT Col1
FROM
(select Col1,listagg(Col2,',') within group (order by Col2) Col2List from Table1
group by Col1)
WHERE Col2List = '1,2,3,4,5'
You can also use below
SELECT COL1
FROM TABLE_NAME
GROUP BY COL1
HAVING
COUNT(COL1)=5
AND
SUM(
(CASE WHEN COL2=1 THEN 1 ELSE 0
END)
+
(CASE WHEN COL2=2 THEN 1 ELSE 0
END)
+
(CASE WHEN COL2=3 THEN 1 ELSE 0
END)
+
(CASE WHEN COL2=4 THEN 1 ELSE 0
END)
+
(CASE WHEN COL2=5 THEN 1 ELSE 0
END))=5

Group rows Keeping the Order of values

How can I group following set of data:
DATENO COL1 COL2
1 A 1
2 B 1
3 C 1
4 C 1
5 D 1
6 C 1
7 D 1
8 D 1
9 E 1
To get something like this:
DATENO COL1 COL2
1 A 1
2 B 1
3 C 2
5 D 1
6 C 1
7 D 2
9 E 1
Sum for C and D are grouped keeping the order intact. Any ideas?
Updated: answer corrected according to comments.
Rows can be grouped as required on such a way:
-- leave only first rows of each group and substitute col2 with a sum.
select
dateno,
col1,
group_sum as col2
from (
-- Get sum of col2 for each bucket
select
dateno,
col1,
is_start,
sum(col2) over (partition by bucket_number) group_sum
from (
-- divide rows into buckets based on previous col1 change count
select
dateno, col1, col2, is_start,
sum(is_start) over(order by dateno rows unbounded preceding) bucket_number
from (
-- mark rows with change of col1 value as start of new sequence
select
dateno, col1, col2,
decode (nvl(prev_col1, col1||'X'), col1, 0, 1) is_start
from (
-- determine for each row value of col1 in previous row.
select
dateno,
col1,
col2,
lag(col1) over (order by dateno) prev_col1
from t
)
)
)
)
where is_start = 1
order by dateno
Example at SQLFiddle