SQL query to calculate totals from table that has overlapping/duplicate data - sql

I need to calculate the total number of items from a table that has smaller building units. For example, let's say I have
Unit_Table
UnitId StartUnit EndUnit QtyUnit
---------------------------------------
1 0 2 5
1 0 15 20
1 2 15 15
1 15 30 8
2 0 2 2
2 0 15 12
And let's say I have
Final_Table
UnitId StartFinal EndFinal QtyFinal
----------------------------------------
1 0 30
1 0 15
1 2 30
2 0 15
How do I populate the QtyFinal column with the correct numbers (28, 20, 23 and 12 in this example)? Let's assume that all building blocks necessary for Final_Table are in Unit_Table
The following query does not work:
;With Aggr As (
Select F.UnitId
, F.StartFinal
, F.EndFinal
, Sum(U.QtyUnit) As QtySum
From Final_Table As F
Inner Join Unit_Table As U
On F.UnitId = U.UnitId
Where F.StartFinal <= U.StartUnit
And F.EndFinal > U.StartUnit
Group By F.UnitId
, F.StartFinal
, F.EndFinal
)
Update Final_Table
Set QtyFinal = QtySum
From Final_Table As F
Inner Join Aggr
On F.UnitId = Aggr.UnitId
And F.StartFinal = Aggr.StartFinal
And F.EndFinal = Aggr.EndFinal
The problem is that the Unit_Table has overlaps ((0-2) (2-15) and (0-15) for UnitId 1, for example), how do I eliminate rows 0-2 and 2-15 from the aggregate function for calculating 0-30?

I was thinking about processing Final_Table first. Then, for each row, we can look "the path" from Start to the End in the Unit_Table.
DECLARE
VarStart NUMBER;
VarEnd NUMBER;
VarQty NUMBER;
TotalQty NUMBER;
BEGIN
FOR i IN (SELECT * FROM Final_Table ORDER BY 1, 2) LOOP
VarStart := i.StartFinal;
VarQty := 0;
TotalQty := 0;
LOOP
SELECT EndUnit, QtyUnit
INTO VarEnd, VarQty
FROM Unit_Table
WHERE UnitID = i.Unitid
AND StartUnit = VarStart
AND ROWNUM = 1;
TotalQty := TotalQty + VarQty;
VarStart := VarEnd;
EXIT WHEN VarEnd = i.Endfinal;
END LOOP;
UPDATE Final_Table
SET QtyFinal = TotalQty
WHERE UnitId = i.Unitid
AND StartFinal = i.Startfinal
AND EndFinal = i.EndFinal;
END LOOP;
END;
/
I tried it and the result is:
UNITID STARTFINAL ENDFINAL QTYFINAL
1 0 15 20
1 0 30 28
1 2 30 23
2 0 15 12
Of course, there's a problem when "the path" its not completed in Unit_Table, like this:
UnitId StartUnit EndUnit QtyUnit
---------------------------------------
3 0 2 7
3 15 30 11
... But I don't know if it is possible in your application. Maybe this can't happend because is one of your requirement before inserting in Unit_Table.

I think you need a correlated subquery or cross apply:
Update F
Set QtyFinal = u.QtyUnit
From Final_Table F cross apply
(select sum(u.QtyUnit)
from Unit_Table u
where F.UnitId = u.UnitId and
F.StartFinal <= u.StartUnit And F.EndFinal > u.StartUnit
) u;
Notes:
The update needs the table alias defined in the from clause. Otherwise, it is a different reference to the table.
If you are not familiar with cross apply, it is a lot like a correlated subquery in the from clause.
A correlated subquery would be similar, but it would update all rows instead of only those that match.

Related

Counting nulls for each column in table

We want to count how many nulls each column in a table has. There are too many columns to do this one by one, so the following PLSQL procedure was created.
In the first part of the procedure, all column names are obtained. This works, as the dbms_output correctly lists them all.
Secondly, a query inserts the count of null values in the variable 'nullscount'. This part does not work, as the output printed for this variable is always 0, even for columns where we know there are nulls.
Does anyone know how to handle the second part correctly?
Many thanks.
CREATE OR REPLACE PROCEDURE COUNTNULLS AS
nullscount int;
BEGIN
for c in (select column_name from all_tab_columns where table_name = upper('gp'))
loop
select count(*) into nullscount from gp where c.column_name is null;
dbms_output.put_line(c.column_name||' '||nullscount);
end loop;
END COUNTNULLS;
You can get it with just one query like this: this query scans table just once:
DBFiddle: https://dbfiddle.uk/asgrCezT
select *
from xmltable(
'/ROWSET/ROW/*'
passing
dbms_xmlgen.getxmltype(
(
select
'select '
||listagg('count(*)-count("'||column_name||'") as "'||column_name||'"',',')
||' from '||upper('gp')
from user_tab_columns
where table_name = upper('gp')
)
)
columns
column_name varchar2(30) path './name()',
cnt_nulls int path '.'
);
Results:
COLUMN_NAME CNT_NULLS
------------------------------ ----------
A 5
B 4
C 3
Dynamic sql in this query uses (24 chars + column name length) so it should work fine for example for 117 columns with average column name length = 10. If you need more, you can rewrite it a bit, for example:
select *
from xmltable(
'let $cnt := /ROWSET/ROW/CNT
for $r in /ROWSET/ROW/*[name() != "CNT"]
return <R name="{$r/name()}"> {$cnt - $r} </R>'
passing
dbms_xmlgen.getxmltype(
(
select
'select count(*) CNT,'
||listagg('count("'||column_name||'") as "'||column_name||'"',',')
||' from '||upper('gp')
from user_tab_columns
where table_name = upper('gp')
)
)
columns
column_name varchar2(30) path '#name',
cnt_nulls int path '.'
);
create table gp (
id number generated by default on null as identity
constraint gp_pk primary key,
c1 number,
c2 number,
c3 number,
c4 number,
c5 number
)
;
-- add some data with NULLS and numbers
DECLARE
BEGIN
FOR r IN 1 .. 20 LOOP
INSERT INTO gp (c1,c2,c3,c4,c5) VALUES
(CASE WHEN mod(r,2) = 0 THEN NULL ELSE mod(r,2) END
,CASE WHEN mod(r,3) = 0 THEN NULL ELSE mod(r,3) END
,CASE WHEN mod(r,4) = 0 THEN NULL ELSE mod(r,4) END
,CASE WHEN mod(r,5) = 0 THEN NULL ELSE mod(r,5) END
,5);
END LOOP;
END;
/
-- check what is in the table
SELECT * FROM gp;
-- do count of each column
DECLARE
l_colcount NUMBER;
l_statement VARCHAR2(100) := 'SELECT COUNT(*) FROM $TABLE_NAME$ WHERE $COLUMN_NAME$ IS NULL';
BEGIN
FOR r IN (SELECT column_name,table_name FROM user_tab_columns WHERE table_name = 'GP') LOOP
EXECUTE IMMEDIATE REPLACE(REPLACE(l_statement,'$TABLE_NAME$',r.table_name),'$COLUMN_NAME$',r.column_name) INTO l_colcount;
dbms_output.put_line('Table: '||r.table_name||', column'||r.column_name||', COUNT: '||l_colcount);
END LOOP;
END;
/
Table created.
Statement processed.
Result Set 4
ID C1 C2 C3 C4 C5
1 1 1 1 1 5
2 - 2 2 2 5
3 1 - 3 3 5
4 - 1 - 4 5
5 1 2 1 - 5
6 - - 2 1 5
7 1 1 3 2 5
8 - 2 - 3 5
9 1 - 1 4 5
10 - 1 2 - 5
11 1 2 3 1 5
12 - - - 2 5
13 1 1 1 3 5
14 - 2 2 4 5
15 1 - 3 - 5
16 - 1 - 1 5
17 1 2 1 2 5
18 - - 2 3 5
19 1 1 3 4 5
20 - 2 - - 5
20 rows selected.
Statement processed.
Table: GP, columnID, COUNT: 0
Table: GP, columnC1, COUNT: 10
Table: GP, columnC2, COUNT: 6
Table: GP, columnC3, COUNT: 5
Table: GP, columnC4, COUNT: 4
Table: GP, columnC5, COUNT: 0
c.column_name is never null because it's the content of the column "column_name" of the table "all_tab_columns"
not the column of which name is the value of c.column_name, in table gp.
You have to use dynamic query and EXECUTE IMMEDIATE to achieve what you want.

How to write proc sql without windowfunction over partition by sum?

I just started learning SAS and realised that proc sql don't use window functions. As I am more at ease with sql I was wondering how I can simulate a sum window function in proc?
desired result
select a.active, a.store_id, a.nbr, sum(nbr) over (partition by a.store_id)
from(select active, store_id, count(customer_id) as nbr from customer group by active, store_id) as a
;
active
store_id
nbr
sum
0
1
8
326
1
1
318
326
0
2
7
273
1
2
266
273
eg of raw data
select active, store_id, customer_id
from customer
limit 10;
active
store_id
customer_id
1
1
1
1
1
2
1
2
3
1
2
4
1
1
5
1
1
6
0
1
7
1
2
8
1
1
9
1
2
10
current result and query
select a.active, a.store_id, a.nbr, sum(nbr)
from(select active, store_id, count(customer_id) as nbr from customer group by active, store_id) as a
group by a.active, a.store_id, a.nbr;
active
store_id
nbr
sum
0
1
8
8
1
1
318
318
0
2
7
7
1
2
266
266
Unlike some SQL implementation SAS is happy to re-merge summary statistics back onto the detail rows when you include variables that are neither group by nor summary statistics.
Let's convert your print outs of data into an actual dataset. And let's change one value so we have at least two values of ACTIVE to group by.
data have;
input active store_id customer_id;
cards;
1 1 1
1 1 2
1 2 3
1 2 4
1 1 5
1 1 6
0 1 7
1 2 8
1 1 9
1 2 10
;
Now we can count the records by ACTIVE and STORE_ID and then generate the report by appending the store total.
proc sql;
select active,store_id,nbr,sum(nbr) as store_nbr
from (select active,store_id,count(*) as nbr
from have
group by active,store_id
)
group by store_id
;
Resulting printout:
active store_id nbr store_nbr
---------------------------------------
0 1 1 6
1 1 5 6
1 2 4 4
You can do the equivalent in proc sql by merging two sub-queries: one for the count of customers by active, store_id, and another for the total customers for each store_id.
proc sql noprint;
create table want as
select t1.active
, t1.store_id
, t1.nbr
, t2.sum
from (select active
, store_id
, count(customer_id) as nbr
from have
group by store_id, active
) as t1
LEFT JOIN
(select store_id
, count(customer_id) as sum
from have
group by store_id
) as t2
ON t1.store_id = t2.store_id
;
quit;
If you wanted to do this in a more SASsy way, you can run proc means and merge together the results from a single dataset that holds everything you need. proc means will calculate all possible combinations of your variables by default.
proc means data=have noprint;
class store_id active;
ways 1 2;
output out=want_total
n(customer_id) = total
;
run;
data want;
merge want_total(where=(_TYPE_ = 3) rename=(total = nbr) )
want_total(where=(_TYPE_ = 2) rename=(total = sum) keep=_TYPE_ store_id total)
;
by store_id;
drop _:;
run;
Or, in SQL:
proc sql;
create table want as
select t1.store_id
, t1.active
, t1.total as nbr
, t2.total as sum
from want_total as t1
LEFT JOIN
want_total as t2
ON t1.store_id = t2.store_id
where t1._TYPE_ = 3
AND t2._TYPE_ = 2
;
quit;
The _TYPE_ variable identifies the level of the analysis. For example, _TYPE_ = 1 is for active only, _TYPE_ = 2 is for store_id only, and _TYPE_ = 3 is for all combinations. You can view this in the output dataset from proc means:
store_id active _TYPE_ _FREQ_ total
. 0 1 3 3
. 1 1 7 7
1 . 2 6 6
2 . 2 4 4
1 0 3 1 1
1 1 3 5 5
2 0 3 2 2
2 1 3 2 2
And if you wanted faster high-performance results, check out its big sibling, proc hpsummary.
Therein lies the cool thing about SAS: You can bounce between PROCs, SQL, the DATA Step, and Python via Pandas/proc python. You can exploit the unique benefits of each of these methods and thought processes for any number of data engineering and statistics problems.

Count rows according to 2 column with Group By

I have a database table of 3 columns
RecordID Deleted CardHolderID
1963 1 9
4601 1 9
6996 0 9
1532 1 11
1529 0 20
I want an sql query to output the sum of rows of Deleted column grouped by CardHolderID.
However query below outputs 2 columns for CardHolderID
select c.CardHolderID, c.Deleted, COUNT(*) as Sum
from Card c
group by c.Deleted, c.CardHolderID
CardHolderID Deleted Sum
9 0 1
9 1 2
20 0 1
11 1 1
I want to include 2 columns as Deleted0 (count of rows with Deleted column equal to 0) and Deleted1 (count of rows with Deleted column equal to 1)
CardHolderID Deleted0 Deleted1
9 1 2
20 1 0
11 1 1
How should be the SQL query for such a result?
Kind regards
Using conditional count:
select c.CardHolderID,
count( case when c.deleted > 0 then 1 else null end ) deleted0,
count( case when c.deleted = 0 then 1 else null end ) deleted1,
from Card c
group by c.CardHolderID
GROUP BY CardHolderID alone.
Use SUM(Deleted) to count the 1's.
Use SUM(1-deleted) to count the 0's.
select c.CardHolderID, sum(1-c.deleted) deleted0, sum(c.Deleted) deleted1
from Card c
group by c.CardHolderID
if you are using MSSQL
select DtlPivot.CardHolderID, isnull(DtlPivot.[0],0) as Deleted0, isnull(DtlPivot.[1],0) as Deleted1 from
(
select c.CardHolderID, c.Deleted, COUNT(*) as Total from Card c
group by c.Deleted, c.CardHolderID
) aa
PIVOT
(
sum(Total) FOR Deleted IN([0],[1])
)AS DtlPivot

Nested sum loop until foreign key 'dies out'

I am pulling my hair out over a data retrieval function I'm trying to write. In essence this query is meant to SUM up the count of all voorwerpnummers in the Voorwerp_in_Rubriek table, grouped by their rubrieknummer gathered from Rubriek.
After that I want to keep looping through the sum in order to get to their 'top level parent'. Rubriek has a foreign key reference to itself with a 'hoofdrubriek', this would be easier seen as it's parent in a category tree.
This also means they can be nested. A value of 'NULL' in the hoofdcategory column means that it is a top-level parent. The idea behind this query is to SUM up the count of voorwerpnummers in Voorwerp_in_rubriek, and add them together until they are at their 'top level parent'.
As the database and testdata is quite massive I've decided not to add direct code to this question but a link to a dbfiddle instead so there's more structure.
https://dbfiddle.uk/?rdbms=sqlserver_2017&fiddle=8068a52da6a29afffe6dc793398f0998
I got it working in some degree using this query:
SELECT R2.hoofdrubriek ,
COUNT(Vr.rubrieknummer) AS aantal
FROM Rubriek R1
RIGHT OUTER JOIN Rubriek R2 ON R1.rubrieknummer = R2.hoofdrubriek
INNER JOIN Voorwerp_in_rubriek Vr ON R2.rubrieknummer = Vr.rubrieknummer
WHERE NOT EXISTS ( SELECT *
FROM Rubriek
WHERE hoofdrubriek = R2.rubrieknummer )
AND R1.hoofdrubriek IS NOT NULL
GROUP BY Vr.rubrieknummer ,
R2.hoofdrubriek
But that doesn't get back all items and flops in general. I hope someone can help me.
If I got it right
declare #t table (
rubrieknummer int,
cnt int);
INSERT #t(rubrieknummer, cnt)
SELECT R.rubrieknummer, COUNT(Vr.voorwerpnummer)
FROM Rubriek R
INNER JOIN voorwerp_in_rubriek Vr ON R.rubrieknummer = Vr.rubrieknummer
GROUP BY Vr.rubrieknummer, R.rubrieknummer;
--select * from #t;
with t as(
select rubrieknummer, cnt
from #t
union all
select r.hoofdrubriek, cnt
from t
join Rubriek r on t.rubrieknummer = r.rubrieknummer
)
select rubrieknummer, sum(cnt) cnt
from t
group by rubrieknummer;
applying to your fiddle data returns
rubrieknummer cnt
<null> 42
100 42
101 26
102 6
103 10
10000 8
10100 4
10101 1
10102 3
10500 4
10501 2
10502 2
15000 18
15100 6
15101 2
15102 2
15103 2
15500 12
15501 4
15502 3
15503 5
20000 6
20001 2
20002 1
20003 1
20004 2
25000 4
25001 1
25002 1
25003 1
25004 1
30001 2
30002 1
30004 3

Identify same amounts over different users

Consider the following table Orders:
OrderID Name Amount
-----------------------
1 A 100
2 A 5
3 B 32
4 C 4000
5 D 701
6 E 32
7 F 200
8 G 100
9 H 12
10 I 17
11 J 100
12 J 100
13 J 11
14 A 5
I need to identify, for each unique 'Amount', if there are 2 or more users that have ordered that exact amount, and then list the details of those orders. So the desired output would be:
OrderID Name Amount
---------------------
1 A 100
8 G 100
11 J 100
12 J 100
3 B 32
6 E 32
please note that user A has ordered 2 x an order of 5 (order 2 and 14) but this shouldn't be in the output as it is within the same user. Only if another user would have made a order of 5, it should be in the output.
Can anyone help me out?
I would just use exists:
select o.*
from orders o
where exists (select 1
from orders o2
where o2.amount = o.amount and o2.name <> o.name
);
You can do :
select t.*
from table t
where exists (select 1 from table t1 where t1.amount = t.amount and t1.name <> t.name);
If you want only selected field then
SELECT Amount,name,
count(*) AS c
FROM TABLE
GROUP BY Amount, name
HAVING c > 1
ORDER BY c DESC
if you want full row
select * from table where Amount in (
select Amount, name from table
group by Amount, name having count(*) > 1)