How to select and export 5,000 lines of debit and credit transactions at a time and have the debits and credits balance to zero? - sql

The ERP system we're migrating to requires csv files with 5,000 or less rows for the GL. The debit and credit transactions within each file must balance to zero. There are multiple debit and credit transaction rows that share a common transaction ID.
Using offset and fetch next I've been able to extract 5000 rows at a time, however the credits and debits do not balance.
Data Example:
TranID Credit Debit Balance Account#
1 250 0 250 ABC
1 0 250 0 DEF
2 0 100 -100 GHI
2 50 0 -50 JKL
2 50 0 0 MNO
declare #batchsize INT = 5000,
#loopcount INT = 0;
while (#loopcount*#batchsize < (select count(*) from [Apps2].[dbo].[GLTrans]))
begin
SELECT * FROM [Apps2].[dbo].[GLTrans]
ORDER BY tranID
offset (#batchsize * #loopcount) rows
fetch next (#batchsize) rows only
set #loopcount= #loopcount + 1
end

A simple solution is pre-process all the transactions and assign a batch no (for each CSV files). The temp table stored the number of lines per TranID.
It is assumed that the Debit & Credit will balance for each TranID.
After that you can generate the CSV based on the temp table.
-- create the temp table
create table #trans
(
TranID int identity,
Cnt int,
Batch int
)
-- populate the temp table
insert into #trans (TranID, Cnt)
select TranID, Cnt = count(*)
from [Apps2].[dbo].[GLTrans]
group by TranID
declare #batchsize int = 5000,
#batch int = 1
while exists (select * from #trans where Batch is null)
begin
update t
set Batch = #batch
from
(
select *, cumm = sum(Cnt) over (order by TranID)
from #trans
where Batch is null
) t
where cumm <= #batchsize
select #batch = #batch + 1
end
-- Verify
select *, sum(Cnt) over (partition by Batch order by TranID)
from #trans
order by TranID

Use a table variable to iterate through your data. Kinda like using a cursor in Oracle...
If I'm understanding your sample data correctly and my assumption that each transID set nets to 0, you change your loop logic to function more like a do...while like this example here where you grab the next transaction set and decide if it keeps the batch under 5k.
This should cover populating one batch of 5000 rows or fewer rows that net to $0 assuming that each transaction ID set nets to $0
Declare #batchCursor TABLE (
TransID INT,
Credit INT, -- chose int for expediency
Debit INT,
Balance INT,
AccountNo Varchar(4)
),
#batchsize INT = 5000,
#rowCount INT = 0,
#transID INT = 1,
#transSize INT = 0;
while (#rowcount <= 5000)
BEGIN
INSERT INTO #batchCursor
SELECT * FROM [Apps2].[dbo].[GLTrans] -- you might need to enumerate all your column names
WHERE TransID = #transID;
SELECT #transSize = COUNT(*) FROM #batchCursor where TransID = #transID);
IF(#transSize > 0)
BEGIN
IF (#transSize + #rowCount < #batchSize)
BEGIN
Set #rowCount += transSize;
Set #transID += 1;
END;
END;
ELSE Set #transID += 1;
IF((Select count(*) FROM [Apps2].[dbo].[GLTrans] WHERE TransID = #transID) + #rowCount > #batch)
BREAK;
END;

Related

Create equal sized, random buckets, with no repetition to the row

Having some difficulty in a scheduling task.
Background: I have 100 members, 10 different sessions, and 10 different activities.
Rules:
Each member must do each activity only once.
Each activity must have the same number of members in each session.
The members must be with (at least mostly) different people in each session.
Each activity must be run in each session with 10 people per activity.
The expected outcome would be something like this:
Person ID
Session ID
Activity ID
1
S1
A
2
S1
B
3
S1
C
1
S2
B
2
S2
C
3
S2
A
In the above example, each activity in each session has only 1 participant, I have to lock that activity in that session out at 10 members.
I have tried a few different solutions in excel / SQL, but not able to meet all 3 rules. The hardest being keeping each activity/session slot to 10 people.
The closest solution I've had is the following.. its not pretty though:
SET STATISTICS TIME, io OFF
-- Create list of applicants
IF OBJECT_ID('process.Numbers') IS NOT NULL DROP TABLE process.Numbers
CREATE TABLE Numbers (ApplicantID INT, SessionID INT, GroupID INT)
DECLARE #i INT,
#Session INT,
#Group INT;
SELECT #i = 1;
SET NOCOUNT ON
WHILE #i <= 100
BEGIN
INSERT INTO Numbers (ApplicantID, SessionID) VALUES (#i, 1);
SELECT #i = #i + 1;
END;
-- Duplicate ApplicantID list for each different session
SELECT #Session = 1
WHILE #Session <= 10
BEGIN
IF #Session > 1
BEGIN
INSERT INTO
Numbers (ApplicantID, SessionID)
SELECT ApplicantID, #Session FROM Numbers WHERE SessionID = 1
END
-- SELECT RANDOM TOP 10 AND SET AS GROUP ID
SELECT #Group = 1
WHILE #Group <= 10
BEGIN
WITH dups_check AS ( SELECT ApplicantID,
GroupID,
COUNT(*) AS vol
FROM Numbers
GROUP BY ApplicantID,
GroupID),
cte AS ( SELECT TOP 10 *
FROM Numbers
WHERE numbers.GroupID IS NULL
AND SessionID = #Session
AND NOT EXISTS (SELECT 1
FROM dups_check
WHERE Numbers.ApplicantID = dups_check.ApplicantID
AND dups_check.GroupID = #Group)
ORDER BY newid())
UPDATE cte SET GroupID = #Group
SELECT #Group = #Group + 1
END
SELECT #Session = #Session + 1
END
SELECT * FROM Numbers
SET NOCOUNT OFF
This code starts to fall over regularly in the higher session numbers when it tries to set an activity that the individual has already done.
Thanks!
I tried using your code to Generate ApplicantID and SessionID rows and modified the last part to generate GroupID column using Ranking functions.
Below is the output of what I have tried:
SET STATISTICS TIME, io OFF
-- Create list of applicants
IF OBJECT_ID('dbo.Numbers') IS NOT NULL DROP TABLE dbo.Numbers
CREATE TABLE dbo.Numbers (ApplicantID INT, SessionID INT, GroupID INT)
DECLARE #i INT,
#Session INT,
#Group INT;
SELECT #i = 1;
SET NOCOUNT ON
WHILE #i <= 100
BEGIN
INSERT INTO Numbers (ApplicantID, SessionID) VALUES (#i, 1);
SELECT #i = #i + 1;
END;
-- Duplicate ApplicantID list for each different session
SELECT #Session = 1
WHILE #Session <= 10
BEGIN
IF #Session > 1
BEGIN
INSERT INTO
Numbers (ApplicantID, SessionID)
SELECT ApplicantID, #Session FROM Numbers WHERE SessionID = 1
END
SELECT #Session = #Session + 1
END
SET NOCOUNT OFF
drop table if exists #temp;
select ApplicantID, SessionID, row_number() OVER(PARTITION BY applicantID ORDER BY applicantID) AS grp_row into #temp
from Numbers
update a
set a.GroupID = b.grp_row
from Numbers a
join #temp b on a.ApplicantID = b. ApplicantID and a.SessionID = b.SessionID
where a.GroupID is null
Each member must do each activity only once.
There are 100 applicants, and as an example, I am showing applicants 1 & 100. Here Each applicant is having each groupID only once.
Each activity must have the same number of members in each session.
There are 10 GroupID's and the number of applicants for each GroupID is the same (100).
The members must be with (at least mostly) different people in each session.
There are 100 applicants but I am taking the top 10 as an example. Here each sessionID has different applicants.

SQL Server - loop through table and update based on count

I have a SQL Server database. I need to loop through a table to get the count of each value in the column 'RevID'. Each value should only be in the table a certain number of times - for example 125 times. If the count of the value is greater than 125 or less than 125, I need to update the column to ensure all values in the RevID (are over 25 different values) is within the same range of 125 (ok to be a few numbers off)
For example, the count of RevID = "A2" is = 45 and the count of RevID = 'B2' is = 165 then I need to update RevID so the 45 count increases and the 165 decreases until they are within the 125 range.
This is what I have so far:
DECLARE #i INT = 1,
#RevCnt INT = SELECT RevId, COUNT(RevId) FROM MyTable group by RevId
WHILE(#RevCnt >= 50)
BEGIN
UPDATE MyTable
SET RevID= (SELECT COUNT(RevID) FROM MyTable)
WHERE RevID < 50)
#i = #i + 1
END
I have also played around with a cursor and instead of trigger. Any idea on how to achieve this? Thanks for any input.
Okay I cam back to this because I found it interesting even though clearly there are some business rules/discussion that you and I and others are not seeing. anyway, if you want to evenly and distribute arbitrarily there are a few ways you could do it by building recursive Common Table Expressions [CTE] or by building temp tables and more. Anyway here is a way that I decided to give it a try, I did utilize 1 temp table because sql was throwing in a little inconsistency with the main logic table as a cte about every 10th time but the temp table seems to have cleared that up. Anyway, this will evenly spread RevId arbitrarily and randomly assigning any remainder (# of Records / # of RevIds) to one of the RevIds. This script also doesn't rely on having a UniqueID or anything it works dynamically over row numbers it creates..... here you go just subtract out test data etc and you have what you more than likely want. Though rebuilding the table/values would probably be easier.
--Build Some Test Data
DECLARE #Table AS TABLE (RevId VARCHAR(10))
DECLARE #C AS INT = 1
WHILE #C <= 400
BEGIN
IF #C <= 200
BEGIN
INSERT INTO #Table (RevId) VALUES ('A1')
END
IF #c <= 170
BEGIN
INSERT INTO #Table (RevId) VALUES ('B2')
END
IF #c <= 100
BEGIN
INSERT INTO #Table (RevId) VALUES ('C3')
END
IF #c <= 400
BEGIN
INSERT INTO #Table (RevId) VALUES ('D4')
END
IF #c <= 1
BEGIN
INSERT INTO #Table (RevId) VALUES ('E5')
END
SET #C = #C+ 1
END
--save starting counts of test data to temp table to compare with later
IF OBJECT_ID('tempdb..#StartingCounts') IS NOT NULL
BEGIN
DROP TABLE #StartingCounts
END
SELECT
RevId
,COUNT(*) as Occurences
INTO #StartingCounts
FROM
#Table
GROUP BY
RevId
ORDER BY
RevId
/************************ This is the main method **********************************/
--clear temp table that is the main processing logic
IF OBJECT_ID('tempdb..#RowNumsToChange') IS NOT NULL
BEGIN
DROP TABLE #RowNumsToChange
END
--figure out how many records there are and how many there should be for each RevId
;WITH cteTargetNumbers AS (
SELECT
RevId
--,COUNT(*) as RevIdCount
--,SUM(COUNT(*)) OVER (PARTITION BY 1) / COUNT(*) OVER (PARTITION BY 1) +
--CASE
--WHEN ROW_NUMBER() OVER (PARTITION BY 1 ORDER BY NEWID()) <=
--SUM(COUNT(*)) OVER (PARTITION BY 1) % COUNT(*) OVER (PARTITION BY 1)
--THEN 1
--ELSE 0
--END as TargetNumOfRecords
,SUM(COUNT(*)) OVER (PARTITION BY 1) / COUNT(*) OVER (PARTITION BY 1) +
CASE
WHEN ROW_NUMBER() OVER (PARTITION BY 1 ORDER BY NEWID()) <=
SUM(COUNT(*)) OVER (PARTITION BY 1) % COUNT(*) OVER (PARTITION BY 1)
THEN 1
ELSE 0
END - COUNT(*) AS NumRecordsToUpdate
FROM
#Table
GROUP BY
RevId
)
, cteEndRowNumsToChange AS (
SELECT *
,SUM(CASE WHEN NumRecordsToUpdate > 1 THEN NumRecordsToUpdate ELSE 0 END)
OVER (PARTITION BY 1 ORDER BY RevId) AS ChangeEndRowNum
FROM
cteTargetNumbers
)
SELECT
*
,LAG(ChangeEndRowNum,1,0) OVER (PARTITION BY 1 ORDER BY RevId) as ChangeStartRowNum
INTO #RowNumsToChange
FROM
cteEndRowNumsToChange
;WITH cteOriginalTableRowNum AS (
SELECT
RevId
,ROW_NUMBER() OVER (PARTITION BY RevId ORDER BY (SELECT 0)) as RowNumByRevId
FROM
#Table t
)
, cteRecordsAllowedToChange AS (
SELECT
o.RevId
,o.RowNumByRevId
,ROW_NUMBER() OVER (PARTITION BY 1 ORDER BY (SELECT 0)) as ChangeRowNum
FROM
cteOriginalTableRowNum o
INNER JOIN #RowNumsToChange t
ON o.RevId = t.RevId
AND t.NumRecordsToUpdate < 0
AND o.RowNumByRevId <= ABS(t.NumRecordsToUpdate)
)
UPDATE o
SET RevId = u.RevId
FROM
cteOriginalTableRowNum o
INNER JOIN cteRecordsAllowedToChange c
ON o.RevId = c.RevId
AND o.RowNumByRevId = c.RowNumByRevId
INNER JOIN #RowNumsToChange u
ON c.ChangeRowNum > u.ChangeStartRowNum
AND c.ChangeRowNum <= u.ChangeEndRowNum
AND u.NumRecordsToUpdate > 0
IF OBJECT_ID('tempdb..#RowNumsToChange') IS NOT NULL
BEGIN
DROP TABLE #RowNumsToChange
END
/***************************** End of Main Method *******************************/
-- Compare the results and clean up
;WITH ctePostUpdateResults AS (
SELECT
RevId
,COUNT(*) as AfterChangeOccurences
FROM
#Table
GROUP BY
RevId
)
SELECT *
FROM
#StartingCounts s
INNER JOIN ctePostUpdateResults r
ON s.RevId = r.RevId
ORDER BY
s.RevId
IF OBJECT_ID('tempdb..#StartingCounts') IS NOT NULL
BEGIN
DROP TABLE #StartingCounts
END
Since you've given no rules for how you'd like the balance to operate we're left to speculate. Here's an approach that would find the most overrepresented value and then find an underrepresented value that can take on the entire overage.
I have no idea how optimal this is and it will probably run in an infinite loop without more logic.
declare #balance int = 125;
declare #cnt_over int;
declare #cnt_under int;
declare #revID_overrepresented varchar(32);
declare #revID_underrepresented varchar(32);
declare #rowcount int = 1;
while #rowcount > 0
begin
select top 1 #revID_overrepresented = RevID, #cnt_over = count(*)
from T
group by RevID
having count(*) > #balance
order by count(*) desc
select top 1 #revID_underrepresented = RevID, #cnt_under = count(*)
from T
group by RevID
having count(*) < #balance - #cnt_over
order by count(*) desc
update top #cnt_over - #balance T
set RevId = #revID_underrepresented
where RevId = #revID_overrepresented;
set #rowcount = ##rowcount;
end
The problem is I don't even know what you mean by balance...You say it needs to be evenly represented but it seems like you want it to be 125. 125 is not "even", it is just 125.
I can't tell what you are trying to do, but I'm guessing this is not really an SQL problem. But you can use SQL to help. Here is some helpful SQL for you. You can use this in your language of choice to solve the problem.
Find the rev values and their counts:
SELECT RevID, COUNT(*)
FROM MyTable
GROUP BY MyTable
Update #X rows (with RevID of value #RevID) to a new value #NewValue
UPDATE TOP #X FROM MyTable
SET RevID = #NewValue
WHERE RevID = #RevID
Using these two queries you should be able to apply your business rules (which you never specified) in a loop or whatever to change the data.

How to optimize this t-sql script code by avoiding loop?

I use following sql query to update MyTable. the code take between 5 to 15 min. to update MyTabel as long as ROWS <= 100000000 but when Rows > 100000000 it take exponential time to update MYTable. How can I change this code to use set-base instead of while loop?
DECLARE #startTime DATETIME
DECLARE #batchSize INT
DECLARE #iterationCount INT
DECLARE #i INT
DECLARE #from INT
DECLARE #to INT
SET #batchSize = 10000
SET #i = 0
SELECT #iterationCount = COUNT(*) / #batchSize
FROM MyTable
WHERE LitraID = 8175
AND id BETWEEN 100000000 AND 300000000
WHILE #i <= #iterationCount BEGIN
BEGIN TRANSACTION T
SET #startTime = GETDATE()
SET #from = #i * #batchSize
SET #to = (#i + 1) * #batchSize - 1
;WITH data
AS (
SELECT DoorsReleased, ROW_NUMBER() OVER (ORDER BY id) AS Row
FROM MyTable
WHERE LitraID = 8175
AND id BETWEEN 100000000 AND 300000000
)
UPDATE data
SET DoorsReleased = ~DoorsReleased
WHERE row BETWEEN #from AND #to
SET #i = #i + 1
COMMIT TRANSACTION T
END
One of your issues is that your select statement in the loop fetches all records for LitraID = 8175, sets row numbers, then filters in the update statement. This happens on every iteration.
One way round this would be to get all ids for the update before entering the loop and storing them in a temporary table. Then you can write a similar query to the one you have, but joining to this table of ids.
However, there is an even easier way if you know approximately how many records have LitraID = 8175 and if they are spread throughout the table, not bunched together with similar ids.
DECLARE #batchSize INT
DECLARE #minId INT
DECLARE #maxId INT
SET #batchSize = 10000 --adjust according to how frequently LitraID = 8175, larger numbers if infrequent
SET #minId = 100000000
WHILE #minId <= 300000000 BEGIN
SET #maxId = #minId + #batchSize - 1
IF #maxId > 300000000 BEGIN
SET #maxId = 300000000
END
BEGIN TRANSACTION T
UPDATE MyTable
SET DoorsReleased = ~DoorsReleased
WHERE id BETWEEN #minId AND #maxId
COMMIT TRANSACTION T
SET #minId = #maxId + 1
END
This will use the value of id to control the loop, meaning you don't need the extra step to calculate #iterationCount. It uses small batches so that the table isn't locked for long periods. It doesn't have any unnecessary SELECT statements and the WHERE clause in the update is efficient assuming id has an index.
It won't have exactly the same number of records updated in every transaction, but there's no reason it needs to.
This will eliminate the loop
UPDATE MyTable
set DoorsReleased = ~DoorsReleased
WHERE LitraID = 8175
AND id BETWEEN 100000000 AND 300000000
AND DoorsReleased is not null -- if DoorsReleased is nullable
-- AND DoorsReleased <> ~DoorsReleased</strike>
if you are set on looping
below will NOT work
I thought ~ was part of the column name but it is a not operator
select 1;
WHILE (##ROWCOUNT > 0)
BEGIN
UPDATE top (100000) MyTable
set DoorsReleased = ~DoorsReleased
WHERE LitraID = 8175
AND id BETWEEN 100000000 AND 300000000
AND ( DoorsReleased <> ~DoorsReleased
or ( DoorsReleased is null and ~DoorsReleased is not null )
)
END
Inside a transaction I don't think looping would have value as the transaction log cannot clear. And a batch size of 10,000 is small.\
as stated in a comment if you want to loop then try using id as row_number() all those loops is expensive
you might be able to use OFFSET

SQL Query to retrieve the last records till the quantity purchased reaches the total quantity in stock

I have a table that have the ItemCode and Quantity in stock and another table that contains the purchases.
I want a query to get the Quantity in stock (ex. Qty = 5) and to take the purchase table to get the purchase invoices by descending order and take the Item Prices.
The Query has to keep retrieving records from the Purchase table according to the Quantity till we reach sum of Quantity in stock = 5.
ex.
**Purchase No ItemCode Qty Cost Price**
2 123 2 100
3 123 10 105
6 123 2 100
8 123 1 90
9 123 2 120
---------------------------------------------
**ItemCode Qty in Stock**
123 5
--------------------------------------------
In this example I want the query to retrieve for me the last 3 invoices (9,8 and 6) because the Qty (2+1+2 = 5)
Is there any suggestion .
Thank you in advance
This script should do the job.
/* SQL SCRIPT BEGIN */
create table #tmp (PurchaseNo int, ItemCode int, Qty int)
insert into #tmp (PurchaseNo, ItemCode, Qty)
select
p1.PurchaseNo, p1.ItemCode, sum(t.Qty) as Qty
from
Purchases p1
join
(
select
p2.PurchaseNo,
p2.ItemCode, p2.Qty
from
Purchases p2
) t on p1.PurchaseNo <= t.PurchaseNo and p1.ItemCode = t.ItemCode
group by p1.PurchaseNo, p1.ItemCode
order by p1.ItemCode, sum(t.Qty) asc
select * From #tmp
where
ItemCode = 123
and
Qty < 5
union
select top 1 * From #tmp
where
ItemCode = 123
and
Qty >= 5
order by PurchaseNo desc
drop table #tmp
/* SQL SCRIPT END */
Hi This can be the solution :
Here I have Used Result Table which will store the result.
I have used three tables Purchage(PurchageNo,ItemCode,Qty) , Stock(ItemCode,QtyInStock) and result(PurchageNo).
Full Workable Code is Here:
DECLARE #ItemCode int;
DECLARE #AvailableQty int;
SET #ItemCode = 123 ;
SET #AvailableQty = (select QtyInStock from Stock where ItemCode = #ItemCode);
SELECT
RowNum = ROW_NUMBER() OVER(ORDER BY PurchageNo),*
INTO #PurchageTemp
FROM Purchage
DECLARE #MaxRownum INT;
SET #MaxRownum = (select COUNT(*)from #PurchageTemp);
DECLARE #Iter INT;
SET #Iter = 1;
DECLARE #QtySum int=0;
DECLARE #QtySumTemp int=0;
DECLARE #CurrentItem int;
WHILE (#Iter <= #MaxRownum and #QtySum <= #AvailableQty)
BEGIN
set #QtySumTemp=#QtySum;
set #QtySumTemp = #QtySumTemp + (SELECT Qty FROM #PurchageTemp WHERE RowNum = #Iter and ItemCode=#ItemCode);
IF #QtySumTemp <= #AvailableQty
BEGIN
set #QtySum=#QtySumTemp;
set #CurrentItem= (SELECT PurchageNo FROM #PurchageTemp WHERE RowNum = #Iter and ItemCode=#ItemCode);
insert into [Result] values (#CurrentItem);
END
SET #Iter = #Iter + 1
END
DROP TABLE #PurchageTemp

While Loop in TSQL with Sum totals

I have the following TSQL Statement, I am trying to figure out how I can keep getting the results (100 rows at a time), store them in a variable (as I will have to add the totals after each select) and continue to select in a while loop until no more records are found and then return the variable totals to the calling function.
SELECT [OrderUser].OrderUserId, ISNULL(SUM(total.FileSize), 0), ISNULL(SUM(total.CompressedFileSize), 0)
FROM
(
SELECT DISTINCT TOP(100) ProductSize.OrderUserId, ProductSize.FileInfoId,
CAST(ProductSize.FileSize AS BIGINT) AS FileSize,
CAST(ProductSize.CompressedFileSize AS BIGINT) AS CompressedFileSize
FROM ProductSize WITH (NOLOCK)
INNER JOIN [Version] ON ProductSize.VersionId = [Version].VersionId
) AS total RIGHT OUTER JOIN [OrderUser] WITH (NOLOCK) ON total.OrderUserId = [OrderUser].OrderUserId
WHERE NOT ([OrderUser].isCustomer = 1 AND [OrderUser].isEndOrderUser = 0 OR [OrderUser].isLocation = 1)
AND [OrderUser].OrderUserId = 1
GROUP BY [OrderUser].OrderUserId
Depending on the clustered index, if its by numbered id, then use the code below. If its by date, go in 10 - 60 minute increments. keep an eye on performance of other things, but the lovely part of this code is you can start and stop at anytime if you push the results to permanent temp table (real table, just temp)
Here's a sample:
declare #count int
Declare #batch int
declare #max int
create table #temp (id int identity(1,1) primary key, Batch int, value int)
select #max = max(OrderUserId), #count = 0, #batch = 1000 from table
while (#count < #max)
begin
insert into #temp (batch,value)
select #count, Sum(stuffs)
from table
where orderId >= #count
and orderid < #count + #batch
Set #count = #count + #batch
waitfor delay ('00:00:01')
Raiserror('On Batch %d',0,1,#Count) with nowait /* Will print progess */
end