Speed up performance on UPDATE of temp table - sql

I have a SQL Server 2012 stored procedure. I'm filling a temp table below, and that's fairly straightforward. However, after that I'm doing some UPDATE on it.
Here's my T-SQL for declaring the temp table, #SourceTable, filling it, then doing some updates on it. After all of this, I simply take this temp table and insert it into a new table we are filling with a MERGE statement which joins on DOI. DOI is a main column here, and you'll see below that my UPDATE statements get MAX/MIN on several columns based on this column as the table can have multiple rows with the same DOI.
My question is...how can I speed up filling #SourceTable or doing my updates on it? Are there any indexes I can create? I'm decent at SQL, but not the best at performance issues. I'm dealing with maybe 60,000,000 records here in the temp table. It's been running for almost 4 hours now. This is a one-time deal here for a script I'm running once.
CREATE TABLE #SourceTable
(
DOI VARCHAR(72),
FullName NVARCHAR(128), LastName NVARCHAR(64),
FirstName NVARCHAR(64), FirstInitial NVARCHAR(10),
JournalId INT, JournalVolume VARCHAR(16),
JournalIssue VARCHAR(16), JournalFirstPage VARCHAR(16),
JournalLastPage VARCHAR(16), ArticleTitle NVARCHAR(1024),
PubYear SMALLINT, CreatedDate SMALLDATETIME,
UpdatedDate SMALLDATETIME,
ISSN_e VARCHAR(16), ISSN_p VARCHAR(16),
Citations INT, LastCitationRefresh SMALLDATETIME,
LastCitationRefreshValue SMALLINT, IsInSearch BIT,
BatchUpdatedDate SMALLDATETIME, LastIndexUpdate SMALLDATETIME,
ArticleClassificationId INT, ArticleClassificationUpdatedBy INT,
ArticleClassificationUpdatedDate SMALLDATETIME,
Affiliations VARCHAR(8000),
--Calculated columns for use in importing...
RowNum SMALLINT, MinCreatedDatePerDOI SMALLDATETIME,
MaxUpdatedDatePerDOI SMALLDATETIME,
MaxBatchUpdatedDatePerDOI SMALLDATETIME,
MaxArticleClassificationUpdatedByPerDOI INT,
MaxArticleClassificationUpdatedDatePerDOI SMALLDATETIME,
AffiliationsSameForAllDOI BIT, NewArticleId INT
)
--***************************************
--CROSSREF_ARTICLES
--***************************************
--GET RAW DATA INTO SOURCE TABLE TEMP TABLE..
INSERT INTO #SourceTable
SELECT
DOI, FullName, LastName, FirstName, FirstInitial,
JournalId, LEFT(JournalVolume,16) AS JournalVolume,
LEFT(JournalIssue,16) AS JournalIssue,
LEFT(JournalFirstPage,16) AS JournalFirstPage,
LEFT(JournalLastPage,16) AS JournalLastPage,
ArticleTitle, PubYear, CreatedDate, UpdatedDate,
ISSN_e, ISSN_p,
ISNULL(Citations,0) AS Citations, LastCitationRefresh,
LastCitationRefreshValue, IsInSearch, BatchUpdatedDate,
LastIndexUpdate, ArticleClassificationId,
ArticleClassificationUpdatedBy,
ArticleClassificationUpdatedDate, Affiliations,
ROW_NUMBER() OVER(PARTITION BY DOI ORDER BY UpdatedDate DESC, CreatedDate ASC) AS RowNum,
NULL AS MinCreatedDatePerDOI, NULL AS MaxUpdatedDatePerDOI,
NULL AS MaxBatchUpdatedDatePerDOI,
NULL AS MaxArticleClassificationUpdatedByPerDOI,
NULL AS ArticleClassificationUpdatedDatePerDOI,
0 AS AffiliationsSameForAllDOI, NULL AS NewArticleId
FROM
CrossRef_Articles WITH (NOLOCK)
--UPDATE SOURCETABLE WITH MAX/MIN/CALCULATED VALUES PER DOI...
UPDATE S
SET MaxUpdatedDatePerDOI = T.MaxUpdatedDatePerDOI, MaxBatchUpdatedDatePerDOI = T.MaxBatchUpdatedDatePerDOI, MinCreatedDatePerDOI = T.MinCreatedDatePerDOI, MaxArticleClassificationUpdatedByPerDOI = T.MaxArticleClassificationUpdatedByPerDOI, MaxArticleClassificationUpdatedDatePerDOI = T.MaxArticleClassificationUpdatedDatePerDOI
FROM #SourceTable S
INNER JOIN (SELECT MAX(UpdatedDate) AS MaxUpdatedDatePerDOI, MIN(CreatedDate) AS MinCreatedDatePerDOI, MAX(BatchUpdatedDate) AS MaxBatchUpdatedDatePerDOI, MAX(ArticleClassificationUpdatedBy) AS MaxArticleClassificationUpdatedByPerDOI, MAX(ArticleClassificationUpdatedDate) AS MaxArticleClassificationUpdatedDatePerDOI, DOI from #SourceTable GROUP BY DOI) AS T ON S.DOI = T.DOI
UPDATE S
SET AffiliationsSameForAllDOI = 1
FROM #SourceTable S
WHERE NOT EXISTS (SELECT 1 FROM #SourceTable S2 WHERE S2.DOI = S.DOI AND S2.Affiliations <> S.Affiliations)
After

This will probably be a faster way to do the update-- hard to say without seeing the execution plan, but it might be running the GROUP BY for every row.
with doigrouped AS
(
SELECT
MAX(UpdatedDate) AS MaxUpdatedDatePerDOI,
MIN(CreatedDate) AS MinCreatedDatePerDOI,
MAX(BatchUpdatedDate) AS MaxBatchUpdatedDatePerDOI,
MAX(ArticleClassificationUpdatedBy) AS MaxArticleClassificationUpdatedByPerDOI,
MAX(ArticleClassificationUpdatedDate) AS MaxArticleClassificationUpdatedDatePerDOI,
DOI
FROM #SourceTable
GROUP BY DOI
)
UPDATE S
SET MaxUpdatedDatePerDOI = T.MaxUpdatedDatePerDOI,
MaxBatchUpdatedDatePerDOI = T.MaxBatchUpdatedDatePerDOI,
MinCreatedDatePerDOI = T.MinCreatedDatePerDOI,
MaxArticleClassificationUpdatedByPerDOI = T.MaxArticleClassificationUpdatedByPerDOI,
MaxArticleClassificationUpdatedDatePerDOI = T.MaxArticleClassificationUpdatedDatePerDOI
FROM #SourceTable S
INNER JOIN doigrouped T ON S.DOI = T.DOI
If it is faster it will be a couple of orders of magnitude faster -- but that does not mean your machine will be able to process 60 million records in any period of time... if you didn't test on 100k first there is no way to know how long it will take to finish.

I suppose you can try:
Replace INSERT with SELECT INTO
Anyway you don't have indexes on your #SourceTable.
SELECT INTO is minimally logged, so you must have some speedup here
Replace UPDATE with SELECT INTO another table
Instead of updating #SourceTable you can create #SourceTable_Updates with SELECT INTO (modified Hogan query):
with doigrouped AS
(
SELECT
MAX(UpdatedDate) AS MaxUpdatedDatePerDOI,
MIN(CreatedDate) AS MinCreatedDatePerDOI,
MAX(BatchUpdatedDate) AS MaxBatchUpdatedDatePerDOI,
MAX(ArticleClassificationUpdatedBy) AS MaxArticleClassificationUpdatedByPerDOI,
MAX(ArticleClassificationUpdatedDate) AS MaxArticleClassificationUpdatedDatePerDOI,
DOI
FROM #SourceTable
GROUP BY DOI
)
SELECT
S.DOI,
MaxUpdatedDatePerDOI = T.MaxUpdatedDatePerDOI,
MaxBatchUpdatedDatePerDOI = T.MaxBatchUpdatedDatePerDOI,
MinCreatedDatePerDOI = T.MinCreatedDatePerDOI,
MaxArticleClassificationUpdatedByPerDOI = T.MaxArticleClassificationUpdatedByPerDOI,
MaxArticleClassificationUpdatedDatePerDOI = T.MaxArticleClassificationUpdatedDatePerDOI
INTO #SourceTable_Updates
FROM #SourceTable S
INNER JOIN doigrouped T ON S.DOI = T.DOI
Use JOIN-ed #SourceTable and #SourceTable_Updates
Hope this helps

Here are a couple of things that may help the performance of you insert statement
Does the CrossRef_Articles table have a primary key? If it does insert the primary key (be sure it is indexed) into your temp table and only include the fields you need to do your calculations. Once the calculations are done then do a select and join your temp table to the original table on the Id field. It takes time to write all that data to disk.
Look at your tempdb. If you have run this query multiple times then the database or log file size may be out of control.
Check the fields between the 2 original tables joined to see if the fields are indexed?

Related

Select from a Temp table is giving slow performance

I need some help with the below query where the last step of
Select * from #PersonDetail order by....
is taking so long to execute - why?
There are millions of records being inserted in this temp table #PersonDetail and insert process takes a few seconds, but the last Select from this same temp table is taking so long.
I created a unique clustered index on the columns used for order by and tried many other options but it doesn't make any difference in the performance.
It is a big stored procedure with many temp table but it is this last select step which is impacting the performance. Here is an example of the last step of the query:
DROP TABLE IF EXISTS #PersonDetail
CREATE TABLE #PersonDetail
(
PersonId INT NOT NULL,
Name NVARCHAR(50) NULL,
Number INT NOT NULL,
Tag NVARCHAR(50) NULL,
UserId INT NOT NULL,
NumberEncrypted VARCHAR(100),
Type NVARCHAR(255),
Status NVARCHAR(50),
CreatedDate DATETIMEOFFSET(7),
AddressDetailId NVARCHAR(50),
Category NVARCHAR(50),
PrimaryId INT,
DailyAmount MONEY,
UNIQUE (PersonId UserId),
UNIQUE CLUSTERED(CreatedDate, UserId)
)
INSERT INTO #PersonDetail (PersonId, Name, Number, Tag, UserId, NumberEncrypted,
Type, Status, CreatedDate, AddressDetailId, Category, PrimaryId, Amount)
SELECT
PersonId, Name, Number, Tag, UserId, NumberEncrypted,
Type, Status, CreatedDate, AddressDetailId, Category, PrimaryId, DailyAmount
FROM
#User u
JOIN
dbo.DailyAmount da (NOLOCK) ON da.UserId = u.UserId
SELECT *
FROM #PersonDetail pd
ORDER BY CreatedDate, UserId
You must specify what database are you using.
In general, you must do theese things:
create some indexes on the join columns (DailyAmount.userId, User.userID); how to create the index must change;
create an index on the order by columns, (CreatedDate+UserID); this must change, in postgresql for example an index with the 2 column is better than 2 indexes;
If your data are not changing frequently, you could try materialized view and create the indexes on the materialized view.

Inserting multiple rows in temp table without loop

This question has already been asked several times but the solution is not working for me. I don't know why.
Actually i am trying to create a temp table in sql query where i am inserting some records in temp table using select into but everytime it returns empty row:
here is what i am trying:
Create Table #TempTable
(
EntityID BIGINT
)
INSERT INTO #TempTable (EntityID)
SELECT pkEntityID FROM Employee WHERE EmpID = 45
Select * from #TempTable
Corresponding to 45 , there are 10 rows in Employee table. IS it like I have to do something else or a loop like structure here as we can only insert one row in a table at once?
This has been stated in the comments, all of which i up-voted, but to answer your question... there isn't anything else you have to do. There clearly isn't an EmpID = 45 in your source table. Here's a reproducible example:
Declare #Employee Table (pkEntityID bigint, EmpID int)
insert into #Employee (pkEntityID, EmpID)
values
(32168123,45),
(89746541,45),
(55566331,45),
(45649224,12)
Create Table #TempTable
(
EntityID BIGINT
)
INSERT INTO #TempTable (EntityID)
SELECT pkEntityID FROM #Employee WHERE EmpID = 45
Select * from #TempTable
drop table #TempTable
Have you accidentally also created the Employee table in the master database and you are currently connected to the master database?

SQL Server: automatically add a unique identifier to all rows inserted at one time

The below SQL Server code successfully calculates and inserts the monthly pay for all employees along with their staffID number and inserts it into Tablepayroll.
INSERT INTO Tablepayroll (StaffID,Totalpaid)
(SELECT Tabletimelog.StaffID , Tabletimelog.hoursworked * Tablestaff.hourlypay
FROM Tabletimelog
JOIN Tablestaff ON
Tabletimelog.StaffID = Tablestaff.StaffID)
However, I want to be able to also insert a batchIDso that you can identify each time the above insert has been run and the records inserted by it at that time. Meaning that all staff payroll calculated at the same time would have the same batchID number. Each subsequent batchID should just increase by 1.
Please see image below for visual explanation .
I think that Select MAX(batch_id) + 1 would work , but I don't know how to include it in the insert statement.
You can use subquery to find latest batch_id from your current table using this query:
INSERT INTO TablePayroll (StaffID, TotalPaid, batch_id)
SELECT T1.StaffID
, T1.HoursWorked * T2.HourlyPay
, ISNULL((SELECT MAX(batch_id) FROM TablePayRoll), 0) + 1 AS batch_id
FROM TableTimeLog AS T1
INNER JOIN TableStaff AS T2
ON T1.StaffID = T2.StaffID;
As you can see, I just add 1 to current MAX(batch_id) and that's it.
By the way, learn to use aliases. It will make your life easier
Yet another solution would be having your batch_id as a GUID, so you wouldn't have to create sequences or get MAX(batch_id) from current table.
DECLARE #batch_id UNIQUEIDENTIFIER = NEWID();
INSERT INTO TablePayroll (StaffID, TotalPaid, batch_id)
SELECT T1.StaffID, T1.HoursWorked * T2.HourlyPay, #batch_id
FROM TableTimeLog AS T1
INNER JOIN TableStaff AS T2
ON T1.StaffID = T2.StaffID;
Updated
First of all obtain the maximum value in a large table (based on the name of the table it must be big) can be very expensive. Especially if there is no index on the column batch_id
Secondly, pay attantion your solution SELECT MAX(batch_id) + 1 may behave incorrectly when you will have competitive inserts. Solution from #EvaldasBuinauskas without opening transaction and right isolation level can also lead to same batch_id if you run the two inserts at the same time in parallel.
If your SQL Server ver 2012 or higer you can try SEQUENCE. This at least ensures that no duplicates batch_id
Creating SEQUENCE:
CREATE SEQUENCE dbo.BatchID
START WITH 1
INCREMENT BY 1 ;
-- DROP SEQUENCE dbo.BatchID
GO
And using it:
DECLARE #BatchID INT
SET #BatchID = NEXT VALUE FOR dbo.BatchID;
INSERT INTO Tablepayroll (StaffID,Totalpaid, batch_id)
(SELECT Tabletimelog.StaffID , Tabletimelog.hoursworked * Tablestaff.hourlypay, #BatchID
FROM Tabletimelog
JOIN Tablestaff ON Tabletimelog.StaffID = Tablestaff.StaffID)
An alternative SEQUENCE may be additional table:
CREATE TABLE dbo.Batch (
ID INT NOT NULL IDENTITY
CONSTRAINT PK_Batch PRIMARY KEY CLUSTERED
,DT DATETIME
CONSTRAINT DF_Batch_DT DEFAULT GETDATE()
);
This solution works even on older version of the server.
DECLARE #BatchID INT
INSERT INTO dbo.Batch (DT)
VALUES (GETDATE());
SET #BatchID = SCOPE_IDENTITY();
INSERT INTO Tablepayroll (StaffID,Totalpaid, batch_id)
(SELECT Tabletimelog.StaffID , Tabletimelog.hoursworked * Tablestaff.hourlypay, #BatchID
FROM Tabletimelog ...
And yes, all of these solutions do not guarantee the absence of holes in the numbering. This can happen during a transaction rollback (deadlock for ex.)

Updating Values of a table from same table without using a select query

My Requirement
Updating Values of a table from same table without using a select query
this query won't effect any rows.
My aim : Update val2 of #table where slno=1 with the value of val2 of slno=2
Is there any other way without doing this method
Declare #val2 nvarchar(50)
select #val2=val2 from #table where slno=2
update #table set val2=#val2 where slno=1
create table #table
(
slno int identity(1,1),
val nvarchar(50),
val2 nvarchar(50)
)
insert into #table(val,val2)values('1',newID())
insert into #table(val,val2)values('1',newID())
insert into #table(val,val2)values('1',newID())
select * from #table
update #table set val2=T.val2
from #table T where slno=1 and T.slno=2
drop table #table
I have lot of records in the table.
So If i am selecting and update it may effect the performance.
Please, provide more info.
Do you have only 2 rows in your table?
Why do you need this kind of update?
I suppose, that your db structure is wrong, but I can't tell exactly, until you explain why do you need this.
Anyway I can suggest a poor way to do this without using select. You can self join the table. It would be better to have addition column, but if you don't have it, how's you should do
UPDATE T1
SET T1.val2 = T2.val2
FROM #table T1 INNER JOIN #table T2
ON T1.slno = 1 AND T2.slno = 2

How do I do an Upsert Into Table?

I have a view that has a list of jobs in it, with data like who they're assigned to and the stage they are in. I need to write a stored procedure that returns how many jobs each person has at each stage.
So far I have this (simplified):
DECLARE #ResultTable table
(
StaffName nvarchar(100),
Stage1Count int,
Stage2Count int
)
INSERT INTO #ResultTable (StaffName, Stage1Count)
SELECT StaffName, COUNT(*) FROM ViewJob
WHERE InStage1 = 1
GROUP BY StaffName
INSERT INTO #ResultTable (StaffName, Stage2Count)
SELECT StaffName, COUNT(*) FROM ViewJob
WHERE InStage2 = 1
GROUP BY StaffName
The problem with that is that the rows don't combine. So if a staff member has jobs in stage1 and stage2 there's two rows in #ResultTable. What I would really like to do is to update the row if one exists for the staff member and insert a new row if one doesn't exist.
Does anyone know how to do this, or can suggest a different approach?
I would really like to avoid using cursors to iterate on the list of users (but that's my fall back option).
I'm using SQL Server 2005.
Edit: #Lee: Unfortunately the InStage1 = 1 was a simplification. It's really more like WHERE DateStarted IS NOT NULL and DateFinished IS NULL.
Edit: #BCS: I like the idea of doing an insert of all the staff first so I just have to do an update every time. But I'm struggling to get those UPDATE statements correct.
Actually, I think you're making it much harder than it is. Won't this code work for what you're trying to do?
SELECT StaffName, SUM(InStage1) AS 'JobsAtStage1', SUM(InStage2) AS 'JobsAtStage2'
FROM ViewJob
GROUP BY StaffName
You could just check for existence and use the appropriate command. I believe this really does use a cursor behind the scenes, but it's the best you'll likely get:
IF (EXISTS (SELECT * FROM MyTable WHERE StaffName = #StaffName))
begin
UPDATE MyTable SET ... WHERE StaffName = #StaffName
end
else
begin
INSERT MyTable ...
end
SQL2008 has a new MERGE capability which is cool, but it's not in 2005.
IIRC there is some sort of "On Duplicate" (name might be wrong) syntax that lets you update if a row exists (MySQL)
Alternately some form of:
INSERT INTO #ResultTable (StaffName, Stage1Count, Stage2Count)
SELECT StaffName,0,0 FROM ViewJob
GROUP BY StaffName
UPDATE #ResultTable Stage1Count= (
SELECT COUNT(*) AS count FROM ViewJob
WHERE InStage1 = 1
#ResultTable.StaffName = StaffName)
UPDATE #ResultTable Stage2Count= (
SELECT COUNT(*) AS count FROM ViewJob
WHERE InStage2 = 1
#ResultTable.StaffName = StaffName)
To get a real "upsert" type of query you need to use an if exists... type of thing, and this unfortunately means using a cursor.
However, you could run two queries, one to do your updates where there is an existing row, then afterwards insert the new one. I'd think this set-based approach would be preferable unless you're dealing exclusively with small numbers of rows.
The following query on your result table should combine the rows again. This is assuming that InStage1 and InStage2 are never both '1'.
select distinct(rt1.StaffName), rt2.Stage1Count, rt3.Stage2Count
from #ResultTable rt1
left join #ResultTable rt2 on rt1.StaffName=rt2.StaffName and rt2.Stage1Count is not null
left join #ResultTable rt3 on rt1.StaffName=rt2.StaffName and rt3.Stage2Count is not null
I managed to get it working with a variation of BCS's answer. It wouldn't let me use a table variable though, so I had to make a temp table.
CREATE TABLE #ResultTable
(
StaffName nvarchar(100),
Stage1Count int,
Stage2Count int
)
INSERT INTO #ResultTable (StaffName)
SELECT StaffName FROM ViewJob
GROUP BY StaffName
UPDATE #ResultTable SET
Stage1Count= (
SELECT COUNT(*) FROM ViewJob V
WHERE InStage1 = 1 AND
V.StaffName = #ResultTable.StaffName COLLATE Latin1_General_CI_AS
GROUP BY V.StaffName),
Stage2Count= (
SELECT COUNT(*) FROM ViewJob V
WHERE InStage2 = 1 AND
V.StaffName = #ResultTable.StaffName COLLATE Latin1_General_CI_AS
GROUP BY V.StaffName)
SELECT StaffName, Stage1Count, Stage2Count FROM #ResultTable
DROP TABLE #ResultTable