I'm working on a macro that will go through a spreadsheet and remove duplicate entries (rows) based on two criteria that are provided separately in two columns (columns Q and D).
Here is what I have. I tested it out on a small dataset and it's slow.
Sub RemoveDupesKeepLast()
dim i As Integer
dim criteria1, criteria2 As String
Application.ScreenUpdating = False
Application.Calculation = xlCalculationManual
Application.EnableEvents = False
'start at bottom of sheet, go up
For i = ActiveSheet.UsedRange.Rows.Count to 2 Step -1
'if there is no entry, go to next row
If Cells(i, "Q").Value = "" Then
GoTo gogo:
End If
'set criteria that we will filter for
criteria1 = Cells(i, "D").Value
criteria2 = Cells(i, "Q").Value
'filter for criteria2, then criteria1 to get duplicates
ActiveSheet.Range("A":"CI").AutoFilter field:=17, Criteria1:=criteria2, Operator:=xlFilterValues
ActiveSheet.Range("A":"CI").AutoFilter field:=4, Criteria1:=criteria1, Operator:=xlFilterValues
'if there are duplicates, keep deleting rows until only bottom-most entry is left behind
Do While Range("Q2", Cells(Rows.Count, "Q").End(xlUp)).Cells.SpecialCells(xlCellTypeVisible).Count > 1
ActiveSheet.AutoFilter.Range.Offset(1).SpecialCells(xlCellTypeVisible).Cells(1,17).EntireRow.Delete
Loop
'reset autofilter
If ActiveSheet.FilterMode Then
Cells.AutoFilter
End If
gogo:
Next i
Application.EnableEvents = True
Application.Calculation = xlCalculationAutomatic
Application.ScreenUpdating = True
End Sub
Is there a different way I can approach this problem to speed things up? As it is right now, I'm basically checking each row until I get to the top. The sheets are actually anywhere from 30,000 rows to the max. Seems to me that there should be a faster, cleaner way of achieving what I'm trying to do but I can't seem to think of one.
This procedure deletes all the duplicated rows identified by column D and Q.
Among duplicates, it will keep the row the closest to the bottom of the sheet.
Basically, an indexed column is created on the right to sort and move all the duplicated rows at the bottom so they can be deleted in a single call.
Note that it doesn't alter the cells formula or format if there is any.
Sub DeleteDuplicatedRows()
Dim rgTable As Range, rgIndex As Range, dataColD(), dataColQ()
Set rgTable = ActiveSheet.UsedRange
' load each column representing the identifier in an array
dataColD = rgTable.Columns("D").value ' load values from column D
dataColQ = rgTable.Columns("Q").value ' load values from column Q
' get each unique row number with a dictionary
Dim dict As New VBA.collection, indexes(), r&, rr
On Error Resume Next
For r = UBound(dataColD) To 1 Step -1
dict.Add r, dataColD(r, 1) & vbNullChar & dataColQ(r, 1)
Next
On Error GoTo 0
' index all the unique rows in an array
ReDim indexes(1 To UBound(dataColD), 1 To 1)
For Each rr In dict: indexes(rr, 1) = rr: Next
' insert the indexes in the last column on the right
Set rgIndex = rgTable.Columns(rgTable.Columns.count + 1)
rgIndex.value = indexes
' sort the rows on the indexes, duplicates will move at the end
Union(rgTable, rgIndex).Sort key1:=rgIndex, Orientation:=xlTopToBottom, Header:=xlYes
' delete the index column on the right and the empty rows at the bottom
rgIndex.EntireColumn.Delete
rgTable.Resize(UBound(dataColD) - dict.count + 1).offset(dict.count).EntireRow.Delete
End Sub
100,00 rows × 87 columns in 40.3 seconds.
If your data set(s) start at 30K rows and only get bigger you should be looking to in-memory processing whenever possible¹. I've adapted the methods used in this solution to more closely follow your requirements.
The following bulk loads all values into a variant array and builds a Scripting.Dictionary object from the results. The 'overwrite' method of adding keys to the dictionary is used so that only the last one is kept.
When the collation has been performed, the values are returned to a re-dimensioned variant array and restored to the worksheet en masse.
Module1 (Code)
Option Explicit
Sub removeDupesKeepLast()
Dim d As Long, dDQs As Object, ky As Variant
Dim r As Long, c As Long, vVALs As Variant, vTMP As Variant
'appTGGL bTGGL:=False 'uncomment this when you have finished debugging
Set dDQs = CreateObject("Scripting.Dictionary")
dDQs.comparemode = vbTextCompare
'step 1 - bulk load the values
With Worksheets("Sheet1") 'you should know what worksheet you are on
With .Cells(1, 1).CurrentRegion 'block of data radiating out from A1
With .Resize(.Rows.Count - 1, .Columns.Count).Offset(1, 0) 'step off the header row
vVALs = .Value 'use .Value2 if you do not have dates in unformatted cells
End With
End With
End With
'step 2 - build the dictionary
ReDim vTMP(UBound(vVALs, 2) - 1)
For r = LBound(vVALs, 1) To UBound(vVALs, 1)
For c = LBound(vVALs, 2) To UBound(vVALs, 2)
vTMP(c - 1) = vVALs(r, c)
Next c
dDQs.Item(vVALs(r, 4) & ChrW(8203) & vVALs(r, 17)) = vTMP
Next r
'step 3 - put the de-duplicated values back into the array
r = 0
ReDim vVALs(1 To dDQs.Count, LBound(vVALs, 2) To UBound(vVALs, 2))
For Each ky In dDQs
r = r + 1
vTMP = dDQs.Item(ky)
For c = LBound(vTMP) To UBound(vTMP)
vVALs(r, c + 1) = vTMP(c)
Next c
Next ky
'step 4 - clear the destination; put the de-duplicated values back into the worksheet and reset .UsedRange
With Worksheets("Sheet1") 'you should know what worksheet you are on
With .Cells(1, 1).CurrentRegion 'block of data radiating out from A1
With .Resize(.Rows.Count - 1, .Columns.Count).Offset(1, 0) 'step off the header row
.ClearContents 'retain formatting if it is there
.Cells(1, 1).Resize(UBound(vVALs, 1), UBound(vVALs, 2)) = vVALs
End With
End With
.UsedRange 'assert the UsedRange property (refreshes it)
End With
dDQs.RemoveAll: Set dDQs = Nothing
appTGGL
End Sub
Public Sub appTGGL(Optional bTGGL As Boolean = True)
With Application
.ScreenUpdating = bTGGL
.EnableEvents = bTGGL
.DisplayAlerts = bTGGL
.AutoRecover.Enabled = bTGGL 'no interruptions with an auto-save
.Calculation = IIf(bTGGL, xlCalculationAutomatic, xlCalculationManual)
.CutCopyMode = False
.StatusBar = vbNullString
End With
Debug.Print Timer
End Sub
My sample workbook took 100K rows × 87 column with ~24% duplicates and processed all duplicates (keeping the last entries) in ~40 seconds. The above writes back to Sheet1; my tests were run writing back to Sheet2 in order to retain the original data. If you choose to write back to a different worksheet, make sure that there are some values starting at A1 in order that the Range.CurrentRegion property can be properly identified. The test machine was an older laptop running 32-bit Excel 2010; your own results will likely vary.
¹ See Highlight Duplicates and Filter by color alternative for tip[s on dealing with large data sets in Excel.
Related
Complicated problem... let me explain first, maybe there is a better solution rather than using iterative calculations:
(Link to Workbook)
Image showing example (to show what I'm working with)
PROBLEM:
Have 4,000+ Strings and would like to categorize them into pre-determined Groups (based on the String's content).
Each String should only be assigned to one Group. (ie. "55 gallon drum faucet" will be listed under "faucet" column, since it contains the word "faucet".)
Once categorized into a Group, the string won't be categorized under any other Groups. (ie. "55 gallon drum faucet" won't be categorized under "drum" once it's already been categorized under "faucet").
It really doesn't matter which Group each String does under, as long as it's categorized.
Note: (I've almost found a solution using iterative calculation, but it doesn't quite work).
SOLUTION:
The way I approached the problem was:
Count the number of times the String (Column A) was duplicated in the worksheet using the formula:
Formula: =COUNTIF($E$2:$IA$10000,A3)
This formula goes in Column C.
Created a formula that will categorize a String underneath a Group based on whether the String contains the Group word (ie. "faucet", "beer", "gallon", "kitchen", etc)... AND has not been used before (ie. Column C, which contains the formula from above).
Formula: =IF(C3<1,IF(IFERROR(SEARCH("faucet",A3),0)>0,A3,""),"")
Drag formula down for all 4,000 strings in Column C, and for each individual "Group" column.
The problem with this approach is that it will do an iterative calculation which will either:
Categorize the String under a Group (but WON'T increase the Times Dup'd field from 0 to 1)...
OR
Increase the "Times Dup'd" field from 0 to 1 BUT will keep the String from being categorized under the Groups column.
Any suggestions on how to address the iterative calculation issue?
(I know it keeps calculating back and forth since it's dependent, so will have to settle with 1 "right" solution... I'm wondering if there's any way to create some sort of 'block' so it can only be calculated one way...)
Any help would be greatly appreciated!
Run this procedure through your data. It performs all processing within a pair of variant arrays.
Sub byGroup()
Dim g As Long, s As Long, aSTRs As Variant, aGRPs As Variant
appTGGL bTGGL:=False
With Worksheets("Sheet1")
aSTRs = .Range(.Cells(2, 1), .Cells(Rows.Count, 1).End(xlUp)).Value2
With .Range(.Cells(1, 5), .Cells(Rows.Count, 1).End(xlUp).Offset(0, Application.Match("zzz", .Rows(1)) - 1))
.Resize(.Rows.Count, .Columns.Count).Offset(1, 0).ClearContents
aGRPs = .Cells.Value2
End With
For s = LBound(aSTRs, 1) To UBound(aSTRs, 1)
For g = LBound(aGRPs, 2) To UBound(aGRPs, 2)
If CBool(InStr(1, aSTRs(s, 1), aGRPs(1, g), vbTextCompare)) Then
aGRPs(s + 1, g) = aSTRs(s, 1)
Exit For
End If
Next g
Next s
.Cells(1, 5).Resize(UBound(aGRPs, 1), UBound(aGRPs, 2)) = aGRPs
End With
appTGGL
End Sub
Public Sub appTGGL(Optional bTGGL As Boolean = True)
Debug.Print Timer
Application.ScreenUpdating = bTGGL
Application.EnableEvents = bTGGL
Application.DisplayAlerts = bTGGL
Application.Calculation = IIf(bTGGL, xlCalculationAutomatic, xlCalculationManual)
End Sub
Elapsed time (excluding your worksheet formula recalculation should be in the 1-2 second range.
Priority of the matching groups is left to right. If you think '55 gallon drum' should be grouped as drum and not gallon then make sure that drum comes before gallon in row 1.
Saving the new macro-enabled workbook as an Excel Binary Workbook (.XLSB) reduces the workbook file size by roughly half.
I was working on something and Jeeped beat me to the answer. I tried Jeeped's code, but was getting multiple group entries for some of the strings. Here is the code I was working on if it's of any value at this point:
Sub sikorloa()
Dim r As Integer
Dim c As Integer
Dim LastRow As Integer
Dim LastCol As Integer
Dim strng As String
Dim grp As String
Application.Calculation = xlCalculationManual
Application.ScreenUpdating = False
LastRow = Range("A" & Rows.Count).End(xlUp).Row
LastCol = Cells(1, Cells.Columns.Count).End(xlToLeft).Column
For r = 3 To LastRow
If Cells(r, 1).Value <> "" Then
strng = Cells(r, 1).Value
For c = 5 To LastCol
grp = Cells(1, c).Value
If InStr(strng, grp) > 0 Then
Cells(r, c).Value = Cells(r, 1).Value
Exit For
End If
Next c
End If
Next r
Application.Calculation = xlCalculationAutomatic
Application.ScreenUpdating = True
End Sub
I am a new user of VBA and am trying to do the following (I got stuck towards the end):
I need to locate the first empty cell across every row from column C to P (3 to 16), take this value, and paste it in the column B of the same row.
What I try to do was:
Find non-empty cells in column C, copy those values into column B.
Then search for empty cells in column B, and try to copy the first non-empty cell in that row.
The first part worked out fine, but I am not too sure how to copy the first non-empty cell in the same row. I think if this can be done, I might not need the first step. Would appreciate any advice/help on this. There is the code:
Private Sub Test()
For j = 3 To 16
For i = 2 To 186313
If Not IsEmpty(Cells(i, j)) Then
Cells(i, j - 1) = Cells(i, j)
End If
sourceCol = 2
'column b has a value of 2
RowCount = Cells(Rows.Count, sourceCol).End(xlUp).Row
'for every row, find the first blank cell, copy the first not empty value in that row
For currentRow = 1 To RowCount
currentRowValue = Cells(currentRow, sourceCol).Value
If Not IsEmpty(Cells(i, 3)) Or Not IsEmpty(Cells(i, 4)) Or Not IsEmpty(Cells(i, 5)) Or Not IsEmpty(Cells(i, 6)) Then
Paste
~ got stuck here
Next i
Next j
End Sub
Your loop is really inefficient as it is iterating over millions of cells, most of which don't need looked at. (16-3)*(186313-2)=2,422,043.
I also don't recommend using xlUp or xlDown or xlCellTypeLastCell as these don't always return the results you expect as the meta-data for these cells are created when the file is saved, so any changes you make after the file is saved but before it is re-saved can give you the wrong cells. This can make debugging a nightmare. Instead, I recommend using the Find() method to find the last cell. This is fast and reliable.
Here is how I would probably do it. I'm looping over the minimum amount of cells I can here, which will speed things up.
You may also want to disable the screenupdating property of the application to speed things up and make the whole thing appear more seemless.
Lastly, if you're new to VBA it's good to get in the habit of disabling the enableevents property as well so if you currently have, or add in the future, any event listeners you will not trigger the procedures associated with them to run unnecessarily or even undesirably.
Option Explicit
Private Sub Test()
Dim LastUsed As Range
Dim PasteHere As Range
Dim i As Integer
Application.ScreenUpdating=False
Application.EnableEvents=False
With Range("B:B")
Set PasteHere = .Find("*", .Cells(1, 1), xlFormulas, xlPart, xlByRows, xlPrevious, False, False, False)
If PasteHere Is Nothing Then Set PasteHere = .Cells(1, 1) Else: Set PasteHere = PasteHere.Offset(1)
End With
For i = 3 To 16
Set LastUsed = Cells(1, i).EntireColumn.Find("*", Cells(1, i), xlFormulas, xlPart, xlByRows, xlPrevious, False, False, False)
If Not LastUsed Is Nothing Then
LastUsed.Copy Destination:=PasteHere
Set PasteHere = PasteHere.Offset(1)
End If
Set LastUsed = Nothing
Next
Application.ScreenUpdating=True
Application.EnableEvents=True
End Sub
Sub non_empty()
Dim lstrow As Long
Dim i As Long
Dim sht As Worksheet
Set sht = Worksheets("Sheet1")
lstrow = sht.Cells(sht.Rows.Count, "B").End(xlUp).Row
For i = 1 To lstrow
If IsEmpty(Range("B" & i)) Then
Range("B" & i).Value = Range("B" & i).End(xlToRight).Value
End If
Next i
End Sub
As the title says. Is there any function or VBA code which does the same function as a countif and is a lot faster. Currently in the middle of massive countif and it is just eating up my CPU.
It is just a basic countif inside the worksheet. Not in VBA.
=countif(X:X,Y) However the lists are massive. So both lists are around 100,000~ rows
If you can do without a count of the occurances and simply wish to check if the value x exists in the column of y's, then returning a boolean TRUE or FALSE with the ISNUMBER function evaluating a MATCH function lookup will greatly speed up the process.
=ISNUMBER(MATCH(S1, Y:Y, 0))
Fill down as necessary to catch all returns. Sort and/or filter the returned values to tabulate results.
Addendum:
Apparently there is. The huge improvement in the MATCH function calculation times over the COUNTIF function made me wonder if MATCH couldn't be put into a loop, advancing the first cell in its lookup_array parameter to the previously returned row number plus one until there were no more matches. Additionally, subsequent MATCh calls to lookup the same number (increasing the count) could be made to increasingly smaller lookup_array cell ranges by resizing (shrinking) the height of the column by the returned row number as well. If the processed values and their counts were stored as keys and items in a scripting dictionary, duplicate values could be instantly resolved without processing a count.
Sub formula_countif_test()
Dim tmr As Double
appOFF
tmr = Timer
With Sheet2.Cells(1, 1).CurrentRegion
With .Offset(1, 0).Resize(.Rows.Count - 1, .Columns.Count) 'skip header
.Cells(1, 3).Resize(.Rows.Count, 1).FormulaR1C1 = _
"=countif(c1, rc2)" 'no need for calculate when blocking in formulas like this
End With
End With
Debug.Print "COUNTIF formula: " & Timer - tmr
appON
End Sub
Sub formula_match_test()
Dim rw As Long, mrw As Long, tmr As Double, vKEY As Variant
'the following requires Tools, References, Microsoft Scripting Dictionary
Dim dVALs As New Scripting.dictionary
dVALs.CompareMode = vbBinaryCompare 'vbtextcompare for non-case sensitive
appOFF
tmr = Timer
With Sheet2.Cells(1, 1).CurrentRegion
With .Offset(1, 0).Resize(.Rows.Count - 1, .Columns.Count) 'skip header
For rw = 1 To .Rows.Count
vKEY = .Cells(rw, 2).Value2
If Not dVALs.Exists(vKEY) Then
dVALs.Add Key:=vKEY, _
Item:=Abs(IsNumeric(Application.Match(vKEY, .Columns(1), 0)))
If CBool(dVALs.Item(vKEY)) Then
mrw = 0: dVALs.Item(vKEY) = 0
Do While IsNumeric(Application.Match(vKEY, .Columns(1).Offset(mrw, 0).Resize(.Rows.Count - mrw + 1, 1), 0))
mrw = mrw + Application.Match(vKEY, .Columns(1).Offset(mrw, 0).Resize(.Rows.Count - mrw + 1, 1), 0)
dVALs.Item(vKEY) = CLng(dVALs.Item(vKEY)) + 1
Loop
End If
.Cells(rw, 3) = CLng(dVALs.Item(vKEY))
Else
.Cells(rw, 3) = CLng(dVALs.Item(vKEY))
End If
Next rw
End With
End With
Debug.Print "MATCH formula: " & Timer - tmr
dVALs.RemoveAll: Set dVALs = Nothing
appON
End Sub
Sub appON(Optional ws As Worksheet)
Application.ScreenUpdating = True
Application.EnableEvents = True
Application.Calculation = xlCalculationAutomatic
End Sub
Sub appOFF(Optional ws As Worksheet)
Application.ScreenUpdating = False
Application.EnableEvents = False
Application.Calculation = xlCalculationManual
End Sub
I used 10K rows with columns A and B filled by RANDBETWEEN(1, 999) then copied and pasted as values.
Elapsed times:
Test 1¹ - 10K rows × 2 columns filled with RANDBETWEEN(1, 999)
COUNTIF formula: 15.488 seconds
MATCH formula: 1.592 seconds
Test 2² - 10K rows × 2 columns filled with RANDBETWEEN(1, 99999)
COUNTIF formula: 14.722 seconds
MATCH formula: 3.484 seconds
I also copied the values from the COUNTIF formula into another column and compared them to the ones returned by the coded MATCH function. They were identical across the 10K rows.
¹ More multiples; less zero counts
² More zero counts, less multiples
While the nature of the data clearly makes a significant difference, the coded MATCH function outperformed the native COUNTIF worksheet function every time.
Don't forget the VBE's Tools ► References ► Microsoft Scripting Dictionary.
I use the following technique in place of COUNTIF. I have 115k rows of data and the calculation step was basically instantaneous, but you spend a bit more time setting it up.
Make a copy of the data you want to count and put in column A of a new sheet.
Sort the data you want to count (such that all identical items are next to each other).
Put the following formula in column B =IF(A2=A1,B2+1,1). Populate the column with the formula then paste value.
Put a sequential number in column C (just 1,2,3,4 ... up to the number of rows you have)
Sort everything by column C descending. The result is that in column B, the biggest count comes first.
Select column A and B, then use "Remove Duplicate" function. Now you're left with one entry per distinct row of Data and the biggest count for each.
Back in your real data sheet, use =VLOOKUP(A2,Sheet2!A:B,2,false) to get the count.
If you want to make a macro out of this, simply use Record Macro while performing the above actions.
Try sumproduct(countif(x:x,y:y))
It’s slightly faster but by how much I am not sure.
Also let us know if you have found a better option out there.
There is an easy workaround for COUNTIF, after sorting the data. You may add this to your VB Script, and run. For data with around 1 lakh line items, normal COUNTIF takes almost 10-15 mins. This script will get the counts in <10 secs.
Sub alternateFunctionForCountIF()
Dim DS As Worksheet
Set DS = ThisWorkbook.ActiveSheet
Dim lcol As Integer
lcol = DS.Cells(1, Columns.Count).End(xlToLeft).Column
Dim fieldHeader As String
Dim lrow As Long, i As Long, j As Long
Dim countifCol As Integer, fieldCol As Integer
fieldHeader = InputBox("Enter the column header to apply COUNTIF")
If Len(fieldHeader) = 0 Then
MsgBox ("Invalid input. " & Chr(13) & "Please enter the column header text and try again")
Exit Sub
End If
For i = 1 To lcol
If fieldHeader = DS.Cells(1, i).Value Then
fieldCol = i
Exit For
End If
Next i
If fieldCol = 0 Then
MsgBox (fieldHeader & " could not be found among the headers. Please enter a valid column header")
Exit Sub
End If
countifCol = fieldCol + 1
lrow = DS.Cells(Rows.Count, "A").End(xlUp).Row
DS.Range(DS.Cells(1, countifCol).EntireColumn, DS.Cells(1, countifCol).EntireColumn).Insert
DS.Cells(1, countifCol) = fieldHeader & "_count"
DS.Sort.SortFields.Clear
DS.Sort.SortFields.Add Key:=Range(DS.Cells(2, fieldCol), DS.Cells(lrow, fieldCol)), SortOn:=xlSortOnValues, Order:=xlAscending, DataOption:=xlSortNormal
With DS.Sort
.SetRange Range(DS.Cells(1, 1), DS.Cells(lrow, lcol))
.header = xlYes
.MatchCase = False
.Orientation = xlTopToBottom
.SortMethod = xlPinYin
.Apply
End With
Dim startPos As Long, endPos As Long
Dim checkText As String
For i = 2 To lrow
checkText = LCase(CStr(DS.Cells(i, fieldCol).Value))
If (checkText <> LCase(CStr(DS.Cells(i - 1, fieldCol).Value))) Then
startPos = i
End If
If (checkText <> LCase(CStr(DS.Cells(i + 1, fieldCol).Value))) Then
endPos = i
For j = startPos To endPos
DS.Cells(j, countifCol) = endPos - startPos + 1
Next j
End If
Next i
MsgBox ("Done")
End Sub
I was wondering if there was a way to get all the different autofilter criteria in a list in order to iterate through each criteria, to in the end copy and paste each different table that would appear to a separate sheet as it iterates through.
Ideally this would be run n times:
ActiveSheet.Range(AllRows).AutoFilter Field:=10, Criteria1:=CritVariable
Where n is the number of different CritVariables there are.
I'd like to stress that I know how to copy and paste in the macro itself, but I was curious how to iterate through all the different criteria because the criteria could be different depending on the day. If a list of it isn't available how would I best go about iterating through the criteria?
You can study and adapt the following. Here is an outline of what is going on.
I have a staff-table starting at cell A5, with a list of Offices in
column G;
I'm copying from G5 downwards (assuming there are no blanks in this column's data) to W1;
From range W1 downwards I am removing duplicates;
Then I'm looping through this data, using Advanced Filter to copy the data for each office to an area starting at cell Z1;
This filtered data is then moved (Cut) to a new worksheet, which is named from the current Office name (the criteria);
After each Advanced Filter the cell W2 is deleted, making the value in W3 move up, so that it can be used for the next filter operation.
This does mean that when you press Ctrl-End to go to the last-used cell it goes further than it needs to. You can find a way to resolve this if necessary ;).
Sub SheetsFromFilter()
Dim wsCurrent As Worksheet
Dim wsNew As Worksheet
Dim iLeft As Integer
Set wsCurrent = ActiveSheet
Application.ScreenUpdating = False
Range("G5", Range("G5").End(xlDown)).Copy Range("W1")
Range("W1").CurrentRegion.RemoveDuplicates Columns:=1, Header:=xlYes
iLeft = Range("W1").CurrentRegion.Rows.Count - 1
Do While iLeft > 0
wsCurrent.Range("A5").CurrentRegion.AdvancedFilter xlFilterCopy, _
wsCurrent.Range("W1:W2"), wsCurrent.Range("Z1")
Set wsNew = Worksheets.Add
wsCurrent.Range("Z1").CurrentRegion.Cut wsNew.Range("A1")
wsNew.Name = wsCurrent.Range("W2").Value
wsCurrent.Range("W2").Delete xlShiftUp
iLeft = iLeft - 1
Loop
wsCurrent.Range("W1").Clear
Application.ScreenUpdating = True
End Sub
BTW I don't intend to modify this for your specific file; this is something that you should do (or pay someone to do ;) ).
BTW It could be done using the normal (rather than Advanced) Filter. You would still copy the column and remove duplicates. This would have the benefit of not increasing the apparent size of the worksheet too much. But I decided to do it this way ;).
Added: Well, I felt inspired to achieve this with AutoFilter as well:
Sub SheetsFromAutoFilter()
Dim wsCurrent As Worksheet
Dim wsNew As Worksheet
Dim iLeft As Integer
Set wsCurrent = ActiveSheet
Application.ScreenUpdating = False
Range("G5", Range("G5").End(xlDown)).Copy Range("W1")
Range("W1").CurrentRegion.RemoveDuplicates Columns:=1, Header:=xlYes
iLeft = Range("W1").CurrentRegion.Rows.Count - 1
Do While iLeft > 0
Set wsNew = Worksheets.Add
With wsCurrent.Range("A5").CurrentRegion
.AutoFilter field:=7, _
Criteria1:=wsCurrent.Range("W1").Offset(iLeft).Value
.Copy wsNew.Range("A1")
.AutoFilter
End With
wsNew.Name = wsCurrent.Range("W1").Offset(iLeft).Value
iLeft = iLeft - 1
Loop
wsCurrent.Range("W1").CurrentRegion.Clear
Application.ScreenUpdating = True
End Sub
[Both procedures could be improved using Defined Names and some error handling/checking.]
if you want you can build a new collection which will have an array of only unique values and then loop over them. you will know that each
I know it's late and you've already selected an answer, but I'm working on a similar project involving a pivot table and decided to do it this way:
'Here I'm Filtering a column of Week numbers to get rid of non-numbers
'From a pivot table
'I select sheet where my underlying pivot data is located and establish the range
'My data is in column 2 and it ends after "DSLastRow" Rows starting at Row 2
Sheets("DataSheet").Select
DSLastRow = Cells.Find("*", SearchOrder:=xlByRows, SearchDirection:=xlPrevious).Row
'I create and redim an array that is large enough to hold all of the data in the range
Dim FilterCriteria(): RedimFilterCriteria(1 To DSLastRow)
For r = 2 To DSLastRow 'r for row / my data has a header in row 1
If Cells(r, 2).Value <> "" Then 'again, starting in column B (2)
'Check if it's already in the FilterCriteria Array
For CheckFCA = 1 To r
'Jumps to next row if it finds a match
If FilterCriteria(CheckFCA) = Cells(r, 2).Value Then GoTo Nextr
'Saves the value and jumps to next row if it reaches an empty value in the array
If IsEmpty(FilterCriteria(CheckFCA)) Then
FilterCriteria(CheckFCA) = Cells(r, 2)
GoTo Nextr
End If
Next CheckFCA
End if
Nextr:
Next r
'At this point FilterCriteria() is filled with all of the unique values
'I'm filtering a pivot table which is why I created the unique array from
'the source data, but you should be able to just loop through the table
Sheets("Pivot").Select
ActiveSheet.PivotTables("ReportPivot").PivotFields("Week").ClearAllFilters
With ActiveSheet.PivotTables("ReportPivot").PivotFields("Week")
For FilterPivot = 1 To DSLastRow
'I'm filtering out all non-numeric items
If IsEmpty(FilterCriteria(FilterPivot)) Then Exit For
If Not IsNumeric(FilterCriteria(FilterPivot)) Then
.PivotItems(FilterCriteria(FilterPivot)).Visible = False
End If
Next FilterPivot
End With
Need a much Optimized code.Well I Got a Project and I have Succefully made it work with the vba (Mostly helped by the stackoverflow programmers Thanks for that)
But Today I got a Feedback. Its deleting 2 more unique entries in the record But I dont know why its deleting Them.
The Algorithm I have applied
I have Used the COUNTIF function Which I found on google
="countif(A$1:A2,A3)=0" A3 is the active cell, Checks A2,A1 for dupes
It Throws False if there is a duplicate in The A column and True If it is a unique.What I have understood about Countif is that
It checks all the above columns values from that cell I mean let us take A4. SO it checks A2,A1,A3 for the duplicate. Similarly A10 checks for A1 to A9 and throws either TRue or False.Well It was working But I dont know what went wrong The code is not working for some entries.Its even showing False for the Unique entries sometimes.
And its taking more time to applye these formula as I have more amount of data. Im trying to make it cleaner and more Optimizing Way.People told me its not a c or some other Language to make it optimize but Im need of code that makes my code more optimized
I need code for these condtions can anyone help me as my countif failed.Im little helpless in doing so.
1)I have a column and I should check for duplicates in that column and delete that row if it is a duplicate
2) I have 35000 Old entries in the column and I have new entries 2000 everyweek these are appended. I need to check these 2000 entries from the total 37000 ( as we appened we get 35000+2000) and these delete operation need to be performed only on the newly appended 2000 entries but it should check the duplicates for entire column
Let me explain you clearly I have 2000 entries newly added,so Only these entries are to be checked for the duplicates from the 35000 entries and also from itself (2000 entries) and delete it if it is a duplicate and no duplicating operation should be performed on the 35000 entries old data.
I have found some codes but they are deleting even the duplicates of the 35000 entries. I have set the range but even though its not working.
Can anyone help me with the best code that takes less time?please thank you
Updating my question with the sample code I have
A B F G H I Y
PTY 39868.5 4 2 540 3 PTY39868.5425403
GTY 34446.1234 2 1 230 1 GTY34446.1234212301
PTY 3945.678 2 2 PTY3945.67822
GTY 34446.1234 2 1 230 1 GTY34446.1234212301
let us say these are old 35000 entries
Explaination to the above example.
The above are the 35000 entries. I have to check A,B,F,G,H,I columns for the dupes, if they are same I have to delete the row, I should not bother about the other columns c,d etc. so what I did is I have used one unused column Y and concatenated these 6 columns values into 1 at Y column using these
= A2 & B2 & F2 & G2 & H2 &I2 with the respective columns
Now checking the Y column for dupes and delete the entire row. as 2003 supports only for one column as far to my knowledge.
Notice that even the 35000 entries may have duplicates in it but I should not delete them. Example you can see the 2 and last row in my example code are dupes but I should not delete
as it is the old data.
A B F G H I Y
PTY 39868.5 4 2 540 3 PTY39868.5425403 'old
GTY 34446.1234 2 1 230 1 GTY34446.1234212301 'old
PTY 3945.678 2 2 PTY3945.67822 'old
GTY 34446.1234 2 1 230 1 GTY34446.1234212301 'old
PTY 3945.678 1 1 230 2 PTY3945.678112302 'new
PTY 39868.5 4 2 540 3 PTY39868.5425403 'new
PTY 3945.678 1 1 230 2 PTY3945.678112302 'new
Now note that New entry PTY (from last 2nd) is a duplicate of the original record(PTY at first) So I hava to delete it.And the last new entry is a duplicate of the new entry itself so I should delete it even that . SO in the above code I have to delete only the last 2 rows which are dupes of original record and also from it . But should not delete the GTY which is the dupe but which is in orginal record.
I think I gave a clear view now. Is concatenating them into one cell . Is it better way to approach? as conactenatin for 40000 entries taking just 2 seconds i think that doesnt matter but any more algorithms to these is much aprreciated
I heard counif treats 45.00 and 45.00000 as different is that right may be that was the problem with it? since I have decimal points in my data. I think I should do
= I2 & H2 & G2 & F2 & A2 & B2
which is better to concatenate? is this or the other i posted before?
BIG UPDATE:
It think the original questions threw me off - there may be a problem with the logic in the question. The following assumes you want to delete the cell, not entire row, for the duplicate entries.
If the 35000 old records do not include duplicates, then all you need to do is remove all duplicates from the entire column - so long as you start from row 1, you run no risk of deleting any of the 'old' rows since no duplicates exist in them.
Here is one way:
Sub UniqueList()
Application.ScreenUpdating = False
Dim vArray As Variant
Dim i As Long, j As Long, lastrow As Long
Dim dictionary As Object
Set dictionary = CreateObject("scripting.dictionary")
lastrow = Range("A" & Rows.Count).End(xlUp).Row
vArray = Range("A1:A" & lastrow).Value
On Error Resume Next
For i = 1 To UBound(vArray, 1)
For j = 1 To UBound(vArray, 2)
If Len(vArray(i, j)) <> 0 Then
dictionary(vArray(i, j)) = 1
End If
Next
Next
Columns("A:A").ClearContents
Range("A1").Resize(dictionary.Count).Value = _
Application.Transpose(dictionary.keys)
Application.ScreenUpdating = True
End Sub
If for some odd reason the 35000 old records DO include dupes and you only want to allow these 35000 records to do so, then you can use 2 dictionaries, but this would be an unusual case since you'd be treating the old records differently than new...
Sub RemoveNewDupes()
Application.ScreenUpdating = False
Dim lastRow As Long
Dim varray As Variant
Dim oldDict As Object, newDict As Object
Set oldDict = CreateObject("scripting.dictionary")
Set newDict = CreateObject("scripting.dictionary")
On Error Resume Next
lastRow = Range("A" & Rows.Count).End(xlUp).Row
'Add old entries to dictionary
varray = Range("A1:A35000").Value
For i = 1 To UBound(varray, 1)
oldDict.Add varray(i, 1), 1
Next
'Check for dupes
varray = Range("A35001:A" & lastRow).Value
For i = 1 To UBound(varray, 1)
If oldDict.exists(varray(i, 1)) = False Then
newDict.Add varray(i, 1), 1
End If
Next
'Delete and slap back on the unique list
Range("A35001", "A" & Rows.Count).ClearContents
Range("A35001").Resize(newDict.Count).Value = _
Application.Transpose(newDict.keys)
Application.ScreenUpdating = True
End Sub
Thanks to Reafidy for the advice and getting me to relook at this.
This is also a response to some of the comments and solutions made by other members so sorry if it does not straight away answer your question.
Firstly I believe that using excel in a database scenario that raw data and presentation data should be separated. This usually means a single worksheet with raw data and multiple other worksheets with presentation data. Then delete the raw data when necessary or archive.
When speed testing it is very difficult to get a level playing field in excel as there are many things that affect the results. Computer specs, available RAM etc.. Code must first be compiled before running any of the procedures. The test data is also important, when considering duplicates - how many duplicates vs how many rows. This sub loads some test data, altering the amount of rows vs the range of random numbers (duplicates) will give very different results for your code. I don't know what your data looks like so we are kind of working blind and your results may be very different.
'// This is still not very good test data, but should suffice for this situation.
Sub TestFill()
'// 300000 rows
For i = 1 To 300000
'// This populates a random number between 1 & 10000 - adjust to suit
Cells(i, "A").value = Int((100000 + 1) * Rnd + 1)
Next
End Sub
If we are talking about advanced filter vs an array & dictonary method then advanced filter will be quicker with a lower amount of rows but once you get above a certain amount of rows then the array method will be quicker. Then see what happens when you change the amount of duplicates.... :)
As a guideline or as a general rule using excels built in functions will be faster and I recommend always develop attempting to use these inbuilt functions, however there are often exceptions, like above when removing duplicates. :)
Deleting rows can be slow when looping if used incorrectly. If looping is used then it is important to keep synchronisation between code and the workbook out of the loop. This usually means read data to an array, loop through the data, then load the data from the array back to the presentation worksheet essentially deleting the unwanted data.
Sub RemoveDuplicatesA()
'// Copy raw data to presentation sheet
Range("A1", Cells(Rows.Count, "A").End(xlUp)).AdvancedFilter _
Action:=xlFilterCopy, CopyToRange:=Sheet2.Range("B1"), Unique:=True
End Sub
This will be the fastest method:
Sub RemoveDuplicatesB()
Dim vData As Variant, vArray As Variant
Dim lCnt As Long, lRow As Long
vData = ActiveSheet.UsedRange.Columns(1).value
ReDim vArray(0 To UBound(vData, 1), 0)
lCnt = 0
With CreateObject("Scripting.Dictionary")
For lRow = 1 To UBound(vData, 1)
If Not .Exists(vData(lRow, 1)) Then
vArray(lCnt, 0) = vData(lRow, 1): lCnt = lCnt + 1
.Add vData(lRow, 1), Nothing
End If
Next lRow
End With
'// Copy raw data to presentation sheet
Sheet2.Range("B1").Resize(lCnt).value = vArray
End Sub
Application transpose has a limitation of 65536 rows but as you are using 2003 you should be fine using it, therefore you can simplify the above code with:
Sub RemoveDuplicatesC()
Dim vData As Variant
Dim lRow As Long
vData = ActiveSheet.UsedRange.Columns(1).value
With CreateObject("Scripting.Dictionary")
For lRow = 1 To UBound(vData, 1)
If Not .exists(vData(lRow, 1)) Then
.Add vData(lRow, 1), Nothing
End If
Next lRow
'// Copy raw data to presentation sheet or replace raw data
Sheet2.Columns(2).ClearContents
Sheet2.Columns(2).Resize(.Count).value = Application.Transpose(.keys)
End With
End Sub
EDIT
Okay so #Issun has mentioned you want the entire row deleted. My suggestion was to improve your spreadsheet layout by having a raw data and presentation sheet which means you dont need to delete anything hence it would have been the fastest method. If you dont want to do that and would like to edit the raw data directly then try this:
Sub RemoveDuplicatesD()
Dim vData As Variant, vArray As Variant
Dim lRow As Long
vData = ActiveSheet.UsedRange.Columns(1).value
ReDim vArray(1 To UBound(vData, 1), 0)
With CreateObject("Scripting.Dictionary")
For lRow = 1 To UBound(vData, 1)
If Not .exists(vData(lRow, 1)) Then
varray(lRow, 0) = "x"
.Add vData(lRow, 1), Nothing
End If
Next lRow
End With
Application.ScreenUpdating = False
'// Modify the raw data
With ActiveSheet
.Columns(2).Insert
.Range("B1").Resize(lRow).value = vArray
.Columns(2).SpecialCells(xlCellTypeBlanks).EntireRow.Delete
.Columns(2).Delete
End With
Application.ScreenUpdating = True
End Sub
Before starting again from scratch your whole code, here are a few things you can try:
Optimize your VBA
There are several tips on the web about optimizing vba. In particular, you can do:
'turn off some Excel functionality so your code runs faster
'these two are especially very efficient
Application.ScreenUpdating = False
Application.Calculation = xlCalculationManual
'use these if you really need to
Application.DisplayStatusBar = False
Application.EnableEvents = False
'code goes here
'at the end, restore the default behavior
'calculate the formulas
Application.Calculate
Application.ScreenUpdating = True
Application.Calculation = xlCalculationAutomatic
Application.DisplayStatusBar = True
Application.EnableEvents = True
See here for more information
Optimize your algorithm
Especially when your inserting your COUNTIF formula, you can try to fill in instead of inserting the formula in each row.
On the deleting row part, you should try the solution I gave you in your previous thread: Delete duplicate entries in a column in excel 2003 vba to filter first on the True values and then to delete the visible cells. It is probably the fastest way.
[EDIT] Seems like Doc Brown's answer would be probably the best way to handle this (hey, this is a dictionary solution that wasn't written by Issun :)). Anyway, the VBA optimization tips are still relevant because this is quite a slow language.
OK, here's the advancedfilter method. Don't know if it is faster than the dictionary method. It would be interesting to know though, so let me know after you try it. I also included the delete portion so you would have to stop that portion if you want to do a true comparison. Also, you can make this a function instead of a sub and put in your variables, however you want to change it.
Sub DeleteRepeats()
Dim d1 As Double
Dim r1 As Range, rKeepers As Range
Dim wks As Worksheet
d1 = Timer
Set wks = ActiveSheet
Application.EnableEvents = False
Application.ScreenUpdating = False
'Make sure all rows are visible
On Error Resume Next
wks.ShowAllData
wks.UsedRange.Rows.Hidden = False
wks.UsedRange.Columns.Hidden = False
On Error GoTo 0
'Get concerned range
Set r1 = wks.Range("A1:A35000")
'Filter
r1.AdvancedFilter Action:=xlFilterInPlace, Unique:=True
'Get range of cells not to be deleted
Set rKeepers = r1.SpecialCells(xlCellTypeVisible)
On Error Resume Next
wks.ShowAllData
On Error GoTo 0
rKeepers.EntireRow.Hidden = True
'Delete all undesirables
r1.SpecialCells(xlCellTypeVisible).EntireRow.Delete
'show all rows
On Error Resume Next
wks.UsedRange.Rows.Hidden = False
On Error GoTo 0
Application.EnableEvents = False
Application.ScreenUpdating = False
Debug.Print Timer() - d1
End Sub
OK, here's a take on Doc's and Issun's use of Dictionaries. Before I wasn't convinced but after looking at it and testing it and comparing to advanced filter, I am convinced, dictionaries are better for this application. I don't know why Excel isn't faster on this point since they should be using faster algorithms, it's not the hiding, unhiding of the rows since that happens very quickly. So if anyone knows, let me know. This procedure takes just over 1 second on my slow computer:
Sub FindDupesAndDelete()
Dim d1 As Double
Dim dict As Object
Dim sh As Worksheet
Dim v1 As Variant
' Dim s1() As String
Dim rDelete As Range
Dim bUnion As Boolean
d1 = Timer()
bUnion = False
Set dict = CreateObject("Scripting.Dictionary")
Set sh = ActiveSheet
v1 = Application.Transpose(sh.Range("A1", "A" _
& sh.Cells.SpecialCells(xlCellTypeLastCell).row))
' ReDim s1(1 To UBound(v1))
Dim row As Long, value As String ', newEntry As Boolean
For row = 1 To sh.Cells.SpecialCells(xlCellTypeLastCell).row
value = v1(row)
If dict.Exists(value) Then
' newEntry = False
If bUnion Then
Set rDelete = Union(rDelete, sh.Range("A" & row))
Else
Set rDelete = sh.Range("A" & row)
bUnion = True
End If
Else
' newEntry = True
dict.Add value, 1
End If
' s1(row) = newEntry
Next
rDelete.EntireRow.Delete
' sh.Range("B1", "B" & UBound(v1)) = Application.Transpose(s1)
Debug.Print Timer() - d1
End Sub
Okay so now we have some more info here is a solution. It should execute almost instantly.
The code works by filling column y with your concatenate formula. It then adds all of column y to a dictionary and using the dictionary marks each row as a duplicate in column z. It then removes all the duplicates found after row 35000. Then finally it clears both column y and column z to remove the redundant data.
Sub RemoveDuplicates()
Dim vData As Variant, vArray As Variant
Dim lRow As Long
'// Get used range of column A (excluding header) and offset to get column y
With ActiveSheet.Range("A2", Cells(Rows.Count, "A").End(xlUp)).Offset(, 24)
'// Adds the concatenate formula to the sheet column (y)
.FormulaR1C1 = "=RC[-24]&RC[-23]&RC[-19]&RC[-18]&RC[-17]&RC[-16]"
'// Adds the formula results to an array
vData = .Resize(, 1).value
End With
'// Re dimension the array to the correct size
ReDim vArray(1 To UBound(vData, 1), 0)
'// Create a dictionary object using late binding
With CreateObject("Scripting.Dictionary")
'// Loop through each row in the array
For lRow = 1 To UBound(vData, 1)
'// Check if value exists in the array
If Not .exists(vData(lRow, 1)) Then
'// Value does not exist mark as non duplicate.
vArray(lRow, 0) = "x"
'// Add value to dictionary
.Add vData(lRow, 1), Nothing
End If
Next lRow
End With
'// Turn off screen updating to speed up code and prevent screen flicker
Application.ScreenUpdating = False
With ActiveSheet
'// Populate column z with the array
.Range("Z2").Resize(UBound(vArray, 1)) = vArray
'// Use error handling as speciallcells throws an error when none exist.
On Error Resume Next
'// Delete all blank cells in column z
.Range("Y35001", .Cells(Rows.Count, "Y").End(xlUp)).Offset(, 1).SpecialCells(xlCellTypeBlanks).EntireRow.Delete
'// Remove error handling
On Error GoTo 0
'// Clear columns y and z
.Columns(25).Resize(, 2).ClearContents
End With
'// Turn screen updating back on.
Application.ScreenUpdating = True
End Sub
NOTE: you can change all references "activesheet" to your sheet codename if you want.
NOTE2: it assumes you have headers and has left row 1 alone.
I have used your columns and test data as best I can. Here is the test fill I used:
Sub TestFill()
For i = 1 To 37000
With Range("A" & i)
.value = Choose(Int(2 * Rnd + 1), "PTY", "GTY")
.Offset(, 1).value = Round((40000 * (Rnd + 1)), Choose(Int(4 * Rnd + 1), 1, 2, 3, 4))
.Offset(, 5).value = Int(4 * Rnd + 1)
.Offset(, 6).value = Int(2 * Rnd + 1)
.Offset(, 7).value = Choose(Int(2 * Rnd + 1), "230", "540")
.Offset(, 8).value = Int(3 * Rnd + 1)
End With
Next i
End Sub
Lets say you have your entries in column A, and you want the result of your formula in column B (but much faster). This VBA macro should do the trick:
Option Explicit
Sub FindDupes()
Dim dict As Object
Dim sh As Worksheet
Set dict = CreateObject("Scripting.Dictionary")
Set sh = ActiveSheet
Dim row As Long, value As String
For row = 1 To sh.Cells.SpecialCells(xlCellTypeLastCell).row
value = sh.Range("A" & row).Text
If dict.Exists(value) Then
sh.Range("B" & row) = "False"
Else
sh.Range("B" & row) = "True"
dict.Add value, 1
End If
Next
End Sub
(Using a dictionary gives here almost linear running time, which should be a matter of seconds for 35.0000 rows, where your original formula had quadratic running time complexity).
Edit: due to your comment: you will have to fill the dictionary first by reading each entry at least once, that is something you cannot avoid easily. What you can avoid is to fill the rows of column B again when they are already filled:
Option Explicit
Sub FindDupes()
Dim dict As Object
Dim sh As Worksheet
Set dict = CreateObject("Scripting.Dictionary")
Set sh = ActiveSheet
Dim row As Long, value As String, newEntry As Boolean
For row = 1 To sh.Cells.SpecialCells(xlCellTypeLastCell).row
value = sh.Range("A" & row).Text
If dict.Exists(value) Then
newEntry = False
Else
newEntry = True
dict.Add value, 1
End If
If Trim(sh.Range("B" & row)) = "" Then sh.Range("B" & row) = newEntry
Next
End Sub
But I suspect this won't be much faster than my first solution.
Now that you have updated that you want the entire rows deleted and that the first 35000 rows are allowed to have dupes, here is a function that will do that for you. I think I came up with a clever method and it's blazing fast, too:
Sub RemoveNewDupes()
Application.ScreenUpdating = False
Dim lastRow As Long
Dim varray As Variant
Dim oldDict As Object, newDict As Object
Set oldDict = CreateObject("scripting.dictionary")
Set newDict = CreateObject("scripting.dictionary")
On Error Resume Next
lastRow = Range("A" & Rows.Count).End(xlUp).Row
'Add old entries to dictionary
varray = Range("A1:A35000").Value
For i = 1 To UBound(varray, 1)
oldDict.Add varray(i, 1), 1
Next
'Check for dupes
varray = Range("A35001:A" & lastRow).Value
For i = 35000 + UBound(varray, 1) To 35001 Step -1
If oldDict.exists(varray(i - 35000, 1)) = True Or _
newDict.exists(varray(i - 35000, 1)) = True Then
Range("A" & i).EntireRow.Delete
Else
newDict.Add varray(i - 35000, 1), 1
End If
Next
Application.ScreenUpdating = True
'A status message at the end for finishing touch
MsgBox UBound(varray, 1) - newDict.Count & _
" duplicate row(s) found and deleted."
End Sub
How it works:
First I store the 35000 cells into a dictionary file. Then I take a variant array of every cell 35001 onward and loop through them backwards to see if it's in the 35k dictionary or not, or that we haven't come across the value yet in the loop. If it finds that it's a dupe, it deletes the row.
The cool (if I may say) way that it does the row deletion is that when you create the varray, for say A35001 - A37000, it stores them as (1, 1) (2, 1) (...). So if you set "i" to the Ubound of the array + 35000 and step back to 35001, you will loop through all the additions backwardsfrom A37000 to A35001. Then when you want to delete the row, "i" is perfectly set to the row number the value was found in, so you can delete it. And since it goes backwards, it does not screw up the loop!