Unable to get rid of unwanted links - vba

I've written a script in VBA to parse some links (connected to contact keyword) from a few sites. One link from each site. I used xmlhttp requests to accomplish the task. When I execute my script it does parse links from each site. The only problem is that few sites do not have any such links (connected to contact keyword) and as a result the output in my excel sheet becomes messy. To be clearer: if any site does not have such link, my scraper fill that column with the previous value. I'm storing those collected links just in the next columns of each search. I hope the below Image will bring you the clarity of what I meant.
This is my try so far:
Sub GetConditionalLinks()
Dim HTTP As New XMLHTTP60, Html As New HTMLDocument
Dim post As Object, cel As Range, newlink$, R&
For Each cel In Range("A1:A" & Cells(Rows.Count, 1).End(xlUp).Row)
If cel.Value <> "" Then
With HTTP
.Open "GET", cel.Value, False
.send
Html.body.innerHTML = .responseText
End With
For Each post In Html.getElementsByTagName("a")
If InStr(1, post.innerText, "contact", 1) > 0 Then newlink = post.getAttribute("href"): Exit For
Next post
cel(1, 2) = newlink
End If
Next cel
End Sub
Links I've tried with (I intentionally left few rows blank to see how the script behaves):
https://www.yify-torrent.org/search/1080p/
https://www.houzz.com/professionals/
https://chandoo.org/forum/forums/vba-macros/
https://www.amazon.com/dp/B01LTIORC8
https://stackoverflow.com/questions
https://www.amazon.com/dp/B01LTIORC8
https://www.amazon.com/dp/B00GPAFHIO
The output I'm having:
The output I'm expecting to have:
Search links are in column A and the collected links are in column B. You must have noticed already that the collected links mismatched with the source links because of my looping logic.
My Question:
How can I fix my loop to get the expected output?
What would be the fully qualified line of cel(1, 2), I meant If I mimic something like Worksheets("SomeSheet").Range("A1")?

I would expect you to be able to use something like the following:
Option Explicit
Public Sub GetConditionalLinks()
Dim HTTP As New XMLHTTP60, Html As New HTMLDocument, post As Object, i As Long, arr()
With ActiveSheet
arr = .Range("A1:B" & .Cells(.Rows.Count, 1).End(xlUp).Row).Value
With HTTP
For i = LBound(arr, 1) To UBound(arr, 1)
If arr(i, 1) <> vbNullString Then
.Open "GET", arr(i, 1), False
.send
Html.body.innerHTML = .responseText
For Each post In Html.getElementsByTagName("a")
If InStr(1, post.innerText, "contact", 1) > 0 Then arr(i, 2) = post.getAttribute("href"): Exit For
Next post
End If
Next i
End With
.Range("A1").Resize(UBound(arr, 1), UBound(arr, 2)) = arr
End With
End Sub
Access Denied Sites:
So I started getting access denied so re-wrote as the following. Open to suggestions on improving error handling. It is pretty rudimentary but I was trying to avoid GoTo statements.
Option Explicit
Public Sub GetConditionalLinks()
Dim HTTP As New MSXML2.ServerXMLHTTP60, Html As New HTMLDocument, post As Object, i As Long, arr(), timeoutError As Boolean
With ActiveSheet
arr = .Range("A1:B" & .Cells(.Rows.Count, 1).End(xlUp).Row).Value
With HTTP
For i = LBound(arr, 1) To UBound(arr, 1)
timeoutError = False
If arr(i, 1) <> vbNullString Then
.Open "GET", arr(i, 1), False
On Error GoTo Errhand
.send
If Not timeoutError Then
Html.body.innerHTML = .responseText
For Each post In Html.getElementsByTagName("a")
If InStr(1, post.innerText, "contact", 1) > 0 Then arr(i, 2) = post.getAttribute("href"): Exit For
Next post
End If
End If
Next i
End With
.Range("A1").Resize(UBound(arr, 1), UBound(arr, 2)) = arr
End With
Exit Sub
Errhand:
If Err.Number <> 0 Then
Select Case Err.Number
Case -2147012894 '<== Timeout; especially on access denied sites
timeoutError = True
Resume Next
Case Else '<== Don't know what we are gonna do yet so let's exit
Debug.Print Err.Number, Err.Description
End Select
End If
End Sub
Without using an array and looping sheet:
Option Explicit
Public Sub GetConditionalLinks()
Dim HTTP As New MSXML2.ServerXMLHTTP60, Html As New HTMLDocument, cel As Range, post As Object, R As Long, timeoutError As Boolean
Application.ScreenUpdating = False
With ActiveSheet
For Each cel In .Range("A1:A" & .Cells(Rows.Count, 1).End(xlUp).Row)
R = R + 1: timeoutError = False
If Not IsEmpty(cel) Then
HTTP.Open "GET", cel.Value, False
On Error GoTo Errhand
HTTP.send
If Not timeoutError Then
Html.body.innerHTML = HTTP.responseText
For Each post In Html.getElementsByTagName("a")
If InStr(1, post.innerText, "contact", 1) > 0 Then
.Cells(R, 2) = post.getAttribute("href"): Exit For
End If
Next post
End If
End If
Next cel
End With
Application.ScreenUpdating = True
Exit Sub
Errhand:
If Err.Number <> 0 Then
Select Case Err.Number
Case -2147012894 '<== Timeout; especially on access denied sites
timeoutError = True
Resume Next
Case Else
Debug.Print Err.Number, Err.Description
End Select
End If
Application.ScreenUpdating = True
End Sub

How about doing like the following? Only difining newlink = "" just after the for loop within the script should fix the issue:
Sub GetConditionalLinks()
Dim HTTP As New XMLHTTP60, HTML As New HTMLDocument
Dim post As Object, elem As Object, newlink$
Dim cel As Range, R&
For Each cel In Range("A1:A" & Cells(Rows.Count, 1).End(xlUp).Row)
newlink = "" '''THIS IS THE FIX
If cel.Value <> "" Then
With HTTP
.Open "GET", cel.Value, False
.send
HTML.body.innerHTML = .responseText
End With
For Each post In HTML.getElementsByTagName("a")
If InStr(1, post.innerText, "contact", 1) > 0 Then newlink = post.getAttribute("href"): Exit For
Next post
cel(1, 2) = newlink
End If
Next cel
End Sub

Related

Data Scraping from Website - receiving error on different player pages

I'm trying to data scrape game logs from basketball reference.com. It worked perfectly on two players I choose (demar derozan and lamarcus aldridge). But then I started going through other players and it just wouldn't scrape the data for many of the other players (Kevin Durant).
I have NO Idea why it wouldn't work. For example, I tried Stephen Curry and it worked fine, but players like Draymond Green and Kevin Durant, the code would just not scrape the data at all. For some reason after the column Date, everything stopped working.
Sub Data()
Dim ieObj As InternetExplorer
Dim htmlEle As IHTMLElement
Dim i As Integer
i = 1
Set ieObj = New InternetExplorer
ieObj.Visible = True
ieObj.navigate "https://www.basketball-reference.com/players/d/duranke01/gamelog/2019"
Application.ScreenUpdating = False
Application.DisplayStatusBar = True
Application.Wait Now + TimeValue("00:00:05")
For Each htmlEle In ieObj.document.getElementsByClassName("stats_table")(0).getElementsByTagName("tr")
With ActiveSheet
.Range("A" & i).Value = htmlEle.Children(0).textContent
.Range("B" & i).Value = htmlEle.Children(1).textContent
.Range("C" & i).Value = htmlEle.Children(2).textContent
.Range("D" & i).Value = htmlEle.Children(3).textContent
End With
i = i + 1
On Error Resume Next
Next htmlEle
End Sub
The error always happens on this line:
.Range("D" & i).Value = htmlEle.Children(3).textContent
I tried skipping columns to but it still wouldn't work.
I found no problem with using id for table, as seen in .responseText and using xmlhttp over browser.
I only tested with 3 urls - in sheet 1 A1:A3
https://www.basketball-reference.com/players/d/duranke01/gamelog/2019
https://www.basketball-reference.com/players/c/curryst01/gamelog/2019
https://www.basketball-reference.com/players/g/greendr01/gamelog/2019
With this site sometimes you can encounter tables inside comments so I stripped out the comment tags before processing. This was not necessary for the links I tried.
I use clipboard to copy paste but you could just use
Set hTable = html.getElementById("pgl_basic")
Then loop the tr and td as you wish using getElementsByTagName.
References (VBE>Tools>References):
Microsoft HTML Object Library
Option Explicit
Public Sub GetPlayerInfo()
Dim urls(), i As Long, html As HTMLDocument, hTable As Object
Dim ws As Worksheet, wsCurrent As Object, clipboard As Object
Dim lastRow As Long, playerIdentifier As String, arr() As String
Set clipboard = GetObject("New:{1C3B4210-F441-11CE-B9EA-00AA006B1A69}")
Set ws = ThisWorkbook.Worksheets("Sheet1")
lastRow = ws.Cells(ws.rows.Count, "A").End(xlUp).Row
urls = Application.Transpose(ws.Range("A1:A" & lastRow).Value)
Set html = New HTMLDocument
With CreateObject("MSXML2.XMLHTTP")
For i = LBound(urls) To UBound(urls)
.Open "GET", urls(i), False
.send
html.body.innerHTML = Replace$(Replace$(.responseText, "-->", vbNullString), "<!--", vbNullString) 'remove comments
arr = Split(urls(i), "/")
playerIdentifier = arr(5)
If SheetExists(playerIdentifier) Then
With ThisWorkbook.Worksheets(playerIdentifier).Cells
.ClearContents
.ClearFormats
Set wsCurrent = ThisWorkbook.Worksheets(playerIdentifier)
End With
Else
Set wsCurrent = ThisWorkbook.Worksheets.Add
wsCurrent.name = playerIdentifier
End If
Set hTable = html.querySelector("#pgl_basic")
clipboard.SetText hTable.outerHTML
clipboard.PutInClipboard
wsCurrent.Range("A1").PasteSpecial
Set wsCurrent = Nothing: Set hTable = Nothing: Erase arr: playerIdentifier = vbNullString
Application.CutCopyMode = False
Next
End With
End Sub
Public Function SheetExists(ByVal sheetName As String) As Boolean '<== function by #Rory
SheetExists = Evaluate("ISREF('" & sheetName & "'!A1)")
End Function
IE
Note proper page load wait used.
Option Explicit
Public Sub GetPlayerInfo()
Dim ieObj As InternetExplorer, htmlEle As IHTMLElement
Dim urls(), i As Long, j As Long, hTable As Object
Dim ws As Worksheet, wsCurrent As Object
Dim lastRow As Long, playerIdentifier As String, arr() As String
Application.ScreenUpdating = False
On Error GoTo errHand
Set ws = ThisWorkbook.Worksheets("Sheet1")
lastRow = ws.Cells(ws.Rows.Count, "A").End(xlUp).Row
urls = Application.Transpose(ws.Range("A1:A" & lastRow).Value)
Set ieObj = New InternetExplorer
With ieObj
.Visible = True
For j = LBound(urls) To UBound(urls)
.navigate urls(j)
While .Busy Or .readyState <> 4: DoEvents: Wend
arr = Split(urls(j), "/")
playerIdentifier = arr(5)
If SheetExists(playerIdentifier) Then
With ThisWorkbook.Worksheets(playerIdentifier).Cells
.ClearContents
.ClearFormats
Set wsCurrent = ThisWorkbook.Worksheets(playerIdentifier)
End With
Else
Set wsCurrent = ThisWorkbook.Worksheets.Add
wsCurrent.Name = playerIdentifier
End If
i = 1
Set hTable = .document.getElementById("pgl_basic")
If Not hTable Is Nothing Then
For Each htmlEle In hTable.getElementsByTagName("tr")
With wsCurrent
.Range("A" & i).Value = htmlEle.Children(0).textContent
.Range("B" & i).Value = htmlEle.Children(1).textContent
.Range("C" & i).Value = htmlEle.Children(2).textContent
.Range("D" & i).Value = htmlEle.Children(3).textContent
End With
i = i + 1
Next htmlEle
End If
Set wsCurrent = Nothing: Set hTable = Nothing: Erase arr: playerIdentifier = vbNullString
Next
End With
errHand:
Application.ScreenUpdating = True
If Err.Number <> 0 Then Debug.Print Err.Number, Err.Description
ie.Quit
End Sub

Unable to make my script handle errors until some loop ends

I've written a script in vba to scrape the ip address populated upon making a proxied request. I've used proxy (out of list of proxies) within my vba script to test (probably none of them are working at this moment).
However, what I want to achieve is that when a requests is failed the following script will print that error message and keep going for the next requests otherwise it will parse the ip address from that site and keep going until the loops gets exhausted.
My attempt so far (consider the proxyList to be the working ones):
Sub ValidateProxies()
Dim Http As New ServerXMLHTTP60, elem As Object, S$
Dim proxyList As Variant, oProxy As Variant
proxyList = [{"98.163.59.8:8080","134.209.115.223:3128","191.101.233.198:3129","198.177.126.218:80","35.185.201.225:8080"}]
For Each oProxy In proxyList
On Error Resume Next
With Http
.Open "GET", "https://www.myip.com/", False
.setRequestHeader "User-Agent", "Mozilla/5.0"
.setProxy 2, oProxy
.send
End With
On Error GoTo 0
If Err.Number <> 0 Then
Debug.Print "Encountered an error"
Else:
With New HTMLDocument
.body.innerHTML = Http.responseText
Set elem = .querySelector("#ip")
R = R + 1: Cells(R, 1) = oProxy
Cells(R, 2) = elem.innerText
End With
End If
Next oProxy
End Sub
How can I make my script print any error when there is one and keep rolling until the loop ends?
Here is the example with async requests pool and logging statuses and errors to a worksheet. It uses a proxy list from free-proxy-list.net.
Option Explicit
Sub TestProxy()
Const PoolCapacity = 50
Const ReqTimeout = 15
Dim sResp
Dim aProxyList
Dim oMatch
Dim oWS
Dim lIndex
Dim ocPool
Dim i
Dim sResult
Dim oReq
' Parsing proxy list from free-proxy-list.net
With CreateObject("MSXML2.ServerXMLHTTP.6.0")
.Open "GET", "https://free-proxy-list.net/", True
.SetRequestHeader "User-Agent", "Mozilla/5.0 (Windows NT 6.1; Win64; x64)"
.Send
Do Until .ReadyState = 4: DoEvents: Loop
sResp = .ResponseText
End With
With CreateObject("VBScript.RegExp")
.Global = True
.Pattern = "<td[^>]*>(\d+\.\d+\.\d+\.\d+)<\/td><td[^>]*>(\d+)<\/td>"
aProxyList = Array()
For Each oMatch In .Execute(sResp)
ReDim Preserve aProxyList(UBound(aProxyList) + 1)
aProxyList(UBound(aProxyList)) = oMatch.SubMatches(0) & ":" & oMatch.SubMatches(1)
Next
End With
' Proxy checking with api.myip.com requests
Set oWS = ThisWorkbook.Sheets(1)
oWS.Cells.Delete
Set ocPool = New Collection
lIndex = 0
Do
' Check pool for completed requests
For i = ocPool.Count To 1 Step -1
On Error Resume Next
sResult = ""
With ocPool(i)(0)
Select Case True
Case .ReadyState < 4
Case .Status \ 100 <> 2
sResult = "Status " & .Status & " / " & .StatusText
Case Else
sResult = .ResponseText
End Select
End With
Select Case True
Case Err.Number <> 0
sResult = "Error " & Err.Number & " / " & Err.Description
Case (Now - ocPool(i)(1)) * 86400 > ReqTimeout
sResult = "Timeout"
End Select
On Error GoTo 0
If sResult <> "" Then
oWS.Cells(ocPool(i)(2), 2).Value = sResult
ocPool.Remove i
End If
DoEvents
Next
' Add new request to pool
If ocPool.Count < PoolCapacity And lIndex <= UBound(aProxyList) Then
Set oReq = CreateObject("MSXML2.ServerXMLHTTP.6.0")
With oWS.Cells(lIndex + 1, 1)
.Value = aProxyList(lIndex)
.Select
End With
With oReq
.Open "GET", "https://api.myip.com/", True
.SetProxy 2, aProxyList(lIndex)
.SetRequestHeader "User-Agent", "Mozilla/5.0 (Windows NT 6.1; Win64; x64)"
.Send
End With
ocPool.Add Array( _
oReq, _
Now, _
lIndex + 1 _
)
lIndex = lIndex + 1
DoEvents
End If
Loop While ocPool.Count > 0
MsgBox "Completed"
End Sub
This will print all errors encountered and you should tailor by err.Number
Option Explicit
Public Sub ValidateProxies()
Dim http As New ServerXMLHTTP60, elem As Object, S$
Dim proxyList As Variant, oProxy As Variant, r As Long
Dim html As HTMLDocument
Set html = New HTMLDocument
proxyList = [{"98.163.59.8:8080","134.209.115.223:3128","191.101.233.198:3129","198.177.126.218:80","35.185.201.225:8080"}]
For Each oProxy In proxyList
On Error GoTo errhand:
With http
.Open "GET", "https://www.myip.com/", False
.setRequestHeader "User-Agent", "Mozilla/5.0"
.SetProxy 2, oProxy
.send
With html
.body.innerHTML = http.responseText
Set elem = .querySelector("#ip")
r = r + 1: ActiveSheet.Cells(r, 1) = oProxy
ActiveSheet.Cells(r, 2) = elem.innerText
End With
End With
Next oProxy
Exit Sub
errhand:
If Err.Number <> 0 Then
Debug.Print "Encountered an error " & Err.Description, oProxy
Err.Clear
Resume Next
End If
End Sub

Unable to convert my existing scraper to xmlhttp request

I have written a script to get Title from any random websites. It works flawlessly. I have written this using Internet Explorer. I've tried a lot but can't make the same using xmlhttp request as performance is a big issue to consider. What I've tried so far is:
Sub Title_scraping()
Dim IE As Object
Dim doc As Object, cel As Range
For Each cel In Range("A1:A5")
Set IE = CreateObject("InternetExplorer.Application")
IE.navigate cel.Value
While IE.Busy
DoEvents
Wend
Set doc = IE.document
x = x + 1
Cells(x, 2) = doc.title
Next cel
End Sub
Sites I've tried with and got results:
https://stackoverflow.com/documentation/
https://codereview.stackexchange.com/
https://yts.ag/browse-movies
Combining your code and the code posted here, here is your final code:
Sub GetData()
Dim title As String
Dim objHttp As Object, cel As Range, x As Long
Set objHttp = CreateObject("MSXML2.ServerXMLHTTP")
For Each cel In Range("A1:A5")
objHttp.Open "GET", cel.Value, False
On Error Resume Next
objHttp.send ""
title = objHttp.responseText
If InStr(1, UCase(title), "<TITLE>") Then
title = Mid(title, InStr(1, UCase(title), "<TITLE>") + Len("<TITLE>"))
title = Mid(title, 1, InStr(1, UCase(title), "</TITLE>") - 1)
Else
title = ""
End If
x = x + 1
Cells(x, 2) = title
Next cel
End Sub

Bulk Url checker macro excel

Im seeking for help as i have a bulk of links to check if the link is broken i have tried the below macro but it works twice and after that it is no longer working i am using ms office 10 64bit i would like to add on the macro if macro
can check the image resolution for example if i paste url on column A it will highlight the broken links and on column b it will show the image resolution
Sub Audit_WorkSheet_For_Broken_Links()
If MsgBox("Is the Active Sheet a Sheet with Hyperlinks You Would Like to Check?", vbOKCancel) = vbCancel Then
Exit Sub
End If
On Error Resume Next
For Each alink In Cells.Hyperlinks
strURL = alink.Address
If Left(strURL, 4) <> "http" Then
strURL = ThisWorkbook.BuiltinDocumentProperties("Hyperlink Base") & strURL
End If
Application.StatusBar = "Testing Link: " & strURL
Set objhttp = CreateObject("MSXML2.XMLHTTP")
objhttp.Open "HEAD", strURL, False
objhttp.Send
If objhttp.statustext <> "OK" Then
alink.Parent.Interior.Color = 255
End If
Next alink
Application.StatusBar = False
On Error GoTo 0
MsgBox ("Checking Complete!" & vbCrLf & vbCrLf & "Cells With Broken or Suspect Links are Highlighted in RED.")
End Sub
Edit: I changed your macro to declare variables properly and release objects upon macro completion; this should address any potential memory issues. Please try this code and let me know if it works.
Sub Audit_WorkSheet_For_Broken_Links()
If MsgBox("Is the Active Sheet a Sheet with Hyperlinks You Would Like to Check?", vbOKCancel) = vbCancel Then
Exit Sub
End If
Dim alink As Hyperlink
Dim strURL As String
Dim objhttp As Object
On Error Resume Next
For Each alink In Cells.Hyperlinks
strURL = alink.Address
If Left(strURL, 4) <> "http" Then
strURL = ThisWorkbook.BuiltinDocumentProperties("Hyperlink Base") & strURL
End If
Application.StatusBar = "Testing Link: " & strURL
Set objhttp = CreateObject("MSXML2.XMLHTTP")
objhttp.Open "HEAD", strURL, False
objhttp.Send
If objhttp.statustext <> "OK" Then
alink.Parent.Interior.Color = 255
End If
Next alink
Application.StatusBar = False
'Release objects to prevent memory issues
Set alink = Nothing
Set objhttp = Nothing
On Error GoTo 0
MsgBox ("Checking Complete!" & vbCrLf & vbCrLf & "Cells With Broken or Suspect Links are Highlighted in RED.")
End Sub
Old Answer Below
Combining your macro (which seems to be from here) with an alternative found on excelforum yields the below code. Give it a try and let me know if it works for you.
Sub TestHLinkValidity()
Dim rRng As Range
Dim fsoFSO As Object
Dim strPath As String
Dim cCell As Range
If MsgBox("Is the Active Sheet a Sheet with Hyperlinks You Would Like to Check?", vbOKCancel) = vbCancel Then
Exit Sub
End If
Set fsoFSO = CreateObject("Scripting.FileSystemObject")
Set rRng = ActiveSheet.UsedRange.Cells
For Each cCell In rRng.Cells
If cCell.Hyperlinks.Count > 0 Then
strPath = GetHlinkAddr(cCell)
If fsoFSO.FileExists(strPath) = False Then cCell.Interior.Color = 65535
End If
Next cCell
End Sub
Function GetHlinkAddr(rngHlinkCell As Range)
GetHlinkAddr = rngHlinkCell.Hyperlinks(1).Address
End Function

Excel 2010 VBA - Pull Table from website

I'm very new to Excel VBA and I was given some code to play with. I successfully modified it once, but then I tried to modify it again and it won't pull the right info. It might be because I don't know the table id for sure? I don't understand all of the code from this example...that's probably the other issue. Anyways I'm trying to pull the Historical Prices Table from this page. It pulls some data, but not the correct data. Any help would be appreciated. Thanks!
Here is my current code:
Sub GrabHistData()
Dim Ptrtbl As Long, r As Long, c As Long
Dim htm As Object
Dim elemCollection As Object
Set htm = CreateObject("htmlFile")
With CreateObject("msxml2.xmlhttp")
.Open "GET", "http://finance.yahoo.com/q/hp?s=TWTR&a=04&b=30&c=2012&d=01&e=7&f=2014&g=d", False
.send
htm.body.innerhtml = .responsetext
End With
Set elemCollection = htm.getElementsByTagName("TABLE")
Ptrtbl = 1
For Each elem In elemCollection
Ptrtbl = Ptrtbl + 1
If elem.ID <> "yfncsumtab" Then GoTo Nxtelem
With elemCollection(Ptrtbl)
For c = 0 To (.Rows(r).Cells.Length - 1)
Cells(r + 1, c + 1) = .Rows(r).Cells(c).innertext
Next c
End With
Exit For
Nxtelem:
Next elem
End Sub
If you want to stick with your current approach, this works for me...
Sub GrabHistData()
Dim Ptrtbl As Long, r As Long, c As Long
Dim htm As Object
Dim elemCollection As Object
Set htm = CreateObject("htmlFile")
With CreateObject("msxml2.xmlhttp")
.Open "GET", "http://finance.yahoo.com/q/hp?s=TWTR&a=04&b=30&c=2012&d=01&e=7&f=2014&g=d", False
.send
htm.body.innerhtml = .responseText
End With
Set elemCollection = htm.getElementsByTagName("td")
For Each itm In elemCollection
If itm.classname = "yfnc_tabledata1" Then
ActiveCell = itm.innertext
If ActiveCell.Column = 7 Then
ActiveCell.Offset(1, -6).Select
Else
ActiveCell.Offset(0, 1).Select
End If
End If
Next
End Sub