I've written a script in vba to scrape the ip address populated upon making a proxied request. I've used proxy (out of list of proxies) within my vba script to test (probably none of them are working at this moment).
However, what I want to achieve is that when a requests is failed the following script will print that error message and keep going for the next requests otherwise it will parse the ip address from that site and keep going until the loops gets exhausted.
My attempt so far (consider the proxyList to be the working ones):
Sub ValidateProxies()
Dim Http As New ServerXMLHTTP60, elem As Object, S$
Dim proxyList As Variant, oProxy As Variant
proxyList = [{"98.163.59.8:8080","134.209.115.223:3128","191.101.233.198:3129","198.177.126.218:80","35.185.201.225:8080"}]
For Each oProxy In proxyList
On Error Resume Next
With Http
.Open "GET", "https://www.myip.com/", False
.setRequestHeader "User-Agent", "Mozilla/5.0"
.setProxy 2, oProxy
.send
End With
On Error GoTo 0
If Err.Number <> 0 Then
Debug.Print "Encountered an error"
Else:
With New HTMLDocument
.body.innerHTML = Http.responseText
Set elem = .querySelector("#ip")
R = R + 1: Cells(R, 1) = oProxy
Cells(R, 2) = elem.innerText
End With
End If
Next oProxy
End Sub
How can I make my script print any error when there is one and keep rolling until the loop ends?
Here is the example with async requests pool and logging statuses and errors to a worksheet. It uses a proxy list from free-proxy-list.net.
Option Explicit
Sub TestProxy()
Const PoolCapacity = 50
Const ReqTimeout = 15
Dim sResp
Dim aProxyList
Dim oMatch
Dim oWS
Dim lIndex
Dim ocPool
Dim i
Dim sResult
Dim oReq
' Parsing proxy list from free-proxy-list.net
With CreateObject("MSXML2.ServerXMLHTTP.6.0")
.Open "GET", "https://free-proxy-list.net/", True
.SetRequestHeader "User-Agent", "Mozilla/5.0 (Windows NT 6.1; Win64; x64)"
.Send
Do Until .ReadyState = 4: DoEvents: Loop
sResp = .ResponseText
End With
With CreateObject("VBScript.RegExp")
.Global = True
.Pattern = "<td[^>]*>(\d+\.\d+\.\d+\.\d+)<\/td><td[^>]*>(\d+)<\/td>"
aProxyList = Array()
For Each oMatch In .Execute(sResp)
ReDim Preserve aProxyList(UBound(aProxyList) + 1)
aProxyList(UBound(aProxyList)) = oMatch.SubMatches(0) & ":" & oMatch.SubMatches(1)
Next
End With
' Proxy checking with api.myip.com requests
Set oWS = ThisWorkbook.Sheets(1)
oWS.Cells.Delete
Set ocPool = New Collection
lIndex = 0
Do
' Check pool for completed requests
For i = ocPool.Count To 1 Step -1
On Error Resume Next
sResult = ""
With ocPool(i)(0)
Select Case True
Case .ReadyState < 4
Case .Status \ 100 <> 2
sResult = "Status " & .Status & " / " & .StatusText
Case Else
sResult = .ResponseText
End Select
End With
Select Case True
Case Err.Number <> 0
sResult = "Error " & Err.Number & " / " & Err.Description
Case (Now - ocPool(i)(1)) * 86400 > ReqTimeout
sResult = "Timeout"
End Select
On Error GoTo 0
If sResult <> "" Then
oWS.Cells(ocPool(i)(2), 2).Value = sResult
ocPool.Remove i
End If
DoEvents
Next
' Add new request to pool
If ocPool.Count < PoolCapacity And lIndex <= UBound(aProxyList) Then
Set oReq = CreateObject("MSXML2.ServerXMLHTTP.6.0")
With oWS.Cells(lIndex + 1, 1)
.Value = aProxyList(lIndex)
.Select
End With
With oReq
.Open "GET", "https://api.myip.com/", True
.SetProxy 2, aProxyList(lIndex)
.SetRequestHeader "User-Agent", "Mozilla/5.0 (Windows NT 6.1; Win64; x64)"
.Send
End With
ocPool.Add Array( _
oReq, _
Now, _
lIndex + 1 _
)
lIndex = lIndex + 1
DoEvents
End If
Loop While ocPool.Count > 0
MsgBox "Completed"
End Sub
This will print all errors encountered and you should tailor by err.Number
Option Explicit
Public Sub ValidateProxies()
Dim http As New ServerXMLHTTP60, elem As Object, S$
Dim proxyList As Variant, oProxy As Variant, r As Long
Dim html As HTMLDocument
Set html = New HTMLDocument
proxyList = [{"98.163.59.8:8080","134.209.115.223:3128","191.101.233.198:3129","198.177.126.218:80","35.185.201.225:8080"}]
For Each oProxy In proxyList
On Error GoTo errhand:
With http
.Open "GET", "https://www.myip.com/", False
.setRequestHeader "User-Agent", "Mozilla/5.0"
.SetProxy 2, oProxy
.send
With html
.body.innerHTML = http.responseText
Set elem = .querySelector("#ip")
r = r + 1: ActiveSheet.Cells(r, 1) = oProxy
ActiveSheet.Cells(r, 2) = elem.innerText
End With
End With
Next oProxy
Exit Sub
errhand:
If Err.Number <> 0 Then
Debug.Print "Encountered an error " & Err.Description, oProxy
Err.Clear
Resume Next
End If
End Sub
Related
I'm trying to create a script in vba using ServerXMLHTTP60 to parse the title of the first post from some identical links. My main goal here is to make the script asynchronous along with setting a highest time up to which the script will try before going for the next url.
However, the macro that I've created always goes for the next url when there is a timeout without being able to scrape the titles from the links.
Sub FetchContentWithinSpecificTime()
Dim oHttp As New ServerXMLHTTP60, HTML As New HTMLDocument
Dim URL As Variant, Urllist As Variant, t As Date, sResp As Boolean
Urllist = Array( _
"https://stackoverflow.com/questions/tagged/web-scraping?tab=newest&page=1", _
"https://stackoverflow.com/questions/tagged/web-scraping?tab=newest&page=2", _
"https://stackoverflow.com/questions/tagged/web-scraping?tab=newest&page=3", _
"https://stackoverflow.com/questions/tagged/web-scraping?tab=newest&page=4", _
"https://stackoverflow.com/questions/tagged/web-scraping?tab=newest&page=5" _
)
For Each URL In Urllist
Debug.Print "trying with: " & URL
With oHttp
.Open "GET", URL, True
.setRequestHeader "User-Agent", "Mozilla/5.0"
.setTimeouts 5000, 5000, 15000, 15000
.send
t = Now + TimeValue("00:00:10")
sResp = False
On Error Resume Next
Do
If .readyState = 4 Then sResp = True: Exit Do
If Now > t Then sResp = False: Exit Do
DoEvents
Loop
On Error GoTo 0
If sResp Then
HTML.body.innerHTML = .responseText
Debug.Print HTML.querySelector(".question-hyperlink").innerText
Else:
Debug.Print "failed with: " & URL
End If
End With
Next URL
End Sub
How can I make a script wait for a certain time for the titles to parse before going for the next url?
I don't know why those SO links take to long to respond but I tried with different urls and the following solution appears to be working in the right way. The credit for the rectified portion goes to the provider of this solution.
Sub FetchContentWithinSpecificTime()
Dim oHttp As New ServerXMLHTTP60, HTML As New HTMLDocument
Dim URL As Variant, Urllist As Variant, t As Date
Dim sPrice$, sResp As Boolean
Urllist = Array( _
"https://finance.yahoo.com/quote/NZDUSD=X?p=NZDUSD=X", _
"https://finance.yahoo.com/quote/FB?p=FB", _
"https://finance.yahoo.com/quote/AAPL?p=AAPL", _
"https://finance.yahoo.com/quote/IBM?p=IBM", _
"https://finance.yahoo.com/quote/UCO?p=UCO" _
)
For Each URL In Urllist
Debug.Print "trying with: " & URL
With oHttp
.Open "GET", URL, True
.setRequestHeader "User-Agent", "Mozilla/5.0"
.send
t = Now + TimeValue("00:00:10")
sResp = False
Do While .readyState < 4
If .readyState = 4 Then Exit Do
sResp = (Now > t) Or (Err.Number <> 0)
If sResp Then Exit Do
DoEvents
Loop
If Not sResp Then
HTML.body.innerHTML = .responseText
sPrice = HTML.querySelector(".Mb\(-4px\)").innerText
Debug.Print sPrice
Else:
Debug.Print "failed with: " & URL
End If
End With
Next URL
End Sub
I've written a script in vba using ServerXMLHTTP requests in order to be able to use proxy along with setting timeout parameter within it. When I run the script, it appears to be working but the problem is - it gets stuck after using the first proxy. I wish this to be running until there is no proxies left to be used. I defined this line While .readyState < 4: DoEvents: Wend only to let not the script freeze. Whether the proxies work or not the script should go on, right?
This is what I've tried:
Sub MakeProxiedRequests()
Dim Http As New ServerXMLHTTP60, Html As New HTMLDocument
Dim elem As Object, proxyList As Variant, oProxy As Variant
proxyList = Array( _
"191.96.42.184:3129", _
"138.197.108.5:3128", _
"35.245.145.147:8080", _
"173.46.67.172:58517", _
"191.96.42.82:3129", _
"157.55.201.224:8080", _
"67.205.172.239:3128", _
"191.96.42.106:3129" _
)
For Each oProxy In proxyList
Debug.Print "trying with: " & oProxy
With Http
.Open "GET", "https://stackoverflow.com/questions/tagged/web-scraping", True
.setRequestHeader "User-Agent", "Mozilla/5.0"
.setProxy 2, oProxy
.setTimeouts 600000, 600000, 15000, 15000 'I don't know the ideal timeout parameters
On Error Resume Next
.send
While .readyState < 4: DoEvents: Wend 'to let not freeze the script
Html.body.innerHTML = .responseText
Set elem = Html.querySelectorAll(".summary .question-hyperlink")
On Error GoTo 0
End With
If elem.Length > 0 Then
Debug.Print elem(0).innerText
Else:
Debug.Print "failed with: " & oProxy
End If
Next oProxy
End Sub
Note: The script will always produce the same result. However, my intention is to keep the script running until all the proxies have been used.
How can I let my script run until all the proxies have been exhausted?
The possible way is controlling request overall elapsed time and limiting it. Any run-time errors are being checked also.
Sub MakeProxiedRequests()
Const Timeout = "0:00:15"
Dim oHttp As New ServerXMLHTTP60
Dim oHtml As New HTMLDocument
Dim oElem As Object
Dim aProxyList
Dim sProxy
Dim t As Date
Dim bFailed As Boolean
aProxyList = Array( _
"191.96.42.184:3129", _
"138.197.108.5:3128", _
"35.245.145.147:8080", _
"173.46.67.172:58517", _
"191.96.42.82:3129", _
"157.55.201.224:8080", _
"67.205.172.239:3128", _
"191.96.42.106:3129" _
)
For Each sProxy In aProxyList
Debug.Print "Trying with: " & sProxy
With oHttp
.Open "GET", "https://stackoverflow.com/questions/tagged/web-scraping", True
.setRequestHeader "User-Agent", "Mozilla/5.0"
.setProxy 2, sProxy
.setTimeouts 60000, 60000, 60000, 60000
.send
t = Now() + TimeValue(Timeout)
bFailed = False
On Error Resume Next
Do
If .readyState = 4 Then Exit Do
bFailed = (Now() > t) Or (Err.Number <> 0)
If bFailed Then Exit Do
DoEvents
Loop
On Error GoTo 0
If Not bFailed Then
oHtml.body.innerHTML = .responseText
Set oElem = oHtml.querySelectorAll(".summary .question-hyperlink")
bFailed = oElem.Length = 0
End If
End With
If Not bFailed Then
Debug.Print oElem(0).innerText
Else
Debug.Print "Failed with: " & sProxy
End If
Next
End Sub
I've written a script in VBA to parse some links (connected to contact keyword) from a few sites. One link from each site. I used xmlhttp requests to accomplish the task. When I execute my script it does parse links from each site. The only problem is that few sites do not have any such links (connected to contact keyword) and as a result the output in my excel sheet becomes messy. To be clearer: if any site does not have such link, my scraper fill that column with the previous value. I'm storing those collected links just in the next columns of each search. I hope the below Image will bring you the clarity of what I meant.
This is my try so far:
Sub GetConditionalLinks()
Dim HTTP As New XMLHTTP60, Html As New HTMLDocument
Dim post As Object, cel As Range, newlink$, R&
For Each cel In Range("A1:A" & Cells(Rows.Count, 1).End(xlUp).Row)
If cel.Value <> "" Then
With HTTP
.Open "GET", cel.Value, False
.send
Html.body.innerHTML = .responseText
End With
For Each post In Html.getElementsByTagName("a")
If InStr(1, post.innerText, "contact", 1) > 0 Then newlink = post.getAttribute("href"): Exit For
Next post
cel(1, 2) = newlink
End If
Next cel
End Sub
Links I've tried with (I intentionally left few rows blank to see how the script behaves):
https://www.yify-torrent.org/search/1080p/
https://www.houzz.com/professionals/
https://chandoo.org/forum/forums/vba-macros/
https://www.amazon.com/dp/B01LTIORC8
https://stackoverflow.com/questions
https://www.amazon.com/dp/B01LTIORC8
https://www.amazon.com/dp/B00GPAFHIO
The output I'm having:
The output I'm expecting to have:
Search links are in column A and the collected links are in column B. You must have noticed already that the collected links mismatched with the source links because of my looping logic.
My Question:
How can I fix my loop to get the expected output?
What would be the fully qualified line of cel(1, 2), I meant If I mimic something like Worksheets("SomeSheet").Range("A1")?
I would expect you to be able to use something like the following:
Option Explicit
Public Sub GetConditionalLinks()
Dim HTTP As New XMLHTTP60, Html As New HTMLDocument, post As Object, i As Long, arr()
With ActiveSheet
arr = .Range("A1:B" & .Cells(.Rows.Count, 1).End(xlUp).Row).Value
With HTTP
For i = LBound(arr, 1) To UBound(arr, 1)
If arr(i, 1) <> vbNullString Then
.Open "GET", arr(i, 1), False
.send
Html.body.innerHTML = .responseText
For Each post In Html.getElementsByTagName("a")
If InStr(1, post.innerText, "contact", 1) > 0 Then arr(i, 2) = post.getAttribute("href"): Exit For
Next post
End If
Next i
End With
.Range("A1").Resize(UBound(arr, 1), UBound(arr, 2)) = arr
End With
End Sub
Access Denied Sites:
So I started getting access denied so re-wrote as the following. Open to suggestions on improving error handling. It is pretty rudimentary but I was trying to avoid GoTo statements.
Option Explicit
Public Sub GetConditionalLinks()
Dim HTTP As New MSXML2.ServerXMLHTTP60, Html As New HTMLDocument, post As Object, i As Long, arr(), timeoutError As Boolean
With ActiveSheet
arr = .Range("A1:B" & .Cells(.Rows.Count, 1).End(xlUp).Row).Value
With HTTP
For i = LBound(arr, 1) To UBound(arr, 1)
timeoutError = False
If arr(i, 1) <> vbNullString Then
.Open "GET", arr(i, 1), False
On Error GoTo Errhand
.send
If Not timeoutError Then
Html.body.innerHTML = .responseText
For Each post In Html.getElementsByTagName("a")
If InStr(1, post.innerText, "contact", 1) > 0 Then arr(i, 2) = post.getAttribute("href"): Exit For
Next post
End If
End If
Next i
End With
.Range("A1").Resize(UBound(arr, 1), UBound(arr, 2)) = arr
End With
Exit Sub
Errhand:
If Err.Number <> 0 Then
Select Case Err.Number
Case -2147012894 '<== Timeout; especially on access denied sites
timeoutError = True
Resume Next
Case Else '<== Don't know what we are gonna do yet so let's exit
Debug.Print Err.Number, Err.Description
End Select
End If
End Sub
Without using an array and looping sheet:
Option Explicit
Public Sub GetConditionalLinks()
Dim HTTP As New MSXML2.ServerXMLHTTP60, Html As New HTMLDocument, cel As Range, post As Object, R As Long, timeoutError As Boolean
Application.ScreenUpdating = False
With ActiveSheet
For Each cel In .Range("A1:A" & .Cells(Rows.Count, 1).End(xlUp).Row)
R = R + 1: timeoutError = False
If Not IsEmpty(cel) Then
HTTP.Open "GET", cel.Value, False
On Error GoTo Errhand
HTTP.send
If Not timeoutError Then
Html.body.innerHTML = HTTP.responseText
For Each post In Html.getElementsByTagName("a")
If InStr(1, post.innerText, "contact", 1) > 0 Then
.Cells(R, 2) = post.getAttribute("href"): Exit For
End If
Next post
End If
End If
Next cel
End With
Application.ScreenUpdating = True
Exit Sub
Errhand:
If Err.Number <> 0 Then
Select Case Err.Number
Case -2147012894 '<== Timeout; especially on access denied sites
timeoutError = True
Resume Next
Case Else
Debug.Print Err.Number, Err.Description
End Select
End If
Application.ScreenUpdating = True
End Sub
How about doing like the following? Only difining newlink = "" just after the for loop within the script should fix the issue:
Sub GetConditionalLinks()
Dim HTTP As New XMLHTTP60, HTML As New HTMLDocument
Dim post As Object, elem As Object, newlink$
Dim cel As Range, R&
For Each cel In Range("A1:A" & Cells(Rows.Count, 1).End(xlUp).Row)
newlink = "" '''THIS IS THE FIX
If cel.Value <> "" Then
With HTTP
.Open "GET", cel.Value, False
.send
HTML.body.innerHTML = .responseText
End With
For Each post In HTML.getElementsByTagName("a")
If InStr(1, post.innerText, "contact", 1) > 0 Then newlink = post.getAttribute("href"): Exit For
Next post
cel(1, 2) = newlink
End If
Next cel
End Sub
I've written a script in vba in combination with IE to perform clicks on some javascript links connected to each profile of a webpage. My script can click on the first link flawlessly but when it comes to click on the next link in it's second iteration, it throws permission denied error. There are on valid links connected to each profile so I can't use the links as navigation. How can I modify my script in order to click on links cyclically?
This is my script:
Sub ClickLinks()
Const Url As String = "https://intraweb.stockton.edu/eyos/page.cfm?siteID=58&pageID=7&action=dirmain&type=FAC&display=basic"
Dim IE As New InternetExplorer, Htmldoc As HTMLDocument, I&
With IE
.Visible = True
.navigate Url
While .Busy = True Or .readyState < 4: DoEvents: Wend
Set Htmldoc = .document
End With
With Htmldoc.querySelectorAll("#main table tr a")
For I = 0 To .Length - 1
.Item(I).Click 'in second iteration this line throws permission denied error
Application.Wait Now + TimeValue("00:00:03")
Next I
End With
End Sub
Using an XHR request. The following does an initial GET request to retrieve all the staff IDs. It then loops the ids issuing POST requests for each id. To show it visits each page, I retrieve the staff e-mail address from each page.
Option Explicit
Public Sub GetInfo()
Dim objHTTP As Object, URL As String, html As New HTMLDocument, i As Long, sBody As String
Set objHTTP = CreateObject("WinHttp.WinHttpRequest.5.1")
URL = "https://intraweb.stockton.edu/eyos/page.cfm?siteID=58&pageID=7&action=details"
With objHTTP
.Open "GET", "https://intraweb.stockton.edu/eyos/page.cfm?siteID=58&pageID=7&action=dirmain&type=FAC&display=basic", False
.send
html.body.innerHTML = .responseText
Dim staffIDs As Object
Set staffIDs = html.querySelectorAll("input[name=employeeID]")
For i = 0 To staffIDs.Length - 1
sBody = "employeeID=" & staffIDs(i).getAttribute("value")
.SetTimeouts 10000, 10000, 10000, 10000
.Open "POST", URL, False
.setRequestHeader "User-Agent", "User-Agent: Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36"
.setRequestHeader "Content-type", "application/x-www-form-urlencoded"
On Error Resume Next
.send (sBody)
If Err.Number = 0 Then
If .Status = "200" Then
html.body.innerHTML = .responseText
Else
Debug.Print "HTTP " & .Status & " " & .statusText
Exit Sub
End If
Else
Debug.Print "Error " & Err.Number & " " & Err.Source & " " & Err.Description
Exit Sub
End If
On Error GoTo 0
Debug.Print html.querySelector("td a").innerText
Next i
End With
End Sub
Sample view on landing page:
Sample code printout from page:
Clunky time based wait for refresh and then navigation back to landing page so can submit next form. This needs improvement and some re-ordering.
Option Explicit
Public Sub ClickLinks2()
Const URL As String = "https://intraweb.stockton.edu/eyos/page.cfm?siteID=58&pageID=7&action=dirmain&type=FAC&display=basic"
Dim IE As New InternetExplorer, Htmldoc As HTMLDocument, i&
With IE
.Visible = True
.navigate URL
While .Busy = True Or .readyState < 4: DoEvents: Wend
Set Htmldoc = .document
Dim numEmployees As Long, a As Object
numEmployees = Htmldoc.querySelectorAll("a.names").Length
For i = 1 To 3 'numEmployees (1-792)
While .Busy = True Or .readyState < 4: DoEvents: Wend
.navigate URL
Application.Wait Now + TimeSerial(0, 0, 5)
.document.parentWindow.execScript "document.form" & i & ".submit();" ''javascript:document.form1.submit(); ''<== Adapted this
Next i
End With
End Sub
I have code like below. And I receive error
"run-time error '-2146697211 *800c0005)'': the system cannot locate the resource specified"
I do not know how to solve it thanks in advance for any help. Line in which error is handled is httpRequest.send
Function pullSomeSite(urlcheck As String) As Boolean
Dim httpRequest As xmlhttp
Set httpRequest = New xmlhttp
Dim URL As String
URL = urlcheck
With httpRequest
.Open "POST", URL, False
.setRequestHeader "Content-Type", "application/x-www-form-urlencoded"
.send !!!!!!here code stops!!!!!!!
End With
With httpRequest
While Not .readyState = 4 '<---------- wait
Application.Wait Now + TimeValue("0:00:01")
Wend
'Debug.Print .Status
If .Status = 200 Then
While InStr(1, .responseText, "Updating", 0) > 0 '<---------- wait again
Application.Wait Now + TimeValue("0:00:01")
Wend
pullSomeSite = True
Else
pullSomeSite = False
End If
End With
End Function
Test this :
Sub Test_URLExists()
Dim url As String
url = "http://stackoverflow.com/questions/33940044/url-check-in-vba-error"
MsgBox url, vbInformation, URLExists(url)
url = "http://stackoverflow.com/questions/12345678/url-check-in-vba-error"
MsgBox url, vbInformation, URLExists(url)
End Sub
here is how to test an URL with a function :
Function URLExists(url As String) As Boolean
Dim Request As Object
Dim ff As Integer
Dim rc As Variant
URLExists = False
On Error GoTo EndNow
Set Request = CreateObject("WinHttp.WinHttpRequest.5.1")
With Request
.Open "GET", url, False
.Send
rc = .StatusText
End With
Set Request = Nothing
If rc = "OK" Then URLExists = True
Exit Function
EndNow:
End Function
Instead of xmlhttp data type use object.
Use the below code.
. You need to give input as "http://google.com"
Sub test1()
a = pullSomeSite("http://www.flipkart.com")
MsgBox a
End Sub
Function pullSomeSite(urlcheck As String) As Boolean
Dim httpRequest As Object
Set httpRequest = CreateObject("MSXML2.XMLHTTP")
'Set httpRequest = New xmlhttp
Dim URL As String
URL = urlcheck
With httpRequest
.Open "POST", URL, False
.setRequestHeader "Content-Type", "application/x-www-form-urlencoded"
.send '!!!!!!here code stops!!!!!!!
End With
With httpRequest
While Not .readyState = 4 '<---------- wait
Application.Wait Now + TimeValue("0:00:01")
Wend
'Debug.Print .Status
If .Status = 200 Then
While InStr(1, .responseText, "Updating", 0) > 0 '<---------- wait again
Application.Wait Now + TimeValue("0:00:01")
Wend
pullSomeSite = True
Else
pullSomeSite = False
End If
End With
End Function