Web-scrape data without class name or ID - vba

I'm trying to make access login in site and get some data from it
that is my code :
Private Sub Command4_Click()
Dim i As SHDocVw.InternetExplorer
Dim ht As HTMLDocument
Set i = New InternetExplorer
i.Visible = True
i.navigate ("https://billing.te.eg/en-US")
Do While i.ReadyState <> READYSTATE_COMPLETE
Loop
Dim idoc As MSHTML.HTMLDocument
Set idoc = i.Document
idoc.all.TxtAreaCode.Value = "45"
idoc.all.TxtPhoneNumber.Value = "03824149"
Dim ele As MSHTML.IHTMLElement
Dim eles As MSHTML.IHTMLElementCollection
Set eles = idoc.getElementsByClassName("btn")
For Each ele In eles
If ele.Type = "button" Then
ele.Click
Else
End If
Next ele
Do While i.ReadyState <> READYSTATE_COMPLETE
Loop
If i.ReadyState = READYSTATE_COMPLETE Then
Dim Doc As HTMLDocument
Set Doc = i.Document
Dim sdd As String
sdd = Trim(Doc.getElementsByClassName("col-md-12").innerText)
MsgBox sdd
Else: End If
End Sub
and this is the part that i need to get data , I need to know the idea of how to get data which doesn't have a class name or id such like that

Try complying with the following approach. It is way faster than IE.
Sub FetchData()
Const Url$ = "https://billing.te.eg/api/Account/Inquiry"
Dim S$, elem As Object, payload As Variant
Dim phone$, areaCode$, counter&
counter = 1
areaCode = "45" 'put areacode here
phone = "03824149" 'put phone number here
payload = "AreaCode=" & areaCode & "&PhoneNumber=" & phone & "&PinCode=&InquiryBy=telephone&AccountNo="
Do
With CreateObject("MSXML2.XMLHTTP")
.Open "POST", Url, False
.setRequestHeader "User-Agent", "Mozilla/5.0 (Windows NT 6.1; ) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36"
.setRequestHeader "Referer", "https://billing.te.eg/en-US"
.setRequestHeader "X-Requested-With", "XMLHttpRequest"
.setRequestHeader "Content-Type", "application/x-www-form-urlencoded; charset=UTF-8"
.send payload
S = .responseText
End With
With CreateObject("VBScript.RegExp")
.Global = True
.MultiLine = True
.Pattern = "TotalAmount"":(.*?),"
Set elem = .Execute(S)
If elem.Count > 0 Then
MsgBox elem(0).SubMatches(0)
Exit Do
End If
End With
counter = counter + 1
If counter = 3 Then Exit Do
Loop
End Sub

Related

How do I scrape data from property tax website

I want to scrape a few things from this page http://bexar.trueautomation.com/clientdb/Property.aspx?cid=110&prop_id=1229623
I want the Property ID: under the Property table
Sqft and Market Value under the Land Table
Impprovement #1 under Improvement / Building
and running into a few issues
I can't find any combo of elementID or tag/class name to pull this info
Same as above
I only want to pull the first item in the list and adding (0) to multiple spots in my code below isn't working to make this happen
I was thinking the best way to do this is make a sub ProcessHTMLPage ProcessHTMLPage2 and ProcessHTMLPage3 that does all these and then I can work on the formatting to get them into the appropriate columns as needed
Sub GetHTMLDocumentXML()
Dim XMLPage As New MSXML2.XMLHTTP60
Dim HTMLDoc As New MSHTML.HTMLDocument
Dim HTMLPage As MSHTML.HTMLDocument
Dim URL As String
Dim HTMLDiv As MSHTML.IHTMLElement
Dim HTMLTable As MSHTML.IHTMLElement
XMLPage.Open "GET", "http://bexar.trueautomation.com/clientdb/Property.aspx?cid=110&prop_id=1229623", False
XMLPage.send
If XMLPage.Status <> 200 Then
MsgBox XMLPage.Status & " - " & XMLPage.statusText
Exit Sub
End If
HTMLDoc.body.innerHTML = XMLPage.responseText
ProcessHTMLPage2 HTMLDoc
End Sub
Sub ProcessHTMLPage2(HTMLPage As MSHTML.HTMLDocument)
Dim HTMLTable As MSHTML.IHTMLElement
Dim HTMLTables As MSHTML.IHTMLElementCollection
Dim HTMLRow As MSHTML.IHTMLElement
Dim HTMLCell As MSHTML.IHTMLElement
Dim RowNum As Long, ColNum As Integer
Set HTMLTables = HTMLPage.getElementsByClassName("improvements")
Cells.Clear
For Each HTMLTable In HTMLTables
Debug.Print HTMLTable.className
RowNum = RowNum + 1
For Each HTMLRow In HTMLTable.getElementsByTagName("tr")
Debug.Print vbTab & HTMLRow.innerText
ColNum = 1
For Each HTMLCell In HTMLRow.Children
Debug.Print vbTab & HTMLCell.innerText
Cells(RowNum, ColNum) = HTMLCell.innerText
ColNum = ColNum + 1
Next HTMLCell
Next HTMLRow
Next HTMLTable
Range("A1").Select
ActiveCell.CurrentRegion.EntireColumn.AutoFit
End Sub
Try the following to get Property ID,Sqft,Market Value from that webpage. I had to use hardcoded index to locate the last two elements as I could not find any specific marker.
Public Sub FetchInfo()
Const Url$ = "http://bexar.trueautomation.com/clientdb/Property.aspx?cid=110&prop_id=1229623"
Dim S$, oItem As Object
Dim propertyId$, Sqft$, marketValue$
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", Url, False
.setRequestHeader "User-Agent", "Mozilla/5.0 (Windows NT 6.1; ) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36"
.send
S = .responseText
End With
With CreateObject("HTMLFile")
.write S
For Each oItem In .getElementsByTagName("td")
If InStr(oItem.innerText, "Property ID:") > 0 Then
propertyId = oItem.NextSibling.innerText
Exit For
End If
Next oItem
Sqft = .getElementById("landDetails").getElementsByTagName("td")(4).innerText
marketValue = .getElementById("landDetails").getElementsByTagName("td")(7).innerText
Debug.Print propertyId, Sqft, marketValue
End With
End Sub

Unable to make my script handle errors until some loop ends

I've written a script in vba to scrape the ip address populated upon making a proxied request. I've used proxy (out of list of proxies) within my vba script to test (probably none of them are working at this moment).
However, what I want to achieve is that when a requests is failed the following script will print that error message and keep going for the next requests otherwise it will parse the ip address from that site and keep going until the loops gets exhausted.
My attempt so far (consider the proxyList to be the working ones):
Sub ValidateProxies()
Dim Http As New ServerXMLHTTP60, elem As Object, S$
Dim proxyList As Variant, oProxy As Variant
proxyList = [{"98.163.59.8:8080","134.209.115.223:3128","191.101.233.198:3129","198.177.126.218:80","35.185.201.225:8080"}]
For Each oProxy In proxyList
On Error Resume Next
With Http
.Open "GET", "https://www.myip.com/", False
.setRequestHeader "User-Agent", "Mozilla/5.0"
.setProxy 2, oProxy
.send
End With
On Error GoTo 0
If Err.Number <> 0 Then
Debug.Print "Encountered an error"
Else:
With New HTMLDocument
.body.innerHTML = Http.responseText
Set elem = .querySelector("#ip")
R = R + 1: Cells(R, 1) = oProxy
Cells(R, 2) = elem.innerText
End With
End If
Next oProxy
End Sub
How can I make my script print any error when there is one and keep rolling until the loop ends?
Here is the example with async requests pool and logging statuses and errors to a worksheet. It uses a proxy list from free-proxy-list.net.
Option Explicit
Sub TestProxy()
Const PoolCapacity = 50
Const ReqTimeout = 15
Dim sResp
Dim aProxyList
Dim oMatch
Dim oWS
Dim lIndex
Dim ocPool
Dim i
Dim sResult
Dim oReq
' Parsing proxy list from free-proxy-list.net
With CreateObject("MSXML2.ServerXMLHTTP.6.0")
.Open "GET", "https://free-proxy-list.net/", True
.SetRequestHeader "User-Agent", "Mozilla/5.0 (Windows NT 6.1; Win64; x64)"
.Send
Do Until .ReadyState = 4: DoEvents: Loop
sResp = .ResponseText
End With
With CreateObject("VBScript.RegExp")
.Global = True
.Pattern = "<td[^>]*>(\d+\.\d+\.\d+\.\d+)<\/td><td[^>]*>(\d+)<\/td>"
aProxyList = Array()
For Each oMatch In .Execute(sResp)
ReDim Preserve aProxyList(UBound(aProxyList) + 1)
aProxyList(UBound(aProxyList)) = oMatch.SubMatches(0) & ":" & oMatch.SubMatches(1)
Next
End With
' Proxy checking with api.myip.com requests
Set oWS = ThisWorkbook.Sheets(1)
oWS.Cells.Delete
Set ocPool = New Collection
lIndex = 0
Do
' Check pool for completed requests
For i = ocPool.Count To 1 Step -1
On Error Resume Next
sResult = ""
With ocPool(i)(0)
Select Case True
Case .ReadyState < 4
Case .Status \ 100 <> 2
sResult = "Status " & .Status & " / " & .StatusText
Case Else
sResult = .ResponseText
End Select
End With
Select Case True
Case Err.Number <> 0
sResult = "Error " & Err.Number & " / " & Err.Description
Case (Now - ocPool(i)(1)) * 86400 > ReqTimeout
sResult = "Timeout"
End Select
On Error GoTo 0
If sResult <> "" Then
oWS.Cells(ocPool(i)(2), 2).Value = sResult
ocPool.Remove i
End If
DoEvents
Next
' Add new request to pool
If ocPool.Count < PoolCapacity And lIndex <= UBound(aProxyList) Then
Set oReq = CreateObject("MSXML2.ServerXMLHTTP.6.0")
With oWS.Cells(lIndex + 1, 1)
.Value = aProxyList(lIndex)
.Select
End With
With oReq
.Open "GET", "https://api.myip.com/", True
.SetProxy 2, aProxyList(lIndex)
.SetRequestHeader "User-Agent", "Mozilla/5.0 (Windows NT 6.1; Win64; x64)"
.Send
End With
ocPool.Add Array( _
oReq, _
Now, _
lIndex + 1 _
)
lIndex = lIndex + 1
DoEvents
End If
Loop While ocPool.Count > 0
MsgBox "Completed"
End Sub
This will print all errors encountered and you should tailor by err.Number
Option Explicit
Public Sub ValidateProxies()
Dim http As New ServerXMLHTTP60, elem As Object, S$
Dim proxyList As Variant, oProxy As Variant, r As Long
Dim html As HTMLDocument
Set html = New HTMLDocument
proxyList = [{"98.163.59.8:8080","134.209.115.223:3128","191.101.233.198:3129","198.177.126.218:80","35.185.201.225:8080"}]
For Each oProxy In proxyList
On Error GoTo errhand:
With http
.Open "GET", "https://www.myip.com/", False
.setRequestHeader "User-Agent", "Mozilla/5.0"
.SetProxy 2, oProxy
.send
With html
.body.innerHTML = http.responseText
Set elem = .querySelector("#ip")
r = r + 1: ActiveSheet.Cells(r, 1) = oProxy
ActiveSheet.Cells(r, 2) = elem.innerText
End With
End With
Next oProxy
Exit Sub
errhand:
If Err.Number <> 0 Then
Debug.Print "Encountered an error " & Err.Description, oProxy
Err.Clear
Resume Next
End If
End Sub

My script throws error while clicking on links cyclically

I've written a script in vba in combination with IE to perform clicks on some javascript links connected to each profile of a webpage. My script can click on the first link flawlessly but when it comes to click on the next link in it's second iteration, it throws permission denied error. There are on valid links connected to each profile so I can't use the links as navigation. How can I modify my script in order to click on links cyclically?
This is my script:
Sub ClickLinks()
Const Url As String = "https://intraweb.stockton.edu/eyos/page.cfm?siteID=58&pageID=7&action=dirmain&type=FAC&display=basic"
Dim IE As New InternetExplorer, Htmldoc As HTMLDocument, I&
With IE
.Visible = True
.navigate Url
While .Busy = True Or .readyState < 4: DoEvents: Wend
Set Htmldoc = .document
End With
With Htmldoc.querySelectorAll("#main table tr a")
For I = 0 To .Length - 1
.Item(I).Click 'in second iteration this line throws permission denied error
Application.Wait Now + TimeValue("00:00:03")
Next I
End With
End Sub
Using an XHR request. The following does an initial GET request to retrieve all the staff IDs. It then loops the ids issuing POST requests for each id. To show it visits each page, I retrieve the staff e-mail address from each page.
Option Explicit
Public Sub GetInfo()
Dim objHTTP As Object, URL As String, html As New HTMLDocument, i As Long, sBody As String
Set objHTTP = CreateObject("WinHttp.WinHttpRequest.5.1")
URL = "https://intraweb.stockton.edu/eyos/page.cfm?siteID=58&pageID=7&action=details"
With objHTTP
.Open "GET", "https://intraweb.stockton.edu/eyos/page.cfm?siteID=58&pageID=7&action=dirmain&type=FAC&display=basic", False
.send
html.body.innerHTML = .responseText
Dim staffIDs As Object
Set staffIDs = html.querySelectorAll("input[name=employeeID]")
For i = 0 To staffIDs.Length - 1
sBody = "employeeID=" & staffIDs(i).getAttribute("value")
.SetTimeouts 10000, 10000, 10000, 10000
.Open "POST", URL, False
.setRequestHeader "User-Agent", "User-Agent: Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36"
.setRequestHeader "Content-type", "application/x-www-form-urlencoded"
On Error Resume Next
.send (sBody)
If Err.Number = 0 Then
If .Status = "200" Then
html.body.innerHTML = .responseText
Else
Debug.Print "HTTP " & .Status & " " & .statusText
Exit Sub
End If
Else
Debug.Print "Error " & Err.Number & " " & Err.Source & " " & Err.Description
Exit Sub
End If
On Error GoTo 0
Debug.Print html.querySelector("td a").innerText
Next i
End With
End Sub
Sample view on landing page:
Sample code printout from page:
Clunky time based wait for refresh and then navigation back to landing page so can submit next form. This needs improvement and some re-ordering.
Option Explicit
Public Sub ClickLinks2()
Const URL As String = "https://intraweb.stockton.edu/eyos/page.cfm?siteID=58&pageID=7&action=dirmain&type=FAC&display=basic"
Dim IE As New InternetExplorer, Htmldoc As HTMLDocument, i&
With IE
.Visible = True
.navigate URL
While .Busy = True Or .readyState < 4: DoEvents: Wend
Set Htmldoc = .document
Dim numEmployees As Long, a As Object
numEmployees = Htmldoc.querySelectorAll("a.names").Length
For i = 1 To 3 'numEmployees (1-792)
While .Busy = True Or .readyState < 4: DoEvents: Wend
.navigate URL
Application.Wait Now + TimeSerial(0, 0, 5)
.document.parentWindow.execScript "document.form" & i & ".submit();" ''javascript:document.form1.submit(); ''<== Adapted this
Next i
End With
End Sub

How to download data from web using TR and TD tag names?

Until recently, I was using the code below, which worked fine for a while. Now, all of a sudden it doesn't work.
Sub Dow_HistoricalData()
Dim xmlHttp As Object
Dim TR_col As Object, TR As Object
Dim TD_col As Object, TD As Object
Dim row As Long, col As Long
ThisSheet = ActiveSheet.Name
Range("A2").Select
Do Until ActiveCell.Value = ""
Symbol = ActiveCell.Value
Sheets(ThisSheet).Select
Sheets.Add
Set xmlHttp = CreateObject("MSXML2.XMLHTTP.6.0")
myURL = "http://finance.yahoo.com/quote/" & Symbol & "/financials?ltr=1"
xmlHttp.Open "GET", myURL, False
xmlHttp.setRequestHeader "Content-Type", "text/xml"
xmlHttp.send
Dim html As Object
Set html = CreateObject("htmlfile")
html.body.innerHTML = xmlHttp.responseText
Dim tbl As Object
Set tbl = html.getElementById("Lh(1.7) W(100%) M(0)")
'
row = 1
col = 1
Set TR_col = html.getElementsByTagName("TR")
For Each TR In TR_col
Set TD_col = TR.getElementsByTagName("TD")
For Each TD In TD_col
Cells(row, col) = TD.innerText
col = col + 1
Next
col = 1
row = row + 1
Next
Sheets(ActiveSheet.Name).Name = Symbol
Sheets(ThisSheet).Select
ActiveCell.Offset(1, 0).Select
Loop
End Sub
I'm getting an error message in this line:
xmlHttp.send
Here's the error message.
'Access is denied.' I did some research on this, and I think it has to do with security, but I don't know what has changed recently, either on my machine or on the Yahoo site.
Here is an image of my setup.
I believe URL has moved from http to https so the error. Also, I Changed to CreateObject("MSXML2.ServerXMLHTTP")
Sub Dow_HistoricalData()
Dim xmlHttp As Object, html As Object
Dim tbl As Object
Dim TR_col As Object, TR As Object
Dim TD_col As Object, TD As Object
Dim row As Long, col As Long, i As Long
Dim sht As Worksheet, newSht As Worksheet
Set sht = ActiveSheet
i = 2
Do Until sht.Cells(i, 1) = ""
Set newSht = Sheets.Add
Symbol = sht.Cells(i, 1)
newSht.Name = Symbol
Set xmlHttp = CreateObject("MSXML2.ServerXMLHTTP")
myURL = "https://finance.yahoo.com/quote/" & Symbol & "/financials?ltr=1"
xmlHttp.Open "GET", myURL, False
xmlHttp.setRequestHeader "Content-Type", "text/xml"
xmlHttp.send
Set html = CreateObject("htmlfile")
html.body.innerHTML = xmlHttp.responseText
Set tbl = html.getElementById("Lh(1.7) W(100%) M(0)")
'
row = 1
col = 1
Set TR_col = html.getElementsByTagName("TR")
For Each TR In TR_col
Set TD_col = TR.getElementsByTagName("TD")
For Each TD In TD_col
newSht.Cells(row, col) = TD.innerText
col = col + 1
Next
col = 1
row = row + 1
Next
i = i + 1
Loop
Set TR_col = Nothing
Set TR = Nothing
Set TD = Nothing
Set html = Nothing
Set xmlHttp = Nothing
End Sub
Seems to work if you fake the browser:
With CreateObject("WinHttp.WinHttpRequest.5.1") '
.Open "GET", "http://finance.yahoo.com/quote/IBM", False
.SetRequestHeader "User-Agent", "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9b5) Gecko/2008032620 Firefox/3.0b5"
.Send
MsgBox .ResponseText
End With

URL check in VBA error

I have code like below. And I receive error
"run-time error '-2146697211 *800c0005)'': the system cannot locate the resource specified"
I do not know how to solve it thanks in advance for any help. Line in which error is handled is httpRequest.send
Function pullSomeSite(urlcheck As String) As Boolean
Dim httpRequest As xmlhttp
Set httpRequest = New xmlhttp
Dim URL As String
URL = urlcheck
With httpRequest
.Open "POST", URL, False
.setRequestHeader "Content-Type", "application/x-www-form-urlencoded"
.send !!!!!!here code stops!!!!!!!
End With
With httpRequest
While Not .readyState = 4 '<---------- wait
Application.Wait Now + TimeValue("0:00:01")
Wend
'Debug.Print .Status
If .Status = 200 Then
While InStr(1, .responseText, "Updating", 0) > 0 '<---------- wait again
Application.Wait Now + TimeValue("0:00:01")
Wend
pullSomeSite = True
Else
pullSomeSite = False
End If
End With
End Function
Test this :
Sub Test_URLExists()
Dim url As String
url = "http://stackoverflow.com/questions/33940044/url-check-in-vba-error"
MsgBox url, vbInformation, URLExists(url)
url = "http://stackoverflow.com/questions/12345678/url-check-in-vba-error"
MsgBox url, vbInformation, URLExists(url)
End Sub
here is how to test an URL with a function :
Function URLExists(url As String) As Boolean
Dim Request As Object
Dim ff As Integer
Dim rc As Variant
URLExists = False
On Error GoTo EndNow
Set Request = CreateObject("WinHttp.WinHttpRequest.5.1")
With Request
.Open "GET", url, False
.Send
rc = .StatusText
End With
Set Request = Nothing
If rc = "OK" Then URLExists = True
Exit Function
EndNow:
End Function
Instead of xmlhttp data type use object.
Use the below code.
. You need to give input as "http://google.com"
Sub test1()
a = pullSomeSite("http://www.flipkart.com")
MsgBox a
End Sub
Function pullSomeSite(urlcheck As String) As Boolean
Dim httpRequest As Object
Set httpRequest = CreateObject("MSXML2.XMLHTTP")
'Set httpRequest = New xmlhttp
Dim URL As String
URL = urlcheck
With httpRequest
.Open "POST", URL, False
.setRequestHeader "Content-Type", "application/x-www-form-urlencoded"
.send '!!!!!!here code stops!!!!!!!
End With
With httpRequest
While Not .readyState = 4 '<---------- wait
Application.Wait Now + TimeValue("0:00:01")
Wend
'Debug.Print .Status
If .Status = 200 Then
While InStr(1, .responseText, "Updating", 0) > 0 '<---------- wait again
Application.Wait Now + TimeValue("0:00:01")
Wend
pullSomeSite = True
Else
pullSomeSite = False
End If
End With
End Function