Cant' fetch some information from a webpage using xhr - vba

I'm trying to grab a certain portion of information from a webpage using xmlhttp request. When I execute my script, it throws an error Object Variable Or With---. But, when I try the same using IE, I get the content like magic.
The most important thing to notice is that the content I'm expecting to grab are neither javascript encrypted nor generated dynamically. So, I should get them using xhr. Where I'm going wrong?
Here goes the website link
Using IE (working one):
Sub GetText()
Const Url As String = "https://www.baseball-reference.com/boxes/ANA/ANA201806180.shtml"
Dim IE As New InternetExplorer, HTML As HTMLDocument, post As Object
With IE
.Visible = False
.navigate Url
While .Busy = True Or .readyState < 4: DoEvents: Wend
Set HTML = .document
End With
Set post = HTML.querySelectorAll(".section_content")(2)
MsgBox post.innerText
End Sub
Using XHR (not working):
Sub GetText()
Const Url As String = "https://www.baseball-reference.com/boxes/ANA/ANA201806180.shtml"
Dim Http As New XMLHTTP60, HTML As New HTMLDocument, post As Object
With Http
.Open "GET", Url, False
.send
HTML.body.innerHTML = .responseText
End With
Set post = HTML.querySelectorAll(".section_content")(2)
MsgBox post.innerText
End Sub
The selector I've defined above is flawless.
I could have pasted here the relevant html elements but they are wrapped within comments. However, I've provided above the link to that site.
To be clearer: the portion of text I'm interested in looks exactly like below in that webpage.
My question: how can I get the aforementioned textblock (which is shown above within image) using XHR?

The solution is plain and simple. All you need to do is kick out the comment signs from responseText using Replace() function or so and then filter them using Html.body.innerHTML to make them proper html contents. The rest is as usual.
This is how you can get the content:
Sub GetTextFromComment()
Const URL As String = "https://www.baseball-reference.com/boxes/ANA/ANA201806180.shtml"
Dim Http As New XMLHTTP60, Html As New HTMLDocument, post As Object
With Http
.Open "GET", URL, False
.send
Html.body.innerHTML = Replace(Replace(.responseText, "<!--", ""), "-->", "")
End With
Set post = Html.querySelectorAll(".section_content")(2)
MsgBox post.innerText
End Sub

Using comment position:
Option Explicit
Public Sub GetInfo()
Dim sResponse As String, html As New HTMLDocument
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", "https://www.baseball-reference.com/boxes/ANA/ANA201806180.shtml", False
.send
sResponse = StrConv(.responseBody, vbUnicode)
End With
sResponse = Mid$(sResponse, InStr(1, sResponse, "<!DOCTYPE "))
With html
.body.innerHTML = sResponse
html.body.innerHTML = html.querySelector("#all_9711922514").LastChild.Data
Debug.Print html.querySelector("#div_9711922514").innerText
End With
End Sub
Method using nodeType:
Option Explicit
Public Sub GetInfo()
Dim sResponse As String, html As New HTMLDocument, ele As Object
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", "https://www.baseball-reference.com/boxes/ANA/ANA201806180.shtml", False
.send
sResponse = StrConv(.responseBody, vbUnicode)
End With
sResponse = Mid$(sResponse, InStr(1, sResponse, "<!DOCTYPE "))
With html
.body.innerHTML = sResponse
For Each ele In html.querySelector("#all_9711922514").Children
If ele.NodeType = 8 Then
html.body.innerHTML = ele.Data
Debug.Print html.querySelector("#div_9711922514").innerText
Exit For
End If
Next
End With
End Sub
Method using regex:
Option Explicit
Public Sub GetInfo()
Dim sResponse As String, html As New HTMLDocument
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", "https://www.baseball-reference.com/boxes/ANA/ANA201806180.shtml", False
.send
sResponse = StrConv(.responseBody, vbUnicode)
End With
sResponse = Mid$(sResponse, InStr(1, sResponse, "<!DOCTYPE "))
With html
.body.innerHTML = sResponse
Dim s As String
s = .querySelector("div[id=all_1786105919]").outerHTML
s = regexRemove(s, "<([^>]+)>")
Debug.Print Replace$(Replace$(s, "&", "°"), "-->", vbNullString)
End With
End Sub
Public Function regexRemove(ByVal s As String, ByVal pattern As String) As String
Dim regex As Object
Set regex = CreateObject("VBScript.RegExp")
With regex
.Global = True
.MultiLine = True
.IgnoreCase = False
.pattern = pattern
End With
If regex.test(s) Then
regexRemove = regex.Replace(s, vbNullString)
Else
regexRemove = s
End If
End Function
Output:

Related

Can't make a script wait asynchronously for a certain time for the titles to parse before going for the next url

I'm trying to create a script in vba using ServerXMLHTTP60 to parse the title of the first post from some identical links. My main goal here is to make the script asynchronous along with setting a highest time up to which the script will try before going for the next url.
However, the macro that I've created always goes for the next url when there is a timeout without being able to scrape the titles from the links.
Sub FetchContentWithinSpecificTime()
Dim oHttp As New ServerXMLHTTP60, HTML As New HTMLDocument
Dim URL As Variant, Urllist As Variant, t As Date, sResp As Boolean
Urllist = Array( _
"https://stackoverflow.com/questions/tagged/web-scraping?tab=newest&page=1", _
"https://stackoverflow.com/questions/tagged/web-scraping?tab=newest&page=2", _
"https://stackoverflow.com/questions/tagged/web-scraping?tab=newest&page=3", _
"https://stackoverflow.com/questions/tagged/web-scraping?tab=newest&page=4", _
"https://stackoverflow.com/questions/tagged/web-scraping?tab=newest&page=5" _
)
For Each URL In Urllist
Debug.Print "trying with: " & URL
With oHttp
.Open "GET", URL, True
.setRequestHeader "User-Agent", "Mozilla/5.0"
.setTimeouts 5000, 5000, 15000, 15000
.send
t = Now + TimeValue("00:00:10")
sResp = False
On Error Resume Next
Do
If .readyState = 4 Then sResp = True: Exit Do
If Now > t Then sResp = False: Exit Do
DoEvents
Loop
On Error GoTo 0
If sResp Then
HTML.body.innerHTML = .responseText
Debug.Print HTML.querySelector(".question-hyperlink").innerText
Else:
Debug.Print "failed with: " & URL
End If
End With
Next URL
End Sub
How can I make a script wait for a certain time for the titles to parse before going for the next url?
I don't know why those SO links take to long to respond but I tried with different urls and the following solution appears to be working in the right way. The credit for the rectified portion goes to the provider of this solution.
Sub FetchContentWithinSpecificTime()
Dim oHttp As New ServerXMLHTTP60, HTML As New HTMLDocument
Dim URL As Variant, Urllist As Variant, t As Date
Dim sPrice$, sResp As Boolean
Urllist = Array( _
"https://finance.yahoo.com/quote/NZDUSD=X?p=NZDUSD=X", _
"https://finance.yahoo.com/quote/FB?p=FB", _
"https://finance.yahoo.com/quote/AAPL?p=AAPL", _
"https://finance.yahoo.com/quote/IBM?p=IBM", _
"https://finance.yahoo.com/quote/UCO?p=UCO" _
)
For Each URL In Urllist
Debug.Print "trying with: " & URL
With oHttp
.Open "GET", URL, True
.setRequestHeader "User-Agent", "Mozilla/5.0"
.send
t = Now + TimeValue("00:00:10")
sResp = False
Do While .readyState < 4
If .readyState = 4 Then Exit Do
sResp = (Now > t) Or (Err.Number <> 0)
If sResp Then Exit Do
DoEvents
Loop
If Not sResp Then
HTML.body.innerHTML = .responseText
sPrice = HTML.querySelector(".Mb\(-4px\)").innerText
Debug.Print sPrice
Else:
Debug.Print "failed with: " & URL
End If
End With
Next URL
End Sub

VBA scrape src instead of href

I am using the code below code but it brings the value of 'src' instead of 'href' for some reason. Anyone can help please?
Sub bringfox(txt As String)
Dim oHtml As HTMLDocument
Dim oElement As Object
Set oHtml = New HTMLDocument
maintext2 = "https://www.jjfox.co.uk/cigars/show/all.html"
With CreateObject("WINHTTP.WinHTTPRequest.5.1")
.Open "GET", maintext2 & gr, False
.send
oHtml.body.innerHTML = .responseText
End With
counter = cnt
'oElement(i).Children(0).getAttribute ("href")
Set oElement = oHtml.getElementsByClassName("products-grid products-grid--max-3-col")(0).getElementsByTagName("a")
i = 0
While i < oElement.Length
Debug.Print oElement(i).Children(0).getAttribute("href")
i = i + 1
Wend
End Sub
You could try using a CSS selector.
#wrapper div.category-products > ul a
This is a reduced version of the full selector that targets a tags within the products category. You then parse the outerHTML for the hrefs as that is where the information is located.
Site image (Sample view)
Output from code (Sample view)
Code
Option Explicit
Public Sub GetInfo()
Dim oHtml As HTMLDocument, nodeList As Object, currentItem As Long
Const URL As String = "https://www.jjfox.co.uk/cigars/show/all.html"
Set oHtml = New HTMLDocument
With CreateObject("WINHTTP.WinHTTPRequest.5.1")
.Open "GET", URL, False
.send
oHtml.body.innerHTML = .responseText
End With
Set nodeList = oHtml.querySelectorAll("#wrapper div.category-products > ul a")
For currentItem = 0 To nodeList.Length - 1
On Error Resume Next
Debug.Print Split(Split(nodeList.item(currentItem).outerHTML, "<A href=")(1), ">")(0)
On Error GoTo 0
Next currentItem
End Sub
Or more simply, use the following
For currentItem = 0 To nodeList.Length - 1
On Error Resume Next
Debug.Print nodeList.item(currentItem).href
On Error GoTo 0
Next currentItem

Open website, find specific value and return value to Excel in VBA

I would like to use VBA to open a website, look for a certain paragraph in the HTML code of this website (<p class="myClass">XYZ</p>) and return this value to Excel, in my example "XYZ".
The website has only one paragraph (p element) with the above class.
I know that this is possible but don't know where to start here.
My code:
Dim objIE As Object
Set objIE = CreateObject("InternetExplorer.Application")
With objIE
.Navigate varUrl
Do While .Busy
Application.Wait Now + TimeValue("0:00:01")
Loop
.Visible = True
End With
Instead of opening IE, use a web request:
Set oRequest = New WinHttp.WinHttpRequest
With oRequest
.Open "GET", sUrl, True
.SetRequestHeader "Content-Type", "application/x-www-form-urlencoded; charset=UTF-8"
.Send "{range:9129370}"
.WaitForResponse
Set index = .ResponseText.IndexOf("<p class=""myClass"">")
Set text = .ResponseText.Substring(index,3)
Cells(row, col).Value = text
End With
If you don't know the length of the string you are looking for, you could also do a loop after index until you hit a "<" character.
Dim objIE As Object
Set objIE = CreateObject("InternetExplorer.Application")
With objIE
.Navigate varUrl
Do While .Busy
Application.Wait Now + TimeValue("0:00:01")
Loop
.Visible = True
End With
'HTML document
Dim doc As Object
Set doc = objIE.document
Dim el As Object
Dim myText as string
For Each el In doc.GetElementsByClassName("myClass")
'put paragrah text in cell A1
Cells(1, 1).Value = el.innerText
'put your paragraph text in a variable string
myText = el.innerText
Next el
That is a tricky and interesting question. Let's say that you want to obtain the title of this current website, which is in class question-hyperlink within StackOverflow. Thus, using the idea of the solution of #Matt Spinks you may come up with something like this:
Option Explicit
Public Sub TestMe()
Dim oRequest As Object
Dim strOb As String
Dim strInfo As String: strInfo = "class=""question-hyperlink"">"
Dim lngStart As Long
Dim lngEnd As Long
Set oRequest = CreateObject("WinHttp.WinHttpRequest.5.1")
With oRequest
.Open "GET", "http://stackoverflow.com/questions/42254051/vba-open-website-find-specific-value-and-return-value-to-excel#42254254", True
.SetRequestHeader "Content-Type", "application/x-www-form-urlencoded; charset=UTF-8"
.Send "{range:9129370}"
.WaitForResponse
strOb = .ResponseText
End With
lngStart = InStr(1, strOb, strInfo)
lngEnd = InStr(lngStart, strOb, "<")
Debug.Print Mid(strOb, lngStart + Len(strInfo), lngEnd - lngStart - Len(strInfo))
End Sub
Instead of Debug.print you may get the Title in a string and work further.

Run time error 91, object variable or with block variable not set

My intention is to scrape all the app name from that page and the app link leading to the next page. However, when i run it, i see that after looping once it produces the following error "Run time error 91, object variable or with block variable not set".Here is the full code. Any help would really be appreciated. Thanks in advance.
Sub app_crawler()
Dim xmlpage As New XMLHTTP60, htmldoc As New HTMLDocument
Dim htmlas As Object, htmla As Object, sstr As String
xmlpage.Open "GET", "https://itunes.apple.com/us/app/candy-crush-saga/id553834731?mt=8", False
xmlpage.send
htmldoc.body.innerHTML = xmlpage.responseText
For Each htmlas In htmldoc.getElementsByClassName("lockup-info")(0).getElementsByTagName("a")
sstr = htmlas.href
xmlpage.Open "GET", sstr, False
xmlpage.send
htmldoc.body.innerHTML = xmlpage.responseText
For Each htmla In htmldoc.getElementsByClassName("intro")(1).getElementsByTagName("h1")
x = x + 1: Cells(x, 1) = htmla.innerText
Next htmla
Next htmlas
End Sub
As Mat's Mug commented, you'll have to test if htmlas(x) returns Nothing before getting elements from it, and the same applies for getElementByTagName and others:
Sub TestSth()
Dim xmlpage As New MSXML2.XMLHTTP60
Dim htmldoc As New MSHTML.HTMLDocument
Dim htmlas As Object, gist As Object
Dim htmla As Object, main As Object, lux As String
Dim x As Long, link As Object, thank As Object
Range("A1").Select
xmlpage.Open "GET", "https://itunes.apple.com/us/app/candy-crush-saga/id553834731?mt=8", False
xmlpage.send
htmldoc.body.innerHTML = xmlpage.responseText
Set xmlpage = Nothing
Set htmlas = htmldoc.getElementsByClassName("lockup-info")
For x = 0 To htmlas.Length - IIf(htmlas.Length > 0, 1, 0)
If Not htmlas(x) Is Nothing Then
If Not htmlas(x).getElementsByTagName("a") Is Nothing Then
If Not htmlas(x).getElementsByTagName("a")(0) Is Nothing Then
lux = htmlas(x).getElementsByTagName("a")(0).getAttribute("href")
If lux <> "" Then
xmlpage.Open "GET", lux, False
xmlpage.send
htmldoc.body.innerHTML = xmlpage.responseText
Set main = htmldoc.getElementsByClassName("intro")(1)
Set thank = main.getElementsByTagName("div")
For Each link In thank
ActiveCell.Value = link.innertext
ActiveCell.Offset(1, 0).Select
Next link
End If
End If
End If
End If
Next x
End Sub
This is the answer which fixes all the problems I was having:
Sub app_crawler()
Dim http As New XMLHTTP60, hdoc As New HTMLDocument, hdoc_one As New HTMLDocument
Dim elem As Object, post As Object, sstr As String
With http
.Open "GET", "https://itunes.apple.com/us/app/candy-crush-saga/id553834731?mt=8", False
.send
hdoc.body.innerHTML = .responseText
End With
For Each elem In hdoc.getElementsByClassName("lockup-info")
With elem.getElementsByTagName("li")(0).getElementsByTagName("a")
If .Length Then sstr = .Item(0).href
End With
With http
.Open "GET", sstr, False
.send
hdoc_one.body.innerHTML = .responseText
End With
For Each post In hdoc_one.getElementsByClassName("intro")
With post.getElementsByTagName("h1")
If .Length Then i = i + 1: Cells(i, 1) = .Item(0).innerText
End With
Next post
Next elem
End Sub

URL check in VBA error

I have code like below. And I receive error
"run-time error '-2146697211 *800c0005)'': the system cannot locate the resource specified"
I do not know how to solve it thanks in advance for any help. Line in which error is handled is httpRequest.send
Function pullSomeSite(urlcheck As String) As Boolean
Dim httpRequest As xmlhttp
Set httpRequest = New xmlhttp
Dim URL As String
URL = urlcheck
With httpRequest
.Open "POST", URL, False
.setRequestHeader "Content-Type", "application/x-www-form-urlencoded"
.send !!!!!!here code stops!!!!!!!
End With
With httpRequest
While Not .readyState = 4 '<---------- wait
Application.Wait Now + TimeValue("0:00:01")
Wend
'Debug.Print .Status
If .Status = 200 Then
While InStr(1, .responseText, "Updating", 0) > 0 '<---------- wait again
Application.Wait Now + TimeValue("0:00:01")
Wend
pullSomeSite = True
Else
pullSomeSite = False
End If
End With
End Function
Test this :
Sub Test_URLExists()
Dim url As String
url = "http://stackoverflow.com/questions/33940044/url-check-in-vba-error"
MsgBox url, vbInformation, URLExists(url)
url = "http://stackoverflow.com/questions/12345678/url-check-in-vba-error"
MsgBox url, vbInformation, URLExists(url)
End Sub
here is how to test an URL with a function :
Function URLExists(url As String) As Boolean
Dim Request As Object
Dim ff As Integer
Dim rc As Variant
URLExists = False
On Error GoTo EndNow
Set Request = CreateObject("WinHttp.WinHttpRequest.5.1")
With Request
.Open "GET", url, False
.Send
rc = .StatusText
End With
Set Request = Nothing
If rc = "OK" Then URLExists = True
Exit Function
EndNow:
End Function
Instead of xmlhttp data type use object.
Use the below code.
. You need to give input as "http://google.com"
Sub test1()
a = pullSomeSite("http://www.flipkart.com")
MsgBox a
End Sub
Function pullSomeSite(urlcheck As String) As Boolean
Dim httpRequest As Object
Set httpRequest = CreateObject("MSXML2.XMLHTTP")
'Set httpRequest = New xmlhttp
Dim URL As String
URL = urlcheck
With httpRequest
.Open "POST", URL, False
.setRequestHeader "Content-Type", "application/x-www-form-urlencoded"
.send '!!!!!!here code stops!!!!!!!
End With
With httpRequest
While Not .readyState = 4 '<---------- wait
Application.Wait Now + TimeValue("0:00:01")
Wend
'Debug.Print .Status
If .Status = 200 Then
While InStr(1, .responseText, "Updating", 0) > 0 '<---------- wait again
Application.Wait Now + TimeValue("0:00:01")
Wend
pullSomeSite = True
Else
pullSomeSite = False
End If
End With
End Function