VBA scrape src instead of href - vba

I am using the code below code but it brings the value of 'src' instead of 'href' for some reason. Anyone can help please?
Sub bringfox(txt As String)
Dim oHtml As HTMLDocument
Dim oElement As Object
Set oHtml = New HTMLDocument
maintext2 = "https://www.jjfox.co.uk/cigars/show/all.html"
With CreateObject("WINHTTP.WinHTTPRequest.5.1")
.Open "GET", maintext2 & gr, False
.send
oHtml.body.innerHTML = .responseText
End With
counter = cnt
'oElement(i).Children(0).getAttribute ("href")
Set oElement = oHtml.getElementsByClassName("products-grid products-grid--max-3-col")(0).getElementsByTagName("a")
i = 0
While i < oElement.Length
Debug.Print oElement(i).Children(0).getAttribute("href")
i = i + 1
Wend
End Sub

You could try using a CSS selector.
#wrapper div.category-products > ul a
This is a reduced version of the full selector that targets a tags within the products category. You then parse the outerHTML for the hrefs as that is where the information is located.
Site image (Sample view)
Output from code (Sample view)
Code
Option Explicit
Public Sub GetInfo()
Dim oHtml As HTMLDocument, nodeList As Object, currentItem As Long
Const URL As String = "https://www.jjfox.co.uk/cigars/show/all.html"
Set oHtml = New HTMLDocument
With CreateObject("WINHTTP.WinHTTPRequest.5.1")
.Open "GET", URL, False
.send
oHtml.body.innerHTML = .responseText
End With
Set nodeList = oHtml.querySelectorAll("#wrapper div.category-products > ul a")
For currentItem = 0 To nodeList.Length - 1
On Error Resume Next
Debug.Print Split(Split(nodeList.item(currentItem).outerHTML, "<A href=")(1), ">")(0)
On Error GoTo 0
Next currentItem
End Sub
Or more simply, use the following
For currentItem = 0 To nodeList.Length - 1
On Error Resume Next
Debug.Print nodeList.item(currentItem).href
On Error GoTo 0
Next currentItem

Related

Parsing website that doesnt fully load VBA

Trying my hand at a simple web parse, my problem is the page doesnt fully load until you scroll down. Google search has come up with possibly using selenium but as I have no idea how to use it I figured I would ask here
Code im using
Sub gfquote()
Dim oHttp As MSXML2.XMLHTTP
Dim sURL As String
Dim HTMLDoc As HTMLDocument
Dim dequote As String
Dim driver As New Webd
' Create an XMLHTTP object
Set oHttp = New MSXML2.XMLHTTP
Dim oElement As Object
' get the URL to open
sURL = "https://www.thevinylspectrum.com/siser-heat-transfer-vinyl/siser-easyweed/12in-x-59in-rolls/"
' open socket and get website html
oHttp.Open "GET", sURL, False
oHttp.send
Set HTMLDoc = New HTMLDocument
With HTMLDoc
' assign the returned text to a HTML document
.body.innerHTML = oHttp.responseText
dastring = oHttp.responseText
' parse the result
UserForm1.TextBox1.Text = dastring
Set prices = .getElementsByClassName("price product-price")
For Each oElement In prices
Sheets("Sheet1").Range("A" & i + 1) = prices(i).innerText
i = i + 1
Next oElement
End With
'Clean up
Set oHttp = Nothing
End Sub
Using selenium basic and using the technique by #Hubisan to handle lazy loading pages and scrolling until everything loaded:
Option Explicit
Public Sub GetNamesAndPrices()
Dim driver As New ChromeDriver, prevlen As Long, curlen As Long
Dim prices As Object, price As Object, name As Object, names As Object
Dim timeout As Long, startTime As Double
timeout = 10 ' set the timeout to 10 seconds
Application.ScreenUpdating = False
With driver
.get "https://www.thevinylspectrum.com/siser-heat-transfer-vinyl/siser-easyweed/12in-x-59in-rolls/"
prevlen = .FindElementsByCss(".price.product-price").Count
startTime = Timer ' set the initial starting time
Do
.ExecuteScript ("window.scrollTo(0, document.body.scrollHeight);")
Set prices = .FindElementsByCss(".price.product-price")
Set names = .FindElementsByCss(".product-name")
curlen = prices.Count
If curlen > prevlen Then
startTime = Timer
prevlen = curlen
End If
Loop While Round(Timer - startTime, 2) <= timeout
Dim r As Long
With ActiveSheet
For Each name In names
r = r + 1: .Cells(r, 1) = name.Text
Next
r = 0
For Each price In prices
r = r + 1: .Cells(r, 2) = price.Text
Next
End With
End With
Application.ScreenUpdating = True
End Sub
Some example output:

Run time error 91, object variable or with block variable not set

My intention is to scrape all the app name from that page and the app link leading to the next page. However, when i run it, i see that after looping once it produces the following error "Run time error 91, object variable or with block variable not set".Here is the full code. Any help would really be appreciated. Thanks in advance.
Sub app_crawler()
Dim xmlpage As New XMLHTTP60, htmldoc As New HTMLDocument
Dim htmlas As Object, htmla As Object, sstr As String
xmlpage.Open "GET", "https://itunes.apple.com/us/app/candy-crush-saga/id553834731?mt=8", False
xmlpage.send
htmldoc.body.innerHTML = xmlpage.responseText
For Each htmlas In htmldoc.getElementsByClassName("lockup-info")(0).getElementsByTagName("a")
sstr = htmlas.href
xmlpage.Open "GET", sstr, False
xmlpage.send
htmldoc.body.innerHTML = xmlpage.responseText
For Each htmla In htmldoc.getElementsByClassName("intro")(1).getElementsByTagName("h1")
x = x + 1: Cells(x, 1) = htmla.innerText
Next htmla
Next htmlas
End Sub
As Mat's Mug commented, you'll have to test if htmlas(x) returns Nothing before getting elements from it, and the same applies for getElementByTagName and others:
Sub TestSth()
Dim xmlpage As New MSXML2.XMLHTTP60
Dim htmldoc As New MSHTML.HTMLDocument
Dim htmlas As Object, gist As Object
Dim htmla As Object, main As Object, lux As String
Dim x As Long, link As Object, thank As Object
Range("A1").Select
xmlpage.Open "GET", "https://itunes.apple.com/us/app/candy-crush-saga/id553834731?mt=8", False
xmlpage.send
htmldoc.body.innerHTML = xmlpage.responseText
Set xmlpage = Nothing
Set htmlas = htmldoc.getElementsByClassName("lockup-info")
For x = 0 To htmlas.Length - IIf(htmlas.Length > 0, 1, 0)
If Not htmlas(x) Is Nothing Then
If Not htmlas(x).getElementsByTagName("a") Is Nothing Then
If Not htmlas(x).getElementsByTagName("a")(0) Is Nothing Then
lux = htmlas(x).getElementsByTagName("a")(0).getAttribute("href")
If lux <> "" Then
xmlpage.Open "GET", lux, False
xmlpage.send
htmldoc.body.innerHTML = xmlpage.responseText
Set main = htmldoc.getElementsByClassName("intro")(1)
Set thank = main.getElementsByTagName("div")
For Each link In thank
ActiveCell.Value = link.innertext
ActiveCell.Offset(1, 0).Select
Next link
End If
End If
End If
End If
Next x
End Sub
This is the answer which fixes all the problems I was having:
Sub app_crawler()
Dim http As New XMLHTTP60, hdoc As New HTMLDocument, hdoc_one As New HTMLDocument
Dim elem As Object, post As Object, sstr As String
With http
.Open "GET", "https://itunes.apple.com/us/app/candy-crush-saga/id553834731?mt=8", False
.send
hdoc.body.innerHTML = .responseText
End With
For Each elem In hdoc.getElementsByClassName("lockup-info")
With elem.getElementsByTagName("li")(0).getElementsByTagName("a")
If .Length Then sstr = .Item(0).href
End With
With http
.Open "GET", sstr, False
.send
hdoc_one.body.innerHTML = .responseText
End With
For Each post In hdoc_one.getElementsByClassName("intro")
With post.getElementsByTagName("h1")
If .Length Then i = i + 1: Cells(i, 1) = .Item(0).innerText
End With
Next post
Next elem
End Sub

vba htmldocument How can I delete a specific element

I am writing the code in VBA.
How can I delete a specific element?
Thank you
Dim html As MSHTML.HTMLDocument
Set html = New MSHTML.HTMLDocument
Dim document As MSHTML.HTMLDocument
Set document = html.createDocumentFromUrl("http://example.com/", vbNullString)
' wait download
Do While document.readyState <> "complete"
Loop
' Insert I Can
Call document.DocumentElement.insertAdjacentHTML("afterend", "<div>test</div>")
' I want to delete a specific element here But I fail
' Call document.removeNode("specific element")
You cant remove elements, but you can remove nodes, which is more or less the same.
Function DelTagById(strData As String, strID As String) As String
On Local Error GoTo MyError
Dim HTMLDoc As HTMLDocument
Dim Node As IHTMLDOMNode
DelTagById = strData
If strID = "" Then GoTo MyExit
Set HTMLDoc = New HTMLDocument
HTMLDoc.body.innerHTML = strData
Set Node = HTMLDoc.getElementById(strID)
If Node Is Nothing Then GoTo MyExit
Node.parentNode.removeChild Node
DelTagById = HTMLDoc.body.innerHTML
MyExit:
Set Node = Nothing
Set HTMLDoc = Nothing
Exit Function
MyError:
'Handle Error
Resume MyExit
End Function

Can't get to value inside XML spanclass

I'm trying to retrieve a value in one of the spanclasses in an online XML-file.
File: http://www.ecb.europa.eu/stats/eurofxref/eurofxref-daily.xml
I want to get to the USD rate but my code does not seem to loop through and span-classes, where is my mistake?
My code
Function response_Text(url As String) ' get the responsetext from an xml request
Dim xml As Object
Set xml = CreateObject("MSXML2.XMLHTTP")
With xml
.Open "Get", url, False
.send
response_Text = .responsetext
End With
Set xml = Nothing
End Function
Private Sub find_ClassElement(HTML_doc As MSHTML.HTMLDocument) ' return value inside span_class
Dim ticker As Variant
Dim XML_elements As MSHTML.IHTMLElementCollection
Dim XML_spanclass As MSHTML.HTMLSpanElement
Dim XML_targetElement As MSHTML.HTMLLIElement
Set XML_elements = HTML_doc.getElementsByClassName("line") **<--- something seems to be wrong here, the code does not loop through any span_classes after this point as intended ( the for statement is not being executed )**
For Each XML_spanclass In XML_elements
If InStr(XML_spanclass.innerHTML, "USD") > 0 Then
Debug.Print "success"
Set XML_targetElement = XML_spanclass.parentElement
Debug.Print CSng(XML_targetElement.getElementsByClassName("webkit-html-attribute-value")(0).innerHTML)
End If
Next
End Sub
Private Sub run() ' run the whole operation
Dim http_req As http_req: Set http_req = New http_req
Dim xml As MSHTML.HTMLDocument: Set xml = New MSHTML.HTMLDocument
Dim url As String: url = "http://www.ecb.europa.eu/stats/eurofxref/eurofxref-daily.xml"
xml.body.innerHTML = http_req.response_Text(url)
Call find_ClassElement(xml)
End Sub
There are no tags with a class of "line" so your collection is empty - nothing to loop through. Here's another way
Sub GetUSD()
Dim xHttp As MSXML2.XMLHTTP
Dim xDoc As MSXML2.DOMDocument
Dim xCube As MSXML2.IXMLDOMElement
Dim xCubes As MSXML2.IXMLDOMSelection
Dim sCurrency As String
'load the xml document
Set xDoc = New MSXML2.DOMDocument
xDoc.Load "http://www.ecb.europa.eu/stats/eurofxref/eurofxref-daily.xml"
'wait until it's completely loaded
Do
DoEvents
Loop Until xDoc.readyState = 4
'get all the cube tags
Set xCubes = xDoc.getElementsByTagName("Cube")
For Each xCube In xCubes
'some cube tags don't have attributes
On Error Resume Next
sCurrency = xCube.Attributes(0).Text
On Error GoTo 0
'if the first attribute is USD, get the second attribute
If sCurrency = "USD" Then
Debug.Print xCube.Attributes(1).Text
End If
Next xCube
End Sub
edit
I don't know xpath well enough to do this properly, but this works.
Sub GetUSD()
Dim xDoc As MSXML2.DOMDocument60
Dim xCube As MSXML2.IXMLDOMNode
Dim xCubes As MSXML2.IXMLDOMNodeList
Dim sCurrency As String
'load the xml document
Set xDoc = New MSXML2.DOMDocument60
xDoc.Load "http://www.ecb.europa.eu/stats/eurofxref/eurofxref-daily.xml"
'wait until it's completely loaded
Do
DoEvents
Loop Until xDoc.readyState = 4
'get all the cube tags
Set xCubes = xDoc.SelectNodes("//*")
For Each xCube In xCubes
On Error Resume Next
sCurrency = xCube.Attributes(0).NodeValue
On Error GoTo 0
'if the first attribute is USD, get the second attribute
If sCurrency = "USD" Then
Debug.Print xCube.Attributes(1).NodeValue
End If
Next xCube
End Sub

Query div element by class name using Excel VBA

I'm trying to get the data to Excel of a div element with a specific class name, like:
<div class="myClass">
<span>Text1</span>
<span>Text2</span>
</div>
My VBA code:
Sub GetData()
Dim oHtml As HTMLDocument
Dim oElement As Object
Set oHtml = New HTMLDocument
With CreateObject("WINHTTP.WinHTTPRequest.5.1")
.Open "GET", "http://www.example.com/", False
.send
oHtml.body.innerHTML = .responseText
End With
For Each oElement In oHtml.getElementsByClassName("myClass")
Debug.Print oElement.Children(0).src
Next oElement
End Sub
This is returning the error:
Run-time error: '438': Object doesn't support this property or method. The error is on line Debug.Print ...
I have activated the following refereces:
Microsoft HTML Object Library
Microsoft Internet Controls
I want to be able to select the text on the first or second span and paste it on a cell, how can I do this?
This worked for me:
Dim oHtml As HTMLDocument
Dim oElement As Object
Set oHtml = New HTMLDocument
With CreateObject("WINHTTP.WinHTTPRequest.5.1")
.Open "GET", "http://www.example.com", False
.send
oHtml.body.innerHTML = .responseText
End With
Set dados = oHtml.getElementsByClassName("myClass")(0).getElementsByTagName("span")
i = 0
For Each oElement In dados
Sheets("Sheet1").Range("A" & i + 1) = dados(i).innerText
i = i + 1
Next oElement
Here's how I've done it in the past:
Set oElement = oHtml.getElementsByClassName("myClass")
i = 0
While i < oElement.Length
Debug.Print oElement(i).innerText
i = i + 1
Wend
you might want innerHtml instead?