I am trying to scrape a table from a website via VBA. I am able to gather all the data from the table however I am only able to get it to populate a single cell, rather than break out the information as formatted on the website. Essentially the data should be in four columns and then go down as many rows as there is data. I know I'm missing something simple here and I just can't put my finger on it. Help please! coding is below:
Sub WebScrape()
Dim objIE As InternetExplore'
Dim ele As Object
Dim y As Integer
objIE.navigate "http://www.uscfinvestments.com/holdings/usci"
Do While objIE.Busy = True Or objIE.readyState <> 4: DoEvents: Loop
y = 1
For Each ele In objIE.document.getElementById("holdingsTableDiv").getElementsByTagName("table")
Debug.Print ele.textContent
Sheets("Sheet1").Range("A" & y).Value = ele.Children(0).textContent
Next
ActiveWorkbook.Save
End Sub
I show a number of methods below. Personally, I prefer the last which uses an API.
Using clipboard:
A nice easy way, if you want to appear as on page, is to simply copy the table to clipboard and paste
Option Explicit
Public Sub GetInfo()
Dim ie As InternetExplorer, hTable As Object, clipboard As Object, ws As Worksheet, t As Date
Const MAX_WAIT_SEC As Long = 10
Set ws = ThisWorkbook.Worksheets("Sheet1")
Set clipboard = GetObject("New:{1C3B4210-F441-11CE-B9EA-00AA006B1A69}")
Set ie = New InternetExplorer
With ie
.Visible = True
.Navigate2 "http://www.uscfinvestments.com/holdings/usci"
While .Busy Or .readyState < 4: DoEvents: Wend
t = Timer
Do
On Error Resume Next
Set hTable = .document.getElementById("holdingsTableID")
On Error GoTo 0
If Timer - t > MAX_WAIT_SEC Then Exit Do
Loop While hTable Is Nothing
If Not hTable Is Nothing Then
clipboard.SetText hTable.outerHTML
clipboard.PutInClipboard
ws.Cells(1, 1).PasteSpecial
End If
.Quit
End With
End Sub
Looping table:
If you want to loop rows and columns of a table then you can use row class name and row number to determine how to write out
Option Explicit
Public Sub GetInfo()
Dim ie As InternetExplorer, hTable As Object, ws As Worksheet, t As Date
Const MAX_WAIT_SEC As Long = 10
Set ws = ThisWorkbook.Worksheets("Sheet1")
Set ie = New InternetExplorer
With ie
.Visible = True
.Navigate2 "http://www.uscfinvestments.com/holdings/usci"
While .Busy Or .readyState < 4: DoEvents: Wend
t = Timer
Do
On Error Resume Next
Set hTable = .document.getElementById("holdingsTableID")
On Error GoTo 0
If Timer - t > MAX_WAIT_SEC Then Exit Do
Loop While hTable Is Nothing
If Not hTable Is Nothing Then
WriteTable hTable, 1, ws
End If
.Quit
End With
End Sub
Public Sub WriteTable(ByVal hTable As HTMLTable, Optional ByVal startRow As Long = 1, Optional ByVal ws As Worksheet)
If ws Is Nothing Then Set ws = ActiveSheet
Dim tRow As Object, td As Object, r As Long, c As Long, th As Object
r = startRow
With ws
For Each tRow In hTable.getElementsByTagName("tr")
c = 1
If r = startRow Or tRow.className = "subHeader" Then
For Each th In tRow.getElementsByTagName("th")
.Cells(r, c) = th.innerText
c = c + 1
Next
Else
For Each td In tRow.getElementsByTagName("td")
.Cells(r, c).Value = td.innerText
c = c + 1
Next td
End If
r = r + 1
Next tRow
End With
End Sub
API:
There is an API which serves the data in a json format
https://cssecure.alpsinc.com/api/v1//holding/usci
It requires authentication. I am using jsonconverter.bas to parse the json returned. After downloading and adding the .bas you need to go VBE > Tools > References > Add a reference to Microsoft Scripting Runtime.
Option Explicit
Public Sub GetValues()
Dim json As Object, authorization As String
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", "http://www.uscfinvestments.com/uscfinvestments-template/assets/javascript/api_key.php", False
.send
authorization = Split(Split(.responseText, "'")(1), "'")(0)
.Open "GET", "https://cssecure.alpsinc.com/api/v1//holding/usci", False
.setRequestHeader "Authorization", authorization
.send
Set json = JsonConverter.ParseJson(.responseText)
End With
Dim arr(), headers(), item As Object, r As Long
headers = Array("Security", "Quantity", "Price", "Market Value")
r = 1
ReDim arr(1 To json.Count, 1 To 4)
For Each item In json
arr(r, 1) = item("name")
arr(r, 2) = item("shares")
Dim test As String
If IsNull(item("contractprice")) Then
arr(r, 3) = item("settlementprice")
Else
arr(r, 3) = item("contractprice")
End If
arr(r, 4) = item("marketvalue")
r = r + 1
Next
With ThisWorkbook.Worksheets("Sheet1")
.Cells(1, 1).Resize(1, UBound(headers) + 1) = headers
.Cells(2, 1).Resize(UBound(arr, 1), UBound(arr, 2)) = arr
End With
End Sub
There is nothing much to add here. QHarr has already covered them. I used hardcoded delay instead of explicit wait to make the script less verbose.
Sub GetContent()
Dim Html As HTMLDocument, elem As Object, tRow As Object, C&, R&
With New InternetExplorer
.Visible = False
.navigate "http://www.uscfinvestments.com/holdings/usci"
While .Busy = True Or .readyState < 4: DoEvents: Wend
Set Html = .Document
End With
Application.Wait Now + TimeValue("00:00:03") 'make it 05 if it fails somehow
For Each elem In Html.getElementById("holdingsTableID").Rows
For Each tRow In elem.Cells
C = C + 1: ThisWorkbook.Worksheets("Sheet1").Cells(R + 1, C) = tRow.innerText
Next tRow
C = 0: R = R + 1
Next elem
End Sub
Related
Sub Web_Table_Option_Two()
Dim HTMLDoc As New HTMLDocument
Dim objTable As Object
Dim lRow As Long
Dim lngTable As Long
Dim lngRow As Long
Dim lngCol As Long
Dim ActRw As Long
Dim objIE As InternetExplorer
Set objIE = New InternetExplorer
objIE.navigate "https://finviz.com/screener.ashx?v=152"
Do Until objIE.readyState = 4 And Not objIE.Busy
DoEvents
Loop
Application.Wait (Now + TimeValue("0:00:03")) 'wait for java script to load
HTMLDoc.body.innerHTML = objIE.document.body.innerHTML
With HTMLDoc.body
Set objTable = .getElementsByTagName("table")
For lngTable = 0 To objTable.Length - 1
For lngRow = 0 To objTable(lngTable).Rows.Length - 1
For lngCol = 0 To objTable(lngTable).Rows(lngRow).Cells.Length - 1
ThisWorkbook.Sheets("Sheet1").Cells(ActRw + lngRow + 1, lngCol + 1) = objTable(lngTable).Rows(lngRow).Cells(lngCol).innerText
Next lngCol
Next lngRow
ActRw = ActRw + objTable(lngTable).Rows.Length + 1
Next lngTable
End With
objIE.Quit
End Sub
`
With the code above I am trying to get the Stock Screener data from the website in the code but the table isn't labeled in HTML code so I am not sure how I am supposed to grab this info. Currently it is grabbing everything on the screen.
For just the bottom table info you can use the following and target the tbody tag collection, and the required index within, to avoid all the unwanted fluff that comes with table selection.
I would use XMLHTTP request as faster. The appropriate index changes between the two methods.
XMLHTTP request:
Option Explicit
Public Sub GetTable()
Dim sResponse As String, html As HTMLDocument, hTable As Object
Set html = New HTMLDocument
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", "https://finviz.com/screener.ashx?v=152", False
.setRequestHeader "If-Modified-Since", "Sat, 1 Jan 2000 00:00:00 GMT"
.send
sResponse = StrConv(.responseBody, vbUnicode)
End With
With html
.body.innerHTML = sResponse
Set hTable = .getElementsByTagName("tbody")(9)
WriteTable hTable, 1, ThisWorkbook.Worksheets("Sheet1")
End With
End Sub
Public Sub WriteTable(ByVal hTable As Object, Optional ByVal startRow As Long = 1, Optional ByVal ws As Worksheet)
If ws Is Nothing Then Set ws = ActiveSheet
Dim tRow As Object, tCell As Object, tr As Object, td As Object, r As Long, c As Long
r = startRow
With ws
Set tRow = hTable.getElementsByTagName("tr")
For Each tr In tRow
r = r + 1: c = 1
Set tCell = tr.getElementsByTagName("td")
For Each td In tCell
.Cells(r, c).Value = td.innerText
c = c + 1
Next td
Next tr
End With
End Sub
Internet Explorer (using WriteTable sub from above):
Option Explicit
Public Sub GetInfo()
Dim IE As New InternetExplorer, hTable As Object
With IE
.Visible = True
.navigate "https://finviz.com/screener.ashx?v=152"
While .Busy Or .readyState < 4: DoEvents: Wend
Set hTable = .document.getElementsByTagName("tbody")(13)
WriteTable hTable, 1, ThisWorkbook.Worksheets("Sheet1")
.Quit
End With
End Sub
Output:
References (VBE > Tools > References):
Microsoft Internet Controls
Microsoft HTML Object Library
I've been strugling with this issue, and trying to find a solution here in StackOverflow, but nothing helped.
I have thousands of links of images (Column A), that will get you to the final JPG url. It's not a redirect link because I've tried with different websites and it doesn't detect it.
Here is an example:
https://www.pepperl-fuchs.com/global/en/doci.htm?docilang=ENG&view=showproductpicbypartno&partno=000046
It will get you here:
https://files.pepperl-fuchs.com/webcat/navi/productInfo/pd/d428540a.jpg
So I would like to extrapolate all the final links in Column B.
I found some code that opens IE for each link, but it probably misses the function to copy the URL and paste it in the cell:
Sub Test()
Dim IE As Object
Dim URL As Range
Dim objDocument As Object
Dim x As Integer
Set IE = CreateObject("InternetExplorer.Application")
With IE
.Visible = True
For Each URL In Range("A2:A16")
.Navigate URL.Value
While .busy Or .ReadyState <> 4: DoEvents: Wend
If LCase(TypeName(objDocument)) = "htmldocument" Then
Cells(A, 1).Value = objDocument.URL
Cells(A, 2).Value = objDocument.Title
x = x + 1
End If
Next
End With
End Sub
Can you guys help me figure out what is missing? Unfortunately I'm not really familiar with VBA.
Thank you very much
Try this
Sub Test()
Dim IE As Object
Dim URL As Range
Dim objDocument As Object
Dim x As Integer
Set IE = CreateObject("InternetExplorer.Application")
With IE
.Visible = True
For Each URL In Range("A2:A16")
.Navigate URL.Value
While .busy Or .ReadyState <> 4: DoEvents: Wend
URL.Offset(, 1).Value = .LocationURL
Next
End With
End Sub
Try this code
Sub Test()
Dim html As HTMLDocument
Dim ie As Object
Dim objDocument As Object
Dim url As Range
Dim x As Integer
Set ie = CreateObject("InternetExplorer.Application")
x = 1
With ie
.Visible = True
For Each url In Range("A2:A3")
.navigate url.Value
While .Busy Or .readyState <> 4: DoEvents: Wend
Set html = .document
x = x + 1
Cells(x, 2).Value = html.url
Cells(x, 3).Value = html.Title
Next url
End With
End Sub
Need more to test with but this will be a lot faster and if you can easily adapt to using an array to loop faster than looping sheet by using Dim arr(): arr = .Range("A1:A" & .Cells(.Rows.Count, "A").End(xlUp).Row).Value and looping the first dimension.
Option Explicit
Public Sub GetInfo()
Dim rng As Range
With Worksheets("Sheet1")
For Each rng In .Range("A1:A" & .Cells(.Rows.Count, "A").End(xlUp).Row)
If InStr(rng.Value, "http") > 0 Then Debug.Print GetURL(rng.Value)
Next
End With
End Sub
Public Function GetURL(ByVal url As String) As String
Dim sResponse As String, s As Long, e As Long
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", url, False
.send
sResponse = StrConv(.responseBody, vbUnicode)
End With
s = InStr(1, sResponse, "https")
e = InStr(1, sResponse, ".jpg") + 4
GetURL = Mid(sResponse, s, e - s)
End Function
This does assume that all your links follow the same pattern as the first.
I currently have the following coding for a set of data on an excel spreadsheet:
Sub DemoMutual2()
Dim URL As String
Dim ieDoc As Object, dObj As Object
Dim cel As Range
URL = Range("G2").Value
With CreateObject("InternetExplorer.Application")
.Visible = False
.Navigate URL
Do Until .ReadyState = 4: DoEvents: Loop
Set ieDoc = .Document
Set dObj = ieDoc.getElementsByClassName("_50f3")
[K2].Value = Split(ieDoc.getElementsByClassName("_50f3")(0).innerText, " ")(0)
For i = 0 To dObj.Length - 1
If InStr(1, dObj(i).getElementsByTagName("a")(0).innerText, "since") Then
[M2].Value = Trim(Split(dObj(i).getElementsByTagName("a")(0).innerText, "since")(1))
End If
Next
.Quit
End With
Set ieDoc = Nothing
Set dObj = Nothing
End Sub
At present this only works on one row, based on the URL in cell G2. What I want this code to do is run for all of the URLs in column G (G2,G3,G4 etc) and return the required results in the cells specified in the code (K2, K3, K4 etc and M2, M3, M4 etc).
So to search URL in G2 and return results in K2 and M2
then G3, results in K3 and M3 and so on.
At present this works perfectly for the one row, but I am struggling to change it so I can do it for multiple rows.
Any help is much appreciated.
How does this work?
Sub DemoMutual2()
Dim URL As String
Dim ieDoc As Object, dObj As Object
Dim cel As Range
Dim lastRow As Long, i As Long, iRow As Long
lastRow = Cells(Rows.Count, 7).End(xlUp).Row
For iRow = 2 To lastRow
URL = Range("G" & iRow).Value
With CreateObject("InternetExplorer.Application")
.Visible = False
.Navigate URL
Do Until .ReadyState = 4: DoEvents: Loop
Set ieDoc = .Document
Set dObj = ieDoc.getElementsByClassName("_50f3")
Cells(iRow, 11).Value = Split(ieDoc.getElementsByClassName("_50f3")(0).innerText, " ")(0)
For i = 0 To dObj.Length - 1
If InStr(1, dObj(i).getElementsByTagName("a")(0).innerText, "since") Then
Cells(iRow, 13).Value = Trim(Split(dObj(i).getElementsByTagName("a")(0).innerText, "since")(1))
End If
Next i
.Quit
End With
Set ieDoc = Nothing
Set dObj = Nothing
Next iRow
End Sub
Note: It may be better to move the IE.Quit and Set ieDoc = Nothing lines to outside the whole loop for performance, but this should work if you don't have too many rows.
I'm trying to get the innertext of 'wfm-bodyText'. So, I want to loop through items on a webpage and import, in this case, 'H. Validate Gamer Correct'
I thought the script below would work, but it actually gives me all kinds of extra text!!
Public Sub CopyFromURL()
Dim IE As InternetExplorer, doc As HTMLDocument
Dim thisClass As IHTMLElement2, thisLink As IHTMLElement
Dim rng As Range, cell As Range
Const READYSTATE_COMPLETE As Integer = 4
Dim TR_col As Object, TR As Object
Dim TD_col As Object, TD As Object
Dim row As Long, col As Long
row = 1
'Set rng = Range("A1:A5")
'For Each cell In rng
Set IE = CreateObject("InternetExplorer.Application")
IE.Navigate URL
Do While (IE.Busy Or IE.ReadyState <> READYSTATE_COMPLETE)
DoEvents
Loop
Set TR_col = IE.Document.getElementsByTagName("TR")
For Each TR In TR_col
Set TD_col = TR.getElementsByTagName("TD")
row = ActiveSheet.Cells(ActiveSheet.Rows.Count, "A").End(xlUp).row
col = 1
For Each TD In TD_col
Cells(row, col) = TD.innerText
col = col + 1
Next
row = row + 1
Next
'Next cell
IE.Quit
End Sub
The URL is behind a firewall. I can't share it.
Again, below is the structure of the page. I want to copy the 'E. Verify phone number if applicable' into my sheet.
I tried the script below in an attempt to get every element from the page.
Sub DumpData()
Set IE = CreateObject("InternetExplorer.Application")
IE.Visible = True
URL = "http://www.SharePoint.aspx"
'Wait for site to fully load
IE.Navigate2 URL
Do While IE.Busy = True
DoEvents
Loop
RowCount = 1
With Sheets("Sheet1")
.Cells.ClearContents
RowCount = 1
For Each itm In IE.document.all
.Range("A" & RowCount) = itm.tagname
.Range("B" & RowCount) = itm.ID
.Range("C" & RowCount) = itm.classname
.Range("D" & RowCount) = Left(itm.innertext, 1024)
RowCount = RowCount + 1
Next itm
End With
End Sub
I still can't tell where the relevant data is coming from.
Sub GetAllLinks()
Dim IE As Object
Set IE = CreateObject("InternetExplorer.Application")
IE.Visible = True
url_name = "http://www.trovanumeri.com/?azione=cerca&cerca=portoscuso"
If url_name = "" Then Exit Sub
IE.navigate url_name
Do
DoEvents
Loop Until IE.readyState = 4
'<a href="http://google.com">Click Here
Set AllHyperLinks = IE.document.getElementsByTagName("A")
Foglio1.ListBox1.Clear
'this is the code
For Each hyper_link In AllHyperLinks
Foglio1.ListBox1.AddItem hyper_link
Next
MsgBox "Done!"
End Sub
Take a look at the below example, showing how to retrieve the data from via IE automation and DOM processing:
Option Explicit
Sub GetData()
Dim oIE As Object
Dim lCurRow As Long
Dim lResultIndex As Long
Dim sUrl As String
Dim oTable0 As Object
Dim oTable1 As Object
Dim oTable2 As Object
Dim oTable3 As Object
Dim sTbl3Text As String
Dim cAncorNodes As Object
Dim oAncorNode As Variant
Dim sRowText As String
Dim oRowNode As Object
Dim aData() As Variant
Set oIE = CreateObject("InternetExplorer.Application")
With oIE
.Visible = True
Sheets(1).Cells.Delete
lCurRow = 1 ' Worksheet rows counter
lResultIndex = 0 ' Search result index counter
Do
' Navigate to the page
sUrl = "http://www.trovanumeri.com/?azione=cerca&cerca=cagliari&da=" & lResultIndex
.Navigate sUrl
' Wait IE
Do While .ReadyState < 3 Or .Busy
DoEvents
Loop
' Wait Document
Do Until .Document.ReadyState = "complete"
DoEvents
Loop
' Retrieve target tables
Set oTable0 = .Document.getElementsByTagName("table")(0)
Set oTable1 = oTable0.getElementsByTagName("table")(1)
Set oTable2 = oTable1.getElementsByTagName("table")(3)
' Get and process ancor nodes
Set cAncorNodes = oTable2.getElementsByTagName("a")
For Each oAncorNode In cAncorNodes
With CreateObject("Scripting.Dictionary")
' Add .href to result
.Add .Count, oAncorNode.href
' Get ancor's parent row
Set oRowNode = oAncorNode.ParentNode.ParentNode.ParentNode
Do
' Add nonemtpy row to result
sRowText = Trim(Replace(oRowNode.innerText, vbCrLf, ""))
If sRowText <> "" Then .Add .Count, sRowText
' If last row then exit
If IsNull(oRowNode.nextElementSibling) Then Exit Do
' Proceed with next row
Set oRowNode = oRowNode.nextElementSibling
' If net row contains oAncorNode then exit
If oRowNode.getElementsByTagName("a").Length > 0 Then Exit Do
DoEvents
Loop
' Get results as array
aData = .Items
End With
' Output array to worksheet row
With Sheets(1).Cells(lCurRow, 1)
.Resize(1, UBound(aData) + 1) = aData
.Select
End With
lCurRow = lCurRow + 1
DoEvents
Next
' Get table containing 'Next' button
Set oTable3 = oTable0.getElementsByTagName("table")(7)
sTbl3Text = oTable3.innerText
' If no 'Next' button then exit
If InStr(sTbl3Text, "Avanti >>") = 0 Then Exit Do
lResultIndex = lResultIndex + 10
DoEvents
Loop
.Quit
End With
End Sub