Data Extraction from Webpage to Excel using VBA - vba

I was trying to pull a table from Webpage, so far i was successful pull a table from Webpage, unfortunately i have some links in each row of table, when i pulled the table from Webpage, i am getting output without link, just text, is there any way we can pull table from webpage using VBA including hyperlinks.
Here is my code:
Sub TableExample()
Dim IE As Object
Dim doc As Object
Dim strURL As String
strURL = "HERE I USED MY URL"
' replace with URL of your choice
Set IE = CreateObject("InternetExplorer.Application")
With IE
'.Visible = True
.Navigate strURL
Do Until .readyState = 4: DoEvents: Loop
Do While .Busy: DoEvents: Loop
Set doc = IE.Document
GetAllTables doc
.Quit
End With
End Sub
Sub GetAllTables(doc As Object)
' get all the tables from a webpage document, doc, and put them in a new worksheet
Dim ws As Worksheet
Dim rng As Range
Dim tbl As Object
Dim rw As Object
Dim cl As Object
Dim tabno As Long
Dim nextrow As Long
Dim I As Long
Set ws = Worksheets.Add
For Each tbl In doc.getElementsByTagName("TABLE")
tabno = tabno + 1
nextrow = nextrow + 1
Set rng = ws.Range("B" & nextrow)
rng.Offset(, -1) = "Table " & tabno
For Each rw In tbl.Rows
For Each cl In rw.Cells
rng.Value = cl.outerText
Set rng = rng.Offset(, 1)
I = I + 1
Next cl
nextrow = nextrow + 1
Set rng = rng.Offset(1, -I)
I = 0
Next rw
Next tbl
ws.Cells.ClearFormats
End Sub

When you execute "rng.Value = cl.outerText" you get only text. If you need to have all links and other html - please use innerHTML property.
Just replace "rng.Value = cl.outerText" to "rng.Value = cl.innerHTML". This will return entire html with links ;)

Related

VBA WEB scraping loop

This is the code I wrote I need it to loop when it copy and paste the table and then click the next button till there no more next button , the next button is a tagname "a" and it has a "href" that I used to click the hyper link
Any help would be appreciated thanks
Sub GetData()
'define variables
Dim i As SHDocVw.InternetExplorer
Set i = New InternetExplorer
i.Visible = True
Dim IE As Object, obj As Object
Dim r As Long, c As Long, t As Long
Dim elemCollection As Object
Dim eRow As Long
Dim HTMLDoc As MSHTML.HTMLDocument
Dim HTMLInput As MSHTML.IHTMLElement
Dim HTMLAs As MSHTML.IHTMLElementCollection
Dim HTMLA As MSHTML.IHTMLElement
url_name = Sheet2.Range("e4")
If url_name = "" Then Exit Sub
i.Visible = True
i.navigate (url_name)
Do While i.readyState <> READYSTATE_COMPLETE
Loop
'we ensure that the web page is downloaded completely
ThisWorkbook.Sheets("Sheet1").Range("a2:ai1000").ClearContents
Set elemCollection = i.document.getElementsByTagName("TABLE")
For t = 0 To (elemCollection.Length - 1)
For r = 0 To (elemCollection(t).Rows.Length - 1)
eRow = Sheet1.Cells(Rows.Count, 1).End(xlUp).Offset(1, 0).Row
For c = 0 To (elemCollection(t).Rows(r).Cells.Length - 1)
ThisWorkbook.Worksheets(1).Cells(eRow, c + 1) = elemCollection(t).Rows(r).Cells(c).innerText
Next c
Next r
Next t
Range("a1:ai1000").Columns.AutoFit
'Clean up Memory
Set IE = Nothing
Set HTMLDoc = i.document
Set HTMLAs = HTMLDoc.getElementsByTagName("a")
For Each HTMLA In HTMLAs
'Debug.Print HTMLA.getAttribute("classname"), HTMLA.getAttribute("href")
If HTMLA.getAttribute("classname") = "button2 next" And HTMLA.getAttribute("href") = "https://stathead.com/football/pgl_finder.cgi?request=1&game_num_max=99&week_num_max=99&order_by=all_td&match=game&season_start=1&year_max=2020&qb_gwd=0&qb_comeback=0&season_end=-1&game_type=R&age_max=99&year_min=2020&offset=100" Then
HTMLA.Click
Exit For
End If
Next
End Sub

Data Scraping from Website - receiving error on different player pages

I'm trying to data scrape game logs from basketball reference.com. It worked perfectly on two players I choose (demar derozan and lamarcus aldridge). But then I started going through other players and it just wouldn't scrape the data for many of the other players (Kevin Durant).
I have NO Idea why it wouldn't work. For example, I tried Stephen Curry and it worked fine, but players like Draymond Green and Kevin Durant, the code would just not scrape the data at all. For some reason after the column Date, everything stopped working.
Sub Data()
Dim ieObj As InternetExplorer
Dim htmlEle As IHTMLElement
Dim i As Integer
i = 1
Set ieObj = New InternetExplorer
ieObj.Visible = True
ieObj.navigate "https://www.basketball-reference.com/players/d/duranke01/gamelog/2019"
Application.ScreenUpdating = False
Application.DisplayStatusBar = True
Application.Wait Now + TimeValue("00:00:05")
For Each htmlEle In ieObj.document.getElementsByClassName("stats_table")(0).getElementsByTagName("tr")
With ActiveSheet
.Range("A" & i).Value = htmlEle.Children(0).textContent
.Range("B" & i).Value = htmlEle.Children(1).textContent
.Range("C" & i).Value = htmlEle.Children(2).textContent
.Range("D" & i).Value = htmlEle.Children(3).textContent
End With
i = i + 1
On Error Resume Next
Next htmlEle
End Sub
The error always happens on this line:
.Range("D" & i).Value = htmlEle.Children(3).textContent
I tried skipping columns to but it still wouldn't work.
I found no problem with using id for table, as seen in .responseText and using xmlhttp over browser.
I only tested with 3 urls - in sheet 1 A1:A3
https://www.basketball-reference.com/players/d/duranke01/gamelog/2019
https://www.basketball-reference.com/players/c/curryst01/gamelog/2019
https://www.basketball-reference.com/players/g/greendr01/gamelog/2019
With this site sometimes you can encounter tables inside comments so I stripped out the comment tags before processing. This was not necessary for the links I tried.
I use clipboard to copy paste but you could just use
Set hTable = html.getElementById("pgl_basic")
Then loop the tr and td as you wish using getElementsByTagName.
References (VBE>Tools>References):
Microsoft HTML Object Library
Option Explicit
Public Sub GetPlayerInfo()
Dim urls(), i As Long, html As HTMLDocument, hTable As Object
Dim ws As Worksheet, wsCurrent As Object, clipboard As Object
Dim lastRow As Long, playerIdentifier As String, arr() As String
Set clipboard = GetObject("New:{1C3B4210-F441-11CE-B9EA-00AA006B1A69}")
Set ws = ThisWorkbook.Worksheets("Sheet1")
lastRow = ws.Cells(ws.rows.Count, "A").End(xlUp).Row
urls = Application.Transpose(ws.Range("A1:A" & lastRow).Value)
Set html = New HTMLDocument
With CreateObject("MSXML2.XMLHTTP")
For i = LBound(urls) To UBound(urls)
.Open "GET", urls(i), False
.send
html.body.innerHTML = Replace$(Replace$(.responseText, "-->", vbNullString), "<!--", vbNullString) 'remove comments
arr = Split(urls(i), "/")
playerIdentifier = arr(5)
If SheetExists(playerIdentifier) Then
With ThisWorkbook.Worksheets(playerIdentifier).Cells
.ClearContents
.ClearFormats
Set wsCurrent = ThisWorkbook.Worksheets(playerIdentifier)
End With
Else
Set wsCurrent = ThisWorkbook.Worksheets.Add
wsCurrent.name = playerIdentifier
End If
Set hTable = html.querySelector("#pgl_basic")
clipboard.SetText hTable.outerHTML
clipboard.PutInClipboard
wsCurrent.Range("A1").PasteSpecial
Set wsCurrent = Nothing: Set hTable = Nothing: Erase arr: playerIdentifier = vbNullString
Application.CutCopyMode = False
Next
End With
End Sub
Public Function SheetExists(ByVal sheetName As String) As Boolean '<== function by #Rory
SheetExists = Evaluate("ISREF('" & sheetName & "'!A1)")
End Function
IE
Note proper page load wait used.
Option Explicit
Public Sub GetPlayerInfo()
Dim ieObj As InternetExplorer, htmlEle As IHTMLElement
Dim urls(), i As Long, j As Long, hTable As Object
Dim ws As Worksheet, wsCurrent As Object
Dim lastRow As Long, playerIdentifier As String, arr() As String
Application.ScreenUpdating = False
On Error GoTo errHand
Set ws = ThisWorkbook.Worksheets("Sheet1")
lastRow = ws.Cells(ws.Rows.Count, "A").End(xlUp).Row
urls = Application.Transpose(ws.Range("A1:A" & lastRow).Value)
Set ieObj = New InternetExplorer
With ieObj
.Visible = True
For j = LBound(urls) To UBound(urls)
.navigate urls(j)
While .Busy Or .readyState <> 4: DoEvents: Wend
arr = Split(urls(j), "/")
playerIdentifier = arr(5)
If SheetExists(playerIdentifier) Then
With ThisWorkbook.Worksheets(playerIdentifier).Cells
.ClearContents
.ClearFormats
Set wsCurrent = ThisWorkbook.Worksheets(playerIdentifier)
End With
Else
Set wsCurrent = ThisWorkbook.Worksheets.Add
wsCurrent.Name = playerIdentifier
End If
i = 1
Set hTable = .document.getElementById("pgl_basic")
If Not hTable Is Nothing Then
For Each htmlEle In hTable.getElementsByTagName("tr")
With wsCurrent
.Range("A" & i).Value = htmlEle.Children(0).textContent
.Range("B" & i).Value = htmlEle.Children(1).textContent
.Range("C" & i).Value = htmlEle.Children(2).textContent
.Range("D" & i).Value = htmlEle.Children(3).textContent
End With
i = i + 1
Next htmlEle
End If
Set wsCurrent = Nothing: Set hTable = Nothing: Erase arr: playerIdentifier = vbNullString
Next
End With
errHand:
Application.ScreenUpdating = True
If Err.Number <> 0 Then Debug.Print Err.Number, Err.Description
ie.Quit
End Sub

I'm trying to import innertext from a td-class. Getting lots of extra text

I'm trying to get the innertext of 'wfm-bodyText'. So, I want to loop through items on a webpage and import, in this case, 'H. Validate Gamer Correct'
I thought the script below would work, but it actually gives me all kinds of extra text!!
Public Sub CopyFromURL()
Dim IE As InternetExplorer, doc As HTMLDocument
Dim thisClass As IHTMLElement2, thisLink As IHTMLElement
Dim rng As Range, cell As Range
Const READYSTATE_COMPLETE As Integer = 4
Dim TR_col As Object, TR As Object
Dim TD_col As Object, TD As Object
Dim row As Long, col As Long
row = 1
'Set rng = Range("A1:A5")
'For Each cell In rng
Set IE = CreateObject("InternetExplorer.Application")
IE.Navigate URL
Do While (IE.Busy Or IE.ReadyState <> READYSTATE_COMPLETE)
DoEvents
Loop
Set TR_col = IE.Document.getElementsByTagName("TR")
For Each TR In TR_col
Set TD_col = TR.getElementsByTagName("TD")
row = ActiveSheet.Cells(ActiveSheet.Rows.Count, "A").End(xlUp).row
col = 1
For Each TD In TD_col
Cells(row, col) = TD.innerText
col = col + 1
Next
row = row + 1
Next
'Next cell
IE.Quit
End Sub
The URL is behind a firewall. I can't share it.
Again, below is the structure of the page. I want to copy the 'E. Verify phone number if applicable' into my sheet.
I tried the script below in an attempt to get every element from the page.
Sub DumpData()
Set IE = CreateObject("InternetExplorer.Application")
IE.Visible = True
URL = "http://www.SharePoint.aspx"
'Wait for site to fully load
IE.Navigate2 URL
Do While IE.Busy = True
DoEvents
Loop
RowCount = 1
With Sheets("Sheet1")
.Cells.ClearContents
RowCount = 1
For Each itm In IE.document.all
.Range("A" & RowCount) = itm.tagname
.Range("B" & RowCount) = itm.ID
.Range("C" & RowCount) = itm.classname
.Range("D" & RowCount) = Left(itm.innertext, 1024)
RowCount = RowCount + 1
Next itm
End With
End Sub
I still can't tell where the relevant data is coming from.

VBA returns [object] when using Doc.getElementsByTagName. Can't locate correct TagName from browser

I'm having trouble locating the correct tag name from Chrome. When I run the following code all that is returned is "[object]". Does anyone have any suggestions to pull in the correct field?
Here's the website I'm attempting to pull information. Specifically Census Tract but at this point if someone could show me how to return any information I will mine the correct one.
Here's my code, currently running the output to a MsgBox just to find the right field. From there I will reenable to for statement and make the URL in the VBA dynamic. Also note that I've attempted to use getElementsByName but no progress there either:
Sub censusTract()
Dim sht As Worksheet
Dim lastRow As Long
Set sht = ActiveWorkbook.Sheets("Sheet1")
lastRow = sht.Cells(sht.Rows.Count, "A").End(xlUp).Row
'For i = 2 To lastRow
Dim IE As New InternetExplorer
'IE.Visible = True
IE.navigate "http://geocoding.geo.census.gov/geocoder/geographies/address?street=1308+Lapwing+Rd&city=Edmond&state=OK&zip=73003&benchmark=4&vintage=4"
Do
DoEvents
Loop Until IE.readyState = READYSTATE_COMPLETE
Dim Doc As HTMLDocument
Set Doc = IE.document
Dim sDD As String
'sDD = Doc.getElementsByTagName("br")(0)
sDD = Doc.getElementsByName("pl_gov_census_geo_geocoder_domain_AddressResult")(0)
MsgBox sDD
'IE.Quit
'sht.Cells(i, 41).Value = sDD
'Next i
End Sub
You are very close. All you need to do is pull the correct data from the object. You can use one of the following
sDD = Doc.getElementsByName("pl_gov_census_geo_geocoder_domain_AddressResult").item(0).innerText
sDD = Doc.getElementsByName("pl_gov_census_geo_geocoder_domain_AddressResult").item(0).innerHTML
sDD = Doc.getElementsByName("pl_gov_census_geo_geocoder_domain_AddressResult").item(0).outerHTML
Right now its just pulling the object, adding innerText, innerHTML or outerHTML should pull your result.
Once you have that pulled you may want to split the results into an array so you can do what you need with each of the elements. the code below should get you started.
Sub censusTract()
Dim sht As Worksheet
Dim lastRow As Long, v As Variant, block As Variant, x As Integer
Set sht = ActiveWorkbook.Sheets("Sheet1")
lastRow = sht.Cells(sht.Rows.Count, "A").End(xlUp).Row
'For i = 2 To lastRow
Dim IE As New InternetExplorer
'IE.Visible = True
IE.navigate "http://geocoding.geo.census.gov/geocoder/geographies/address?street=1308+Lapwing+Rd&city=Edmond&state=OK&zip=73003&benchmark=4&vintage=4"
Do
DoEvents
Loop Until IE.readyState = READYSTATE_COMPLETE
Dim Doc As HTMLDocument
Set Doc = IE.document
Dim sDD As String
'sDD = Doc.getElementsByTagName("br")(0)
sDD = Doc.getElementsByName("pl_gov_census_geo_geocoder_domain_AddressResult").Item(0).innerText
block = Split(sDD, vbLf)
x = 2 'start column
For Each v In block
If v <> "" Then
Cells(i, x).Value = v
x = x + 1
End If
Next v
'IE.Quit
'sht.Cells(i, 41).Value = sDD
'Next i
End Sub
Let me know if you have any questions.

Web table not fetching the correct data by VBA

With the followoing code I can fetch the price table from this webpage http://www.idealo.de/preisvergleich/OffersOfProduct/143513.html
But from another page here this table is not being fetched...though these two pages are identical. I can't figure out where am lacking.
Any help on this is deeply appreciable.
Sub TableExample()
Dim IE As Object
Dim doc As Object
Dim strURL As String
strURL = "http://www.idealo.de/preisvergleich/OffersOfProduct/143513.html"
Set IE = CreateObject("InternetExplorer.Application")
With IE
.navigate strURL
Do Until .readyState = 4: DoEvents: Loop
Do While .Busy: DoEvents: Loop
Set doc = IE.document
GetAllTables doc
.Quit
End With
End Sub
Sub GetAllTables(doc As Object)
Dim ws As Worksheet
Dim rng As Range
Dim tbl As Object
Dim rw As Object
Dim cl As Object
Dim tabno As Long
Dim nextrow As Long
Dim i As Long
Set ws = Sheets("Sheet1")
For Each tbl In doc.getElementsByTagName("TABLE")
tabno = tabno + 1
nextrow = nextrow + 1
Set rng = ws.Range("B" & nextrow)
rng.Offset(, -1) = "Table " & tabno
If tabno = 5 Then
For Each rw In tbl.Rows
colno = 5
For Each cl In rw.Cells
If colno = 5 And nextrow > 5 Then
Set classColl = doc.getElementsByClassName("shop")
Set imgTgt = classColl(nextrow - 6).getElementsByTagName("img")
rng.Value = imgTgt(0).getAttribute("alt")
Else
rng.Value = cl.innerText
End If
Set rng = rng.Offset(, 1)
i = i + 1
colno = colno + 1
Next cl
nextrow = nextrow + 1
Set rng = rng.Offset(1, -i)
i = 0
Next rw
End If
Next tbl
End Sub
The table numbers change between the two urls. Table 5 is the table you're interested in for the first url, but Table 6 is the one of interest in the 2nd url. However both tables of interest have the same id ("offers-list"), so instead of looking for the fifth table, adjust your code to look for the Table with the ID "offers-list"
change
If tabno = 5 Then
to
If InStr(1, tbl.outerhtml, "Produktbezeichnung des Shops", vbTextCompare) > 0 Then
This will get you close. There are other changes on the second web page that your current code isn't quite handling - but like I said this will get you close.
I have changed the If tabno = 5 Then with
For Each tbl In doc.getElementsByTagName("table")
' tabno = tabno + 1
If tbl.className = "orangebox_rowborder" Then
Thanks #Ron for guiding me for this...thanks a ton Dude
The following works for each URL so is more robust and is a lot faster than the method you are currently using as it does away with the IE browser nvaigation.
For a lengthy code explanation please see here.
Option Explicit
'Tools > References > HTML Object Library
Public Sub GetTable()
Const URL = "https://www.idealo.de/preisvergleich/OffersOfProduct/1866742_-335-billingham.html" '<==Change this
Dim sResponse As String, listItems As Object, html As HTMLDocument, headers()
headers = Array("product_id", "product_name", "product_price", "product_category", "currency", "spr", "shop_name", "delivery_time", "shop_rating", "position", "free_return", "approved_shipping")
Application.ScreenUpdating = False
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", URL, False
.send
sResponse = StrConv(.responseBody, vbUnicode)
End With
sResponse = Mid$(sResponse, InStr(1, sResponse, "<!DOCTYPE "))
Set html = New HTMLDocument
With html
.body.innerHTML = sResponse
Set listItems = .getElementsByClassName("productOffers-listItemOfferPrice")
End With
Dim currentItem As Long
With ActiveSheet
.Cells(1, 1).Resize(1, UBound(headers) + 1) = headers
For currentItem = 0 To listItems.Length - 1
Dim tempString As String, columnValues() As String
tempString = TidyString(GetTransactionInfo(listItems(currentItem).outerHTML), "&#\d+;")
columnValues = GetColumnValues(tempString, headers)
.Cells(currentItem + 2, 1).Resize(1, UBound(columnValues) + 1) = columnValues
Next currentItem
End With
Application.ScreenUpdating = True
End Sub
Public Function GetTransactionInfo(ByVal inputString) As String
'Split to get just the transaction items i.e. Headers and associated values
GetTransactionInfo = Split(Split(inputString, """transaction"",")(1), "}")(0)
End Function
Public Function TidyString(ByVal inputString As String, ByVal matchPattern As String) As String
'Extract transaction info
'Use regex to find these unwanted strings and replace pattern e.g. &#\d+;
'Example inputString
Dim regex As Object, tempString As String
Set regex = CreateObject("VBScript.RegExp")
With regex
.Global = True
.MultiLine = True
.IgnoreCase = False
.Pattern = matchPattern
End With
If regex.test(inputString) Then
TidyString = regex.Replace(inputString, vbNullString)
Else
TidyString = inputString
End If
End Function
Public Function GetColumnValues(ByVal inputString As String, ByVal headers As Variant) As Variant
' Example input string "product_id": "143513","product_name": "Canon 500D Nahlinse 72mm","product_price": "128.0","product_category": "26570","currency": "EUR","spr": "cfd","shop_name": "computeruniverse.net","delivery_time": "long","shop_rating": "100","position": "1","free_return": "14","approved_shipping": "false"
' Extract just the inner string value of each header e.g. 143513
Dim arr() As String, currentItem As Long, tempString As String
tempString = inputString
For currentItem = LBound(headers) To UBound(headers)
tempString = TidyString(tempString, Chr$(34) & headers(currentItem) & Chr$(34) & ":")
Next currentItem
arr = Split(Replace$(tempString, Chr$(34), vbNullString), ",")
GetColumnValues = arr
End Function