How to Extract Data From HTML (.aspx) website into Excel using VBA
In this example I used the following VBA code to extract the data stored in a script variable. You can see the example data from an HTML file at the end of this post.
Below is the snapshot of the page that I was trying to extract data from.
VBA CODE:
The following snapshot is the output of the Excel File.
Below is the snapshot of the page that I was trying to extract data from.
VBA CODE:
'********************************************************
' DESCRIPTION: GET DATA FROM TABLE OF A .ASPX WEBSITE
' BY: CROMWELL D. BAYON
' EMAIL: OMELSOFT@GMAIL.COM
'********************************************************
Sub GetOfflineData()
Dim file As String, Data As String
Dim delimQ As String
Dim lineContentLoc As Long
delimQ = Chr(34)
file = Application.ActiveWorkbook.Path & "\Source of Select.html"
Open file For Input As #1
Do Until EOF(1)
Line Input #1, linefromfile
'Check only the line containing this pattern (var best_idear_data =[)
If InStr(1, linefromfile, "var best_idear_data =[") > 0 Then
lineContentLoc = InStr(1, linefromfile, delimQ & "line-content" & delimQ & ">")
Data = Mid(linefromfile, lineContentLoc + 15, Len(linefromfile))
Exit Do
End If
Loop
Close #1
Data = Replace(Data, "var best_idear_data =[", "", 1, Len(Data))
Data = Replace(Data, "]//]]>", "")
'Replace Variables with equivalent characters
ReplaceString (Data)
End Sub
'This replaces the strings with the original texts
Sub ReplaceString(Data As String)
ReDim filter(0 To 10) As Variant
ReDim replacewith(0 To 10) As Variant
Dim delim As String, delimQ As String
delim = ","
delimQ = Chr(34) 'Quote
Dim i As Integer 'Filter Index
'Patterns
filter(0) = "\u003ca"
filter(1) = "\u003c/a\u003e"
filter(2) = "title='"
filter(3) = "\u003e"
filter(4) = "\u003cimg"
filter(5) = "\u0027"
filter(6) = "["
filter(7) = "]"
filter(8) = delimQ & delim & delimQ 'Comma delimiter
filter(9) = "//>"
replacewith(0) = "<a"
replacewith(1) = "</a>"
replacewith(2) = "title="
replacewith(3) = ">"
replacewith(4) = "<img"
replacewith(5) = "'"
replacewith(6) = ""
replacewith(7) = ""
replacewith(8) = delimQ & "*" & delimQ 'replace comma delimiter with an * to unclutter strings
replacewith(9) = ""
'Replace/remove original pattern
For i = 0 To UBound(filter)
Data = Replace(Data, filter(i), replacewith(i))
Next
'Enable this to see the output in Immediate window
'Debug.Print Data
'Split Data and Populate to Worksheet
SplitAndPopulateData Data
End Sub
Sub SplitAndPopulateData(Data As String)
Dim LineItems As Variant
Dim Items As Long, Item As Long
Dim colIndex As Long 'Column Index
Dim row As Long
Dim ws As Worksheet
Dim lRange As Long 'Item Range
Dim delimQ As String 'Quote
Dim ancData As String 'Anchor Tag
Dim imgData As String 'Image Tag for Rating
delimQ = Chr(34) 'Quote
LineItems = Split(Data, "*") 'Now we put each item into an array
Items = UBound(LineItems)
Set ws = Sheets(1)
row = 2
lRange = 0 'Start item count
'Populate data to worksheet
For Item = 0 To Items / 12 - 1
For colIndex = 0 To 11
If colIndex = 0 Then
'Get only the text of the anchor tag
ancData = Replace(LineItems(lRange), delimQ, "")
ancData = Replace(LineItems(lRange), "</a>", "")
ancData = Mid(ancData, InStr(1, ancData, ">") + 1, Len(ancData) - InStr(1, ancData, ">") - 1)
'Set cell's Value
ws.Cells(row, colIndex + 1).Value = ancData
ElseIf colIndex = 8 Then
'Get the ALT attribute value
imgData = LineItems(lRange)
imgData = Mid(imgData, InStr(1, imgData, "alt='") + 5, 1)
'set cell's value
ws.Cells(row, colIndex + 1).Value = imgData
Else
'Remove the Quotes then populate to cell
ws.Cells(row, colIndex + 1).Value = Replace(LineItems(lRange), delimQ, "")
End If
lRange = lRange + 1
Next colIndex
row = row + 1
Next Item
End Sub
The following snapshot is the output of the Excel File.
comtaOmam_ma Kathy Tate https://wakelet.com/wake/3z_vXAiovN1_lZXytrVwq
ReplyDeletetrimcheareagest
0incaan-ze_Madison Mark Price Crack
ReplyDeleteruiserpode
esunPjes_ro1982 Maria Robinson click
ReplyDeletehttps://colab.research.google.com/drive/1creqMzUrWPVHNMkTiqSR8G_rujJ_UbB7
click here
click here
disrirosig
linaor_pa Erica Cain Programs
ReplyDeleteprogram
scoutareshos