How to Extract Data From HTML (.aspx) website into Excel using VBA

In this example I used the following VBA code to extract the data stored in a script variable. You can see the example data from an HTML file at the end of this post.

Below is the snapshot of the page that I was trying to extract data from.


VBA CODE:

'********************************************************
' DESCRIPTION:  GET DATA FROM TABLE OF A .ASPX WEBSITE
' BY:           CROMWELL D. BAYON
' EMAIL:        OMELSOFT@GMAIL.COM
'********************************************************

Sub GetOfflineData()
    Dim file As String, Data As String
    Dim delimQ As String
    Dim lineContentLoc As Long
    
    delimQ = Chr(34)
    file = Application.ActiveWorkbook.Path & "\Source of Select.html"
    
    Open file For Input As #1
    Do Until EOF(1)
        
        Line Input #1, linefromfile
        
        'Check only the line containing this pattern (var best_idear_data =[)
        If InStr(1, linefromfile, "var best_idear_data =[") > 0 Then
            lineContentLoc = InStr(1, linefromfile, delimQ & "line-content" & delimQ & ">")
            Data = Mid(linefromfile, lineContentLoc + 15, Len(linefromfile))
            Exit Do
        End If
        
    Loop
    Close #1
    
    Data = Replace(Data, "var best_idear_data =[", "", 1, Len(Data))
    Data = Replace(Data, "]//]]>", "")
    
    'Replace Variables with equivalent characters
    ReplaceString (Data)
    
End Sub

'This replaces the strings with the original texts
Sub ReplaceString(Data As String)
    ReDim filter(0 To 10) As Variant
    ReDim replacewith(0 To 10) As Variant
    Dim delim As String, delimQ As String
    
    delim = ","
    delimQ = Chr(34)    'Quote
    Dim i As Integer    'Filter Index
    
    'Patterns
    filter(0) = "\u003ca"
    filter(1) = "\u003c/a\u003e"
    filter(2) = "title='"
    filter(3) = "\u003e"
    filter(4) = "\u003cimg"
    filter(5) = "\u0027"
    filter(6) = "["
    filter(7) = "]"
    filter(8) = delimQ & delim & delimQ 'Comma delimiter
    filter(9) = "//>"
    
    replacewith(0) = "<a"
    replacewith(1) = "</a>"
    replacewith(2) = "title="
    replacewith(3) = ">"
    replacewith(4) = "<img"
    replacewith(5) = "'"
    replacewith(6) = ""
    replacewith(7) = ""
    replacewith(8) = delimQ & "*" & delimQ  'replace comma delimiter with an * to unclutter strings
    replacewith(9) = ""
    
    'Replace/remove original pattern
    For i = 0 To UBound(filter)
        
        Data = Replace(Data, filter(i), replacewith(i))
    
    Next
    
    'Enable this to see the output in Immediate window
    'Debug.Print Data
    
    'Split Data and Populate to Worksheet
    SplitAndPopulateData Data
    
End Sub

Sub SplitAndPopulateData(Data As String)
    Dim LineItems As Variant
    Dim Items As Long, Item As Long
    Dim colIndex As Long    'Column Index
    Dim row As Long
    Dim ws As Worksheet
    Dim lRange As Long      'Item Range
    Dim delimQ As String    'Quote
    Dim ancData As String   'Anchor Tag
    Dim imgData As String   'Image Tag for Rating
    
    delimQ = Chr(34)        'Quote
    LineItems = Split(Data, "*")    'Now we put each item into an array
    Items = UBound(LineItems)
    
    Set ws = Sheets(1)
    row = 2
    lRange = 0  'Start item count
    
    'Populate data to worksheet
    For Item = 0 To Items / 12 - 1
    
        For colIndex = 0 To 11
            
            If colIndex = 0 Then
            'Get only the text of the anchor tag
            ancData = Replace(LineItems(lRange), delimQ, "")
            ancData = Replace(LineItems(lRange), "</a>", "")
            ancData = Mid(ancData, InStr(1, ancData, ">") + 1, Len(ancData) - InStr(1, ancData, ">") - 1)
            
            'Set cell's Value
            ws.Cells(row, colIndex + 1).Value = ancData
            
            ElseIf colIndex = 8 Then
            
            'Get the ALT attribute value
            imgData = LineItems(lRange)
            imgData = Mid(imgData, InStr(1, imgData, "alt='") + 5, 1)
            
            'set cell's value
            ws.Cells(row, colIndex + 1).Value = imgData
            
            Else
            'Remove the Quotes then populate to cell
            ws.Cells(row, colIndex + 1).Value = Replace(LineItems(lRange), delimQ, "")
            
            End If
            
            lRange = lRange + 1
        Next colIndex
        
        row = row + 1
        
    Next Item

End Sub

The following snapshot is the output of the Excel File.

Comments

Post a Comment

Popular posts from this blog

How to Create a Configuration.INI Files in VB6

How to Set Windows Form Always on Top of Other Applications in VB6

Send Email with Excel VBA via CDO through GMail