Skip to content

Instantly share code, notes, and snippets.

@lafleurh
Created March 7, 2018 22:08
Show Gist options
  • Save lafleurh/5ee677713a30e4af6aa3292463fd6cbb to your computer and use it in GitHub Desktop.
Save lafleurh/5ee677713a30e4af6aa3292463fd6cbb to your computer and use it in GitHub Desktop.
VB Script to Convert HTML to Text for Excel
' This function uses HTMLAgilityPack, available from NuGet.
' You must COM Register HTMLAgilityPack and reference it from your Excel macros.
' This is not a proper implementation, but should work in general.
Public Function StripHTML(html As Object, Optional NL As Boolean = False, Optional LI As Boolean = False) As String
Dim HDoc As HtmlAgilityPack.HtmlDocument
Dim ret As String
Dim htmlStr As String
ret = ""
On Error GoTo NoAgilityPack
' Create the HTML document object
Set HDoc = New HtmlAgilityPack.HtmlDocument
' Do not do anything if there is an error.
On Error Resume Next
' Load the HTML into the document (surround with HTML)
If InStr(1, "<html", CStr(html)) <= 0 Then
If InStr(1, "<body", CStr(html)) <= 0 Then
html = "<body>" & CStr(html) & "</body>"
End If
html = "<html>" & CStr(html) & "</html>"
End If
' Optionally put newlines where line breaks are.
If NL Then
htmlStr = Replace(CStr(html), "<br", vbCrLf & "<br")
Else
htmlStr = CStr(html)
End If
' Optionally put bullets where list items are.
If LI Then
htmlStr = Replace(CStr(htmlStr), "<li", vbCrLf & "• <li")
Else
htmlStr = CStr(htmlStr)
End If
Call HDoc.LoadHtml(htmlStr)
' Extract the raw text.
ret = HDoc.DocumentNode.innerText
ret = HttpDecode(ret)
StripHTML = ret
Exit Function
NoAgilityPack:
StripHTML = "Could not load HtmlAgilityPack .NET library. Make sure it is available to COM."
End Function
' This is a rough translation of an HttpDecode function (needs work)
Public Function HttpDecode(StringToDecode As String) As String
Dim TempAns As String
Dim CurChr As Integer
CurChr = 1
Do Until CurChr - 1 = Len(StringToDecode)
Select Case Mid(StringToDecode, CurChr, 1)
Case "&"
If InStr(CurChr, StringToDecode, ";") > 0 Then
If Mid(StringToDecode, CurChr + 2, 1) >= 0 And Mid(StringToDecode, CurChr + 2, 1) <= 9 Then
TempAns = TempAns & Chr(Val( _
Mid(StringToDecode, CurChr + 2, InStr(CurChr, StringToDecode, ";") - CurChr - 2)))
ElseIf Mid(StringToDecode, CurChr + 2, InStr(CurChr, StringToDecode, ";") - CurChr - 2) = "nbsp" Then
TempAns = TempAns & " "
End If
CurChr = InStr(CurChr, StringToDecode, ";")
End If
Case Else
TempAns = TempAns & Mid(StringToDecode, CurChr, 1)
End Select
CurChr = CurChr + 1
Loop
HttpDecode = TempAns
End Function
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment