Created
March 7, 2018 22:08
-
-
Save lafleurh/5ee677713a30e4af6aa3292463fd6cbb to your computer and use it in GitHub Desktop.
VB Script to Convert HTML to Text for Excel
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
' This function uses HTMLAgilityPack, available from NuGet. | |
' You must COM Register HTMLAgilityPack and reference it from your Excel macros. | |
' This is not a proper implementation, but should work in general. | |
Public Function StripHTML(html As Object, Optional NL As Boolean = False, Optional LI As Boolean = False) As String | |
Dim HDoc As HtmlAgilityPack.HtmlDocument | |
Dim ret As String | |
Dim htmlStr As String | |
ret = "" | |
On Error GoTo NoAgilityPack | |
' Create the HTML document object | |
Set HDoc = New HtmlAgilityPack.HtmlDocument | |
' Do not do anything if there is an error. | |
On Error Resume Next | |
' Load the HTML into the document (surround with HTML) | |
If InStr(1, "<html", CStr(html)) <= 0 Then | |
If InStr(1, "<body", CStr(html)) <= 0 Then | |
html = "<body>" & CStr(html) & "</body>" | |
End If | |
html = "<html>" & CStr(html) & "</html>" | |
End If | |
' Optionally put newlines where line breaks are. | |
If NL Then | |
htmlStr = Replace(CStr(html), "<br", vbCrLf & "<br") | |
Else | |
htmlStr = CStr(html) | |
End If | |
' Optionally put bullets where list items are. | |
If LI Then | |
htmlStr = Replace(CStr(htmlStr), "<li", vbCrLf & "• <li") | |
Else | |
htmlStr = CStr(htmlStr) | |
End If | |
Call HDoc.LoadHtml(htmlStr) | |
' Extract the raw text. | |
ret = HDoc.DocumentNode.innerText | |
ret = HttpDecode(ret) | |
StripHTML = ret | |
Exit Function | |
NoAgilityPack: | |
StripHTML = "Could not load HtmlAgilityPack .NET library. Make sure it is available to COM." | |
End Function | |
' This is a rough translation of an HttpDecode function (needs work) | |
Public Function HttpDecode(StringToDecode As String) As String | |
Dim TempAns As String | |
Dim CurChr As Integer | |
CurChr = 1 | |
Do Until CurChr - 1 = Len(StringToDecode) | |
Select Case Mid(StringToDecode, CurChr, 1) | |
Case "&" | |
If InStr(CurChr, StringToDecode, ";") > 0 Then | |
If Mid(StringToDecode, CurChr + 2, 1) >= 0 And Mid(StringToDecode, CurChr + 2, 1) <= 9 Then | |
TempAns = TempAns & Chr(Val( _ | |
Mid(StringToDecode, CurChr + 2, InStr(CurChr, StringToDecode, ";") - CurChr - 2))) | |
ElseIf Mid(StringToDecode, CurChr + 2, InStr(CurChr, StringToDecode, ";") - CurChr - 2) = "nbsp" Then | |
TempAns = TempAns & " " | |
End If | |
CurChr = InStr(CurChr, StringToDecode, ";") | |
End If | |
Case Else | |
TempAns = TempAns & Mid(StringToDecode, CurChr, 1) | |
End Select | |
CurChr = CurChr + 1 | |
Loop | |
HttpDecode = TempAns | |
End Function | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment