Last active
May 14, 2018 19:13
-
-
Save tonmcg/1173759b95943b2b9ed290b9edbe74d3 to your computer and use it in GitHub Desktop.
M Language Helper Functions for HTML Parsing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
let GetTables = | |
(url as text) => | |
let | |
DOM = Text.FromBinary(Web.Contents(url)), | |
DOCTYPE = | |
let | |
DOCTag = "<!" & Text.BetweenDelimiters(DOM, "<!", ">") & ">" | |
in | |
DOCTag, | |
HTMLOpeningTag = | |
let | |
HtmlTag = "<html" & Text.BetweenDelimiters(DOM, "<html", ">") & ">" | |
in | |
HtmlTag, | |
HEAD = | |
let | |
HeadString = "<head" & Text.BetweenDelimiters(DOM, "<head", "</head>") & "</head>" | |
in | |
HeadString, | |
BODYOpeningTag = | |
let | |
BodyString = "<body>" | |
in | |
BodyString, | |
GetTables = (n as number) => | |
let | |
CurrentTable = Text.BetweenDelimiters(DOM, "<table", "</table>", n) | |
in | |
if CurrentTable = "" then | |
"" | |
else | |
Text.Combine({ "<table", CurrentTable, "</table>", @GetTables(n+1) }), | |
TABLES = GetTables(0), | |
HTML = Text.Combine({DOCTYPE, HTMLOpeningTag, HEAD, BODYOpeningTag, TABLES, "</body></html>"}), | |
Page = Web.Page(HTML), | |
Tables = Table.SelectRows(Page, each ([Source] = "Table")) | |
in | |
Tables, | |
DefineDocs = [ | |
Documentation.Name = " HTML.GetTables", | |
Documentation.Description = " Returns the contents of all table nodes within the HTML document broken into its constituent structures", | |
Documentation.LongDescription = " Returns the contents of all table nodes within the HTML document broken into its constituent structures of a user-supplied URL.", | |
Documentation.Category = " Html.Modification", | |
Documentation.Source = " Inspired by solutions after Imke Feldmann", | |
Documentation.Author = " Tony McGovern: www.emdata.ai", | |
Documentation.Examples = { | |
[ | |
Description = "", | |
Code = " GetTables(""https://www.census.gov/geo/reference/ansi_statetables.html"")", | |
Result = "" | |
] | |
} | |
] | |
in | |
Value.ReplaceType( | |
GetTables, | |
Value.ReplaceMetadata( | |
Value.Type(GetTables), | |
DefineDocs | |
) | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Fixed hard coded reference to URL