Created
December 7, 2016 14:57
-
-
Save kseo/ca4d5ec49bf40cedb60f4ed4e11acef2 to your computer and use it in GitHub Desktop.
DOM parser
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{-# LANGUAGE FlexibleInstances #-} | |
{-# LANGUAGE TypeFamilies #-} | |
module Beard.DOM | |
( Element(..) | |
, Node(..) | |
, parseDOM | |
) where | |
import Control.Monad (when) | |
import Data.Foldable (fold) | |
import Data.List.NonEmpty (NonEmpty(..)) | |
import qualified Data.List.NonEmpty as NE | |
import Data.Semigroup ((<>)) | |
import Data.Text (Text) | |
import qualified Data.Text as T | |
import Data.HashMap.Strict (HashMap) | |
import qualified Data.HashMap.Strict as HMS | |
import qualified Data.Set as Set | |
import Text.HTML.TagSoup | |
import Text.Megaparsec | |
import Text.Megaparsec.Pos (SourcePos) | |
import Text.Megaparsec.Error (Dec, ShowToken) | |
type TextTag = Tag Text | |
type TextTagStream = [TextTag] | |
data Element = Element | |
{ elementName :: !Text | |
, elementAttrs :: !(HashMap Text Text) | |
, elementChildren :: [Node] | |
} deriving (Eq, Show) | |
data Node = | |
ElementNode Element | |
| TextNode Text | |
deriving (Eq, Show) | |
type DOMParser = Parsec Dec TextTagStream | |
instance Stream TextTagStream where | |
type Token TextTagStream = TextTag | |
uncons [] = Nothing | |
uncons (x:xs) = Just (x, xs) | |
updatePos = const updatePos' | |
updatePos' | |
:: Pos -- ^ Tab width | |
-> SourcePos -- ^ Current position | |
-> TextTag -- ^ Current token | |
-> (SourcePos, SourcePos) -- ^ Actual position and incremented position | |
updatePos' _ apos@(SourcePos n l c) token = (apos, npos) | |
where | |
u = unsafePos 1 | |
npos = SourcePos n l (c <> u) | |
instance ShowToken TextTag where | |
showTokens = fold . NE.intersperse " " . fmap show | |
tagSatisfy :: (TextTag -> Bool) -> DOMParser TextTag | |
tagSatisfy f = token testTag Nothing | |
where testTag x = if f x | |
then Right x | |
else Left (Set.singleton (Tokens (x:|[])), Set.empty, Set.empty) | |
tagOpen = tagSatisfy isTagOpen <?> "tag open" | |
tagClose = tagSatisfy isTagClose <?> "tag close" | |
tagText = tagSatisfy isTagText <?> "tag text" | |
text :: DOMParser Text | |
text = fromTagText <$> tagText | |
element :: DOMParser Element | |
element = do | |
(TagOpen tagName attrs) <- tagOpen | |
children <- many node | |
closeTag@(TagClose tagName') <- tagClose | |
when (tagName /= tagName') $ fail $ "unexpected close tag: " ++ show closeTag | |
return $ Element tagName (HMS.fromList attrs) children | |
node :: DOMParser Node | |
node = ElementNode <$> element <|> TextNode <$> text | |
eraseComments :: TextTagStream -> TextTagStream | |
eraseComments = filter (not .isTagComment) | |
parseDOM :: Text -> Either (ParseError TextTag Dec) [Node] | |
parseDOM html = do | |
let tags = eraseComments (parseTags html) | |
parse (many node) "" tags |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
test