Skip to content

Instantly share code, notes, and snippets.

@chrisdone
Last active January 17, 2020 15:07
Show Gist options
  • Save chrisdone/12468cab7b7d151fc6844dfd75235605 to your computer and use it in GitHub Desktop.
Save chrisdone/12468cab7b7d151fc6844dfd75235605 to your computer and use it in GitHub Desktop.
XMLParser wrapper for xml-conduit
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE GADTs #-}
{-# LANGUAGE PartialTypeSignatures #-}
{-# LANGUAGE ParallelListComp #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE DisambiguateRecordFields #-}
{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE OverloadedStrings #-}
-- | A very convenient XML parser with helpful error messages.
module XMLParser
( parseFile
, parseHTMLFile
, runParser
, sinkParser
, getContext
, nth
, with
, withMaybe
, elem
, elemMaybe
, elems
, attr
, text
, get
, foreach
, ParseError(..)
, Parser
, Context(..)
) where
import Control.Applicative
import Control.DeepSeq
import Control.Monad.Catch
import Control.Monad.State
import Data.ByteString (ByteString)
import Data.Conduit
import Data.Generics (Typeable)
import Data.List (intercalate, foldl)
import qualified Data.Map.Strict as M
import Data.Maybe
import Data.Text (Text)
import qualified Data.Text as T
import GHC.Generics
import Prelude hiding (elem)
import qualified Text.HTML.DOM as HTML
import qualified Text.XML as DOM
--------------------------------------------------------------------------------
-- Types
data Parser a where
ElementParser :: DOM.Name -> Parser a -> Parser a
ElementParserMaybe :: DOM.Name -> Parser a -> Parser (Maybe a)
AttributeParser :: DOM.Name -> (Text -> Either String a) -> Parser a
TextParser :: (Text -> Either String a) -> Parser a
GetContext :: Parser Context
WithContext :: Context -> Parser a -> Parser a
Pure :: a -> Parser a
Bind :: Parser a -> (a -> Parser b) -> Parser b
Alt :: [Parser a] -> Parser a
Foreach :: (Int -> DOM.Name -> Parser a) -> Parser [a]
Nth :: Int -> Parser a -> Parser a
instance Monad Parser where return = Pure; (>>=) = Bind
instance Applicative Parser where (<*>) = ap; pure = return
instance Functor Parser where fmap = liftM
instance Semigroup (Parser a) where
x <> y = Alt (list x <> list y)
where
list =
\case
Alt xs -> xs
e -> [e]
data ParseError
= ParseError Context Reason
| AllBranchesFailed [ParseError]
deriving (Show, Typeable, Generic)
instance NFData ParseError
data Context = Context
{ contextParents :: [(Text, Maybe Int)]
, contextElement :: DOM.Element
} deriving (Show, Generic)
instance NFData Context
data Reason
= ExpectedElement DOM.Name
| ExpectedAttribute DOM.Name
| AttributeParserFail DOM.Name String
| TextParsingFail Text String
| MissingIndexedNode Int
deriving (Show, Generic)
instance NFData Reason
--------------------------------------------------------------------------------
-- Top level parser runners
parseFile ::
(MonadIO m, MonadThrow m, MonadCatch m)
=> DOM.ParseSettings
-> FilePath
-> Parser a
-> m (Either ParseError a)
parseFile settings fp parser0 = do
doc <- liftIO (DOM.readFile settings fp)
try (runParser parser0 (DOM.documentRoot doc))
parseHTMLFile ::
(MonadIO m, MonadThrow m, MonadCatch m)
=> FilePath
-> Parser a
-> m (Either ParseError a)
parseHTMLFile fp parser0 = do
doc <- liftIO (HTML.readFile fp)
try (runParser parser0 (DOM.documentRoot doc))
sinkParser ::
(MonadThrow m, MonadCatch m')
=> DOM.ParseSettings
-> Parser a
-> ConduitM ByteString o m (m' a)
sinkParser settings p = do
r <- DOM.sinkDoc settings
pure (runParser p (DOM.documentRoot r))
runParser ::
forall m a. (MonadThrow m, MonadCatch m)
=> Parser a
-> DOM.Element
-> m a
runParser parser0 element0 =
evalStateT
(go parser0)
(Context
{ contextParents =
[(DOM.nameLocalName (DOM.elementName element0), Nothing)]
, contextElement = element0
})
where
go :: forall x. Parser x -> StateT Context m x
go =
\case
GetContext -> get
WithContext ctx m -> do
s <- get
put ctx
v <- go m
put s
pure v
Pure a -> pure a
Bind m f -> go m >>= go . f
Alt xs0 ->
let loop es (x:xs) = do
s <- get
result <- try (go x)
case result of
Right v -> pure v
Left (e :: ParseError) -> do
put s
loop (e : es) xs
loop es [] = throwM (AllBranchesFailed (reverse es))
in loop [] xs0
TextParser parser -> do
element <- gets contextElement
let txt =
(mconcat
(mapMaybe
(\case
DOM.NodeContent t -> Just t
_ -> Nothing)
(DOM.elementNodes element)))
case parser txt of
Right ok -> pure ok
Left e -> do
context <- get
throwM (ParseError context (TextParsingFail txt e))
ElementParser name inner -> do
result <- lookupElementByName name
case result of
Nothing -> do
context <- get
throwM (ParseError context (ExpectedElement name))
Just element' -> runElementParser Nothing name element' inner go
ElementParserMaybe name inner -> do
result <- lookupElementByName name
case result of
Nothing -> pure Nothing
Just element' ->
fmap Just (runElementParser Nothing name element' inner go)
AttributeParser name parser -> do
element <- gets contextElement
context <- get
case M.lookup name (DOM.elementAttributes element) of
Nothing -> throwM (ParseError context (ExpectedAttribute name))
Just t ->
case parser t of
Right ok -> pure ok
Left e ->
throwM (ParseError context (AttributeParserFail name e))
Foreach cont -> do
element <- gets contextElement
fmap
catMaybes
(traverse
(\(i, el) ->
case el of
DOM.NodeElement e ->
fmap
Just
(runElementParser
(pure i)
(DOM.elementName e)
e
(cont i (DOM.elementName e))
go)
_ -> pure Nothing)
(zip
[0 ..]
(filter
(\case
DOM.NodeElement {} -> True
_ -> False)
(DOM.elementNodes element))))
Nth i cont -> do
element <- gets contextElement
context <- get
result <-
fmap
(listToMaybe . catMaybes)
(traverse
(\(j, x) ->
case x of
DOM.NodeElement e
| j == i ->
fmap
Just
(runElementParser
(pure i)
(DOM.elementName e)
e
cont
go)
_ -> pure Nothing)
(zip
[0 ..]
(filter
(\case
DOM.NodeElement {} -> True
_ -> False)
(DOM.elementNodes element))))
case result of
Nothing -> throwM (ParseError context (MissingIndexedNode i))
Just r -> pure r
runElementParser ::
MonadState Context m => Maybe Int -> DOM.Name -> DOM.Element -> t -> (t -> m b) -> m b
runElementParser i name element' inner go = do
orig <- get
modify
(\(Context parents _) ->
Context ((DOM.nameLocalName name, i) : parents) element')
a <- go inner
put orig
pure a
lookupElementByName :: (MonadState Context m, MonadThrow m) => DOM.Name -> m (Maybe DOM.Element)
lookupElementByName name = do
element <- gets contextElement
pure
(listToMaybe
(mapMaybe
(\case
DOM.NodeElement this
| DOM.elementName this == name -> Just this
_ -> Nothing)
(DOM.elementNodes element)))
--------------------------------------------------------------------------------
-- Parser combinators
nth :: Int -> Parser a -> Parser a
nth = Nth
foreach :: (Int -> DOM.Name -> Parser a) -> Parser [a]
foreach = Foreach
getContext :: Parser Context
getContext = GetContext
with :: Context -> Parser a -> Parser a
with = WithContext
withMaybe :: Maybe Context -> Parser a -> Parser (Maybe a)
withMaybe mctx p = maybe (pure Nothing) (fmap Just . (\ctx -> with ctx p)) mctx
elem :: DOM.Name -> Parser a -> Parser a
elem = ElementParser
elemMaybe :: DOM.Name -> Parser a -> Parser (Maybe a)
elemMaybe = ElementParserMaybe
elems :: [DOM.Name] -> Parser a -> Parser a
elems xs p = foldr elem p xs
attr :: DOM.Name -> (Text -> Either String a) -> Parser a
attr = AttributeParser
text :: (Text -> Either String a) -> Parser a
text = TextParser
--------------------------------------------------------------------------------
-- Very helpful error messages
instance Exception ParseError where
displayException = go 0
where
go i =
\case
AllBranchesFailed es ->
("All of these parse branches failed:\n\n" <>
indent (i + 4) (intercalate "\n\n" (map (go (i + 4)) es)))
ParseError ctx reason ->
"In the context:\n\n" <>
indent (i+4) (showContext ctx) <> "\n\n" <>
showReason i ctx reason
indent :: Int -> String -> [Char]
indent n = intercalate "\n" . map (replicate n ' ' ++) . lines
showReason :: Int -> Context -> Reason -> [Char]
showReason i (Context _ el) =
\case
MissingIndexedNode i -> "Missing node at index " ++ show i ++ "\n\n" ++
case mapMaybe
(\case
DOM.NodeElement element ->
Just (showTagName (DOM.elementName element))
_ -> Nothing)
(DOM.elementNodes el) of
[] -> "There are no elements here!"
nodes ->
"Elements you can expect at this level:\n\n" ++
indent (i + 4) (intercalate ", " nodes)
ExpectedElement name ->
"Expected element " ++
showTagName name ++
"\n\n" ++
case mapMaybe
(\case
DOM.NodeElement element ->
Just (showTagName (DOM.elementName element))
_ -> Nothing)
(DOM.elementNodes el) of
[] -> "There are no elements here!"
nodes ->
"Elements you can expect at this level:\n\n" ++
indent (i + 4) (intercalate ", " nodes)
ExpectedAttribute name ->
"Expected attribute " ++ showName name ++ "=\"...\""
AttributeParserFail name err ->
"Attribute " ++
showName name ++
"=\"...\" failed to parse,\n" ++ "with error:\n" ++ indent (i + 4) err
TextParsingFail tex err ->
"The text " ++
show (ellipsis 40 tex) ++
" failed to parse,\n" ++ "with error:\n\n" ++ indent (i + 4) err
where ellipsis n t =
T.take
n
(if T.length t > n
then t <> ".."
else t)
showName :: DOM.Name -> String
showName (DOM.Name {DOM.nameLocalName = ln}) = T.unpack ln
showContext :: Context -> [Char]
showContext (Context parents _elem) =
foldl
(\rents (this, i) ->
let midx =
case i of
Just i' -> " [" ++ show i' ++ "]"
Nothing -> ""
in showTag this ++
midx ++
"\n" ++
(if not (null rents)
then indent 2 rents
else ""))
""
parents
where
showTag :: Text -> [Char]
showTag = \name -> "<" ++ T.unpack name ++ ">"
showTagName :: DOM.Name -> [Char]
showTagName = \name -> "<" ++ showName name ++ ">"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment