Last active
January 17, 2020 15:07
-
-
Save chrisdone/12468cab7b7d151fc6844dfd75235605 to your computer and use it in GitHub Desktop.
XMLParser wrapper for xml-conduit
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{-# LANGUAGE DeriveGeneric #-} | |
{-# LANGUAGE FlexibleContexts #-} | |
{-# LANGUAGE GADTs #-} | |
{-# LANGUAGE PartialTypeSignatures #-} | |
{-# LANGUAGE ParallelListComp #-} | |
{-# LANGUAGE ScopedTypeVariables #-} | |
{-# LANGUAGE DisambiguateRecordFields #-} | |
{-# LANGUAGE LambdaCase #-} | |
{-# LANGUAGE OverloadedStrings #-} | |
-- | A very convenient XML parser with helpful error messages. | |
module XMLParser | |
( parseFile | |
, parseHTMLFile | |
, runParser | |
, sinkParser | |
, getContext | |
, nth | |
, with | |
, withMaybe | |
, elem | |
, elemMaybe | |
, elems | |
, attr | |
, text | |
, get | |
, foreach | |
, ParseError(..) | |
, Parser | |
, Context(..) | |
) where | |
import Control.Applicative | |
import Control.DeepSeq | |
import Control.Monad.Catch | |
import Control.Monad.State | |
import Data.ByteString (ByteString) | |
import Data.Conduit | |
import Data.Generics (Typeable) | |
import Data.List (intercalate, foldl) | |
import qualified Data.Map.Strict as M | |
import Data.Maybe | |
import Data.Text (Text) | |
import qualified Data.Text as T | |
import GHC.Generics | |
import Prelude hiding (elem) | |
import qualified Text.HTML.DOM as HTML | |
import qualified Text.XML as DOM | |
-------------------------------------------------------------------------------- | |
-- Types | |
data Parser a where | |
ElementParser :: DOM.Name -> Parser a -> Parser a | |
ElementParserMaybe :: DOM.Name -> Parser a -> Parser (Maybe a) | |
AttributeParser :: DOM.Name -> (Text -> Either String a) -> Parser a | |
TextParser :: (Text -> Either String a) -> Parser a | |
GetContext :: Parser Context | |
WithContext :: Context -> Parser a -> Parser a | |
Pure :: a -> Parser a | |
Bind :: Parser a -> (a -> Parser b) -> Parser b | |
Alt :: [Parser a] -> Parser a | |
Foreach :: (Int -> DOM.Name -> Parser a) -> Parser [a] | |
Nth :: Int -> Parser a -> Parser a | |
instance Monad Parser where return = Pure; (>>=) = Bind | |
instance Applicative Parser where (<*>) = ap; pure = return | |
instance Functor Parser where fmap = liftM | |
instance Semigroup (Parser a) where | |
x <> y = Alt (list x <> list y) | |
where | |
list = | |
\case | |
Alt xs -> xs | |
e -> [e] | |
data ParseError | |
= ParseError Context Reason | |
| AllBranchesFailed [ParseError] | |
deriving (Show, Typeable, Generic) | |
instance NFData ParseError | |
data Context = Context | |
{ contextParents :: [(Text, Maybe Int)] | |
, contextElement :: DOM.Element | |
} deriving (Show, Generic) | |
instance NFData Context | |
data Reason | |
= ExpectedElement DOM.Name | |
| ExpectedAttribute DOM.Name | |
| AttributeParserFail DOM.Name String | |
| TextParsingFail Text String | |
| MissingIndexedNode Int | |
deriving (Show, Generic) | |
instance NFData Reason | |
-------------------------------------------------------------------------------- | |
-- Top level parser runners | |
parseFile :: | |
(MonadIO m, MonadThrow m, MonadCatch m) | |
=> DOM.ParseSettings | |
-> FilePath | |
-> Parser a | |
-> m (Either ParseError a) | |
parseFile settings fp parser0 = do | |
doc <- liftIO (DOM.readFile settings fp) | |
try (runParser parser0 (DOM.documentRoot doc)) | |
parseHTMLFile :: | |
(MonadIO m, MonadThrow m, MonadCatch m) | |
=> FilePath | |
-> Parser a | |
-> m (Either ParseError a) | |
parseHTMLFile fp parser0 = do | |
doc <- liftIO (HTML.readFile fp) | |
try (runParser parser0 (DOM.documentRoot doc)) | |
sinkParser :: | |
(MonadThrow m, MonadCatch m') | |
=> DOM.ParseSettings | |
-> Parser a | |
-> ConduitM ByteString o m (m' a) | |
sinkParser settings p = do | |
r <- DOM.sinkDoc settings | |
pure (runParser p (DOM.documentRoot r)) | |
runParser :: | |
forall m a. (MonadThrow m, MonadCatch m) | |
=> Parser a | |
-> DOM.Element | |
-> m a | |
runParser parser0 element0 = | |
evalStateT | |
(go parser0) | |
(Context | |
{ contextParents = | |
[(DOM.nameLocalName (DOM.elementName element0), Nothing)] | |
, contextElement = element0 | |
}) | |
where | |
go :: forall x. Parser x -> StateT Context m x | |
go = | |
\case | |
GetContext -> get | |
WithContext ctx m -> do | |
s <- get | |
put ctx | |
v <- go m | |
put s | |
pure v | |
Pure a -> pure a | |
Bind m f -> go m >>= go . f | |
Alt xs0 -> | |
let loop es (x:xs) = do | |
s <- get | |
result <- try (go x) | |
case result of | |
Right v -> pure v | |
Left (e :: ParseError) -> do | |
put s | |
loop (e : es) xs | |
loop es [] = throwM (AllBranchesFailed (reverse es)) | |
in loop [] xs0 | |
TextParser parser -> do | |
element <- gets contextElement | |
let txt = | |
(mconcat | |
(mapMaybe | |
(\case | |
DOM.NodeContent t -> Just t | |
_ -> Nothing) | |
(DOM.elementNodes element))) | |
case parser txt of | |
Right ok -> pure ok | |
Left e -> do | |
context <- get | |
throwM (ParseError context (TextParsingFail txt e)) | |
ElementParser name inner -> do | |
result <- lookupElementByName name | |
case result of | |
Nothing -> do | |
context <- get | |
throwM (ParseError context (ExpectedElement name)) | |
Just element' -> runElementParser Nothing name element' inner go | |
ElementParserMaybe name inner -> do | |
result <- lookupElementByName name | |
case result of | |
Nothing -> pure Nothing | |
Just element' -> | |
fmap Just (runElementParser Nothing name element' inner go) | |
AttributeParser name parser -> do | |
element <- gets contextElement | |
context <- get | |
case M.lookup name (DOM.elementAttributes element) of | |
Nothing -> throwM (ParseError context (ExpectedAttribute name)) | |
Just t -> | |
case parser t of | |
Right ok -> pure ok | |
Left e -> | |
throwM (ParseError context (AttributeParserFail name e)) | |
Foreach cont -> do | |
element <- gets contextElement | |
fmap | |
catMaybes | |
(traverse | |
(\(i, el) -> | |
case el of | |
DOM.NodeElement e -> | |
fmap | |
Just | |
(runElementParser | |
(pure i) | |
(DOM.elementName e) | |
e | |
(cont i (DOM.elementName e)) | |
go) | |
_ -> pure Nothing) | |
(zip | |
[0 ..] | |
(filter | |
(\case | |
DOM.NodeElement {} -> True | |
_ -> False) | |
(DOM.elementNodes element)))) | |
Nth i cont -> do | |
element <- gets contextElement | |
context <- get | |
result <- | |
fmap | |
(listToMaybe . catMaybes) | |
(traverse | |
(\(j, x) -> | |
case x of | |
DOM.NodeElement e | |
| j == i -> | |
fmap | |
Just | |
(runElementParser | |
(pure i) | |
(DOM.elementName e) | |
e | |
cont | |
go) | |
_ -> pure Nothing) | |
(zip | |
[0 ..] | |
(filter | |
(\case | |
DOM.NodeElement {} -> True | |
_ -> False) | |
(DOM.elementNodes element)))) | |
case result of | |
Nothing -> throwM (ParseError context (MissingIndexedNode i)) | |
Just r -> pure r | |
runElementParser :: | |
MonadState Context m => Maybe Int -> DOM.Name -> DOM.Element -> t -> (t -> m b) -> m b | |
runElementParser i name element' inner go = do | |
orig <- get | |
modify | |
(\(Context parents _) -> | |
Context ((DOM.nameLocalName name, i) : parents) element') | |
a <- go inner | |
put orig | |
pure a | |
lookupElementByName :: (MonadState Context m, MonadThrow m) => DOM.Name -> m (Maybe DOM.Element) | |
lookupElementByName name = do | |
element <- gets contextElement | |
pure | |
(listToMaybe | |
(mapMaybe | |
(\case | |
DOM.NodeElement this | |
| DOM.elementName this == name -> Just this | |
_ -> Nothing) | |
(DOM.elementNodes element))) | |
-------------------------------------------------------------------------------- | |
-- Parser combinators | |
nth :: Int -> Parser a -> Parser a | |
nth = Nth | |
foreach :: (Int -> DOM.Name -> Parser a) -> Parser [a] | |
foreach = Foreach | |
getContext :: Parser Context | |
getContext = GetContext | |
with :: Context -> Parser a -> Parser a | |
with = WithContext | |
withMaybe :: Maybe Context -> Parser a -> Parser (Maybe a) | |
withMaybe mctx p = maybe (pure Nothing) (fmap Just . (\ctx -> with ctx p)) mctx | |
elem :: DOM.Name -> Parser a -> Parser a | |
elem = ElementParser | |
elemMaybe :: DOM.Name -> Parser a -> Parser (Maybe a) | |
elemMaybe = ElementParserMaybe | |
elems :: [DOM.Name] -> Parser a -> Parser a | |
elems xs p = foldr elem p xs | |
attr :: DOM.Name -> (Text -> Either String a) -> Parser a | |
attr = AttributeParser | |
text :: (Text -> Either String a) -> Parser a | |
text = TextParser | |
-------------------------------------------------------------------------------- | |
-- Very helpful error messages | |
instance Exception ParseError where | |
displayException = go 0 | |
where | |
go i = | |
\case | |
AllBranchesFailed es -> | |
("All of these parse branches failed:\n\n" <> | |
indent (i + 4) (intercalate "\n\n" (map (go (i + 4)) es))) | |
ParseError ctx reason -> | |
"In the context:\n\n" <> | |
indent (i+4) (showContext ctx) <> "\n\n" <> | |
showReason i ctx reason | |
indent :: Int -> String -> [Char] | |
indent n = intercalate "\n" . map (replicate n ' ' ++) . lines | |
showReason :: Int -> Context -> Reason -> [Char] | |
showReason i (Context _ el) = | |
\case | |
MissingIndexedNode i -> "Missing node at index " ++ show i ++ "\n\n" ++ | |
case mapMaybe | |
(\case | |
DOM.NodeElement element -> | |
Just (showTagName (DOM.elementName element)) | |
_ -> Nothing) | |
(DOM.elementNodes el) of | |
[] -> "There are no elements here!" | |
nodes -> | |
"Elements you can expect at this level:\n\n" ++ | |
indent (i + 4) (intercalate ", " nodes) | |
ExpectedElement name -> | |
"Expected element " ++ | |
showTagName name ++ | |
"\n\n" ++ | |
case mapMaybe | |
(\case | |
DOM.NodeElement element -> | |
Just (showTagName (DOM.elementName element)) | |
_ -> Nothing) | |
(DOM.elementNodes el) of | |
[] -> "There are no elements here!" | |
nodes -> | |
"Elements you can expect at this level:\n\n" ++ | |
indent (i + 4) (intercalate ", " nodes) | |
ExpectedAttribute name -> | |
"Expected attribute " ++ showName name ++ "=\"...\"" | |
AttributeParserFail name err -> | |
"Attribute " ++ | |
showName name ++ | |
"=\"...\" failed to parse,\n" ++ "with error:\n" ++ indent (i + 4) err | |
TextParsingFail tex err -> | |
"The text " ++ | |
show (ellipsis 40 tex) ++ | |
" failed to parse,\n" ++ "with error:\n\n" ++ indent (i + 4) err | |
where ellipsis n t = | |
T.take | |
n | |
(if T.length t > n | |
then t <> ".." | |
else t) | |
showName :: DOM.Name -> String | |
showName (DOM.Name {DOM.nameLocalName = ln}) = T.unpack ln | |
showContext :: Context -> [Char] | |
showContext (Context parents _elem) = | |
foldl | |
(\rents (this, i) -> | |
let midx = | |
case i of | |
Just i' -> " [" ++ show i' ++ "]" | |
Nothing -> "" | |
in showTag this ++ | |
midx ++ | |
"\n" ++ | |
(if not (null rents) | |
then indent 2 rents | |
else "")) | |
"" | |
parents | |
where | |
showTag :: Text -> [Char] | |
showTag = \name -> "<" ++ T.unpack name ++ ">" | |
showTagName :: DOM.Name -> [Char] | |
showTagName = \name -> "<" ++ showName name ++ ">" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment