Last active
July 20, 2024 20:24
-
-
Save matthewbauer/6bd1cc57bd32860b36b0e0c949310eb2 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{-# LANGUAGE DerivingStrategies #-} | |
{-# LANGUAGE FlexibleContexts #-} | |
{-# LANGUAGE NumericUnderscores #-} | |
{-# LANGUAGE OverloadedStrings #-} | |
module Text.Megaparsec.ParseNumWordsEn (parseNumWordsEn, showNumWordsEn, test) where | |
import Control.Exception hiding (try) | |
import Control.Monad | |
import Data.CaseInsensitive (FoldCase) | |
import Data.Int | |
import Data.List (foldl') | |
import Data.Maybe | |
import Data.String | |
import Data.Text (Text) | |
import Data.Void | |
import Data.Word | |
import Prelude | |
import System.Random | |
import Text.Megaparsec | |
import Text.Megaparsec.Byte | |
type MyToken s = (Stream s, FoldCase (Tokens s), IsString (Tokens s)) | |
-- | List of names to use for 10^3n parsing and the value of n. | |
ions :: [(String, Word8)] | |
ions = [ ("thousand", 1) | |
, ("million", 2) | |
, ("billion", 3) | |
, ("trillion", 4) | |
, ("quadrillion", 5) | |
, ("quintillion", 6) | |
] | |
-- | Parser for the names of numbers in English. | |
-- For example, "five hundred fifty five" would become 555. | |
-- | |
-- This supports: | |
-- | |
-- - supports input of Text, ByteString, and String | |
-- - can parse up to 10^21-1 numbers (one sextillion minus one) | |
-- - can output any instance of Fractional (which is any rational number) | |
-- - can also parse phone number style like "nine oh two one oh" or "five five five five five five five" | |
-- - can parse "negative" prefix | |
parseNumWordsEn :: (MyToken s, Fractional a) => Parsec Void s a | |
parseNumWordsEn = do | |
negative <- fmap isJust <$> optional $ string' "negative" | |
skipFormatting | |
number <- allDigits <|> foldl' goion l1000r ions | |
eof | |
pure $ if negative then -number else number | |
where | |
-- | Skip any formatting like whitespace or dashes or commas. | |
skipFormatting :: MyToken s => Parsec Void s () | |
skipFormatting = skipMany (string " " <|> string "-" <|> string ",") <?> "formatting" | |
-- | Match 1-9 whole numbers. | |
posDigits :: MyToken s => Parsec Void s Word8 | |
posDigits = (1 <$ string' "one") | |
<|> (2 <$ string' "two") | |
<|> (3 <$ string' "three") | |
<|> (4 <$ string' "four") | |
<|> (5 <$ string' "five") | |
<|> (6 <$ string' "six") | |
<|> (7 <$ string' "seven") | |
<|> (8 <$ string' "eight") | |
<|> (9 <$ string' "nine") | |
-- | Match zero, including some unusual uses like "oh" and "none". | |
zero :: MyToken s => Parsec Void s Word8 | |
zero = 0 <$ (string' "zero" <|> string' "oh" <|> string' "none" <|> string' "no" <|> string' "nil" <|> string "nothing") <?> "zero" | |
-- | Match 0-9 whole numbers. | |
l10 :: MyToken s => Parsec Void s Word8 | |
l10 = zero <|> posDigits | |
-- | Match 10-19 whole numbers. | |
teens :: MyToken s => Parsec Void s Word8 | |
teens = (10 <$ string' "ten") | |
<|> (11 <$ string' "eleven") | |
<|> (12 <$ string' "twelve") | |
<|> (13 <$ string' "thirteen") | |
<|> (14 <$ string' "fourteen") | |
<|> (15 <$ string' "fifteen") | |
<|> (16 <$ string' "sixteen") | |
<|> (17 <$ string' "seventeen") | |
<|> (18 <$ string' "eighteen") | |
<|> (19 <$ string' "nineteen") | |
-- | Match <20 non-negative whole numbers. | |
l20 :: MyToken s => Parsec Void s Word8 | |
l20 = teens <|> l10 | |
-- | Match 20-99 prefixes like "twenty". | |
tys :: MyToken s => Parsec Void s Word8 | |
tys = (20 <$ string' "twenty") | |
<|> (30 <$ string' "thirty") | |
<|> (40 <$ string' "forty") | |
<|> (50 <$ string' "fifty") | |
<|> (60 <$ string' "sixty") | |
<|> (70 <$ string' "seventy") | |
<|> (80 <$ string' "eighty") | |
<|> (90 <$ string' "ninety") | |
-- | Match 20-99 whole numbers. | |
tens :: MyToken s => Parsec Void s Word8 | |
tens = do | |
prefix <- tys | |
skipFormatting | |
mSuffix <- optional posDigits | |
pure $ prefix + fromMaybe 0 mSuffix | |
-- | Match <100 non-negative whole numbers. | |
l100 :: MyToken s => Parsec Void s Word8 | |
l100 = tens <|> l20 | |
-- | Match 100-999 whole numbers. | |
hundreds :: MyToken s => Parsec Void s Word16 | |
hundreds = do | |
prefix <- try $ do | |
prefix <- fromIntegral <$> l100 -- could be l10, but want to support "nineteen hundred twenty two" | |
skipFormatting | |
_ <- string' "hundred" | |
pure prefix | |
skipFormatting | |
_ <- optional $ do | |
_ <- string' "and" | |
skipFormatting | |
mSuffix <- optional $ fromIntegral <$> l100 | |
pure $ prefix * 100 + fromMaybe 0 mSuffix | |
-- | Match decimal place. | |
fractional :: (MyToken s, Fractional a) => Parsec Void s a | |
fractional = do | |
_ <- string' "point" | |
skipFormatting | |
xs <- some $ do | |
x <- fromIntegral <$> l10 | |
skipFormatting | |
pure x | |
pure $ foldl' (\a (b, n) -> a + b / (10 ^ n)) 0 (zip xs ([1..] :: [Integer])) | |
-- | Match <1000 non-negative rational numbers. | |
l1000r :: (MyToken s, Fractional a) => Parsec Void s a | |
l1000r = do | |
number <- fromIntegral <$> hundreds <|> fromIntegral <$> l100 | |
skipFormatting | |
mFractional <- optional fractional | |
pure $ number + fromMaybe 0 mFractional | |
-- | Match >= 10^(3(n-1)) and < 10^3n rational numbers. | |
l1000nr :: (MyToken s, Fractional a) => Parsec Void s a -> String -> Word8 -> Parsec Void s a | |
l1000nr pSmaller name n = do | |
prefix <- try $ do | |
prefix <- l1000r | |
skipFormatting | |
_ <- string' (fromString name) | |
pure prefix | |
skipFormatting | |
_ <- optional $ do | |
_ <- string' "and" | |
skipFormatting | |
mSuffix <- optional $ do | |
suffix <- pSmaller | |
skipFormatting | |
pure suffix | |
mFractional <- optional fractional | |
pure $ prefix * 10 ^ (3 * n) + fromMaybe 0 mSuffix + fromMaybe 0 mFractional | |
-- | Process one iteration of "ion" suffix. | |
goion a (name, n) = (l1000nr a name n <?> (name <> " cardinal")) <|> a | |
-- | Parser for all digits words like "four oh five". | |
allDigits = do | |
xs <- try $ do | |
x1 <- l10 | |
skipFormatting | |
xs <- some $ do -- need at least two l10’s, otherwise we’d take from normal numbers | |
x <- l10 | |
skipFormatting | |
pure x | |
pure $ fromIntegral <$> (x1 : xs) | |
mFractional <- optional fractional | |
pure $ foldl' (\a (b, n) -> a + b * (10 ^ n)) 0 (zip (reverse xs) ([0..] :: [Integer])) + fromMaybe 0 mFractional | |
-- | Convert provided number into English words. | |
-- | |
-- So 101 becomes "one hundred one". | |
-- | |
-- This supports: | |
-- - numbers up to 10^21-1 (one sextillion minus one) | |
-- - can create input of Text, ByteString, and String | |
-- - supports negative numbers | |
showNumWordsEn :: (RealFrac a, Show a, IsString b, Monoid b) => a -> b | |
showNumWordsEn = go | |
where | |
l20 :: IsString a => Word8 -> a | |
l20 0 = "zero" | |
l20 1 = "one" | |
l20 2 = "two" | |
l20 3 = "three" | |
l20 4 = "four" | |
l20 5 = "five" | |
l20 6 = "six" | |
l20 7 = "seven" | |
l20 8 = "eight" | |
l20 9 = "nine" | |
l20 10 = "ten" | |
l20 11 = "eleven" | |
l20 12 = "twelve" | |
l20 13 = "thirteen" | |
l20 14 = "fourteen" | |
l20 15 = "fifteen" | |
l20 16 = "sixteen" | |
l20 17 = "seventeen" | |
l20 18 = "eighteen" | |
l20 19 = "nineteen" | |
l20 n = error $ "l20: unexpected number >=20: " <> show n | |
fractional :: (RealFrac a, Show a, IsString b, Monoid b) => a -> b | |
fractional n | n < 0 = error $ "fractional: unexpected negative number: " <> show n | |
fractional n | n >= 1 = error $ "fractional: unexpected number >=1: " <> show n | |
fractional n = | |
let (x, y) = properFraction (n * 10) | |
in l20 x <> if y == 0 then mempty else " " <> fractional y | |
tys :: IsString a => Word8 -> a | |
tys 2 = "twenty" | |
tys 3 = "thirty" | |
tys 4 = "forty" | |
tys 5 = "fifty" | |
tys 6 = "sixty" | |
tys 7 = "seventy" | |
tys 8 = "eighty" | |
tys 9 = "ninety" | |
tys x = error $ "tys: unexpected number less than 2 or greater than 9: " <> show x | |
l100 :: (IsString a, Monoid a) => Word8 -> a | |
l100 n | n < 20 = l20 n | |
l100 n | n < 100 = | |
let (x, y) = n `quotRem` 10 | |
in tys x <> (if y == 0 then mempty else "-" <> l20 y) | |
l100 x = error $ "l100: unexpected number >= 100: " <> show x | |
l1000 :: (IsString a, Monoid a) => Word16 -> a | |
l1000 n | n < 100 = l100 (fromIntegral n) | |
l1000 n | n < 1000 = | |
let (x, y) = n `quotRem` 100 | |
in l20 (fromIntegral x) <> " hundred" <> if y == 0 then mempty else " " <> l100 (fromIntegral y) | |
l1000 x = error $ "l10000: unexpected number >= 1000: " <> show x | |
l1000n :: (IsString a, Monoid a) => [(String, Word8)] -> Integer -> a | |
l1000n _ n | n < 0 = error $ "l1000n: unexpected negative number: " <> show n | |
l1000n _ n | n < 1000 = l1000 (fromIntegral n) | |
l1000n ((name, e) : xs) n | n < 10 ^ (3 * (e + 1)) = | |
let (x, y) = n `quotRem` (10 ^ (3 * e)) | |
in if x == 0 | |
then l1000n xs y | |
else l1000 (fromIntegral x) <> " " <> fromString name <> (if y == 0 then mempty else " and " <> l1000n xs y) | |
l1000n ((_, e) : _) x = error $ "l1000n: unexpected number >= 10^" <> show (3 * (e + 1)) <> ": " <> show x | |
l1000n _ x = error $ "l1000n: unexpected number: " <> show x | |
go n | n < 0 = "negative " <> go (-n) | |
go n = let (x, y) = properFraction n | |
in l1000n (reverse ions) x <> (if y == 0 then mempty else " point " <> fractional y) | |
------------------------------------------------------------------------------------------------------------ | |
data TestError = TestError { testExpected :: Double, testGot :: Double} | |
deriving stock Show | |
instance Exception TestError | |
test :: IO () | |
test = do | |
let testParse :: Text -> Double -> IO () | |
testParse input expected = do | |
putStrLn $ "testing " <> show input | |
case parse parseNumWordsEn "" input of | |
Right res | res == expected -> do | |
putStrLn $ " => got " <> show res | |
pure () | |
Right got -> throwIO $ TestError expected got | |
Left err -> do | |
putStrLn $ errorBundlePretty err | |
throwIO err | |
testParse "none" 0 | |
testParse "nineteen" 19 | |
testParse "twenty" 20 | |
testParse "ninetynine" 99 | |
testParse "fiftyfive" 55 | |
testParse "one-hundred" 100 | |
testParse "one hundred fifty-five" 155 | |
testParse "fifty-five" 55 | |
testParse "one thousand two hundred forty four" 1244 | |
testParse "fifty five point five" 55.5 | |
testParse "four million five hundred twentytwo" 4_000_522.0 | |
testParse "seventy six" 76 | |
testParse "nineteen hundred forty-four" 1944 | |
testParse "Seventy Thousand and one" 70_001 | |
testParse "Negative five" (-5) | |
testParse "Five Billion And Five Hundred And Fifty Five Million And Five Hundred And Fifty Five Thousand And Five Hundred And Fifty Five" 5_555_555_555 | |
testParse "Ten point five five five" 10.555 | |
testParse "four oh five" 405 | |
testParse "five five five" 555 | |
testParse "five five" 55 | |
testParse "five point five five five five" 5.555_5 | |
testParse "nineteen hundred twenty two" 1922 | |
testParse "nine oh two one oh" 90_210 | |
testParse "one hundred and one" 101 | |
testParse "five point five million" 5_500_000 | |
testParse "five point five million point five" 5_500_000.5 | |
replicateM_ 10 $ do | |
-- Testing with Random Double doesn’t seem to work because we get rounding issues. | |
n <- fromIntegral <$> (randomIO :: IO Int32) | |
putStrLn $ "roundtrip " <> show n | |
let str :: Text = showNumWordsEn n | |
putStrLn $ " => showNumWordsEn " <> show n <> " = " <> show str | |
case parse parseNumWordsEn "" str of | |
Right res -> do | |
putStrLn $ " => parseNumWordsEn " <> show str <> " = " <> show res | |
when (n /= res) $ throwIO $ TestError n res | |
Left err -> do | |
putStrLn $ errorBundlePretty err | |
throwIO err |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment