shithub: MicroHs

ref: e1db93e902de05e48d3362d13f0b809b888b6dbc
dir: /lib/Text/Read/Lex.hs/

View raw version
{-# LANGUAGE Trustworthy #-}
{-# LANGUAGE NoImplicitPrelude #-}

-----------------------------------------------------------------------------
-- |
-- Module      :  GHC.Internal.Text.Read.Lex
-- Copyright   :  (c) The University of Glasgow 2002
-- License     :  BSD-style (see the file libraries/base/LICENSE)
--
-- Maintainer  :  libraries@haskell.org
-- Stability   :  provisional
-- Portability :  non-portable (uses Text.ParserCombinators.ReadP)
--
-- The cut-down Haskell lexer, used by GHC.Internal.Text.Read
--
-----------------------------------------------------------------------------

module Text.Read.Lex
  -- lexing types
  ( Lexeme(..), Number

  , numberToInteger, numberToFixed, numberToRational, numberToRangedRational

  -- lexer
  , lex, expect
  , hsLex
  , lexChar

  , readBinP
  , readIntP
  , readOctP
  , readDecP
  , readHexP

  , isSymbolChar
  )
 where
import Prelude()
import Control.Error
import Control.Monad
import Data.Char
import Data.Bool
import Data.Bounded
import Data.Eq
import Data.Function
import Data.Int
import Data.Integer
import Data.Integral
import Data.List
import Data.Maybe
import Data.Num
import Data.Ord
import Data.Ratio
import Data.Tuple
import Text.Show
import Text.ParserCombinators.ReadP
{-
import Control.Monad
import Data.Enum
import Data.List
import Data.Maybe
import Data.Real
-}
-- -----------------------------------------------------------------------------
-- Lexing types

-- ^ Haskell lexemes.
data Lexeme
  = Char   Char         -- ^ Character literal
  | String String       -- ^ String literal, with escapes interpreted
  | Punc   String       -- ^ Punctuation or reserved symbol, e.g. @(@, @::@
  | Ident  String       -- ^ Haskell identifier, e.g. @foo@, @Baz@
  | Symbol String       -- ^ Haskell symbol, e.g. @>>@, @:%@
  | Number Number       -- ^ @since base-4.6.0.0
  | EOF
 deriving ( Eq   -- ^ @since base-2.01
          , Show -- ^ @since base-2.01
          )

-- | @since base-4.6.0.0
data Number = MkNumber Int              -- Base
                       Digits           -- Integral part
            | MkDecimal Digits          -- Integral part
                        (Maybe Digits)  -- Fractional part
                        (Maybe Integer) -- Exponent
 deriving ( Eq   -- ^ @since base-4.6.0.0
          , Show -- ^ @since base-4.6.0.0
          )

-- | @since base-4.5.1.0
numberToInteger :: Number -> Maybe Integer
numberToInteger (MkNumber base iPart) = Just (val (fromIntegral base) iPart)
numberToInteger (MkDecimal iPart Nothing Nothing) = Just (val 10 iPart)
numberToInteger _ = Nothing

-- | @since base-4.7.0.0
numberToFixed :: Integer -> Number -> Maybe (Integer, Integer)
numberToFixed _ (MkNumber base iPart) = Just (val (fromIntegral base) iPart, 0)
numberToFixed _ (MkDecimal iPart Nothing Nothing) = Just (val 10 iPart, 0)
numberToFixed p (MkDecimal iPart (Just fPart) Nothing)
    = let i = val 10 iPart
          f = val 10 (integerTake p (fPart ++ repeat 0))
          -- Sigh, we really want genericTake, but that's above us in
          -- the hierarchy, so we define our own version here (actually
          -- specialised to Integer)
          integerTake             :: Integer -> [a] -> [a]
          integerTake n _ | n <= 0 = []
          integerTake _ []        =  []
          integerTake n (x:xs)    =  x : integerTake (n-1) xs
      in Just (i, f)
numberToFixed _ _ = Nothing

-- This takes a floatRange, and if the Rational would be outside of
-- the floatRange then it may return Nothing. Not that it will not
-- /necessarily/ return Nothing, but it is good enough to fix the
-- space problems in #5688
-- Ways this is conservative:
-- * the floatRange is in base 2, but we pretend it is in base 10
-- * we pad the floatRange a bit, just in case it is very small
--   and we would otherwise hit an edge case
-- * We only worry about numbers that have an exponent. If they don't
--   have an exponent then the Rational won't be much larger than the
--   Number, so there is no problem
-- | @since base-4.5.1.0
numberToRangedRational :: (Int, Int) -> Number
                       -> Maybe Rational -- Nothing = Inf
numberToRangedRational (neg, pos) n@(MkDecimal iPart mFPart (Just exp))
    -- Calculate amount to increase/decrease the exponent, based on (non
    -- leading zero) places in the iPart, or leading zeros in the fPart.
    -- If iPart and fPart are all zeros, return Nothing.
    = let mFirstDigit = case dropWhile (0 ==) iPart of
                        iPart'@(_ : _) -> Just (length iPart')
                        [] -> case mFPart of
                              Nothing -> Nothing
                              Just fPart ->
                                  case span (0 ==) fPart of
                                  (_, []) -> Nothing
                                  (zeroes, _) ->
                                      Just (negate (length zeroes))
      in case mFirstDigit of
         Nothing -> Just 0
         Just firstDigit ->
             -- compare exp to bounds as Integer to avoid over/underflow
             let firstDigit' = toInteger firstDigit + exp
             in if firstDigit' > toInteger (pos + 3)
                then Nothing
                else if firstDigit' < toInteger (neg - 3)
                then Just 0
                else Just (numberToRational n)
numberToRangedRational _ n = Just (numberToRational n)

-- | @since base-4.6.0.0
numberToRational :: Number -> Rational
numberToRational (MkNumber base iPart) = val (fromIntegral base) iPart % 1
numberToRational (MkDecimal iPart mFPart mExp)
 = let i = val 10 iPart
   in case (mFPart, mExp) of
      (Nothing, Nothing)     -> i % 1
      (Nothing, Just exp)
       | exp >= 0            -> (i * (10 ^ exp)) % 1
       | otherwise           -> i % (10 ^ (- exp))
      (Just fPart, Nothing)  -> fracExp 0   i fPart
      (Just fPart, Just exp) -> fracExp exp i fPart
      -- fracExp is a bit more efficient in calculating the Rational.
      -- Instead of calculating the fractional part alone, then
      -- adding the integral part and finally multiplying with
      -- 10 ^ exp if an exponent was given, do it all at once.

-- -----------------------------------------------------------------------------
-- Lexing

lex :: ReadP Lexeme
lex = skipSpaces >> lexToken

-- | @since base-4.7.0.0
expect :: Lexeme -> ReadP ()
expect lexeme = do { skipSpaces
                   ; thing <- lexToken
                   ; if thing == lexeme then return () else pfail }

hsLex :: ReadP String
-- ^ Haskell lexer: returns the lexed string, rather than the lexeme
hsLex = do skipSpaces
           (s,_) <- gather lexToken
           return s

lexToken :: ReadP Lexeme
lexToken = lexEOF     +++
           lexLitChar +++
           lexString  +++
           lexPunc    +++
           lexSymbol  +++
           lexId      +++
           lexNumber


-- ----------------------------------------------------------------------
-- End of file
lexEOF :: ReadP Lexeme
lexEOF = do s <- look
            guard (null s)
            return EOF

-- ---------------------------------------------------------------------------
-- Single character lexemes

lexPunc :: ReadP Lexeme
lexPunc =
  do c <- satisfy isPuncChar
     return (Punc [c])

-- | The @special@ character class as defined in the Haskell Report.
isPuncChar :: Char -> Bool
isPuncChar c = c `elem` ",;()[]{}`"

-- ----------------------------------------------------------------------
-- Symbols

lexSymbol :: ReadP Lexeme
lexSymbol =
  do s <- munch1 isSymbolChar
     if s `elem` reserved_ops then
        return (Punc s)         -- Reserved-ops count as punctuation
      else
        return (Symbol s)
  where
    reserved_ops   = ["..", "::", "=", "\\", "|", "<-", "->", "@", "~", "=>"] :: [String]

{-
isSymbolChar :: Char -> Bool
isSymbolChar c = not (isPuncChar c) && case generalCategory c of
    MathSymbol              -> True
    CurrencySymbol          -> True
    ModifierSymbol          -> True
    OtherSymbol             -> True
    DashPunctuation         -> True
    OtherPunctuation        -> not (c `elem` "'\"")
    ConnectorPunctuation    -> c /= '_'
    _                       -> False
-}
isSymbolChar :: Char -> Bool
isSymbolChar c = c `elem` ("!@#$%&?+./<=>?\\^|:-~"::String)

-- ----------------------------------------------------------------------
-- identifiers

lexId :: ReadP Lexeme
lexId = do c <- satisfy isIdsChar
           s <- munch isIdfChar
           return (Ident (c:s))
  where
          -- Identifiers can start with a '_'
    isIdsChar c = isAlpha c || c == '_'
    isIdfChar c = isAlphaNum c || c `elem` "_'"

-- ---------------------------------------------------------------------------
-- Lexing character literals

lexLitChar :: ReadP Lexeme
lexLitChar =
  do _ <- char '\''
     (c,esc) <- lexCharE
     guard (esc || c /= '\'')   -- Eliminate '' possibility
     _ <- char '\''
     return (Char c)

lexChar :: ReadP Char
lexChar = do { (c,_) <- lexCharE; consumeEmpties; return c }
    where
    -- Consumes the string "\&" repeatedly and greedily (will only produce one match)
    consumeEmpties :: ReadP ()
    consumeEmpties = do
        rest <- look
        case rest of
            ('\\':'&':_) -> string "\\&" >> consumeEmpties
            _ -> return ()


lexCharE :: ReadP (Char, Bool)  -- "escaped or not"?
lexCharE =
  do c1 <- get
     if c1 == '\\'
       then do c2 <- lexEsc; return (c2, True)
       else return (c1, False)
 where
  lexEsc =
    lexEscChar
      +++ lexNumeric
        +++ lexCntrlChar
          +++ lexAscii

  lexEscChar =
    do c <- get
       case c of
         'a'  -> return '\a'
         'b'  -> return '\b'
         'f'  -> return '\f'
         'n'  -> return '\n'
         'r'  -> return '\r'
         't'  -> return '\t'
         'v'  -> return '\v'
         '\\' -> return '\\'
         '\"' -> return '\"'
         '\'' -> return '\''
         _    -> pfail

  lexNumeric =
    do base <- lexBaseChar <++ return 10
       n    <- lexInteger base
       guard (n <= toInteger (ord maxBound))
       return (chr (fromInteger n))

  lexCntrlChar =
    do _ <- char '^'
       c <- get
       case c of
         '@'  -> return '\^@'
         'A'  -> return '\^A'
         'B'  -> return '\^B'
         'C'  -> return '\^C'
         'D'  -> return '\^D'
         'E'  -> return '\^E'
         'F'  -> return '\^F'
         'G'  -> return '\^G'
         'H'  -> return '\^H'
         'I'  -> return '\^I'
         'J'  -> return '\^J'
         'K'  -> return '\^K'
         'L'  -> return '\^L'
         'M'  -> return '\^M'
         'N'  -> return '\^N'
         'O'  -> return '\^O'
         'P'  -> return '\^P'
         'Q'  -> return '\^Q'
         'R'  -> return '\^R'
         'S'  -> return '\^S'
         'T'  -> return '\^T'
         'U'  -> return '\^U'
         'V'  -> return '\^V'
         'W'  -> return '\^W'
         'X'  -> return '\^X'
         'Y'  -> return '\^Y'
         'Z'  -> return '\^Z'
         '['  -> return '\^['
         '\\' -> return '\^\'
         ']'  -> return '\^]'
         '^'  -> return '\^^'
         '_'  -> return '\^_'
         _    -> pfail

  lexAscii =
     choice
         [ (string "SOH" >> return '\SOH') <++
           (string "SO"  >> return '\SO')
                -- \SO and \SOH need maximal-munch treatment
                -- See the Haskell report Sect 2.6

         , string "NUL" >> return '\NUL'
         , string "STX" >> return '\STX'
         , string "ETX" >> return '\ETX'
         , string "EOT" >> return '\EOT'
         , string "ENQ" >> return '\ENQ'
         , string "ACK" >> return '\ACK'
         , string "BEL" >> return '\BEL'
         , string "BS"  >> return '\BS'
         , string "HT"  >> return '\HT'
         , string "LF"  >> return '\LF'
         , string "VT"  >> return '\VT'
         , string "FF"  >> return '\FF'
         , string "CR"  >> return '\CR'
         , string "SI"  >> return '\SI'
         , string "DLE" >> return '\DLE'
         , string "DC1" >> return '\DC1'
         , string "DC2" >> return '\DC2'
         , string "DC3" >> return '\DC3'
         , string "DC4" >> return '\DC4'
         , string "NAK" >> return '\NAK'
         , string "SYN" >> return '\SYN'
         , string "ETB" >> return '\ETB'
         , string "CAN" >> return '\CAN'
         , string "EM"  >> return '\EM'
         , string "SUB" >> return '\SUB'
         , string "ESC" >> return '\ESC'
         , string "FS"  >> return '\FS'
         , string "GS"  >> return '\GS'
         , string "RS"  >> return '\RS'
         , string "US"  >> return '\US'
         , string "SP"  >> return '\SP'
         , string "DEL" >> return '\DEL'
         ]


-- ---------------------------------------------------------------------------
-- string literal

lexString :: ReadP Lexeme
lexString =
  do _ <- char '"'
     body id
 where
  body f =
    do (c,esc) <- lexStrItem
       if c /= '"' || esc
         then body (f.(c:))
         else let s = f "" in
              return (String s)

  lexStrItem = (lexEmpty >> lexStrItem)
               +++ lexCharE

  lexEmpty =
    do _ <- char '\\'
       c <- get
       case c of
         '&'           -> return ()
         _ | isSpace c -> do skipSpaces; _ <- char '\\'; return ()
         _             -> pfail

-- ---------------------------------------------------------------------------
--  Lexing numbers

type Base   = Int
type Digits = [Int]

lexNumber :: ReadP Lexeme
lexNumber
  = lexHexOct  <++      -- First try for hex or octal 0x, 0o etc
                        -- If that fails, try for a decimal number
    lexDecNumber        -- Start with ordinary digits

lexHexOct :: ReadP Lexeme
lexHexOct
  = do  _ <- char '0'
        base <- lexBaseChar
        digits <- lexDigits base
        return (Number (MkNumber base digits))

lexBaseChar :: ReadP Int
-- Lex a single character indicating the base; fail if not there
lexBaseChar = do
  c <- get
  case c of
    'b' -> return 2
    'B' -> return 2
    'o' -> return 8
    'O' -> return 8
    'x' -> return 16
    'X' -> return 16
    _   -> pfail

lexDecNumber :: ReadP Lexeme
lexDecNumber =
  do xs    <- lexDigits 10
     mFrac <- lexFrac <++ return Nothing
     mExp  <- lexExp  <++ return Nothing
     return (Number (MkDecimal xs mFrac mExp))

lexFrac :: ReadP (Maybe Digits)
-- Read the fractional part; fail if it doesn't
-- start ".d" where d is a digit
lexFrac = do _ <- char '.'
             fraction <- lexDigits 10
             return (Just fraction)

lexExp :: ReadP (Maybe Integer)
lexExp = do _ <- char 'e' +++ char 'E'
            exp <- signedExp +++ lexInteger 10
            return (Just exp)
 where
   signedExp
     = do c <- char '-' +++ char '+'
          n <- lexInteger 10
          return (if c == '-' then -n else n)

lexDigits :: Int -> ReadP Digits
-- Lex a non-empty sequence of digits in specified base
lexDigits base =
  do s  <- look
     xs <- scan s id
     guard (not (null xs))
     return xs
 where
  scan (c:cs) f = case valDig base c of
                    Just n  -> do _ <- get; scan cs (f.(n:))
                    Nothing -> return (f [])
  scan []     f = return (f [])

lexInteger :: Base -> ReadP Integer
lexInteger base =
  do xs <- lexDigits base
     return (val (fromIntegral base) xs)

val :: Num a => a -> Digits -> a
val = valSimple
{-# RULES
"val/Integer" val = valInteger
  #-}
{-# INLINE [1] val #-}

-- The following algorithm is only linear for types whose Num operations
-- are in constant time.
valSimple :: (Num a, Integral d) => a -> [d] -> a
valSimple base = go 0
  where
    go r [] = r
    go r (d : ds) = r' `seq` go r' ds
      where
        r' = r * base + fromIntegral d
{-# INLINE valSimple #-}

-- A sub-quadratic algorithm for Integer. Pairs of adjacent radix b
-- digits are combined into a single radix b^2 digit. This process is
-- repeated until we are left with a single digit. This algorithm
-- performs well only on large inputs, so we use the simple algorithm
-- for smaller inputs.
valInteger :: Integer -> Digits -> Integer
valInteger b0 ds0 = go b0 (length ds0) $ map fromIntegral ds0
  where
    go _ _ []  = 0
    go _ _ [d] = d
    go b l ds
        | l > 40 = b' `seq` go b' l' (combine b ds')
        | otherwise = valSimple b ds
      where
        -- ensure that we have an even number of digits
        -- before we call combine:
        ds' = if even l then ds else 0 : ds
        b' = b * b
        l' = (l + 1) `quot` 2
    combine b (d1 : d2 : ds) = d `seq` (d : combine b ds)
      where
        d = d1 * b + d2
    combine _ []  = []
    combine _ [_] = errorWithoutStackTrace "this should not happen"

-- Calculate a Rational from the exponent [of 10 to multiply with],
-- the integral part of the mantissa and the digits of the fractional
-- part. Leaving the calculation of the power of 10 until the end,
-- when we know the effective exponent, saves multiplications.
-- More importantly, this way we need at most one gcd instead of three.
--
-- frac was never used with anything but Integer and base 10, so
-- those are hardcoded now (trivial to change if necessary).
fracExp :: Integer -> Integer -> Digits -> Rational
fracExp exp mant []
  | exp < 0     = mant % (10 ^ (-exp))
  | otherwise   = fromInteger (mant * 10 ^ exp)
fracExp exp mant (d:ds) = exp' `seq` mant' `seq` fracExp exp' mant' ds
  where
    exp'  = exp - 1
    mant' = mant * 10 + fromIntegral d

valDig :: (Eq a, Num a) => a -> Char -> Maybe Int
valDig 2 c
  | '0' <= c && c <= '1' = Just (ord c - ord '0')
  | otherwise            = Nothing

valDig 8 c
  | '0' <= c && c <= '7' = Just (ord c - ord '0')
  | otherwise            = Nothing

valDig 10 c = valDecDig c

valDig 16 c
  | '0' <= c && c <= '9' = Just (ord c - ord '0')
  | 'a' <= c && c <= 'f' = Just (ord c - ord 'a' + 10)
  | 'A' <= c && c <= 'F' = Just (ord c - ord 'A' + 10)
  | otherwise            = Nothing

valDig _ _ = errorWithoutStackTrace "valDig: Bad base"

valDecDig :: Char -> Maybe Int
valDecDig c
  | '0' <= c && c <= '9' = Just (ord c - ord '0')
  | otherwise            = Nothing

-- ----------------------------------------------------------------------
-- other numeric lexing functions

readIntP :: Num a => a -> (Char -> Bool) -> (Char -> Int) -> ReadP a
readIntP base isDigit valDigit =
  do s <- munch1 isDigit
     return (val base (map valDigit s))
{-# SPECIALISE readIntP
        :: Integer -> (Char -> Bool) -> (Char -> Int) -> ReadP Integer #-}

readIntP' :: (Eq a, Num a) => a -> ReadP a
readIntP' base = readIntP base isDigit valDigit
 where
  isDigit  c = maybe False (const True) (valDig base c)
  valDigit c = maybe 0     id           (valDig base c)
{-# SPECIALISE readIntP' :: Integer -> ReadP Integer #-}

readBinP, readOctP, readDecP, readHexP :: (Eq a, Num a) => ReadP a
readBinP = readIntP' 2
readOctP = readIntP' 8
readDecP = readIntP' 10
readHexP = readIntP' 16
{-# SPECIALISE readBinP :: ReadP Integer #-}
{-# SPECIALISE readOctP :: ReadP Integer #-}
{-# SPECIALISE readDecP :: ReadP Integer #-}
{-# SPECIALISE readHexP :: ReadP Integer #-}