Skip to content

Commit 83b0cd0

Browse files
committed
Add lexeme IDs during parsing rather than computing them from source spans later.
1 parent 9b1d22e commit 83b0cd0

8 files changed

+163
-463
lines changed

dex.cabal

+1-1
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,6 @@ library
9191
, SourceRename
9292
, TopLevel
9393
, Transpose
94-
, TraverseSourceInfo
9594
, Types.Core
9695
, Types.Imp
9796
, Types.Misc
@@ -143,6 +142,7 @@ library
143142
if flag(live)
144143
build-depends: binary
145144
, blaze-html
145+
, blaze-markup
146146
, cmark
147147
, http-types
148148
, wai

src/lib/ConcreteSyntax.hs

+8-7
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ parseUModule name s = do
6161
{-# SCC parseUModule #-}
6262

6363
preludeImportBlock :: SourceBlock
64-
preludeImportBlock = SourceBlock 0 0 LogNothing "" $ Misc $ ImportModule Prelude
64+
preludeImportBlock = SourceBlock 0 0 LogNothing "" mempty (Misc $ ImportModule Prelude)
6565

6666
sourceBlocks :: Parser [SourceBlock]
6767
sourceBlocks = manyTill (sourceBlock <* outputLines) eof
@@ -108,11 +108,12 @@ sourceBlock :: Parser SourceBlock
108108
sourceBlock = do
109109
offset <- getOffset
110110
pos <- getSourcePos
111-
(src, (level, b)) <- withSource $ withRecovery recover $ do
111+
(src, (sm, (level, b))) <- withSource $ withSourceMaps $ withRecovery recover do
112112
level <- logLevel <|> logTime <|> logBench <|> return LogNothing
113113
b <- sourceBlock'
114114
return (level, b)
115-
return $ SourceBlock (unPos (sourceLine pos)) offset level src b
115+
let sm' = sm { lexemeInfo = lexemeInfo sm <&> \(t, (l, r)) -> (t, (l-offset, r-offset))}
116+
return $ SourceBlock (unPos (sourceLine pos)) offset level src sm' b
116117

117118
recover :: ParseError Text Void -> Parser (LogLevel, SourceBlock')
118119
recover e = do
@@ -154,7 +155,7 @@ consumeTillBreak = void $ manyTill anySingle $ eof <|> void (try (eol >> eol))
154155

155156
logLevel :: Parser LogLevel
156157
logLevel = do
157-
void $ try $ lexeme $ char '%' >> string "passes"
158+
void $ try $ lexeme MiscLexeme $ char '%' >> string "passes"
158159
passes <- many passName
159160
eol
160161
case passes of
@@ -163,13 +164,13 @@ logLevel = do
163164

164165
logTime :: Parser LogLevel
165166
logTime = do
166-
void $ try $ lexeme $ char '%' >> string "time"
167+
void $ try $ lexeme MiscLexeme $ char '%' >> string "time"
167168
eol
168169
return PrintEvalTime
169170

170171
logBench :: Parser LogLevel
171172
logBench = do
172-
void $ try $ lexeme $ char '%' >> string "bench"
173+
void $ try $ lexeme MiscLexeme $ char '%' >> string "bench"
173174
benchName <- strLit
174175
eol
175176
return $ PrintBench benchName
@@ -391,7 +392,7 @@ immediateLParen = label "'(' (without preceding whitespace)" do
391392
nextChar >>= \case
392393
'(' -> precededByWhitespace >>= \case
393394
True -> empty
394-
False -> charLexeme '('
395+
False -> lParen
395396
_ -> empty
396397

397398
immediateParens :: Parser a -> Parser a

src/lib/Lexing.hs

+50-19
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ import Data.Text (Text)
1515
import Data.Text qualified as T
1616
import Data.Void
1717
import Data.Word
18+
import qualified Data.Map.Strict as M
1819

1920
import Text.Megaparsec hiding (Label, State)
2021
import Text.Megaparsec.Char hiding (space, eol)
@@ -25,15 +26,18 @@ import Text.Megaparsec.Debug
2526
import Err
2627
import SourceInfo
2728
import Types.Primitives
29+
import Types.Source
30+
import Util (toSnocList)
2831

2932
data ParseCtx = ParseCtx
3033
{ curIndent :: Int -- used Reader-style (i.e. ask/local)
3134
, canBreak :: Bool -- used Reader-style (i.e. ask/local)
3235
, prevWhitespace :: Bool -- tracks whether we just consumed whitespace
33-
}
36+
, sourceIdCounter :: Int
37+
, curSourceMap :: SourceMaps } -- append to, writer-style
3438

3539
initParseCtx :: ParseCtx
36-
initParseCtx = ParseCtx 0 False False
40+
initParseCtx = ParseCtx 0 False False 0 mempty
3741

3842
type Parser = StateT ParseCtx (Parsec Void Text)
3943

@@ -64,7 +68,7 @@ nextChar = do
6468
{-# INLINE nextChar #-}
6569

6670
anyCaseName :: Lexer SourceName
67-
anyCaseName = label "name" $ lexeme $
71+
anyCaseName = label "name" $ lexeme LowerName $ -- TODO: distinguish lowercase/uppercase
6872
checkNotKeyword $ (:) <$> satisfy (\c -> isLower c || isUpper c) <*>
6973
(T.unpack <$> takeWhileP Nothing (\c -> isAlphaNum c || c == '\'' || c == '_'))
7074

@@ -121,7 +125,7 @@ keyWordToken = \case
121125
PassKW -> "pass"
122126

123127
keyWord :: KeyWord -> Lexer ()
124-
keyWord kw = lexeme $ try $ string (fromString $ keyWordToken kw)
128+
keyWord kw = lexeme Keyword $ try $ string (fromString $ keyWordToken kw)
125129
>> notFollowedBy nameTailChar
126130

127131
keyWordSet :: HS.HashSet String
@@ -131,19 +135,19 @@ keyWordStrs :: [String]
131135
keyWordStrs = map keyWordToken [DefKW .. PassKW]
132136

133137
primName :: Lexer String
134-
primName = lexeme $ try $ char '%' >> some alphaNumChar
138+
primName = lexeme MiscLexeme $ try $ char '%' >> some alphaNumChar
135139

136140
charLit :: Lexer Char
137-
charLit = lexeme $ char '\'' >> L.charLiteral <* char '\''
141+
charLit = lexeme MiscLexeme $ char '\'' >> L.charLiteral <* char '\''
138142

139143
strLit :: Lexer String
140-
strLit = lexeme $ char '"' >> manyTill L.charLiteral (char '"')
144+
strLit = lexeme StringLiteralLexeme $ char '"' >> manyTill L.charLiteral (char '"')
141145

142146
natLit :: Lexer Word64
143-
natLit = lexeme $ try $ L.decimal <* notFollowedBy (char '.')
147+
natLit = lexeme LiteralLexeme $ try $ L.decimal <* notFollowedBy (char '.')
144148

145149
doubleLit :: Lexer Double
146-
doubleLit = lexeme $
150+
doubleLit = lexeme LiteralLexeme $
147151
try L.float
148152
<|> try (fromIntegral <$> (L.decimal :: Parser Int) <* char '.')
149153
<|> try do
@@ -161,22 +165,22 @@ knownSymStrs = HS.fromList
161165

162166
-- string must be in `knownSymStrs`
163167
sym :: Text -> Lexer ()
164-
sym s = lexeme $ try $ string s >> notFollowedBy symChar
168+
sym s = lexeme Symbol $ try $ string s >> notFollowedBy symChar
165169

166170
anySym :: Lexer String
167-
anySym = lexeme $ try $ do
171+
anySym = lexeme Symbol $ try $ do
168172
s <- some symChar
169173
failIf (s `HS.member` knownSymStrs) ""
170174
return s
171175

172176
symName :: Lexer SourceName
173-
symName = label "symbol name" $ lexeme $ try $ do
177+
symName = label "symbol name" $ lexeme Symbol $ try $ do
174178
s <- between (char '(') (char ')') $ some symChar
175179
return $ "(" <> s <> ")"
176180

177181
backquoteName :: Lexer SourceName
178182
backquoteName = label "backquoted name" $
179-
lexeme $ try $ between (char '`') (char '`') anyCaseName
183+
lexeme Symbol $ try $ between (char '`') (char '`') anyCaseName
180184

181185
-- brackets and punctuation
182186
-- (can't treat as sym because e.g. `((` is two separate lexemes)
@@ -192,7 +196,7 @@ semicolon = charLexeme ';'
192196
underscore = charLexeme '_'
193197

194198
charLexeme :: Char -> Parser ()
195-
charLexeme c = void $ lexeme $ char c
199+
charLexeme c = void $ lexeme Symbol $ char c
196200

197201
nameTailChar :: Parser Char
198202
nameTailChar = alphaNumChar <|> char '\'' <|> char '_'
@@ -243,10 +247,10 @@ recordNonWhitespace = modify \ctx -> ctx { prevWhitespace = False }
243247
{-# INLINE recordNonWhitespace #-}
244248

245249
nameString :: Parser String
246-
nameString = lexeme . try $ (:) <$> lowerChar <*> many alphaNumChar
250+
nameString = lexeme LowerName . try $ (:) <$> lowerChar <*> many alphaNumChar
247251

248252
thisNameString :: Text -> Parser ()
249-
thisNameString s = lexeme $ try $ string s >> notFollowedBy alphaNumChar
253+
thisNameString s = lexeme MiscLexeme $ try $ string s >> notFollowedBy alphaNumChar
250254

251255
bracketed :: Parser () -> Parser () -> Parser a -> Parser a
252256
bracketed left right p = between left right $ mayBreak $ sc >> p
@@ -310,10 +314,37 @@ failIf :: Bool -> String -> Parser ()
310314
failIf True s = fail s
311315
failIf False _ = return ()
312316

313-
lexeme :: Parser a -> Parser a
314-
lexeme p = L.lexeme sc (p <* recordNonWhitespace)
317+
newSourceId :: Parser SourceId
318+
newSourceId = do
319+
c <- gets sourceIdCounter
320+
modify \ctx -> ctx { sourceIdCounter = c + 1 }
321+
return $ SourceId c
322+
323+
withSourceMaps :: Parser a -> Parser (SourceMaps, a)
324+
withSourceMaps cont = do
325+
smPrev <- gets curSourceMap
326+
modify \ctx -> ctx { curSourceMap = mempty }
327+
result <- cont
328+
sm <- gets curSourceMap
329+
modify \ctx -> ctx { curSourceMap = smPrev }
330+
return (sm, result)
331+
332+
emitSourceMaps :: SourceMaps -> Parser ()
333+
emitSourceMaps m = modify \ctx -> ctx { curSourceMap = curSourceMap ctx <> m }
334+
335+
lexeme :: LexemeType -> Parser a -> Parser a
336+
lexeme lexemeType p = do
337+
start <- getOffset
338+
ans <- p
339+
end <- getOffset
340+
recordNonWhitespace
341+
sc
342+
name <- newSourceId
343+
emitSourceMaps $ mempty
344+
{ lexemeList = toSnocList [name]
345+
, lexemeInfo = M.singleton name (lexemeType, (start, end)) }
346+
return ans
315347
{-# INLINE lexeme #-}
316348

317349
symbol :: Text -> Parser ()
318350
symbol s = void $ L.symbol sc s
319-

0 commit comments

Comments
 (0)