Skip to content

Commit

Permalink
Parse non-syntactic operator tokens as K"Identifier" kind (#523)
Browse files Browse the repository at this point in the history
Most operators are semantically just normal identifiers after parsing so
should get the Kind `K"Identifier"`. For example, after this change `a + b`
parses with `K"Identifier"` kind for the `+` token.

As an exception, standalone syntactic ops keep their kind - they can't
really be used in a sane way as identifiers or interpolated into
expressions in the normal way because they have their own syntactic
forms. This also helps us in `Expr` conversion where they also have
their own rules for coalescing with dots, when dotted.

Also introduce a new keyword `operators_as_identifiers` to the
`tokenize()` API to accommodate some simple uses of this API to colour
token strings by operator type, even when the operator is semantically
in identifier-position.
  • Loading branch information
c42f authored Dec 30, 2024
1 parent 4a7e846 commit 35dc3bc
Show file tree
Hide file tree
Showing 9 changed files with 119 additions and 41 deletions.
2 changes: 1 addition & 1 deletion src/expr.jl
Original file line number Diff line number Diff line change
Expand Up @@ -297,7 +297,7 @@ function _internal_node_to_Expr(source, srcrange, head, childranges, childheads,
if !@isexpr(a2, :quote) && !(a2 isa QuoteNode)
args[2] = QuoteNode(a2)
end
elseif length(args) == 1 && is_operator(childheads[1])
elseif length(args) == 1
# Hack: Here we preserve the head of the operator to determine whether
# we need to coalesce it with the dot into a single symbol later on.
args[1] = (childheads[1], args[1])
Expand Down
9 changes: 9 additions & 0 deletions src/kinds.jl
Original file line number Diff line number Diff line change
Expand Up @@ -1230,3 +1230,12 @@ function is_whitespace(x)
k = kind(x)
return k == K"Whitespace" || k == K"NewlineWs" || k == K"Comment"
end

function is_syntactic_operator(x)
k = kind(x)
# TODO: Do we need to disallow dotted and suffixed forms when this is used
# in the parser? The lexer itself usually disallows such tokens, so it's
# not clear whether we need to handle them. (Though note `.->` is a
# token...)
return k in KSet"&& || . ... ->" || is_syntactic_assignment(k)
end
3 changes: 2 additions & 1 deletion src/parse_stream.jl
Original file line number Diff line number Diff line change
Expand Up @@ -890,7 +890,8 @@ function bump_split(stream::ParseStream, split_spec::Vararg{Any, N}) where {N}
for (i, (nbyte, k, f)) in enumerate(split_spec)
h = SyntaxHead(k, f)
b = (i == length(split_spec)) ? tok.next_byte : b + nbyte
push!(stream.tokens, SyntaxToken(h, kind(tok), false, b))
orig_k = k == K"." ? K"." : kind(tok)
push!(stream.tokens, SyntaxToken(h, orig_k, false, b))
end
stream.peek_count = 0
return position(stream)
Expand Down
52 changes: 25 additions & 27 deletions src/parser.jl
Original file line number Diff line number Diff line change
Expand Up @@ -382,22 +382,22 @@ function parse_LtoR(ps::ParseState, down, is_op)
down(ps)
while is_op(peek(ps))
t = peek_token(ps)
bump_dotsplit(ps)
bump_dotsplit(ps, remap_kind=K"Identifier")
down(ps)
emit(ps, mark, is_dotted(t) ? K"dotcall" : K"call", INFIX_FLAG)
end
end

# parse right-to-left binary operator
# produces structures like (= a (= b (= c d)))
# produces structures like (=> a (=> b (=> c d)))
#
# flisp: parse-RtoL
function parse_RtoL(ps::ParseState, down, is_op, self)
mark = position(ps)
down(ps)
t = peek_token(ps)
if is_op(kind(t))
bump_dotsplit(ps)
bump_dotsplit(ps, remap_kind=K"Identifier")
self(ps)
emit(ps, mark, is_dotted(t) ? K"dotcall" : K"call", INFIX_FLAG)
end
Expand Down Expand Up @@ -624,7 +624,7 @@ function parse_assignment_with_initial_ex(ps::ParseState, mark, down::T) where {
# a .~ b ==> (dotcall-i a ~ b)
# [a ~ b c] ==> (hcat (call-i a ~ b) c)
# [a~b] ==> (vect (call-i a ~ b))
bump_dotsplit(ps)
bump_dotsplit(ps, remap_kind=K"Identifier")
bump_trivia(ps)
parse_assignment(ps, down)
emit(ps, mark, is_dotted(t) ? K"dotcall" : K"call", INFIX_FLAG)
Expand Down Expand Up @@ -759,7 +759,7 @@ function parse_arrow(ps::ParseState)
# x <--> y ==> (call-i x <--> y)
# x .--> y ==> (dotcall-i x --> y)
# x -->₁ y ==> (call-i x -->₁ y)
bump_dotsplit(ps)
bump_dotsplit(ps, remap_kind=K"Identifier")
parse_arrow(ps)
emit(ps, mark, is_dotted(t) ? K"dotcall" : K"call", INFIX_FLAG)
end
Expand Down Expand Up @@ -821,7 +821,7 @@ function parse_comparison(ps::ParseState, subtype_comparison=false)
while (t = peek_token(ps); is_prec_comparison(t))
n_comparisons += 1
op_dotted = is_dotted(t)
op_pos = bump_dotsplit(ps, emit_dot_node=true)
op_pos = bump_dotsplit(ps, emit_dot_node=true, remap_kind=K"Identifier")
parse_pipe_lt(ps)
end
if n_comparisons == 1
Expand Down Expand Up @@ -881,7 +881,7 @@ function parse_range(ps::ParseState)
# a..b ==> (call-i a .. b)
# a … b ==> (call-i a … b)
# a .… b ==> (dotcall-i a … b)
bump_dotsplit(ps)
bump_dotsplit(ps, remap_kind=K"Identifier")
parse_invalid_ops(ps)
emit(ps, mark, is_dotted(initial_tok) ? K"dotcall" : K"call", INFIX_FLAG)
elseif initial_kind == K":" && ps.range_colon_enabled
Expand All @@ -904,17 +904,17 @@ function parse_range(ps::ParseState)
# a :> b ==> (call-i a (error : >) b)
bump_trivia(ps, skip_newlines=false)
emark = position(ps)
bump(ps) # K":"
bump(ps, remap_kind=K"Identifier") # K":"
ks = untokenize(peek(ps))
bump(ps) # K"<" or K">"
bump(ps, remap_kind=K"Identifier") # K"<" or K">"
emit(ps, emark, K"error",
error="Invalid `:$ks` found, maybe replace with `$ks:`")
parse_invalid_ops(ps)
emit(ps, mark, K"call", INFIX_FLAG)
break
end
n_colons += 1
bump(ps, n_colons == 1 ? EMPTY_FLAGS : TRIVIA_FLAG)
bump(ps, n_colons == 1 ? EMPTY_FLAGS : TRIVIA_FLAG; remap_kind=K"Identifier")
had_newline = peek(ps) == K"NewlineWs"
t = peek_token(ps)
if is_closing_token(ps, kind(t))
Expand Down Expand Up @@ -1008,7 +1008,7 @@ function parse_with_chains(ps::ParseState, down, is_op, chain_ops)
# [x+y + z] ==> (vect (call-i x + y z))
break
end
bump_dotsplit(ps)
bump_dotsplit(ps, remap_kind=K"Identifier")
down(ps)
if kind(t) in chain_ops && !is_decorated(t)
# a + b + c ==> (call-i a + b c)
Expand Down Expand Up @@ -1217,7 +1217,7 @@ function parse_unary(ps::ParseState)
# unary negation
# -2^x ==> (call-pre - (call-i 2 ^ x))
# -2[1, 3] ==> (call-pre - (ref 2 1 3))
bump(ps)
bump(ps, remap_kind=K"Identifier")
parse_factor(ps)
emit(ps, mark, K"call", PREFIX_OP_FLAG)
else
Expand Down Expand Up @@ -1256,7 +1256,7 @@ function parse_unary(ps::ParseState)
#
# (The flisp parser only considers commas before `;` and thus gets this
# last case wrong)
op_pos = bump_dotsplit(ps, emit_dot_node=true)
op_pos = bump_dotsplit(ps, emit_dot_node=true, remap_kind=K"Identifier")

space_before_paren = preceding_whitespace(t2)
if space_before_paren
Expand Down Expand Up @@ -1303,7 +1303,7 @@ function parse_unary(ps::ParseState)
if is_type_operator(op_t)
# <:(a,) ==> (<: a)
emit(ps, mark, op_k, opts.delim_flags)
reset_node!(ps, op_pos, flags=TRIVIA_FLAG)
reset_node!(ps, op_pos, flags=TRIVIA_FLAG, kind=op_k)
else
emit(ps, mark, K"call", opts.delim_flags)
end
Expand All @@ -1329,7 +1329,7 @@ function parse_unary(ps::ParseState)
if is_type_operator(op_t)
# <:(a) ==> (<:-pre (parens a))
emit(ps, mark, op_k, PREFIX_OP_FLAG)
reset_node!(ps, op_pos, flags=TRIVIA_FLAG)
reset_node!(ps, op_pos, flags=TRIVIA_FLAG, kind=op_k)
else
if is_dotted(op_t)
emit(ps, mark, K"dotcall", PREFIX_OP_FLAG)
Expand All @@ -1349,12 +1349,12 @@ function parse_unary(ps::ParseState)
# -0x1 ==> (call-pre - 0x01)
# - 2 ==> (call-pre - 2)
# .-2 ==> (dotcall-pre - 2)
op_pos = bump_dotsplit(ps, EMPTY_FLAGS)
op_pos = bump_dotsplit(ps, EMPTY_FLAGS, remap_kind=K"Identifier")
else
# /x ==> (call-pre (error /) x)
# +₁ x ==> (call-pre (error +₁) x)
# .<: x ==> (dotcall-pre (error (. <:)) x)
bump_dotsplit(ps, EMPTY_FLAGS, emit_dot_node=true)
bump_dotsplit(ps, EMPTY_FLAGS, emit_dot_node=true, remap_kind=K"Identifier")
op_pos = emit(ps, mark, K"error", error="not a unary operator")
end
parse_unary(ps)
Expand Down Expand Up @@ -1385,7 +1385,7 @@ end
function parse_factor_with_initial_ex(ps::ParseState, mark)
parse_decl_with_initial_ex(ps, mark)
if (t = peek_token(ps); is_prec_power(kind(t)))
bump_dotsplit(ps)
bump_dotsplit(ps, remap_kind=K"Identifier")
parse_factor_after(ps)
emit(ps, mark, is_dotted(t) ? K"dotcall" : K"call", INFIX_FLAG)
end
Expand Down Expand Up @@ -1687,11 +1687,12 @@ function parse_call_chain(ps::ParseState, mark, is_macrocall=false)
macro_atname_range = (m, position(ps))
emit(ps, mark, K".")
elseif k == K"'"
# f.' => f (error-t (. '))
bump_dotsplit(ps, remap_kind=K"Identifier")
# TODO: Reclaim dotted postfix operators :-)
# f.' => f (error-t ')
bump(ps)
emit(ps, emark, K"error", TRIVIA_FLAG,
emit(ps, emark, K"error",
error="the .' operator for transpose is discontinued")
emit(ps, mark, K"dotcall", POSTFIX_OP_FLAG)
else
# Field/property syntax
# f.x.y ==> (. (. f x) y)
Expand All @@ -1703,7 +1704,7 @@ function parse_call_chain(ps::ParseState, mark, is_macrocall=false)
elseif k == K"'" && !preceding_whitespace(t)
# f' ==> (call-post f ')
# f'ᵀ ==> (call-post f 'ᵀ)
bump(ps)
bump(ps, remap_kind=K"Identifier")
emit(ps, mark, K"call", POSTFIX_OP_FLAG)
elseif k == K"{"
# Type parameter curlies and macro calls
Expand Down Expand Up @@ -3554,11 +3555,8 @@ function parse_atom(ps::ParseState, check_identifiers=true)
# + ==> +
# .+ ==> (. +)
# .= ==> (. =)
if is_dotted(peek_token(ps))
bump_dotsplit(ps, emit_dot_node=true)
else
bump(ps, remap_kind=K"Identifier")
end
bump_dotsplit(ps, emit_dot_node=true, remap_kind=
is_syntactic_operator(leading_kind) ? leading_kind : K"Identifier")
if check_identifiers && !is_valid_identifier(leading_kind)
# += ==> (error +=)
# ? ==> (error ?)
Expand Down
19 changes: 16 additions & 3 deletions src/parser_api.jl
Original file line number Diff line number Diff line change
Expand Up @@ -174,15 +174,20 @@ Token() = Token(SyntaxHead(K"None", EMPTY_FLAGS), 0:0)
head(t::Token) = t.head

"""
tokenize(text)
tokenize(text; operators_as_identifiers=true)
Returns the tokenized UTF-8 encoded `text` as a vector of `Token`s. The
text for the token can be retrieved by using `untokenize()`. The full text can be
reconstructed with, for example, `join(untokenize.(tokenize(text), text))`.
This interface works on UTF-8 encoded string or buffer data only.
The keyword `operators_as_identifiers` specifies whether operators in
identifier-position should have `K"Identifier"` as their kind, or be emitted as
more specific operator kinds. For example, whether the `+` in `a + b` should be
emitted as `K"Identifier"` (the default) or as `K"+"`.
"""
function tokenize(text)
function tokenize(text; operators_as_identifiers=true)
ps = ParseStream(text)
parse!(ps, rule=:all)
ts = ps.tokens
Expand All @@ -192,7 +197,15 @@ function tokenize(text)
continue
end
r = ts[i-1].next_byte:ts[i].next_byte-1
push!(output_tokens, Token(head(ts[i]), r))
k = kind(ts[i])
if k == K"Identifier" && !operators_as_identifiers
orig_k = ts[i].orig_kind
if is_operator(orig_k) && !is_word_operator(orig_k)
k = orig_k
end
end
f = flags(ts[i])
push!(output_tokens, Token(SyntaxHead(k,f), r))
end
output_tokens
end
Expand Down
2 changes: 1 addition & 1 deletion test/green_node.jl
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
@test head.(children(t)) == [
SyntaxHead(K"Identifier", 0x0000)
SyntaxHead(K"Whitespace", 0x0001)
SyntaxHead(K"+", 0x0000)
SyntaxHead(K"Identifier", 0x0000)
SyntaxHead(K"Whitespace", 0x0001)
SyntaxHead(K"Identifier", 0x0000)
]
Expand Down
46 changes: 42 additions & 4 deletions test/parser.jl
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
"""
Parse string to SyntaxNode tree and show as an sexpression
"""
function parse_to_sexpr_str(production, code::AbstractString; v=v"1.6")
function parse_to_sexpr_str(production, code::AbstractString; v=v"1.6", show_kws...)
stream = ParseStream(code, version=v)
production(ParseState(stream))
JuliaSyntax.validate_tokens(stream)
t = build_tree(GreenNode, stream)
source = SourceFile(code)
s = SyntaxNode(source, t, keep_parens=true)
return sprint(show, MIME("text/x.sexpression"), s)
return sprint(io->show(io, MIME("text/x.sexpression"), s; show_kws...))
end

function test_parse(production, input, output)
Expand All @@ -29,7 +29,7 @@ function test_parse(inout::Pair)
test_parse(JuliaSyntax.parse_toplevel, inout...)
end

const PARSE_ERROR = r"\(error-t "
PARSE_ERROR = r"\(error-t "

with_version(v::VersionNumber, (i,o)::Pair) = ((;v=v), i) => o

Expand Down Expand Up @@ -436,7 +436,7 @@ tests = [
"A.@x a" => "(macrocall (. A @x) a)"
"@A.B.@x a" => "(macrocall (. (. A B) (error-t) @x) a)"
# .' discontinued
"f.'" => "(wrapper f (error-t '))"
"f.'" => "(dotcall-post f (error '))"
# Field/property syntax
"f.x.y" => "(. (. f x) y)"
"x .y" => "(. x (error-t) y)"
Expand Down Expand Up @@ -1112,6 +1112,44 @@ parsestmt_test_specs = [
end
end

parsestmt_with_kind_tests = [
# Most operators are semantically just normal identifiers after parsing so
# get the Kind K"Identifier"
"+" => "+::Identifier"
"a + b" => "(call-i a::Identifier +::Identifier b::Identifier)"
"a .+ b" => "(dotcall-i a::Identifier +::Identifier b::Identifier)"
"a |> b" => "(call-i a::Identifier |>::Identifier b::Identifier)"
"a => b" => "(call-i a::Identifier =>::Identifier b::Identifier)"
"a → b" => "(call-i a::Identifier →::Identifier b::Identifier)"
"a < b < c" => "(comparison a::Identifier <::Identifier b::Identifier <::Identifier c::Identifier)"
"a .<: b"=> "(dotcall-i a::Identifier <:::Identifier b::Identifier)"
"a .. b" => "(call-i a::Identifier ..::Identifier b::Identifier)"
"a : b" => "(call-i a::Identifier :::Identifier b::Identifier)"
"-2^x" => "(call-pre -::Identifier (call-i 2::Integer ^::Identifier x::Identifier))"
"-(2)" => "(call-pre -::Identifier (parens 2::Integer))"
"<:(a,)" => "(<:-, a::Identifier)"
"- 2" => "(call-pre -::Identifier 2::Integer)"
"/x" => "(call-pre (error /::Identifier) x::Identifier)"
"a^b" => "(call-i a::Identifier ^::Identifier b::Identifier)"
"f.'" => "(dotcall-post f::Identifier (error '::Identifier))"
"f'" => "(call-post f::Identifier '::Identifier)"
# Standalone syntactic ops which keep their kind - they can't really be
# used in a sane way as identifiers or interpolated into expressions
# because they have their own syntactic forms.
":(::)" => "(quote-: (parens ::::::))"
":(\$)" => "(quote-: (parens \$::\$))"
":(<:)" => "(quote-: (parens <:::<:))"
":(&&)" => "(quote-: (parens &&::&&))"
":(=)" => "(quote-: (parens =::=))"
]

@testset "parser `Kind` remapping" begin
@testset "$(repr(input))" for (input, output) in parsestmt_with_kind_tests
input = ((show_kind=true,), input)
test_parse(JuliaSyntax.parse_stmts, input, output)
end
end

@testset "Trivia attachment" begin
# TODO: Need to expand this greatly to cover as many forms as possible!
@test show_green_tree("f(a;b)") == """
Expand Down
21 changes: 20 additions & 1 deletion test/parser_api.jl
Original file line number Diff line number Diff line change
Expand Up @@ -170,14 +170,25 @@ end
end
end

tokensplit(str) = [kind(tok) => untokenize(tok, str) for tok in tokenize(str)]
tokensplit(str; kws...) = [kind(tok) => untokenize(tok, str) for tok in tokenize(str; kws...)]

@testset "tokenize() API" begin
# tokenize() is eager
@test tokenize("aba") isa Vector{JuliaSyntax.Token}

# . is a separate token from + in `.+`
@test tokensplit("a .+ β") == [
K"Identifier" => "a",
K"Whitespace" => " ",
K"." => ".",
K"Identifier" => "+",
K"Whitespace" => " ",
K"Identifier" => "β",
]

# + is kind K"+" when operators in identifier position are emitted as
# operator kinds.
@test tokensplit("a .+ β"; operators_as_identifiers=false) == [
K"Identifier" => "a",
K"Whitespace" => " ",
K"." => ".",
Expand All @@ -194,6 +205,14 @@ tokensplit(str) = [kind(tok) => untokenize(tok, str) for tok in tokenize(str)]
K"Whitespace" => " ",
K"Integer" => "1",
]
# Including word operators
@test tokensplit("where = 1"; operators_as_identifiers=false) == [
K"Identifier" => "where",
K"Whitespace" => " ",
K"=" => "=",
K"Whitespace" => " ",
K"Integer" => "1",
]

# A predicate based on flags()
@test JuliaSyntax.is_suffixed(tokenize("+₁")[1])
Expand Down
Loading

0 comments on commit 35dc3bc

Please sign in to comment.