Parse non-syntactic operator tokens as K"Identifier" kind (#523)

Most operators are semantically just normal identifiers after parsing so should get the Kind `K"Identifier"`. For example, after this change `a + b` parses with `K"Identifier"` kind for the `+` token. As an exception, standalone syntactic ops keep their kind - they can't really be used in a sane way as identifiers or interpolated into expressions in the normal way because they have their own syntactic forms. This also helps us in `Expr` conversion where they also have their own rules for coalescing with dots, when dotted. Also introduce a new keyword `operators_as_identifiers` to the `tokenize()` API to accommodate some simple uses of this API to colour token strings by operator type, even when the operator is semantically in identifier-position.
JuliaLang · Dec 30, 2024 · 35dc3bc · 35dc3bc
1 parent 4a7e846
commit 35dc3bc
Show file tree

Hide file tree

Showing 9 changed files with 119 additions and 41 deletions.
diff --git a/src/expr.jl b/src/expr.jl
@@ -297,7 +297,7 @@ function _internal_node_to_Expr(source, srcrange, head, childranges, childheads,
             if !@isexpr(a2, :quote) && !(a2 isa QuoteNode)
                 args[2] = QuoteNode(a2)
             end
-        elseif length(args) == 1 && is_operator(childheads[1])
+        elseif length(args) == 1
             # Hack: Here we preserve the head of the operator to determine whether
             # we need to coalesce it with the dot into a single symbol later on.
             args[1] = (childheads[1], args[1])

diff --git a/src/kinds.jl b/src/kinds.jl
@@ -1230,3 +1230,12 @@ function is_whitespace(x)
     k = kind(x)
     return k == K"Whitespace" || k == K"NewlineWs" || k == K"Comment"
 end
+
+function is_syntactic_operator(x)
+    k = kind(x)
+    # TODO: Do we need to disallow dotted and suffixed forms when this is used
+    # in the parser? The lexer itself usually disallows such tokens, so it's
+    # not clear whether we need to handle them. (Though note `.->` is a
+    # token...)
+    return k in KSet"&& || . ... ->" || is_syntactic_assignment(k)
+end
diff --git a/src/parse_stream.jl b/src/parse_stream.jl
@@ -890,7 +890,8 @@ function bump_split(stream::ParseStream, split_spec::Vararg{Any, N}) where {N}
     for (i, (nbyte, k, f)) in enumerate(split_spec)
         h = SyntaxHead(k, f)
         b = (i == length(split_spec)) ? tok.next_byte : b + nbyte
-        push!(stream.tokens, SyntaxToken(h, kind(tok), false, b))
+        orig_k = k == K"." ? K"." : kind(tok)
+        push!(stream.tokens, SyntaxToken(h, orig_k, false, b))
     end
     stream.peek_count = 0
     return position(stream)

diff --git a/src/parser.jl b/src/parser.jl
@@ -382,22 +382,22 @@ function parse_LtoR(ps::ParseState, down, is_op)
     down(ps)
     while is_op(peek(ps))
         t = peek_token(ps)
-        bump_dotsplit(ps)
+        bump_dotsplit(ps, remap_kind=K"Identifier")
         down(ps)
         emit(ps, mark, is_dotted(t) ? K"dotcall" : K"call", INFIX_FLAG)
     end
 end
 
 # parse right-to-left binary operator
-# produces structures like (= a (= b (= c d)))
+# produces structures like (=> a (=> b (=> c d)))
 #
 # flisp: parse-RtoL
 function parse_RtoL(ps::ParseState, down, is_op, self)
     mark = position(ps)
     down(ps)
     t = peek_token(ps)
     if is_op(kind(t))
-        bump_dotsplit(ps)
+        bump_dotsplit(ps, remap_kind=K"Identifier")
         self(ps)
         emit(ps, mark, is_dotted(t) ? K"dotcall" : K"call", INFIX_FLAG)
     end
@@ -624,7 +624,7 @@ function parse_assignment_with_initial_ex(ps::ParseState, mark, down::T) where {
         # a .~ b     ==>  (dotcall-i a ~ b)
         # [a ~ b c]  ==>  (hcat (call-i a ~ b) c)
         # [a~b]      ==>  (vect (call-i a ~ b))
-        bump_dotsplit(ps)
+        bump_dotsplit(ps, remap_kind=K"Identifier")
         bump_trivia(ps)
         parse_assignment(ps, down)
         emit(ps, mark, is_dotted(t) ? K"dotcall" : K"call", INFIX_FLAG)
@@ -759,7 +759,7 @@ function parse_arrow(ps::ParseState)
             # x <--> y  ==>  (call-i x <--> y)
             # x .--> y  ==>  (dotcall-i x --> y)
             # x -->₁ y  ==>  (call-i x -->₁ y)
-            bump_dotsplit(ps)
+            bump_dotsplit(ps, remap_kind=K"Identifier")
             parse_arrow(ps)
             emit(ps, mark, is_dotted(t) ? K"dotcall" : K"call", INFIX_FLAG)
         end
@@ -821,7 +821,7 @@ function parse_comparison(ps::ParseState, subtype_comparison=false)
     while (t = peek_token(ps); is_prec_comparison(t))
         n_comparisons += 1
         op_dotted = is_dotted(t)
-        op_pos = bump_dotsplit(ps, emit_dot_node=true)
+        op_pos = bump_dotsplit(ps, emit_dot_node=true, remap_kind=K"Identifier")
         parse_pipe_lt(ps)
     end
     if n_comparisons == 1
@@ -881,7 +881,7 @@ function parse_range(ps::ParseState)
         # a..b     ==>   (call-i a .. b)
         # a … b    ==>   (call-i a … b)
         # a .… b    ==>  (dotcall-i a … b)
-        bump_dotsplit(ps)
+        bump_dotsplit(ps, remap_kind=K"Identifier")
         parse_invalid_ops(ps)
         emit(ps, mark, is_dotted(initial_tok) ? K"dotcall" : K"call", INFIX_FLAG)
     elseif initial_kind == K":" && ps.range_colon_enabled
@@ -904,17 +904,17 @@ function parse_range(ps::ParseState)
                 # a :> b   ==>  (call-i a (error : >) b)
                 bump_trivia(ps, skip_newlines=false)
                 emark = position(ps)
-                bump(ps) # K":"
+                bump(ps, remap_kind=K"Identifier") # K":"
                 ks = untokenize(peek(ps))
-                bump(ps) # K"<" or K">"
+                bump(ps, remap_kind=K"Identifier") # K"<" or K">"
                 emit(ps, emark, K"error",
                      error="Invalid `:$ks` found, maybe replace with `$ks:`")
                 parse_invalid_ops(ps)
                 emit(ps, mark, K"call", INFIX_FLAG)
                 break
             end
             n_colons += 1
-            bump(ps, n_colons == 1 ? EMPTY_FLAGS : TRIVIA_FLAG)
+            bump(ps, n_colons == 1 ? EMPTY_FLAGS : TRIVIA_FLAG; remap_kind=K"Identifier")
             had_newline = peek(ps) == K"NewlineWs"
             t = peek_token(ps)
             if is_closing_token(ps, kind(t))
@@ -1008,7 +1008,7 @@ function parse_with_chains(ps::ParseState, down, is_op, chain_ops)
             # [x+y + z]  ==>  (vect (call-i x + y z))
             break
         end
-        bump_dotsplit(ps)
+        bump_dotsplit(ps, remap_kind=K"Identifier")
         down(ps)
         if kind(t) in chain_ops && !is_decorated(t)
             # a + b + c    ==>  (call-i a + b c)
@@ -1217,7 +1217,7 @@ function parse_unary(ps::ParseState)
                 # unary negation
                 # -2^x      ==>  (call-pre - (call-i 2 ^ x))
                 # -2[1, 3]  ==>  (call-pre - (ref 2 1 3))
-                bump(ps)
+                bump(ps, remap_kind=K"Identifier")
                 parse_factor(ps)
                 emit(ps, mark, K"call", PREFIX_OP_FLAG)
             else
@@ -1256,7 +1256,7 @@ function parse_unary(ps::ParseState)
         #
         # (The flisp parser only considers commas before `;` and thus gets this
         # last case wrong)
-        op_pos = bump_dotsplit(ps, emit_dot_node=true)
+        op_pos = bump_dotsplit(ps, emit_dot_node=true, remap_kind=K"Identifier")
 
         space_before_paren = preceding_whitespace(t2)
         if space_before_paren
@@ -1303,7 +1303,7 @@ function parse_unary(ps::ParseState)
             if is_type_operator(op_t)
                 # <:(a,)  ==>  (<: a)
                 emit(ps, mark, op_k, opts.delim_flags)
-                reset_node!(ps, op_pos, flags=TRIVIA_FLAG)
+                reset_node!(ps, op_pos, flags=TRIVIA_FLAG, kind=op_k)
             else
                 emit(ps, mark, K"call", opts.delim_flags)
             end
@@ -1329,7 +1329,7 @@ function parse_unary(ps::ParseState)
             if is_type_operator(op_t)
                 # <:(a)  ==>  (<:-pre (parens a))
                 emit(ps, mark, op_k, PREFIX_OP_FLAG)
-                reset_node!(ps, op_pos, flags=TRIVIA_FLAG)
+                reset_node!(ps, op_pos, flags=TRIVIA_FLAG, kind=op_k)
             else
                 if is_dotted(op_t)
                     emit(ps, mark, K"dotcall", PREFIX_OP_FLAG)
@@ -1349,12 +1349,12 @@ function parse_unary(ps::ParseState)
             # -0x1 ==> (call-pre - 0x01)
             # - 2  ==> (call-pre - 2)
             # .-2  ==> (dotcall-pre - 2)
-            op_pos = bump_dotsplit(ps, EMPTY_FLAGS)
+            op_pos = bump_dotsplit(ps, EMPTY_FLAGS, remap_kind=K"Identifier")
         else
             # /x     ==>  (call-pre (error /) x)
             # +₁ x   ==>  (call-pre (error +₁) x)
             # .<: x  ==>  (dotcall-pre (error (. <:)) x)
-            bump_dotsplit(ps, EMPTY_FLAGS, emit_dot_node=true)
+            bump_dotsplit(ps, EMPTY_FLAGS, emit_dot_node=true, remap_kind=K"Identifier")
             op_pos = emit(ps, mark, K"error", error="not a unary operator")
         end
         parse_unary(ps)
@@ -1385,7 +1385,7 @@ end
 function parse_factor_with_initial_ex(ps::ParseState, mark)
     parse_decl_with_initial_ex(ps, mark)
     if (t = peek_token(ps); is_prec_power(kind(t)))
-        bump_dotsplit(ps)
+        bump_dotsplit(ps, remap_kind=K"Identifier")
         parse_factor_after(ps)
         emit(ps, mark, is_dotted(t) ? K"dotcall" : K"call", INFIX_FLAG)
     end
@@ -1687,11 +1687,12 @@ function parse_call_chain(ps::ParseState, mark, is_macrocall=false)
                 macro_atname_range = (m, position(ps))
                 emit(ps, mark, K".")
             elseif k == K"'"
+                # f.'  =>  f (error-t (. '))
+                bump_dotsplit(ps, remap_kind=K"Identifier")
                 # TODO: Reclaim dotted postfix operators :-)
-                # f.'  =>  f (error-t ')
-                bump(ps)
-                emit(ps, emark, K"error", TRIVIA_FLAG,
+                emit(ps, emark, K"error",
                      error="the .' operator for transpose is discontinued")
+                emit(ps, mark, K"dotcall", POSTFIX_OP_FLAG)
             else
                 # Field/property syntax
                 # f.x.y ==> (. (. f x) y)
@@ -1703,7 +1704,7 @@ function parse_call_chain(ps::ParseState, mark, is_macrocall=false)
         elseif k == K"'" && !preceding_whitespace(t)
             # f'  ==> (call-post f ')
             # f'ᵀ ==> (call-post f 'ᵀ)
-            bump(ps)
+            bump(ps, remap_kind=K"Identifier")
             emit(ps, mark, K"call", POSTFIX_OP_FLAG)
         elseif k == K"{"
             # Type parameter curlies and macro calls
@@ -3554,11 +3555,8 @@ function parse_atom(ps::ParseState, check_identifiers=true)
         # +     ==>  +
         # .+    ==>  (. +)
         # .=    ==>  (. =)
-        if is_dotted(peek_token(ps))
-            bump_dotsplit(ps, emit_dot_node=true)
-        else
-            bump(ps, remap_kind=K"Identifier")
-        end
+        bump_dotsplit(ps, emit_dot_node=true, remap_kind=
+                      is_syntactic_operator(leading_kind) ? leading_kind : K"Identifier")
         if check_identifiers && !is_valid_identifier(leading_kind)
             # +=   ==>  (error +=)
             # ?    ==>  (error ?)

diff --git a/src/parser_api.jl b/src/parser_api.jl
@@ -174,15 +174,20 @@ Token() = Token(SyntaxHead(K"None", EMPTY_FLAGS), 0:0)
 head(t::Token) = t.head
 
 """
-    tokenize(text)
+    tokenize(text; operators_as_identifiers=true)
 
 Returns the tokenized UTF-8 encoded `text` as a vector of `Token`s. The
 text for the token can be retrieved by using `untokenize()`. The full text can be
 reconstructed with, for example, `join(untokenize.(tokenize(text), text))`.
 
 This interface works on UTF-8 encoded string or buffer data only.
+
+The keyword `operators_as_identifiers` specifies whether operators in
+identifier-position should have `K"Identifier"` as their kind, or be emitted as
+more specific operator kinds. For example, whether the `+` in `a + b` should be
+emitted as `K"Identifier"` (the default) or as `K"+"`.
 """
-function tokenize(text)
+function tokenize(text; operators_as_identifiers=true)
     ps = ParseStream(text)
     parse!(ps, rule=:all)
     ts = ps.tokens
@@ -192,7 +197,15 @@ function tokenize(text)
             continue
         end
         r = ts[i-1].next_byte:ts[i].next_byte-1
-        push!(output_tokens, Token(head(ts[i]), r))
+        k = kind(ts[i])
+        if k == K"Identifier" && !operators_as_identifiers
+            orig_k = ts[i].orig_kind
+            if is_operator(orig_k) && !is_word_operator(orig_k)
+                k = orig_k
+            end
+        end
+        f = flags(ts[i])
+        push!(output_tokens, Token(SyntaxHead(k,f), r))
     end
     output_tokens
 end

diff --git a/test/green_node.jl b/test/green_node.jl
@@ -8,7 +8,7 @@
     @test head.(children(t)) == [
          SyntaxHead(K"Identifier", 0x0000)
          SyntaxHead(K"Whitespace", 0x0001)
-         SyntaxHead(K"+", 0x0000)
+         SyntaxHead(K"Identifier", 0x0000)
          SyntaxHead(K"Whitespace", 0x0001)
          SyntaxHead(K"Identifier", 0x0000)
     ]

diff --git a/test/parser.jl b/test/parser.jl
@@ -1,14 +1,14 @@
 """
 Parse string to SyntaxNode tree and show as an sexpression
 """
-function parse_to_sexpr_str(production, code::AbstractString; v=v"1.6")
+function parse_to_sexpr_str(production, code::AbstractString; v=v"1.6", show_kws...)
     stream = ParseStream(code, version=v)
     production(ParseState(stream))
     JuliaSyntax.validate_tokens(stream)
     t = build_tree(GreenNode, stream)
     source = SourceFile(code)
     s = SyntaxNode(source, t, keep_parens=true)
-    return sprint(show, MIME("text/x.sexpression"), s)
+    return sprint(io->show(io, MIME("text/x.sexpression"), s; show_kws...))
 end
 
 function test_parse(production, input, output)
@@ -29,7 +29,7 @@ function test_parse(inout::Pair)
     test_parse(JuliaSyntax.parse_toplevel, inout...)
 end
 
-const PARSE_ERROR = r"\(error-t "
+PARSE_ERROR = r"\(error-t "
 
 with_version(v::VersionNumber, (i,o)::Pair) = ((;v=v), i) => o
 
@@ -436,7 +436,7 @@ tests = [
         "A.@x a"    =>  "(macrocall (. A @x) a)"
         "@A.B.@x a" =>  "(macrocall (. (. A B) (error-t) @x) a)"
         # .' discontinued
-        "f.'"    =>  "(wrapper f (error-t '))"
+        "f.'"    =>  "(dotcall-post f (error '))"
         # Field/property syntax
         "f.x.y"  =>  "(. (. f x) y)"
         "x .y"   =>  "(. x (error-t) y)"
@@ -1112,6 +1112,44 @@ parsestmt_test_specs = [
     end
 end
 
+parsestmt_with_kind_tests = [
+    # Most operators are semantically just normal identifiers after parsing so
+    # get the Kind K"Identifier"
+    "+"      => "+::Identifier"
+    "a + b"  => "(call-i a::Identifier +::Identifier b::Identifier)"
+    "a .+ b" => "(dotcall-i a::Identifier +::Identifier b::Identifier)"
+    "a |> b" => "(call-i a::Identifier |>::Identifier b::Identifier)"
+    "a => b" => "(call-i a::Identifier =>::Identifier b::Identifier)"
+    "a →  b" => "(call-i a::Identifier →::Identifier b::Identifier)"
+    "a < b < c" => "(comparison a::Identifier <::Identifier b::Identifier <::Identifier c::Identifier)"
+    "a .<: b"=> "(dotcall-i a::Identifier <:::Identifier b::Identifier)"
+    "a .. b" => "(call-i a::Identifier ..::Identifier b::Identifier)"
+    "a : b"  => "(call-i a::Identifier :::Identifier b::Identifier)"
+    "-2^x"   => "(call-pre -::Identifier (call-i 2::Integer ^::Identifier x::Identifier))"
+    "-(2)"   => "(call-pre -::Identifier (parens 2::Integer))"
+    "<:(a,)" => "(<:-, a::Identifier)"
+    "- 2"    => "(call-pre -::Identifier 2::Integer)"
+    "/x"     => "(call-pre (error /::Identifier) x::Identifier)"
+    "a^b"    => "(call-i a::Identifier ^::Identifier b::Identifier)"
+    "f.'"    => "(dotcall-post f::Identifier (error '::Identifier))"
+    "f'"     => "(call-post f::Identifier '::Identifier)"
+    # Standalone syntactic ops which keep their kind - they can't really be
+    # used in a sane way as identifiers or interpolated into expressions
+    # because they have their own syntactic forms.
+    ":(::)"  => "(quote-: (parens ::::::))"
+    ":(\$)"  => "(quote-: (parens \$::\$))"
+    ":(<:)"  => "(quote-: (parens <:::<:))"
+    ":(&&)"  => "(quote-: (parens &&::&&))"
+    ":(=)"   => "(quote-: (parens =::=))"
+]
+
+@testset "parser `Kind` remapping" begin
+    @testset "$(repr(input))" for (input, output) in parsestmt_with_kind_tests
+        input = ((show_kind=true,), input)
+        test_parse(JuliaSyntax.parse_stmts, input, output)
+    end
+end
+
 @testset "Trivia attachment" begin
     # TODO: Need to expand this greatly to cover as many forms as possible!
     @test show_green_tree("f(a;b)") == """

diff --git a/test/parser_api.jl b/test/parser_api.jl
@@ -170,14 +170,25 @@ end
     end
 end
 
-tokensplit(str) = [kind(tok) => untokenize(tok, str) for tok in tokenize(str)]
+tokensplit(str; kws...) = [kind(tok) => untokenize(tok, str) for tok in tokenize(str; kws...)]
 
 @testset "tokenize() API" begin
     # tokenize() is eager
     @test tokenize("aba") isa Vector{JuliaSyntax.Token}
 
     # . is a separate token from + in `.+`
     @test tokensplit("a .+ β") == [
+        K"Identifier" => "a",
+        K"Whitespace" => " ",
+        K"." => ".",
+        K"Identifier" => "+",
+        K"Whitespace" => " ",
+        K"Identifier" => "β",
+    ]
+
+    # + is kind K"+" when operators in identifier position are emitted as
+    # operator kinds.
+    @test tokensplit("a .+ β"; operators_as_identifiers=false) == [
         K"Identifier" => "a",
         K"Whitespace" => " ",
         K"." => ".",
@@ -194,6 +205,14 @@ tokensplit(str) = [kind(tok) => untokenize(tok, str) for tok in tokenize(str)]
         K"Whitespace" => " ",
         K"Integer" => "1",
     ]
+    # Including word operators
+    @test tokensplit("where = 1"; operators_as_identifiers=false) == [
+        K"Identifier" => "where",
+        K"Whitespace" => " ",
+        K"=" => "=",
+        K"Whitespace" => " ",
+        K"Integer" => "1",
+    ]
 
     # A predicate based on flags()
     @test JuliaSyntax.is_suffixed(tokenize("+₁")[1])