Skip to content

Commit

Permalink
Rewrite the MarkupLexer as a state machine
Browse files Browse the repository at this point in the history
  • Loading branch information
sliekens committed Oct 29, 2024
1 parent 8d2e85a commit d26b324
Show file tree
Hide file tree
Showing 5 changed files with 166 additions and 69 deletions.
36 changes: 33 additions & 3 deletions GW2SDK.Tests/Features/Markup/MarkupParserTest.cs
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@ public void Ignores_invalid_tags()
var input = "5 <REDACTED> Dye kits";
var lexer = new MarkupLexer();
var parser = new MarkupParser();

var actual = parser.Parse(lexer.Tokenize(input));
var tokens = lexer.Tokenize(input);
var actual = parser.Parse(tokens);

Assert.NotNull(actual);
Assert.Collection(actual.Children,
Expand All @@ -33,7 +33,8 @@ public void Forgives_mismatched_tags()
var input = "<c=@reminder>This coat hides leg armor.<c>";
var lexer = new MarkupLexer();
var parser = new MarkupParser();
var actual = parser.Parse(lexer.Tokenize(input));
var tokens = lexer.Tokenize(input);
var actual = parser.Parse(tokens);

Assert.NotNull(actual);
Assert.Collection(actual.Children,
Expand All @@ -51,4 +52,33 @@ public void Forgives_mismatched_tags()
}
);
}

[Fact]
public void Keeps_trailing_newline()
{
var input = "<c=@flavor>A gift given in gratitude from the leaders of Tyria.</c>\n";
var lexer = new MarkupLexer();
var parser = new MarkupParser();
var tokens = lexer.Tokenize(input);
var actual = parser.Parse(tokens);

Assert.NotNull(actual);
Assert.Collection(actual.Children,
node =>
{
var coloredText = Assert.IsType<ColoredTextNode>(node);
Assert.Equal("@flavor", coloredText.Color);
Assert.Collection(coloredText.Children,
node =>
{
var text = Assert.IsType<TextNode>(node);
Assert.Equal("A gift given in gratitude from the leaders of Tyria.", text.Text);
}
);
},
node =>
{
var lineBreak = Assert.IsType<LineBreakNode>(node);
});
}
}
2 changes: 1 addition & 1 deletion GW2SDK.Tests/Features/Markup/MarkupSyntaxValidator.cs
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ private static void ValidateNode(MarkupNode node)

private static void ValidateTextNode(TextNode text)
{
Assert.Matches("[a-zA-Z0-9. \n]+", text.Text);
Assert.Matches("[a-zA-Z0-9 .]+", text.Text);
}

private static void ValidateColoredTextNode(ColoredTextNode coloredText)
Expand Down
150 changes: 110 additions & 40 deletions GW2SDK/Features/Markup/MarkupLexer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,6 @@ namespace GuildWars2.Markup;
[PublicAPI]
public sealed class MarkupLexer
{
private static readonly char[] EqualSign = ['='];

private static readonly HashSet<string> VoidElements = new(StringComparer.OrdinalIgnoreCase)
{
"br"
Expand All @@ -22,59 +20,131 @@ public sealed class MarkupLexer
/// </returns>
public IEnumerable<MarkupToken> Tokenize(string input)
{
// The language can be very roughly described by the following grammar:
// MARKUP = *(TEXT / LF / TAG_OPEN / TAG_CLOSE / VOID_TAG)
// TEXT = *VCHAR; except "<" and "\"
// TAG_OPEN = "<" TAG_NAME [ "=" TAG_VALUE ] ">"
// TAG_CLOSE = "</" TAG_NAME ">"
// TAG_VOID = "<" TAG_NAME ">"
// TAG_NAME = 1*ALPHA
// TAG_VALUE = 1*VCHAR; except ">"
var scanner = new Scanner(input);
var state = MarkupLexerState.Text;
var start = scanner.Position;
while (scanner.CanAdvance)
{
if (scanner.Current == '<')
switch (state)
{
scanner.Advance();
if (scanner.Current == '/')
{
scanner.Advance();
var tagName = scanner.ReadUntil('>');
scanner.Advance();
yield return new MarkupToken(MarkupTokenType.TagClose, tagName);
}
else
{
var tagName = scanner.ReadUntil('>');
scanner.Advance();

// Self-closing tags like <br /> are not valid, ignore the trailing slash
if (tagName.EndsWith("/"))
case MarkupLexerState.Text:
if (scanner.Current == '<')
{
tagName = tagName[..^1];
if (scanner.Position > start)
{
yield return new MarkupToken(MarkupTokenType.Text, input[start..scanner.Position]);
}

state = MarkupLexerState.TagOpen;
start = scanner.Position + 1;
}
else if (scanner.Current == '\n')
{
if (scanner.Position > start)
{
yield return new MarkupToken(MarkupTokenType.Text, input[start..scanner.Position]);
}

// Also ignore any meaningless whitespace in the tag name
tagName = tagName.Trim();
yield return new MarkupToken(MarkupTokenType.LineBreak, "");
state = MarkupLexerState.Text;
start = scanner.Position + 1;
}

if (VoidElements.Contains(tagName))
break;

case MarkupLexerState.TagOpen:
if (scanner.Current == '/')
{
yield return new MarkupToken(MarkupTokenType.TagVoid, tagName);
if (scanner.Position == start)
{
state = MarkupLexerState.TagClose;
start = scanner.Position + 1;
}
else if (scanner.Peek() == '>')
{
// Ignore the '/' in '/>'
var tagName = input[start..scanner.Position].Trim();
if (VoidElements.Contains(tagName))
{
yield return new MarkupToken(MarkupTokenType.TagVoid, tagName);
}
else
{
yield return new MarkupToken(MarkupTokenType.TagStart, tagName);
}

state = MarkupLexerState.Text;
start = scanner.Position + 2;
}
else
{
// Invalid tag
state = MarkupLexerState.Text;
start = scanner.Position + 1;
}
}
else if (tagName.Contains('='))
else if (scanner.Current == '=')
{
var parts = tagName.Split(EqualSign, 2);
yield return new MarkupToken(MarkupTokenType.TagStart, parts[0]);
yield return new MarkupToken(MarkupTokenType.TagValue, parts[1]);
var tagName = input[start..scanner.Position].Trim();
yield return new MarkupToken(MarkupTokenType.TagStart, tagName);
state = MarkupLexerState.TagValue;
start = scanner.Position + 1;
}
else
else if (scanner.Current == '>')
{
yield return new MarkupToken(MarkupTokenType.TagStart, tagName);
var tagName = input[start..scanner.Position].Trim();
if (VoidElements.Contains(tagName))
{
yield return new MarkupToken(MarkupTokenType.TagVoid, tagName);
}
else
{
yield return new MarkupToken(MarkupTokenType.TagStart, tagName);
}

state = MarkupLexerState.Text;
start = scanner.Position + 1;
}
}
}
else if (scanner.Current == '\n')
{
scanner.Advance();
yield return new MarkupToken(MarkupTokenType.LineBreak, "");
}
else
{
var text = scanner.ReadUntilAny('<', '\n');
yield return new MarkupToken(MarkupTokenType.Text, text);

break;

case MarkupLexerState.TagValue:
if (scanner.Current == '>')
{
var tagValue = input[start..scanner.Position].Trim();
yield return new MarkupToken(MarkupTokenType.TagValue, tagValue);
state = MarkupLexerState.Text;
start = scanner.Position + 1;
}

break;

case MarkupLexerState.TagClose:
if (scanner.Current == '>')
{
var tagName = input[start..scanner.Position].Trim();
yield return new MarkupToken(MarkupTokenType.TagClose, tagName);
state = MarkupLexerState.Text;
start = scanner.Position + 1;
}

break;
}

scanner.Advance();
}

if (scanner.Position > start)
{
yield return new MarkupToken(MarkupTokenType.Text, input[start..scanner.Position]);
}

yield return new MarkupToken(MarkupTokenType.End, "");
Expand Down
13 changes: 13 additions & 0 deletions GW2SDK/Features/Markup/MarkupLexerState.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@

namespace GuildWars2.Markup;

internal enum MarkupLexerState
{
Text,

TagOpen,

TagValue,

TagClose
}
34 changes: 9 additions & 25 deletions GW2SDK/Features/Markup/Scanner.cs
Original file line number Diff line number Diff line change
@@ -1,35 +1,19 @@

using System.Diagnostics;

namespace GuildWars2.Markup;

internal class Scanner(string input)
{
private int position;

public char Current => position >= input.Length ? '\0' : input[position];

public bool CanAdvance => position < input.Length;

public void Advance() => position++;
public int Position { get; private set; }

public string ReadUntil(char c)
{
var start = position;
while (Current != c && CanAdvance)
{
Advance();
}
public char Current => Position >= input.Length ? '\0' : input[Position];

return input[start..position];
}
public bool CanAdvance => Position < input.Length;

public string ReadUntilAny(params char[] chars)
{
var start = position;
while (!chars.Contains(Current) && CanAdvance)
{
Advance();
}
[DebuggerStepThrough]
public char Peek() => Position + 1 >= input.Length ? '\0' : input[Position + 1];

return input[start..position];
}
[DebuggerStepThrough]
public void Advance() => Position++;
}

0 comments on commit d26b324

Please sign in to comment.