Skip to content

Commit

Permalink
Improve regex parsing to skip empty attribute and handle html encoded…
Browse files Browse the repository at this point in the history
… separators to avoid an extra call to HtmlDecode
  • Loading branch information
onizet committed Nov 10, 2024
1 parent 25eba15 commit d506001
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 9 deletions.
11 changes: 2 additions & 9 deletions src/Html2OpenXml/Collections/HtmlAttributeCollection.cs
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,7 @@ namespace HtmlToOpenXml;
/// </summary>
sealed class HtmlAttributeCollection
{
private static readonly Regex stripStyleAttributesRegex = new(@"(?<name>.+?):\s*(?<val>[^;]+);*\s*");

private static readonly Regex stripStyleAttributesRegex = new(@"(?<name>[^;\s]+)\s?(&\#58;|:)\s?(?<val>[^;&]+)\s?(;|&\#59;)*");
private readonly Dictionary<string, string> attributes = [];


Expand All @@ -37,13 +36,7 @@ public static HtmlAttributeCollection ParseStyle(string? htmlTag)

// Encoded ':' and ';' characters are valid for browser but not handled by the regex (bug #13812 reported by robin391)
// ex= <span style="text-decoration&#58;underline&#59;color:red">
MatchCollection matches = stripStyleAttributesRegex.Matches(
#if NET5_0_OR_GREATER
System.Web.HttpUtility.HtmlDecode(htmlTag)
#else
HttpUtility.HtmlDecode(htmlTag)
#endif
);
MatchCollection matches = stripStyleAttributesRegex.Matches(htmlTag);
foreach (Match m in matches)
collection.attributes[m.Groups["name"].Value] = m.Groups["val"].Value;

Expand Down
16 changes: 16 additions & 0 deletions test/HtmlToOpenXml.Tests/StyleTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -165,5 +165,21 @@ public void DuplicateStyle_ReturnsLatter()
var styleAttributes = HtmlAttributeCollection.ParseStyle("color:red;color:blue");
Assert.That(styleAttributes["color"], Is.EqualTo("blue"));
}

[Test(Description = "Encoded ':' and ';' characters are valid")]
public void EncodedStyle_ShouldSucceed()
{
var styleAttributes = HtmlAttributeCollection.ParseStyle("text-decoration&#58;underline&#59;color:red");
Assert.That(styleAttributes["text-decoration"], Is.EqualTo("underline"));
Assert.That(styleAttributes["color"], Is.EqualTo("red"));
}

[Test(Description = "Key style with no value should be ignored")]
public void EmptyStyle_ShouldBeIgnoredd()
{
var styleAttributes = HtmlAttributeCollection.ParseStyle("text-decoration;color:red");
Assert.That(styleAttributes["text-decoration"], Is.Null);
Assert.That(styleAttributes["color"], Is.EqualTo("red"));
}
}
}

0 comments on commit d506001

Please sign in to comment.