Skip to content

Commit

Permalink
Optimize the parse_attributes method to use Source#match to parse XML.
Browse files Browse the repository at this point in the history
## Why?

Improve maintainability by consolidating processing into `Source#match`.

## [Benchmark]

```
RUBYLIB= BUNDLER_ORIG_RUBYLIB= /Users/naitoh/.rbenv/versions/3.3.0/bin/ruby -v -S benchmark-driver /Users/naitoh/ghq/github.com/naitoh/rexml/benchmark/parse.yaml
ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin22]
Calculating -------------------------------------
                         before       after  before(YJIT)  after(YJIT)
                 dom     10.193      10.886        16.969       18.136 i/s -     100.000 times in 9.810390s 9.186087s 5.893093s 5.513848s
                 sax     30.812      30.131        50.759       55.520 i/s -     100.000 times in 3.245535s 3.318887s 1.970105s 1.801142s
                pull     36.413      35.511        60.692       68.538 i/s -     100.000 times in 2.746293s 2.816067s 1.647660s 1.459039s
              stream     34.968      34.461        52.797       60.311 i/s -     100.000 times in 2.859738s 2.901858s 1.894046s 1.658079s

Comparison:
                              dom
         after(YJIT):        18.1 i/s
        before(YJIT):        17.0 i/s - 1.07x  slower
               after:        10.9 i/s - 1.67x  slower
              before:        10.2 i/s - 1.78x  slower

                              sax
         after(YJIT):        55.5 i/s
        before(YJIT):        50.8 i/s - 1.09x  slower
              before:        30.8 i/s - 1.80x  slower
               after:        30.1 i/s - 1.84x  slower

                             pull
         after(YJIT):        68.5 i/s
        before(YJIT):        60.7 i/s - 1.13x  slower
              before:        36.4 i/s - 1.88x  slower
               after:        35.5 i/s - 1.93x  slower

                           stream
         after(YJIT):        60.3 i/s
        before(YJIT):        52.8 i/s - 1.14x  slower
              before:        35.0 i/s - 1.72x  slower
               after:        34.5 i/s - 1.75x  slower

```

- YJIT=ON : 1.07x - 1.14x faster
- YJIT=OFF : 0.97x - 1.06x faster
  • Loading branch information
naitoh committed Mar 12, 2024
1 parent 303dd15 commit cce06cd
Show file tree
Hide file tree
Showing 3 changed files with 53 additions and 78 deletions.
116 changes: 44 additions & 72 deletions lib/rexml/parsers/baseparser.rb
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ class BaseParser

module Private
INSTRUCTION_END = /#{NAME}(\s+.*?)?\?>/um
TAG_PATTERN = /((?>#{QNAME_STR}))/um
TAG_PATTERN = /((?>#{QNAME_STR}))\s*/um
CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um
ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um
NAME_PATTERN = /\s*#{NAME}/um
Expand All @@ -128,7 +128,6 @@ module Private
def initialize( source )
self.stream = source
@listeners = []
@attributes_scanner = StringScanner.new('')
end

def add_listener( listener )
Expand Down Expand Up @@ -614,87 +613,60 @@ def process_instruction(start_position)
def parse_attributes(prefixes, curr_ns)
attributes = {}
closed = false
match_data = @source.match(/^(.*?)(\/)?>/um, true)
if match_data.nil?
message = "Start tag isn't ended"
raise REXML::ParseException.new(message, @source)
end

raw_attributes = match_data[1]
closed = !match_data[2].nil?
return attributes, closed if raw_attributes.nil?
return attributes, closed if raw_attributes.empty?

@attributes_scanner.string = raw_attributes
scanner = @attributes_scanner
until scanner.eos?
if scanner.scan(/\s+/)
break if scanner.eos?
end

start_position = scanner.pos
while true
break if scanner.scan(ATTRIBUTE_PATTERN)
unless scanner.scan(QNAME)
message = "Invalid attribute name: <#{scanner.rest}>"
raise REXML::ParseException.new(message, @source)
end
name = scanner[0]
unless scanner.scan(/\s*=\s*/um)
while true
if @source.match(">", true)
return attributes, closed
elsif @source.match("/>", true)
closed = true
return attributes, closed
elsif match = @source.match(QNAME, true)
name = match[1]
prefix = match[2]
local_part = match[3]

unless @source.match(/\s*=\s*/um, true)
message = "Missing attribute equal: <#{name}>"
raise REXML::ParseException.new(message, @source)
end
quote = scanner.scan(/['"]/)
unless quote
message = "Missing attribute value start quote: <#{name}>"
raise REXML::ParseException.new(message, @source)
end
unless scanner.scan(/.*#{Regexp.escape(quote)}/um)
@source.ensure_buffer
match_data = @source.match(/^(.*?)(\/)?>/um, true)
if match_data
scanner << "/" if closed
scanner << ">"
scanner << match_data[1]
scanner.pos = start_position
closed = !match_data[2].nil?
next
unless match = @source.match(/(['"])(.*?)\1\s*/um, true)
if match = @source.match(/(['"])/, true)
message =
"Missing attribute value end quote: <#{name}>: <#{match[1]}>"
raise REXML::ParseException.new(message, @source)
else
message = "Missing attribute value start quote: <#{name}>"
raise REXML::ParseException.new(message, @source)
end
message =
"Missing attribute value end quote: <#{name}>: <#{quote}>"
raise REXML::ParseException.new(message, @source)
end
end
name = scanner[1]
prefix = scanner[2]
local_part = scanner[3]
# quote = scanner[4]
value = scanner[5]
if prefix == "xmlns"
if local_part == "xml"
if value != "http://www.w3.org/XML/1998/namespace"
msg = "The 'xml' prefix must not be bound to any other namespace "+
value = match[2]
if prefix == "xmlns"
if local_part == "xml"
if value != "http://www.w3.org/XML/1998/namespace"
msg = "The 'xml' prefix must not be bound to any other namespace "+
"(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
raise REXML::ParseException.new( msg, @source, self )
end
elsif local_part == "xmlns"
msg = "The 'xmlns' prefix must not be declared "+
"(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
raise REXML::ParseException.new( msg, @source, self )
raise REXML::ParseException.new( msg, @source, self)
end
elsif local_part == "xmlns"
msg = "The 'xmlns' prefix must not be declared "+
"(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
raise REXML::ParseException.new( msg, @source, self)
curr_ns << local_part
elsif prefix
prefixes << prefix unless prefix == "xml"
end
curr_ns << local_part
elsif prefix
prefixes << prefix unless prefix == "xml"
end

if attributes.has_key?(name)
msg = "Duplicate attribute #{name.inspect}"
raise REXML::ParseException.new(msg, @source, self)
end
if attributes[name]
msg = "Duplicate attribute #{name.inspect}"
raise REXML::ParseException.new(msg, @source, self)
end

attributes[name] = value
attributes[name] = value
else
message = "Invalid attribute name: <#{@source.buffer}>"
raise REXML::ParseException.new(message, @source)
end
end
return attributes, closed
end
end
end
Expand Down
6 changes: 3 additions & 3 deletions test/parse/test_element.rb
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,11 @@ def test_empty_namespace_attribute_name
parse("<x :a=\"\"></x>")
end
assert_equal(<<-DETAIL.chomp, exception.to_s)
Invalid attribute name: <:a="">
Invalid attribute name: <:a=""></x>>
Line: 1
Position: 9
Position: 13
Last 80 unconsumed characters:
:a=""></x>
DETAIL
end

Expand Down
9 changes: 6 additions & 3 deletions test/test_core.rb
Original file line number Diff line number Diff line change
Expand Up @@ -116,11 +116,12 @@ def test_attribute

def test_attribute_namespace_conflict
# https://www.w3.org/TR/xml-names/#uniqAttrs
message = <<-MESSAGE
message = <<-MESSAGE.chomp
Duplicate attribute "a"
Line: 4
Position: 140
Last 80 unconsumed characters:
/>
MESSAGE
assert_raise(REXML::ParseException.new(message)) do
Document.new(<<-XML)
Expand Down Expand Up @@ -1323,11 +1324,12 @@ def test_ticket_21
exception = assert_raise(ParseException) do
Document.new(src)
end
assert_equal(<<-DETAIL, exception.to_s)
assert_equal(<<-DETAIL.chomp, exception.to_s)
Missing attribute value start quote: <bar>
Line: 1
Position: 16
Last 80 unconsumed characters:
value/>
DETAIL
end

Expand All @@ -1336,11 +1338,12 @@ def test_ticket_21_part_2
exception = assert_raise(ParseException) do
Document.new(src)
end
assert_equal(<<-DETAIL, exception.to_s)
assert_equal(<<-DETAIL.chomp, exception.to_s)
Missing attribute value end quote: <bar>: <">
Line: 1
Position: 17
Last 80 unconsumed characters:
value/>
DETAIL
end

Expand Down

0 comments on commit cce06cd

Please sign in to comment.