Skip to content

Commit

Permalink
Use string scanner with baseparser (#105)
Browse files Browse the repository at this point in the history
Using StringScanner reduces the string copying process and speeds up the
process.

And I removed unnecessary methods.


https://github.com/ruby/rexml/actions/runs/7549990000/job/20554906140?pr=105

```
ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [x86_64-linux]
Calculating -------------------------------------
                     rexml 3.2.6      master  3.2.6(YJIT)  master(YJIT) 
                 dom       4.868       5.077        8.137         8.303 i/s -     100.000 times in 20.540529s 19.696590s 12.288900s 12.043666s
                 sax      13.597      13.953       19.206        20.948 i/s -     100.000 times in 7.354343s 7.167142s 5.206745s 4.773765s
                pull      15.641      16.918       22.266        25.378 i/s -     100.000 times in 6.393424s 5.910955s 4.491201s 3.940471s
              stream      14.339      15.844       19.810        22.206 i/s -     100.000 times in 6.973856s 6.311350s 5.047957s 4.503244s

Comparison:
                              dom
        master(YJIT):         8.3 i/s 
         3.2.6(YJIT):         8.1 i/s - 1.02x  slower
              master:         5.1 i/s - 1.64x  slower
         rexml 3.2.6:         4.9 i/s - 1.71x  slower

                              sax
        master(YJIT):        20.9 i/s 
         3.2.6(YJIT):        19.2 i/s - 1.09x  slower
              master:        14.0 i/s - 1.50x  slower
         rexml 3.2.6:        13.6 i/s - 1.54x  slower

                             pull
        master(YJIT):        25.4 i/s 
         3.2.6(YJIT):        22.3 i/s - 1.14x  slower
              master:        16.9 i/s - 1.50x  slower
         rexml 3.2.6:        15.6 i/s - 1.62x  slower

                           stream
        master(YJIT):        22.2 i/s 
         3.2.6(YJIT):        19.8 i/s - 1.12x  slower
              master:        15.8 i/s - 1.40x  slower
         rexml 3.2.6:        14.3 i/s - 1.55x  slower
```

- YJIT=ON : 1.02x - 1.14x faster
- YJIT=OFF : 1.02x - 1.10x faster

---------

Co-authored-by: Sutou Kouhei <[email protected]>
  • Loading branch information
naitoh and kou authored Jan 17, 2024
1 parent 72a26d6 commit 810d228
Show file tree
Hide file tree
Showing 6 changed files with 99 additions and 115 deletions.
4 changes: 4 additions & 0 deletions benchmark/parse.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ contexts:
require: false
prelude: require 'rexml'
- name: master
gems:
strscan: 3.0.8
prelude: |
$LOAD_PATH.unshift(File.expand_path("lib"))
require 'rexml'
Expand All @@ -16,6 +18,8 @@ contexts:
require 'rexml'
RubyVM::YJIT.enable
- name: master(YJIT)
gems:
strscan: 3.0.8
prelude: |
$LOAD_PATH.unshift(File.expand_path("lib"))
require 'rexml'
Expand Down
21 changes: 9 additions & 12 deletions lib/rexml/parsers/baseparser.rb
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ class BaseParser
ENTITYDEF = "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))"
PEDECL = "<!ENTITY\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
GEDECL = "<!ENTITY\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
ENTITYDECL = /\s*(?:#{GEDECL})|(?:#{PEDECL})/um
ENTITYDECL = /\s*(?:#{GEDECL})|\s*(?:#{PEDECL})/um

NOTATIONDECL_START = /\A\s*<!NOTATION/um
EXTERNAL_ID_PUBLIC = /\A\s*PUBLIC\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}\s*/um
Expand Down Expand Up @@ -259,7 +259,7 @@ def pull_event
else
@document_status = :after_doctype
if @source.encoding == "UTF-8"
@source.buffer.force_encoding(::Encoding::UTF_8)
@source.buffer_encoding = ::Encoding::UTF_8
end
end
end
Expand All @@ -274,8 +274,7 @@ def pull_event
return [ :elementdecl, @source.match( ELEMENTDECL_PATTERN, true )[1] ]

when ENTITY_START
match = @source.match( ENTITYDECL, true ).to_a.compact
match[0] = :entitydecl
match = [:entitydecl, *@source.match( ENTITYDECL, true ).captures.compact]
ref = false
if match[1] == '%'
ref = true
Expand Down Expand Up @@ -392,6 +391,7 @@ def pull_event
unless md
raise REXML::ParseException.new("malformed XML: missing tag start", @source)
end
tag = md[1]
@document_status = :in_element
prefixes = Set.new
prefixes << md[2] if md[2]
Expand All @@ -405,23 +405,20 @@ def pull_event
end

if closed
@closed = md[1]
@closed = tag
@nsstack.shift
else
@tags.push( md[1] )
@tags.push( tag )
end
return [ :start_element, md[1], attributes ]
return [ :start_element, tag, attributes ]
end
else
md = @source.match( TEXT_PATTERN, true )
text = md[1]
if md[0].length == 0
@source.match( /(\s+)/, true )
end
#STDERR.puts "GOT #{md[1].inspect}" unless md[0].length == 0
#return [ :text, "" ] if md[0].length == 0
# unnormalized = Text::unnormalize( md[1], self )
# return PullEvent.new( :text, md[1], unnormalized )
return [ :text, md[1] ]
return [ :text, text ]
end
rescue REXML::UndefinedNamespaceException
raise
Expand Down
149 changes: 47 additions & 102 deletions lib/rexml/source.rb
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,6 @@ def SourceFactory::create_from(arg)
# objects and provides consumption of text
class Source
include Encoding
# The current buffer (what we're going to read next)
attr_reader :buffer
# The line number of the last consumed text
attr_reader :line
attr_reader :encoding
Expand All @@ -41,7 +39,8 @@ class Source
# @param encoding if non-null, sets the encoding of the source to this
# value, overriding all encoding detection
def initialize(arg, encoding=nil)
@orig = @buffer = arg
@orig = arg
@scanner = StringScanner.new(@orig)
if encoding
self.encoding = encoding
else
Expand All @@ -50,6 +49,14 @@ def initialize(arg, encoding=nil)
@line = 0
end

# The current buffer (what we're going to read next)
def buffer
@scanner.rest
end

def buffer_encoding=(encoding)
@scanner.string.force_encoding(encoding)
end

# Inherited from Encoding
# Overridden to support optimized en/decoding
Expand All @@ -58,98 +65,57 @@ def encoding=(enc)
encoding_updated
end

# Scans the source for a given pattern. Note, that this is not your
# usual scan() method. For one thing, the pattern argument has some
# requirements; for another, the source can be consumed. You can easily
# confuse this method. Originally, the patterns were easier
# to construct and this method more robust, because this method
# generated search regexps on the fly; however, this was
# computationally expensive and slowed down the entire REXML package
# considerably, since this is by far the most commonly called method.
# @param pattern must be a Regexp, and must be in the form of
# /^\s*(#{your pattern, with no groups})(.*)/. The first group
# will be returned; the second group is used if the consume flag is
# set.
# @param consume if true, the pattern returned will be consumed, leaving
# everything after it in the Source.
# @return the pattern, if found, or nil if the Source is empty or the
# pattern is not found.
def scan(pattern, cons=false)
return nil if @buffer.nil?
rv = @buffer.scan(pattern)
@buffer = $' if cons and rv.size>0
rv
end

def read
end

def consume( pattern )
@buffer = $' if pattern.match( @buffer )
end

def match_to( char, pattern )
return pattern.match(@buffer)
end

def match_to_consume( char, pattern )
md = pattern.match(@buffer)
@buffer = $'
return md
end

def match(pattern, cons=false)
md = pattern.match(@buffer)
@buffer = $' if cons and md
return md
if cons
@scanner.scan(pattern).nil? ? nil : @scanner
else
@scanner.check(pattern).nil? ? nil : @scanner
end
end

# @return true if the Source is exhausted
def empty?
@buffer == ""
end

def position
@orig.index( @buffer )
@scanner.eos?
end

# @return the current line in the source
def current_line
lines = @orig.split
res = lines.grep @buffer[0..30]
res = lines.grep @scanner.rest[0..30]
res = res[-1] if res.kind_of? Array
lines.index( res ) if res
end

private

def detect_encoding
buffer_encoding = @buffer.encoding
scanner_encoding = @scanner.rest.encoding
detected_encoding = "UTF-8"
begin
@buffer.force_encoding("ASCII-8BIT")
if @buffer[0, 2] == "\xfe\xff"
@buffer[0, 2] = ""
@scanner.string.force_encoding("ASCII-8BIT")
if @scanner.scan(/\xfe\xff/n)
detected_encoding = "UTF-16BE"
elsif @buffer[0, 2] == "\xff\xfe"
@buffer[0, 2] = ""
elsif @scanner.scan(/\xff\xfe/n)
detected_encoding = "UTF-16LE"
elsif @buffer[0, 3] == "\xef\xbb\xbf"
@buffer[0, 3] = ""
elsif @scanner.scan(/\xef\xbb\xbf/n)
detected_encoding = "UTF-8"
end
ensure
@buffer.force_encoding(buffer_encoding)
@scanner.string.force_encoding(scanner_encoding)
end
self.encoding = detected_encoding
end

def encoding_updated
if @encoding != 'UTF-8'
@buffer = decode(@buffer)
@scanner.string = decode(@scanner.rest)
@to_utf = true
else
@to_utf = false
@buffer.force_encoding ::Encoding::UTF_8
@scanner.string.force_encoding(::Encoding::UTF_8)
end
end
end
Expand All @@ -172,7 +138,7 @@ def initialize(arg, block_size=500, encoding=nil)
end

if !@to_utf and
@buffer.respond_to?(:force_encoding) and
@orig.respond_to?(:force_encoding) and
@source.respond_to?(:external_encoding) and
@source.external_encoding != ::Encoding::UTF_8
@force_utf8 = true
Expand All @@ -181,65 +147,44 @@ def initialize(arg, block_size=500, encoding=nil)
end
end

def scan(pattern, cons=false)
rv = super
# You'll notice that this next section is very similar to the same
# section in match(), but just a liiittle different. This is
# because it is a touch faster to do it this way with scan()
# than the way match() does it; enough faster to warrant duplicating
# some code
if rv.size == 0
until @buffer =~ pattern or @source.nil?
begin
@buffer << readline
rescue Iconv::IllegalSequence
raise
rescue
@source = nil
end
end
rv = super
end
rv.taint if RUBY_VERSION < '2.7'
rv
end

def read
begin
@buffer << readline
# NOTE: `@scanner << readline` does not free memory, so when parsing huge XML in JRuby's DOM,
# out-of-memory error `Java::JavaLang::OutOfMemoryError: Java heap space` occurs.
# `@scanner.string = @scanner.rest + readline` frees memory that is already consumed
# and avoids this problem.
@scanner.string = @scanner.rest + readline
rescue Exception, NameError
@source = nil
end
end

def consume( pattern )
match( pattern, true )
end

def match( pattern, cons=false )
rv = pattern.match(@buffer)
@buffer = $' if cons and rv
while !rv and @source
if cons
md = @scanner.scan(pattern)
else
md = @scanner.check(pattern)
end
while md.nil? and @source
begin
@buffer << readline
rv = pattern.match(@buffer)
@buffer = $' if cons and rv
@scanner << readline
if cons
md = @scanner.scan(pattern)
else
md = @scanner.check(pattern)
end
rescue
@source = nil
end
end
rv.taint if RUBY_VERSION < '2.7'
rv

md.nil? ? nil : @scanner
end

def empty?
super and ( @source.nil? || @source.eof? )
end

def position
@er_source.pos rescue 0
end

# @return the current line in the source
def current_line
begin
Expand Down Expand Up @@ -290,7 +235,7 @@ def encoding_updated
@source.set_encoding(@encoding, @encoding)
end
@line_break = encode(">")
@pending_buffer, @buffer = @buffer, ""
@pending_buffer, @scanner.string = @scanner.rest, ""
@pending_buffer.force_encoding(@encoding)
super
end
Expand Down
2 changes: 2 additions & 0 deletions rexml.gemspec
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,8 @@ Gem::Specification.new do |spec|

spec.required_ruby_version = '>= 2.5.0'

spec.add_runtime_dependency("strscan", ">= 3.0.8")

spec.add_development_dependency "benchmark_driver"
spec.add_development_dependency "bundler"
spec.add_development_dependency "rake"
Expand Down
36 changes: 36 additions & 0 deletions test/parse/test_entity_declaration.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# frozen_string_literal: false
require 'test/unit'
require 'rexml/document'

module REXMLTests
class TestParseEntityDeclaration < Test::Unit::TestCase
private
def xml(internal_subset)
<<-XML
<!DOCTYPE r SYSTEM "urn:x-henrikmartensson:test" [
#{internal_subset}
]>
<r/>
XML
end

def parse(internal_subset)
REXML::Document.new(xml(internal_subset)).doctype
end

def test_empty
exception = assert_raise(REXML::ParseException) do
parse(<<-INTERNAL_SUBSET)
<!ENTITY>
INTERNAL_SUBSET
end
assert_equal(<<-DETAIL.chomp, exception.to_s)
Malformed notation declaration: name is missing
Line: 5
Position: 72
Last 80 unconsumed characters:
<!ENTITY> ]> <r/>
DETAIL
end
end
end
2 changes: 1 addition & 1 deletion test/test_core.rb
Original file line number Diff line number Diff line change
Expand Up @@ -727,7 +727,7 @@ def test_iso_8859_1_output_function
koln_iso_8859_1 = "K\xF6ln"
koln_utf8 = "K\xc3\xb6ln"
source = Source.new( koln_iso_8859_1, 'iso-8859-1' )
results = source.scan(/.*/)[0]
results = source.match(/.*/)[0]
koln_utf8.force_encoding('UTF-8') if koln_utf8.respond_to?(:force_encoding)
assert_equal koln_utf8, results
output << results
Expand Down

0 comments on commit 810d228

Please sign in to comment.