From 810d2285235d5501a0a124f300832e6e9515da3c Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Wed, 17 Jan 2024 15:32:57 +0900 Subject: [PATCH] Use string scanner with baseparser (#105) Using StringScanner reduces the string copying process and speeds up the process. And I removed unnecessary methods. https://github.com/ruby/rexml/actions/runs/7549990000/job/20554906140?pr=105 ``` ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [x86_64-linux] Calculating ------------------------------------- rexml 3.2.6 master 3.2.6(YJIT) master(YJIT) dom 4.868 5.077 8.137 8.303 i/s - 100.000 times in 20.540529s 19.696590s 12.288900s 12.043666s sax 13.597 13.953 19.206 20.948 i/s - 100.000 times in 7.354343s 7.167142s 5.206745s 4.773765s pull 15.641 16.918 22.266 25.378 i/s - 100.000 times in 6.393424s 5.910955s 4.491201s 3.940471s stream 14.339 15.844 19.810 22.206 i/s - 100.000 times in 6.973856s 6.311350s 5.047957s 4.503244s Comparison: dom master(YJIT): 8.3 i/s 3.2.6(YJIT): 8.1 i/s - 1.02x slower master: 5.1 i/s - 1.64x slower rexml 3.2.6: 4.9 i/s - 1.71x slower sax master(YJIT): 20.9 i/s 3.2.6(YJIT): 19.2 i/s - 1.09x slower master: 14.0 i/s - 1.50x slower rexml 3.2.6: 13.6 i/s - 1.54x slower pull master(YJIT): 25.4 i/s 3.2.6(YJIT): 22.3 i/s - 1.14x slower master: 16.9 i/s - 1.50x slower rexml 3.2.6: 15.6 i/s - 1.62x slower stream master(YJIT): 22.2 i/s 3.2.6(YJIT): 19.8 i/s - 1.12x slower master: 15.8 i/s - 1.40x slower rexml 3.2.6: 14.3 i/s - 1.55x slower ``` - YJIT=ON : 1.02x - 1.14x faster - YJIT=OFF : 1.02x - 1.10x faster --------- Co-authored-by: Sutou Kouhei --- benchmark/parse.yaml | 4 + lib/rexml/parsers/baseparser.rb | 21 ++-- lib/rexml/source.rb | 149 ++++++++------------------ rexml.gemspec | 2 + test/parse/test_entity_declaration.rb | 36 +++++++ test/test_core.rb | 2 +- 6 files changed, 99 insertions(+), 115 deletions(-) create mode 100644 test/parse/test_entity_declaration.rb diff --git a/benchmark/parse.yaml b/benchmark/parse.yaml index e7066fcb..8818b50c 100644 --- a/benchmark/parse.yaml +++ b/benchmark/parse.yaml @@ -5,6 +5,8 @@ contexts: require: false prelude: require 'rexml' - name: master + gems: + strscan: 3.0.8 prelude: | $LOAD_PATH.unshift(File.expand_path("lib")) require 'rexml' @@ -16,6 +18,8 @@ contexts: require 'rexml' RubyVM::YJIT.enable - name: master(YJIT) + gems: + strscan: 3.0.8 prelude: | $LOAD_PATH.unshift(File.expand_path("lib")) require 'rexml' diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index 305b1207..65bad260 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -96,7 +96,7 @@ class BaseParser ENTITYDEF = "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))" PEDECL = "" GEDECL = "" - ENTITYDECL = /\s*(?:#{GEDECL})|(?:#{PEDECL})/um + ENTITYDECL = /\s*(?:#{GEDECL})|\s*(?:#{PEDECL})/um NOTATIONDECL_START = /\A\s*0 - rv - end - def read end - def consume( pattern ) - @buffer = $' if pattern.match( @buffer ) - end - - def match_to( char, pattern ) - return pattern.match(@buffer) - end - - def match_to_consume( char, pattern ) - md = pattern.match(@buffer) - @buffer = $' - return md - end - def match(pattern, cons=false) - md = pattern.match(@buffer) - @buffer = $' if cons and md - return md + if cons + @scanner.scan(pattern).nil? ? nil : @scanner + else + @scanner.check(pattern).nil? ? nil : @scanner + end end # @return true if the Source is exhausted def empty? - @buffer == "" - end - - def position - @orig.index( @buffer ) + @scanner.eos? end # @return the current line in the source def current_line lines = @orig.split - res = lines.grep @buffer[0..30] + res = lines.grep @scanner.rest[0..30] res = res[-1] if res.kind_of? Array lines.index( res ) if res end private + def detect_encoding - buffer_encoding = @buffer.encoding + scanner_encoding = @scanner.rest.encoding detected_encoding = "UTF-8" begin - @buffer.force_encoding("ASCII-8BIT") - if @buffer[0, 2] == "\xfe\xff" - @buffer[0, 2] = "" + @scanner.string.force_encoding("ASCII-8BIT") + if @scanner.scan(/\xfe\xff/n) detected_encoding = "UTF-16BE" - elsif @buffer[0, 2] == "\xff\xfe" - @buffer[0, 2] = "" + elsif @scanner.scan(/\xff\xfe/n) detected_encoding = "UTF-16LE" - elsif @buffer[0, 3] == "\xef\xbb\xbf" - @buffer[0, 3] = "" + elsif @scanner.scan(/\xef\xbb\xbf/n) detected_encoding = "UTF-8" end ensure - @buffer.force_encoding(buffer_encoding) + @scanner.string.force_encoding(scanner_encoding) end self.encoding = detected_encoding end def encoding_updated if @encoding != 'UTF-8' - @buffer = decode(@buffer) + @scanner.string = decode(@scanner.rest) @to_utf = true else @to_utf = false - @buffer.force_encoding ::Encoding::UTF_8 + @scanner.string.force_encoding(::Encoding::UTF_8) end end end @@ -172,7 +138,7 @@ def initialize(arg, block_size=500, encoding=nil) end if !@to_utf and - @buffer.respond_to?(:force_encoding) and + @orig.respond_to?(:force_encoding) and @source.respond_to?(:external_encoding) and @source.external_encoding != ::Encoding::UTF_8 @force_utf8 = true @@ -181,65 +147,44 @@ def initialize(arg, block_size=500, encoding=nil) end end - def scan(pattern, cons=false) - rv = super - # You'll notice that this next section is very similar to the same - # section in match(), but just a liiittle different. This is - # because it is a touch faster to do it this way with scan() - # than the way match() does it; enough faster to warrant duplicating - # some code - if rv.size == 0 - until @buffer =~ pattern or @source.nil? - begin - @buffer << readline - rescue Iconv::IllegalSequence - raise - rescue - @source = nil - end - end - rv = super - end - rv.taint if RUBY_VERSION < '2.7' - rv - end - def read begin - @buffer << readline + # NOTE: `@scanner << readline` does not free memory, so when parsing huge XML in JRuby's DOM, + # out-of-memory error `Java::JavaLang::OutOfMemoryError: Java heap space` occurs. + # `@scanner.string = @scanner.rest + readline` frees memory that is already consumed + # and avoids this problem. + @scanner.string = @scanner.rest + readline rescue Exception, NameError @source = nil end end - def consume( pattern ) - match( pattern, true ) - end - def match( pattern, cons=false ) - rv = pattern.match(@buffer) - @buffer = $' if cons and rv - while !rv and @source + if cons + md = @scanner.scan(pattern) + else + md = @scanner.check(pattern) + end + while md.nil? and @source begin - @buffer << readline - rv = pattern.match(@buffer) - @buffer = $' if cons and rv + @scanner << readline + if cons + md = @scanner.scan(pattern) + else + md = @scanner.check(pattern) + end rescue @source = nil end end - rv.taint if RUBY_VERSION < '2.7' - rv + + md.nil? ? nil : @scanner end def empty? super and ( @source.nil? || @source.eof? ) end - def position - @er_source.pos rescue 0 - end - # @return the current line in the source def current_line begin @@ -290,7 +235,7 @@ def encoding_updated @source.set_encoding(@encoding, @encoding) end @line_break = encode(">") - @pending_buffer, @buffer = @buffer, "" + @pending_buffer, @scanner.string = @scanner.rest, "" @pending_buffer.force_encoding(@encoding) super end diff --git a/rexml.gemspec b/rexml.gemspec index b51df33b..2ba1c64d 100644 --- a/rexml.gemspec +++ b/rexml.gemspec @@ -55,6 +55,8 @@ Gem::Specification.new do |spec| spec.required_ruby_version = '>= 2.5.0' + spec.add_runtime_dependency("strscan", ">= 3.0.8") + spec.add_development_dependency "benchmark_driver" spec.add_development_dependency "bundler" spec.add_development_dependency "rake" diff --git a/test/parse/test_entity_declaration.rb b/test/parse/test_entity_declaration.rb new file mode 100644 index 00000000..e15deec6 --- /dev/null +++ b/test/parse/test_entity_declaration.rb @@ -0,0 +1,36 @@ +# frozen_string_literal: false +require 'test/unit' +require 'rexml/document' + +module REXMLTests + class TestParseEntityDeclaration < Test::Unit::TestCase + private + def xml(internal_subset) + <<-XML + + + XML + end + + def parse(internal_subset) + REXML::Document.new(xml(internal_subset)).doctype + end + + def test_empty + exception = assert_raise(REXML::ParseException) do + parse(<<-INTERNAL_SUBSET) + + INTERNAL_SUBSET + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed notation declaration: name is missing +Line: 5 +Position: 72 +Last 80 unconsumed characters: + ]> + DETAIL + end + end +end diff --git a/test/test_core.rb b/test/test_core.rb index 7c18c03f..8c33d834 100644 --- a/test/test_core.rb +++ b/test/test_core.rb @@ -727,7 +727,7 @@ def test_iso_8859_1_output_function koln_iso_8859_1 = "K\xF6ln" koln_utf8 = "K\xc3\xb6ln" source = Source.new( koln_iso_8859_1, 'iso-8859-1' ) - results = source.scan(/.*/)[0] + results = source.match(/.*/)[0] koln_utf8.force_encoding('UTF-8') if koln_utf8.respond_to?(:force_encoding) assert_equal koln_utf8, results output << results