diff --git a/benchmark/parse.yaml b/benchmark/parse.yaml index e7066fcb..8818b50c 100644 --- a/benchmark/parse.yaml +++ b/benchmark/parse.yaml @@ -5,6 +5,8 @@ contexts: require: false prelude: require 'rexml' - name: master + gems: + strscan: 3.0.8 prelude: | $LOAD_PATH.unshift(File.expand_path("lib")) require 'rexml' @@ -16,6 +18,8 @@ contexts: require 'rexml' RubyVM::YJIT.enable - name: master(YJIT) + gems: + strscan: 3.0.8 prelude: | $LOAD_PATH.unshift(File.expand_path("lib")) require 'rexml' diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index 305b1207..65bad260 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -96,7 +96,7 @@ class BaseParser ENTITYDEF = "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))" PEDECL = "" GEDECL = "" - ENTITYDECL = /\s*(?:#{GEDECL})|(?:#{PEDECL})/um + ENTITYDECL = /\s*(?:#{GEDECL})|\s*(?:#{PEDECL})/um NOTATIONDECL_START = /\A\s*0 - rv - end - def read end - def consume( pattern ) - @buffer = $' if pattern.match( @buffer ) - end - - def match_to( char, pattern ) - return pattern.match(@buffer) - end - - def match_to_consume( char, pattern ) - md = pattern.match(@buffer) - @buffer = $' - return md - end - def match(pattern, cons=false) - md = pattern.match(@buffer) - @buffer = $' if cons and md - return md + if cons + @scanner.scan(pattern).nil? ? nil : @scanner + else + @scanner.check(pattern).nil? ? nil : @scanner + end end # @return true if the Source is exhausted def empty? - @buffer == "" - end - - def position - @orig.index( @buffer ) + @scanner.eos? end # @return the current line in the source def current_line lines = @orig.split - res = lines.grep @buffer[0..30] + res = lines.grep @scanner.rest[0..30] res = res[-1] if res.kind_of? Array lines.index( res ) if res end private + def detect_encoding - buffer_encoding = @buffer.encoding + scanner_encoding = @scanner.rest.encoding detected_encoding = "UTF-8" begin - @buffer.force_encoding("ASCII-8BIT") - if @buffer[0, 2] == "\xfe\xff" - @buffer[0, 2] = "" + @scanner.string.force_encoding("ASCII-8BIT") + if @scanner.scan(/\xfe\xff/n) detected_encoding = "UTF-16BE" - elsif @buffer[0, 2] == "\xff\xfe" - @buffer[0, 2] = "" + elsif @scanner.scan(/\xff\xfe/n) detected_encoding = "UTF-16LE" - elsif @buffer[0, 3] == "\xef\xbb\xbf" - @buffer[0, 3] = "" + elsif @scanner.scan(/\xef\xbb\xbf/n) detected_encoding = "UTF-8" end ensure - @buffer.force_encoding(buffer_encoding) + @scanner.string.force_encoding(scanner_encoding) end self.encoding = detected_encoding end def encoding_updated if @encoding != 'UTF-8' - @buffer = decode(@buffer) + @scanner.string = decode(@scanner.rest) @to_utf = true else @to_utf = false - @buffer.force_encoding ::Encoding::UTF_8 + @scanner.string.force_encoding(::Encoding::UTF_8) end end end @@ -172,7 +138,7 @@ def initialize(arg, block_size=500, encoding=nil) end if !@to_utf and - @buffer.respond_to?(:force_encoding) and + @orig.respond_to?(:force_encoding) and @source.respond_to?(:external_encoding) and @source.external_encoding != ::Encoding::UTF_8 @force_utf8 = true @@ -181,65 +147,44 @@ def initialize(arg, block_size=500, encoding=nil) end end - def scan(pattern, cons=false) - rv = super - # You'll notice that this next section is very similar to the same - # section in match(), but just a liiittle different. This is - # because it is a touch faster to do it this way with scan() - # than the way match() does it; enough faster to warrant duplicating - # some code - if rv.size == 0 - until @buffer =~ pattern or @source.nil? - begin - @buffer << readline - rescue Iconv::IllegalSequence - raise - rescue - @source = nil - end - end - rv = super - end - rv.taint if RUBY_VERSION < '2.7' - rv - end - def read begin - @buffer << readline + # NOTE: `@scanner << readline` does not free memory, so when parsing huge XML in JRuby's DOM, + # out-of-memory error `Java::JavaLang::OutOfMemoryError: Java heap space` occurs. + # `@scanner.string = @scanner.rest + readline` frees memory that is already consumed + # and avoids this problem. + @scanner.string = @scanner.rest + readline rescue Exception, NameError @source = nil end end - def consume( pattern ) - match( pattern, true ) - end - def match( pattern, cons=false ) - rv = pattern.match(@buffer) - @buffer = $' if cons and rv - while !rv and @source + if cons + md = @scanner.scan(pattern) + else + md = @scanner.check(pattern) + end + while md.nil? and @source begin - @buffer << readline - rv = pattern.match(@buffer) - @buffer = $' if cons and rv + @scanner << readline + if cons + md = @scanner.scan(pattern) + else + md = @scanner.check(pattern) + end rescue @source = nil end end - rv.taint if RUBY_VERSION < '2.7' - rv + + md.nil? ? nil : @scanner end def empty? super and ( @source.nil? || @source.eof? ) end - def position - @er_source.pos rescue 0 - end - # @return the current line in the source def current_line begin @@ -290,7 +235,7 @@ def encoding_updated @source.set_encoding(@encoding, @encoding) end @line_break = encode(">") - @pending_buffer, @buffer = @buffer, "" + @pending_buffer, @scanner.string = @scanner.rest, "" @pending_buffer.force_encoding(@encoding) super end diff --git a/rexml.gemspec b/rexml.gemspec index b51df33b..2ba1c64d 100644 --- a/rexml.gemspec +++ b/rexml.gemspec @@ -55,6 +55,8 @@ Gem::Specification.new do |spec| spec.required_ruby_version = '>= 2.5.0' + spec.add_runtime_dependency("strscan", ">= 3.0.8") + spec.add_development_dependency "benchmark_driver" spec.add_development_dependency "bundler" spec.add_development_dependency "rake" diff --git a/test/parse/test_entity_declaration.rb b/test/parse/test_entity_declaration.rb new file mode 100644 index 00000000..e15deec6 --- /dev/null +++ b/test/parse/test_entity_declaration.rb @@ -0,0 +1,36 @@ +# frozen_string_literal: false +require 'test/unit' +require 'rexml/document' + +module REXMLTests + class TestParseEntityDeclaration < Test::Unit::TestCase + private + def xml(internal_subset) + <<-XML + + + XML + end + + def parse(internal_subset) + REXML::Document.new(xml(internal_subset)).doctype + end + + def test_empty + exception = assert_raise(REXML::ParseException) do + parse(<<-INTERNAL_SUBSET) + + INTERNAL_SUBSET + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed notation declaration: name is missing +Line: 5 +Position: 72 +Last 80 unconsumed characters: + ]> + DETAIL + end + end +end diff --git a/test/test_core.rb b/test/test_core.rb index 7c18c03f..8c33d834 100644 --- a/test/test_core.rb +++ b/test/test_core.rb @@ -727,7 +727,7 @@ def test_iso_8859_1_output_function koln_iso_8859_1 = "K\xF6ln" koln_utf8 = "K\xc3\xb6ln" source = Source.new( koln_iso_8859_1, 'iso-8859-1' ) - results = source.scan(/.*/)[0] + results = source.match(/.*/)[0] koln_utf8.force_encoding('UTF-8') if koln_utf8.respond_to?(:force_encoding) assert_equal koln_utf8, results output << results