From 77cb0dcf0af1b31acf7fc813315c7c3defac23f8 Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Thu, 7 Mar 2024 07:02:34 +0900 Subject: [PATCH] Separate `IOSource#ensure_buffer` from `IOSource#match`. (#118) ## Why? It would affect performance to do a read check in `IOSource#match` every time, Separate read processing from `IOSource#ensure_buffer`. Use `IOSource#ensure_buffer` in the following cases where `@source.buffer` is empty. 1. at the start of pull_event 2. If a trailing `'>'` pattern matches, as in `@source.match(/\s*>/um)`. ## Benchmark ``` RUBYLIB= BUNDLER_ORIG_RUBYLIB= /Users/naitoh/.rbenv/versions/3.3.0/bin/ruby -v -S benchmark-driver /Users/naitoh/ghq/github.com/naitoh/rexml/benchmark/parse.yaml ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin22] Calculating ------------------------------------- before after before(YJIT) after(YJIT) dom 10.278 10.986 16.430 16.941 i/s - 100.000 times in 9.729858s 9.102574s 6.086579s 5.902885s sax 30.166 30.496 49.851 51.596 i/s - 100.000 times in 3.315008s 3.279069s 2.005961s 1.938123s pull 35.459 36.380 60.266 63.134 i/s - 100.000 times in 2.820181s 2.748745s 1.659301s 1.583928s stream 33.762 34.636 55.173 55.859 i/s - 100.000 times in 2.961948s 2.887131s 1.812485s 1.790218s Comparison: dom after(YJIT): 16.9 i/s before(YJIT): 16.4 i/s - 1.03x slower after: 11.0 i/s - 1.54x slower before: 10.3 i/s - 1.65x slower sax after(YJIT): 51.6 i/s before(YJIT): 49.9 i/s - 1.04x slower after: 30.5 i/s - 1.69x slower before: 30.2 i/s - 1.71x slower pull after(YJIT): 63.1 i/s before(YJIT): 60.3 i/s - 1.05x slower after: 36.4 i/s - 1.74x slower before: 35.5 i/s - 1.78x slower stream after(YJIT): 55.9 i/s before(YJIT): 55.2 i/s - 1.01x slower after: 34.6 i/s - 1.61x slower before: 33.8 i/s - 1.65x slower ``` - YJIT=ON : 1.01x - 1.05x faster - YJIT=OFF : 1.01x - 1.06x faster --- lib/rexml/parsers/baseparser.rb | 5 +++++ lib/rexml/source.rb | 8 +++++++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index c79de0eb..c01b087b 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -210,6 +210,8 @@ def pull_event return @stack.shift if @stack.size > 0 #STDERR.puts @source.encoding #STDERR.puts "BUFFER = #{@source.buffer.inspect}" + + @source.ensure_buffer if @document_status == nil start_position = @source.position if @source.match("/um, true) id = [nil, nil, nil] @document_status = :after_doctype + @source.ensure_buffer else id = parse_id(base_error_message, accept_external_id: true, @@ -248,6 +251,7 @@ def pull_event @document_status = :in_doctype elsif @source.match(/\s*>/um, true) @document_status = :after_doctype + @source.ensure_buffer else message = "#{base_error_message}: garbage after external ID" raise REXML::ParseException.new(message, @source) @@ -646,6 +650,7 @@ def parse_attributes(prefixes, curr_ns) raise REXML::ParseException.new(message, @source) end unless scanner.scan(/.*#{Regexp.escape(quote)}/um) + @source.ensure_buffer match_data = @source.match(/^(.*?)(\/)?>/um, true) if match_data scanner << "/" if closed diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb index 81d96451..7f47c2be 100644 --- a/lib/rexml/source.rb +++ b/lib/rexml/source.rb @@ -68,6 +68,9 @@ def encoding=(enc) def read end + def ensure_buffer + end + def match(pattern, cons=false) if cons @scanner.scan(pattern).nil? ? nil : @scanner @@ -165,11 +168,14 @@ def read end end + def ensure_buffer + read if @scanner.eos? && @source + end + # Note: When specifying a string for 'pattern', it must not include '>' except in the following formats: # - ">" # - "XXX>" (X is any string excluding '>') def match( pattern, cons=false ) - read if @scanner.eos? && @source while true if cons md = @scanner.scan(pattern)