Skip to content

Commit

Permalink
Changed processing in REXML::Parsers::BaseParser#pull_event from regu…
Browse files Browse the repository at this point in the history
…lar expression to processing using StringScanner.

## Why
Improve maintainability by optimizing the process so that the parsing process proceeds using StringScanner#scan.

# Changed
- Added read_source option to IOSource#match to suppress read from @source.
- Added Source#string= method for error message output.

## Benchmark

```
RUBYLIB= BUNDLER_ORIG_RUBYLIB= /Users/naitoh/.rbenv/versions/3.3.0/bin/ruby -v -S benchmark-driver /Users/naitoh/ghq/github.com/naitoh/rexml/benchmark/parse.yaml
ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin22]
Calculating -------------------------------------
                         before       after  before(YJIT)  after(YJIT)
                 dom     11.308      11.437        17.833       18.369 i/s -     100.000 times in 8.843230s 8.743769s 5.607477s 5.443861s
                 sax     31.280      31.835        48.002       51.767 i/s -     100.000 times in 3.196881s 3.141148s 2.083259s 1.931737s
                pull     36.954      37.981        59.502       62.359 i/s -     100.000 times in 2.706080s 2.632914s 1.680629s 1.603608s
              stream     34.328      36.263        50.594       56.571 i/s -     100.000 times in 2.913063s 2.757657s 1.976527s 1.767694s

Comparison:
                              dom
         after(YJIT):        18.4 i/s
        before(YJIT):        17.8 i/s - 1.03x  slower
               after:        11.4 i/s - 1.61x  slower
              before:        11.3 i/s - 1.62x  slower

                              sax
         after(YJIT):        51.8 i/s
        before(YJIT):        48.0 i/s - 1.08x  slower
               after:        31.8 i/s - 1.63x  slower
              before:        31.3 i/s - 1.65x  slower

                             pull
         after(YJIT):        62.4 i/s
        before(YJIT):        59.5 i/s - 1.05x  slower
               after:        38.0 i/s - 1.64x  slower
              before:        37.0 i/s - 1.69x  slower

                           stream
         after(YJIT):        56.6 i/s
        before(YJIT):        50.6 i/s - 1.12x  slower
               after:        36.3 i/s - 1.56x  slower
              before:        34.3 i/s - 1.65x  slower

```

- YJIT=ON : 1.03x - 1.12x faster
- YJIT=OFF : 1.01x - 1.05x faster
  • Loading branch information
naitoh committed Feb 24, 2024
1 parent 0656925 commit 9a075bf
Show file tree
Hide file tree
Showing 2 changed files with 96 additions and 109 deletions.
195 changes: 89 additions & 106 deletions lib/rexml/parsers/baseparser.rb
Original file line number Diff line number Diff line change
Expand Up @@ -48,29 +48,15 @@ class BaseParser
REFERENCE = "&(?:#{NAME};|#\\d+;|#x[0-9a-fA-F]+;)"
REFERENCE_RE = /#{REFERENCE}/

DOCTYPE_START = /\A\s*<!DOCTYPE\s/um
DOCTYPE_END = /\A\s*\]\s*>/um
ATTRIBUTE_PATTERN = /\s*(#{QNAME_STR})\s*=\s*(["'])(.*?)\4/um
COMMENT_START = /\A<!--/u
COMMENT_PATTERN = /<!--(.*?)-->/um
CDATA_START = /\A<!\[CDATA\[/u
CDATA_END = /\A\s*\]\s*>/um
CDATA_PATTERN = /<!\[CDATA\[(.*?)\]\]>/um
XMLDECL_START = /\A<\?xml\s/u;
XMLDECL_PATTERN = /<\?xml\s+(.*?)\?>/um
INSTRUCTION_START = /\A<\?/u
INSTRUCTION_PATTERN = /<\?#{NAME}(\s+.*?)?\?>/um
TAG_MATCH = /\A<((?>#{QNAME_STR}))/um
CLOSE_MATCH = /\A\s*<\/(#{QNAME_STR})\s*>/um
INSTRUCTION_PATTERN = /#{NAME}(\s+.*?)?\?>/um
TAG_MATCH = /((?>#{QNAME_STR}))/um
CLOSE_MATCH = /(#{QNAME_STR})\s*>/um

VERSION = /\bversion\s*=\s*["'](.*?)['"]/um
ENCODING = /\bencoding\s*=\s*["'](.*?)['"]/um
STANDALONE = /\bstandalone\s*=\s*["'](.*?)['"]/um

ENTITY_START = /\A\s*<!ENTITY/
ELEMENTDECL_START = /\A\s*<!ELEMENT/um
ELEMENTDECL_PATTERN = /\A\s*(<!ELEMENT.*?)>/um
SYSTEMENTITY = /\A\s*(%.*?;)\s*$/um
ENUMERATION = "\\(\\s*#{NMTOKEN}(?:\\s*\\|\\s*#{NMTOKEN})*\\s*\\)"
NOTATIONTYPE = "NOTATION\\s+\\(\\s*#{NAME}(?:\\s*\\|\\s*#{NAME})*\\s*\\)"
ENUMERATEDTYPE = "(?:(?:#{NOTATIONTYPE})|(?:#{ENUMERATION}))"
Expand All @@ -79,10 +65,7 @@ class BaseParser
DEFAULTDECL = "(#REQUIRED|#IMPLIED|(?:(#FIXED\\s+)?#{ATTVALUE}))"
ATTDEF = "\\s+#{NAME}\\s+#{ATTTYPE}\\s+#{DEFAULTDECL}"
ATTDEF_RE = /#{ATTDEF}/
ATTLISTDECL_START = /\A\s*<!ATTLIST/um
ATTLISTDECL_PATTERN = /\A\s*<!ATTLIST\s+#{NAME}(?:#{ATTDEF})*\s*>/um

TEXT_PATTERN = /\A([^<]*)/um
ATTLISTDECL_PATTERN = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um

# Entity constants
PUBIDCHAR = "\x20\x0D\x0Aa-zA-Z0-9\\-()+,./:=?;!*@$_%#"
Expand All @@ -94,11 +77,10 @@ class BaseParser
ENTITYVALUE = %Q{((?:"(?:[^%&"]|#{PEREFERENCE}|#{REFERENCE})*")|(?:'([^%&']|#{PEREFERENCE}|#{REFERENCE})*'))}
PEDEF = "(?:#{ENTITYVALUE}|#{EXTERNALID})"
ENTITYDEF = "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))"
PEDECL = "<!ENTITY\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
GEDECL = "<!ENTITY\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
ENTITYDECL = /\s*(?:#{GEDECL})|\s*(?:#{PEDECL})/um
PEDECL = "\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
GEDECL = "\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
ENTITYDECL = /(?:#{GEDECL})|(?:#{PEDECL})/um

NOTATIONDECL_START = /\A\s*<!NOTATION/um
EXTERNAL_ID_PUBLIC = /\A\s*PUBLIC\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}\s*/um
EXTERNAL_ID_SYSTEM = /\A\s*SYSTEM\s+#{SYSTEMLITERAL}\s*/um
PUBLIC_ID = /\A\s*PUBLIC\s+#{PUBIDLITERAL}\s*/um
Expand Down Expand Up @@ -198,65 +180,67 @@ def pull_event
#STDERR.puts @source.encoding
#STDERR.puts "BUFFER = #{@source.buffer.inspect}"
if @document_status == nil
word = @source.match( /\A((?:\s+)|(?:<[^>]*>))/um )
word = word[1] unless word.nil?
#STDERR.puts "WORD = #{word.inspect}"
case word
when COMMENT_START
return [ :comment, @source.match( COMMENT_PATTERN, true )[1] ]
when XMLDECL_START
#STDERR.puts "XMLDECL"
results = @source.match( XMLDECL_PATTERN, true )[1]
version = VERSION.match( results )
version = version[1] unless version.nil?
encoding = ENCODING.match(results)
encoding = encoding[1] unless encoding.nil?
if need_source_encoding_update?(encoding)
@source.encoding = encoding
end
if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding
encoding = "UTF-16"
end
standalone = STANDALONE.match(results)
standalone = standalone[1] unless standalone.nil?
return [ :xmldecl, version, encoding, standalone ]
when INSTRUCTION_START
return process_instruction
when DOCTYPE_START
base_error_message = "Malformed DOCTYPE"
@source.match(DOCTYPE_START, true)
@nsstack.unshift(curr_ns=Set.new)
name = parse_name(base_error_message)
if @source.match(/\A\s*\[/um, true)
id = [nil, nil, nil]
@document_status = :in_doctype
elsif @source.match(/\A\s*>/um, true)
id = [nil, nil, nil]
@document_status = :after_doctype
else
id = parse_id(base_error_message,
accept_external_id: true,
accept_public_id: false)
if id[0] == "SYSTEM"
# For backward compatibility
id[1], id[2] = id[2], nil
@source.read
if @source.match("<?", true, false)
if results = @source.match(/xml\s+(.*?)\?>/um, true, false)
results = results[1]
version = VERSION.match( results )
version = version[1] unless version.nil?
encoding = ENCODING.match(results)
encoding = encoding[1] unless encoding.nil?
if need_source_encoding_update?(encoding)
@source.encoding = encoding
end
if @source.match(/\A\s*\[/um, true)
if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding
encoding = "UTF-16"
end
standalone = STANDALONE.match(results)
standalone = standalone[1] unless standalone.nil?
return [ :xmldecl, version, encoding, standalone ]
else # instruction
return process_instruction
end
elsif @source.match("<!", true, false)
if @source.match("--", true, false)
return [ :comment, @source.match( /(.*?)-->/um, true )[1] ]
elsif @source.match(/DOCTYPE\s/um, true, false)
base_error_message = "Malformed DOCTYPE"
@nsstack.unshift(curr_ns=Set.new)
name = parse_name(base_error_message)
if @source.match(/\s*\[/um, true)
id = [nil, nil, nil]
@document_status = :in_doctype
elsif @source.match(/\A\s*>/um, true)
elsif @source.match(/\s*>/um, true)
id = [nil, nil, nil]
@document_status = :after_doctype
else
message = "#{base_error_message}: garbage after external ID"
raise REXML::ParseException.new(message, @source)
id = parse_id(base_error_message,
accept_external_id: true,
accept_public_id: false)
if id[0] == "SYSTEM"
# For backward compatibility
id[1], id[2] = id[2], nil
end
if @source.match(/\s*\[/um, true)
@document_status = :in_doctype
elsif @source.match(/\s*>/um, true)
@document_status = :after_doctype
else
message = "#{base_error_message}: garbage after external ID"
raise REXML::ParseException.new(message, @source)
end
end
args = [:start_doctype, name, *id]
if @document_status == :after_doctype
@source.match(/\s*/um, true)
@stack << [ :end_doctype ]
end
return args
else
message = "Invalid XML"
raise REXML::ParseException.new(message, @source)
end
args = [:start_doctype, name, *id]
if @document_status == :after_doctype
@source.match(/\A\s*/um, true)
@stack << [ :end_doctype ]
end
return args
when /\A\s+/
elsif @source.match( /\s+/, false, false )
else
@document_status = :after_doctype
if @source.encoding == "UTF-8"
Expand All @@ -265,16 +249,13 @@ def pull_event
end
end
if @document_status == :in_doctype
md = @source.match(/\A\s*(.*?>)/um)
case md[1]
when SYSTEMENTITY
match = @source.match( SYSTEMENTITY, true )[1]
return [ :externalentity, match ]

when ELEMENTDECL_START
return [ :elementdecl, @source.match( ELEMENTDECL_PATTERN, true )[1] ]

when ENTITY_START
@source.read
@source.match(/\s*/um, true, false) # skip spaces
if match = @source.match( /(%.*?;)\s*$/um, true, false)
return [ :externalentity, match[1] ]
elsif match = @source.match(/(<!ELEMENT.*?)>/um, true, false)
return [ :elementdecl, match[1] ]
elsif @source.match( "<!ENTITY", true, false)
match = [:entitydecl, *@source.match( ENTITYDECL, true ).captures.compact]
ref = false
if match[1] == '%'
Expand All @@ -300,7 +281,7 @@ def pull_event
end
match << '%' if ref
return match
when ATTLISTDECL_START
elsif @source.match( "<!ATTLIST", true, false)
md = @source.match( ATTLISTDECL_PATTERN, true )
raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
element = md[1]
Expand All @@ -320,42 +301,41 @@ def pull_event
end
end
return [ :attlistdecl, element, pairs, contents ]
when NOTATIONDECL_START
elsif @source.match( "<!NOTATION", true, false)
base_error_message = "Malformed notation declaration"
unless @source.match(/\A\s*<!NOTATION\s+/um, true)
if @source.match(/\A\s*<!NOTATION\s*>/um)
unless @source.match(/\s+/um, true)
if @source.match(/\s*>/um)
message = "#{base_error_message}: name is missing"
else
message = "#{base_error_message}: invalid declaration name"
end
@source.string = " <!NOTATION" + @source.buffer
raise REXML::ParseException.new(message, @source)
end
name = parse_name(base_error_message)
id = parse_id(base_error_message,
accept_external_id: true,
accept_public_id: true)
unless @source.match(/\A\s*>/um, true)
unless @source.match(/\s*>/um, true)
message = "#{base_error_message}: garbage before end >"
raise REXML::ParseException.new(message, @source)
end
return [:notationdecl, name, *id]
when DOCTYPE_END
elsif @source.match( /\]\s*>/um, true, false)
@document_status = :after_doctype
@source.match( DOCTYPE_END, true )
return [ :end_doctype ]
end
end
if @document_status == :after_doctype
@source.match(/\A\s*/um, true)
@source.match(/\s*/um, true)
end
begin
next_data = @source.buffer
if next_data.size < 2
@source.read
next_data = @source.buffer
end
if next_data[0] == ?<
if next_data[1] == ?/
if @source.match("<", true, false)
if @source.match("/", true, false)
@nsstack.shift
last_tag = @tags.pop
md = @source.match( CLOSE_MATCH, true )
Expand All @@ -366,15 +346,16 @@ def pull_event
if md.nil? or last_tag != md[1]
message = "Missing end tag for '#{last_tag}'"
message += " (got '#{md[1]}')" if md
@source.string = "</" + @source.buffer if md.nil?
raise REXML::ParseException.new(message, @source)
end
return [ :end_element, last_tag ]
elsif next_data[1] == ?!
md = @source.match(/\A(\s*[^>]*>)/um)
elsif @source.match("!", true, false)
md = @source.match(/([^>]*>)/um)
#STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}"
raise REXML::ParseException.new("Malformed node", @source) unless md
if md[0][2] == ?-
md = @source.match( COMMENT_PATTERN, true )
if md[0][0] == ?-
md = @source.match( /--(.*?)-->/um, true )

case md[1]
when /--/, /-\z/
Expand All @@ -383,17 +364,18 @@ def pull_event

return [ :comment, md[1] ] if md
else
md = @source.match( CDATA_PATTERN, true )
md = @source.match( /\[CDATA\[(.*?)\]\]>/um, true )
return [ :cdata, md[1] ] if md
end
raise REXML::ParseException.new( "Declarations can only occur "+
"in the doctype declaration.", @source)
elsif next_data[1] == ??
elsif @source.match("?", true, false)
return process_instruction
else
# Get the next tag
md = @source.match(TAG_MATCH, true)
unless md
@source.string = "<" + @source.buffer
raise REXML::ParseException.new("malformed XML: missing tag start", @source)
end
tag = md[1]
Expand All @@ -418,7 +400,7 @@ def pull_event
return [ :start_element, tag, attributes ]
end
else
md = @source.match( TEXT_PATTERN, true )
md = @source.match( /([^<]*)/um, true )
text = md[1]
return [ :text, text ]
end
Expand Down Expand Up @@ -579,6 +561,7 @@ def process_instruction
match_data = @source.match(INSTRUCTION_PATTERN, true)
unless match_data
message = "Invalid processing instruction node"
@source.string = "<?" + @source.buffer
raise REXML::ParseException.new(message, @source)
end
[:processing_instruction, match_data[1], match_data[2]]
Expand Down
10 changes: 7 additions & 3 deletions lib/rexml/source.rb
Original file line number Diff line number Diff line change
Expand Up @@ -68,14 +68,18 @@ def encoding=(enc)
def read
end

def match(pattern, cons=false)
def match(pattern, cons=false, read_source=false)
if cons
@scanner.scan(pattern).nil? ? nil : @scanner
else
@scanner.check(pattern).nil? ? nil : @scanner
end
end

def string=(string)
@scanner.string = string
end

# @return true if the Source is exhausted
def empty?
@scanner.eos?
Expand Down Expand Up @@ -155,13 +159,13 @@ def read
end
end

def match( pattern, cons=false )
def match( pattern, cons=false, read_source=true )
if cons
md = @scanner.scan(pattern)
else
md = @scanner.check(pattern)
end
while md.nil? and @source
while read_source && md.nil? && @source
begin
@scanner << readline
if cons
Expand Down

0 comments on commit 9a075bf

Please sign in to comment.