Skip to content

Commit

Permalink
Changed processing of @document_status == nil in REXML::Parsers::Base…
Browse files Browse the repository at this point in the history
…Parser#pull_event from regular expression to processing using StringScanner.

[Why]
Improve maintainability by optimizing the process so that the parsing process proceeds using StringScanner#scan.

[Changed]
- Added read_source option to IOSource#match to suppress read from @source.
- Added Source#string= method for error message output.
  • Loading branch information
naitoh committed Feb 17, 2024
1 parent 372daf1 commit fbff363
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 19 deletions.
36 changes: 20 additions & 16 deletions lib/rexml/parsers/baseparser.rb
Original file line number Diff line number Diff line change
Expand Up @@ -48,17 +48,19 @@ class BaseParser
REFERENCE = "&(?:#{NAME};|#\\d+;|#x[0-9a-fA-F]+;)"
REFERENCE_RE = /#{REFERENCE}/

DOCTYPE_START = /\A\s*<!DOCTYPE\s/um
DOCTYPE_END = /\A\s*\]\s*>/um
DOCTYPE_START = /<!DOCTYPE\s/um
DOCTYPE_END = /\s*\]\s*>/um
ATTRIBUTE_PATTERN = /\s*(#{QNAME_STR})\s*=\s*(["'])(.*?)\4/um
COMMENT_START = /\A<!--/u
COMMENT_END = /(.*?)-->/um
COMMENT_PATTERN = /<!--(.*?)-->/um
CDATA_START = /\A<!\[CDATA\[/u
CDATA_END = /\A\s*\]\s*>/um
CDATA_PATTERN = /<!\[CDATA\[(.*?)\]\]>/um
XMLDECL_START = /\A<\?xml\s/u;
XMLDECL_PATTERN = /<\?xml\s+(.*?)\?>/um
XMLDECL_PATTERN = /\s*<\?xml\s+(.*?)\?>/um
INSTRUCTION_START = /\A<\?/u
INSTRUCTION_END = /#{NAME}(\s+.*?)?\?>/um
INSTRUCTION_PATTERN = /<\?#{NAME}(\s+.*?)?\?>/um
TAG_MATCH = /\A<((?>#{QNAME_STR}))/um
CLOSE_MATCH = /\A\s*<\/(#{QNAME_STR})\s*>/um
Expand Down Expand Up @@ -198,15 +200,10 @@ def pull_event
#STDERR.puts @source.encoding
#STDERR.puts "BUFFER = #{@source.buffer.inspect}"
if @document_status == nil
word = @source.match( /\A((?:\s+)|(?:<[^>]*>))/um )
word = word[1] unless word.nil?
#STDERR.puts "WORD = #{word.inspect}"
case word
when COMMENT_START
return [ :comment, @source.match( COMMENT_PATTERN, true )[1] ]
when XMLDECL_START
@source.read
if results = @source.match(XMLDECL_PATTERN, true, false)
#STDERR.puts "XMLDECL"
results = @source.match( XMLDECL_PATTERN, true )[1]
results = results[1]
version = VERSION.match( results )
version = version[1] unless version.nil?
encoding = ENCODING.match(results)
Expand All @@ -220,11 +217,18 @@ def pull_event
standalone = STANDALONE.match(results)
standalone = standalone[1] unless standalone.nil?
return [ :xmldecl, version, encoding, standalone ]
when INSTRUCTION_START
return process_instruction
when DOCTYPE_START
elsif @source.match(COMMENT_START, true, false)
return [ :comment, @source.match( COMMENT_END, true )[1] ]
elsif @source.match(INSTRUCTION_START, true, false)
md = @source.match(INSTRUCTION_END, true)
unless md
message = "Invalid processing instruction node"
@source.string = "<?" + @source.buffer
raise REXML::ParseException.new(message, @source)
end
return [:processing_instruction, md[1], md[2]]
elsif @source.match(DOCTYPE_START, true, false)
base_error_message = "Malformed DOCTYPE"
@source.match(DOCTYPE_START, true)
@nsstack.unshift(curr_ns=Set.new)
name = parse_name(base_error_message)
if @source.match(/\A\s*\[/um, true)
Expand Down Expand Up @@ -256,7 +260,7 @@ def pull_event
@stack << [ :end_doctype ]
end
return args
when /\A\s+/
elsif @source.match( /\A\s+/, false, false )
else
@document_status = :after_doctype
if @source.encoding == "UTF-8"
Expand Down
10 changes: 7 additions & 3 deletions lib/rexml/source.rb
Original file line number Diff line number Diff line change
Expand Up @@ -68,14 +68,18 @@ def encoding=(enc)
def read
end

def match(pattern, cons=false)
def match(pattern, cons=false, read_source=false)
if cons
@scanner.scan(pattern).nil? ? nil : @scanner
else
@scanner.check(pattern).nil? ? nil : @scanner
end
end

def string=(string)
@scanner.string = string
end

# @return true if the Source is exhausted
def empty?
@scanner.eos?
Expand Down Expand Up @@ -155,13 +159,13 @@ def read
end
end

def match( pattern, cons=false )
def match( pattern, cons=false, read_source=true )
if cons
md = @scanner.scan(pattern)
else
md = @scanner.check(pattern)
end
while md.nil? and @source
while read_source && md.nil? && @source
begin
@scanner << readline
if cons
Expand Down

0 comments on commit fbff363

Please sign in to comment.