From 0e49e2b0a7105c4e211e340d29489ace167ea1c8 Mon Sep 17 00:00:00 2001 From: otegami Date: Mon, 5 Aug 2024 20:45:14 +0800 Subject: [PATCH] wikipedia: increase REXML entity expansion limit during XML parsing Using `Datasets::Wikipedia#each` raised an `entity expansion has grown too large (RuntimeError)`. This error occurs because the entity expansion limit in REXML is set by https://github.com/ruby/rexml/pull/187, and `Datasets::Wikipedia#each` exceeds that limit. In Red Datasets, increasing the entity expansion limit is not a problem because we want to handle large datasets. Therefore, we temporarily increase the limit. ```ruby require 'datasets' wikipedia = Datasets::Wikipedia.new wikipedia.each do |wiki| pp wiki end ``` ```console $ cd red-datasets && bundle && bundle exec ruby wiki /home/otegami/.rbenv/versions/3.3.3/lib/ruby/gems/3.3.0/gems/rexml-3.3.4/lib/rexml/parsers/baseparser.rb:560:in `block in unnormalize': entity expansion has grown too large (RuntimeError) ``` --- lib/datasets/wikipedia.rb | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/lib/datasets/wikipedia.rb b/lib/datasets/wikipedia.rb index ab818bf..1d73156 100644 --- a/lib/datasets/wikipedia.rb +++ b/lib/datasets/wikipedia.rb @@ -48,11 +48,16 @@ def each(&block) open_data do |input| listener = ArticlesListener.new(block) parser = REXML::Parsers::StreamParser.new(input, listener) - parser.parse + with_increased_entity_expansion_text_limit do + parser.parse + end end end private + + ENTITY_EXPANSION_TEXT_LIMIT = 1_342_177_280 + def base_name "#{@language}wiki-latest-#{type_in_path}.xml.bz2" end @@ -80,6 +85,14 @@ def type_in_path end end + def with_increased_entity_expansion_text_limit + default_limit = REXML::Security.entity_expansion_text_limit + REXML::Security.entity_expansion_text_limit = ENTITY_EXPANSION_TEXT_LIMIT + yield + ensure + REXML::Security.entity_expansion_text_limit = default_limit + end + class ArticlesListener include REXML::StreamListener