diff --git a/lib/datasets/wikipedia.rb b/lib/datasets/wikipedia.rb index ab818bf..1d73156 100644 --- a/lib/datasets/wikipedia.rb +++ b/lib/datasets/wikipedia.rb @@ -48,11 +48,16 @@ def each(&block) open_data do |input| listener = ArticlesListener.new(block) parser = REXML::Parsers::StreamParser.new(input, listener) - parser.parse + with_increased_entity_expansion_text_limit do + parser.parse + end end end private + + ENTITY_EXPANSION_TEXT_LIMIT = 1_342_177_280 + def base_name "#{@language}wiki-latest-#{type_in_path}.xml.bz2" end @@ -80,6 +85,14 @@ def type_in_path end end + def with_increased_entity_expansion_text_limit + default_limit = REXML::Security.entity_expansion_text_limit + REXML::Security.entity_expansion_text_limit = ENTITY_EXPANSION_TEXT_LIMIT + yield + ensure + REXML::Security.entity_expansion_text_limit = default_limit + end + class ArticlesListener include REXML::StreamListener