From b8ca0c28c672a061d7371a74f6ffb00b68f59693 Mon Sep 17 00:00:00 2001 From: hmdne <54514036+hmdne@users.noreply.github.com> Date: Fri, 23 Aug 2024 20:37:02 +0200 Subject: [PATCH] input/output/converter: input output pattern This provides a backbone for implementing #72. Convert reverse_adoc to Coradoc::Input::HTML (fixes #100) This commit also implements Input::{Adoc,Docx} and Output::Adoc, based on existing code. Everything is handled by a new Converter class and our current CLI tools are refactored to use the new Converter class. This gives us a solid framework for implementing #64. --- exe/reverse_adoc | 54 ++++----- exe/w2a | 42 +++---- lib/coradoc.rb | 13 ++- lib/coradoc/converter.rb | 139 +++++++++++++++++++++++ lib/coradoc/input.rb | 8 +- lib/coradoc/input/adoc.rb | 19 ++++ lib/coradoc/input/docx.rb | 35 ++++++ lib/coradoc/input/html.rb | 30 ++++- lib/coradoc/input/html/cleaner.rb | 4 + lib/coradoc/input/html/html_converter.rb | 51 ++++++--- lib/coradoc/output.rb | 10 ++ lib/coradoc/output/adoc.rb | 17 +++ 12 files changed, 338 insertions(+), 84 deletions(-) create mode 100644 lib/coradoc/converter.rb create mode 100644 lib/coradoc/input/adoc.rb create mode 100644 lib/coradoc/input/docx.rb create mode 100644 lib/coradoc/output.rb create mode 100644 lib/coradoc/output/adoc.rb diff --git a/exe/reverse_adoc b/exe/reverse_adoc index d00f692..515d09a 100755 --- a/exe/reverse_adoc +++ b/exe/reverse_adoc @@ -5,27 +5,36 @@ require "rubygems" require "bundler/setup" require "coradoc/input/html" +require "coradoc/converter" require "optparse" require "fileutils" +config = { + input_options: input_options = {}, + input_processor: :html, + output_options: output_options = {}, + output_processor: :adoc, +} +destination = nil + OptionParser.new do |opts| opts.banner = "Usage: reverse_adoc [options] " opts.on("-m", "--mathml2asciimath", "Convert MathML to AsciiMath") do |_v| - Coradoc::Input::HTML.config.mathml2asciimath = true + input_options[:mathml2asciimath] = true end opts.on("-oFILENAME", "--output=FILENAME", "Output file to write to") do |v| - Coradoc::Input::HTML.config.destination = File.expand_path(v) + destination = File.expand_path(v) # puts "output goes to #{Coradoc::Input::HTML.config.destination}" end opts.on("-e", "--external-images", "Export images if data URI") do |_v| - Coradoc::Input::HTML.config.external_images = true + input_options[:external_images] = true end opts.on("-u", "--unknown_tags [pass_through, drop, bypass, raise]", "Unknown tag handling (default: pass_through)") do |v| - Coradoc::Input::HTML.config.unknown_tags = v + input_options[:unknown_tags] = v end opts.on("-r", "--require RUBYMODULE", "Require additional Ruby file") do |v| @@ -33,15 +42,15 @@ OptionParser.new do |opts| end opts.on("--track-time", "Track time spent on each step") do - Coradoc::Input::HTML.config.track_time = true + input_options[:track_time] = true end opts.on("--split-sections LEVEL", "Split sections up to LEVEL") do |i| - Coradoc::Input::HTML.config.split_sections = i.to_i + input_options[:split_sections] = i.to_i end opts.on("-v", "--version", "Version information") do |_v| - puts "reverse_adoc: v#{Coradoc::Input::HTML::VERSION}" + puts "Coradoc: v#{Coradoc::VERSION}" exit end @@ -52,40 +61,21 @@ OptionParser.new do |opts| end.parse! if filename = ARGV.pop - input_content = IO.read(filename) - Coradoc::Input::HTML.config.sourcedir = File.dirname(File.expand_path(filename)) + input_content = filename else - if Coradoc::Input::HTML.config.external_images + if input_options[:external_images] raise "The -e | --external-images feature cannot be used with STDIN input. Exiting." end - input_content = ARGF.read + input_content = ARGF end -if Coradoc::Input::HTML.config.external_images && Coradoc::Input::HTML.config.destination.nil? +if input_options[:external_images] && destination.nil? raise "The -e | --external-images feature must be used with -o | --output. Exiting." end -if Coradoc::Input::HTML.config.split_sections && Coradoc::Input::HTML.config.destination.nil? +if input_options[:split_sections] && destination.nil? raise "The --split_sections feature must be used with -o | --output. Exiting." end -# Read from STDIN -adoc_content = Coradoc::Input::HTML.convert(input_content) - -# Print to STDOUT -unless Coradoc::Input::HTML.config.destination - puts adoc_content - exit -end - -# Write output to Coradoc::Input::HTML.config.destination -adoc_content = {nil => adoc_content} unless adoc_content.is_a? Hash - -adoc_content.each do |file, content| - destination = Coradoc::Input::HTML.config.destination - destdir = File.dirname(destination) - filename = file ? "#{destdir}/#{file}" : destination - FileUtils.mkdir_p(File.dirname(filename)) - File.write(filename, content) -end +Coradoc::Converter.(input_content, destination, **config) diff --git a/exe/w2a b/exe/w2a index 71f4c47..f62bd80 100755 --- a/exe/w2a +++ b/exe/w2a @@ -6,27 +6,35 @@ require "bundler/setup" require "word-to-markdown" require "optparse" -require "coradoc/input/html" +require "coradoc" ARGV.push("-h") if ARGV.empty? +config = { + input_options: input_options = {}, + input_processor: :docx, + output_options: output_options = {}, + output_processor: :adoc, +} +destination = nil + OptionParser.new do |opts| opts.banner = "Usage: w2a [options] " opts.on("-m", "--mathml2asciimath", "Convert MathML to AsciiMath") do |_v| - Coradoc::Input::HTML.config.mathml2asciimath = true + input_options[:mathml2asciimath] = true end opts.on("-oFILENAME", "--output=FILENAME", "Output file to write to") do |v| - Coradoc::Input::HTML.config.destination = File.expand_path(v) + destination = File.expand_path(v) # puts "output goes to #{Coradoc::Input::HTML.config.destination}" end opts.on("-e", "--external-images", "Export images if data URI") do |_v| - Coradoc::Input::HTML.config.external_images = true + input_options[:external_images] = true end opts.on("-v", "--version", "Version information") do |_v| - puts "reverse_adoc: v#{Coradoc::Input::HTML::VERSION}" + puts "Coradoc: v#{Coradoc::VERSION}" puts "[dependency] WordToMarkdown: v#{WordToMarkdown::VERSION}" if Gem.win_platform? puts "[dependency] LibreOffice: version not available on Windows" @@ -45,28 +53,8 @@ end.parse! filename = ARGV.pop raise "Please provide an input file to process. Exiting." unless filename -if Coradoc::Input::HTML.config.external_images && Coradoc::Input::HTML.config.destination.nil? +if input_options[:external_images] && destination.nil? raise "The -e | --external-images feature must be used with -o | --output. Exiting." end -Coradoc::Input::HTML.config.sourcedir = Dir.mktmpdir - -doc = WordToMarkdown.new(filename, Coradoc::Input::HTML.config.sourcedir) -# File.open("test.html", "w:UTF-8") { |f| f.write doc.document.html } -adoc_content = Coradoc::Input::HTML.convert( - Coradoc::Input::HTML.cleaner.preprocess_word_html(doc.document.html), - WordToMarkdown::REVERSE_MARKDOWN_OPTIONS, -) -# puts scrub_whitespace(doc.document.html) - -# Print to STDOUT -unless Coradoc::Input::HTML.config.destination - puts adoc_content - exit -end - -# Write output to Coradoc::Input::HTML.config.destination -FileUtils.mkdir_p(File.dirname(Coradoc::Input::HTML.config.destination)) -File.open(Coradoc::Input::HTML.config.destination, "w") do |file| - file.write(adoc_content) -end +Coradoc::Converter.(filename, destination, **config) diff --git a/lib/coradoc.rb b/lib/coradoc.rb index ee0f56d..ad337bb 100644 --- a/lib/coradoc.rb +++ b/lib/coradoc.rb @@ -3,11 +3,14 @@ require "pathname" require "parslet" -require_relative "coradoc/version" -require_relative "coradoc/util" -require_relative "coradoc/parser" -require_relative "coradoc/transformer" -require_relative "coradoc/generator" +require "coradoc/version" +require "coradoc/util" +require "coradoc/parser" +require "coradoc/transformer" +require "coradoc/generator" +require "coradoc/converter" +require "coradoc/input" +require "coradoc/output" module Coradoc class Error < StandardError; end diff --git a/lib/coradoc/converter.rb b/lib/coradoc/converter.rb new file mode 100644 index 0000000..42372c8 --- /dev/null +++ b/lib/coradoc/converter.rb @@ -0,0 +1,139 @@ +require "fileutils" + +module Coradoc + class Converter + attr_accessor :input, :output, :config + + def initialize(input = nil, output = nil, **config) + @input = input || $stdin + @output = output || $stdout + + @config = { + input_options: {}, + output_options: {}, + }.merge(config) + + yield if block_given? + end + + def self.call(*args, **kwargs, &block) + new(*args, **kwargs, &block).convert + end + + def input_processor + if config[:input_processor] + Input[config[:input_processor]] + else + Input.select_processor(input) + end + end + + def output_processor + if config[:output_processor] + Output[config[:output_processor]] + else + Output.select_processor(output) + end + end + + def convert(data = nil) + input_id = input_processor.processor_id + output_id = output_processor.processor_id + + unless data + input = self.input + input = File.open(input, "rb") if input.is_a? String + data = input.read + input_path = input.path if input.respond_to? :path + end + + # Some input processors may prefer filenames + if input_processor.respond_to? :processor_wants_filenames + unless input.respond_to? :path + raise NoInputPathError, + "no input path given, but #{input_processor} wants that " + + "form. Ensure you don't read from standard input." + end + + data = input.path + end + + # We may need to configure destination path. + output = self.output + if output.is_a? String + FileUtils.mkdir_p(File.dirname(output)) + output = File.open(output, "wb") + end + output_path = output.path if output.respond_to?(:path) + + input_options = config[:input_options] + input_options = input_options.merge(destination: output_path) if output_path + input_options = input_options.merge(sourcedir: File.dirname(input_path)) if input_path + + data = input_processor.processor_execute(data, input_options) + + # Two options are possible at this point: + # Either we have a document we want to write to some output, or + # we have a Hash, that contains a list of files and their + # documents (where a nil key denotes the main file). Let's normalize + # those cases. + data = { nil => data } unless data.is_a? Hash + + # Let's check an edge case of non-nil keys and no output path + if !output_path && data.keys.any? { |i| !i.nil? } + raise NoOutputPathError, + "no output path given, while wanting to write multiple files" + end + + data = output_processor.processor_execute(data, config[:output_options]) + + if input_processor.respond_to?(:processor_postprocess) + data = input_processor.processor_postprocess( + data, input_options.merge(output_processor: output_id) + ) + end + + # Now we have all, let's write. + data.each do |filename, content| + if filename.nil? + file = output + else + dirname = File.dirname(output_path) + file = "#{dirname}/#{filename}" + FileUtils.mkdir_p(File.dirname(file)) + file = File.open(file, "wb") + end + + file.write(content) + file.close + end + end + + class NoInputPathError < StandardError; end + class NoOutputPathError < StandardError; end + class NoProcessorError < StandardError; end + + module CommonInputOutputMethods + def define(const) + @processors[const.processor_id] = const + end + + def [](id) + @processors[id.to_sym] + end + + def select_processor(filename) + filename = filename.path if filename.respond_to? :path + unless filename.is_a? String + raise Converter::NoProcessorError, + "can't find a path for #{filename}. You must manually select the processor." + end + + @processors.values.find do |i| + i.processor_match?(filename) + end or raise Converter::NoProcessorError, + "you must manually select the processor for #{filename}" + end + end + end +end diff --git a/lib/coradoc/input.rb b/lib/coradoc/input.rb index 800111f..2e7979d 100644 --- a/lib/coradoc/input.rb +++ b/lib/coradoc/input.rb @@ -1,6 +1,12 @@ -require_relative "../coradoc" +require "coradoc" module Coradoc module Input + @processors = {} + extend Converter::CommonInputOutputMethods end end + +require "coradoc/input/adoc" +require "coradoc/input/docx" +require "coradoc/input/html" diff --git a/lib/coradoc/input/adoc.rb b/lib/coradoc/input/adoc.rb new file mode 100644 index 0000000..b55b5d2 --- /dev/null +++ b/lib/coradoc/input/adoc.rb @@ -0,0 +1,19 @@ +require "coradoc/input" + +module Coradoc + module Input::Adoc + def self.processor_id + :adoc + end + + def self.processor_match?(filename) + %w[.adoc].any? { |i| filename.downcase.end_with?(i) } + end + + def self.processor_execute(input, _options = {}) + Coradoc::Parser.parse(input) + end + + Coradoc::Input.define(self) + end +end diff --git a/lib/coradoc/input/docx.rb b/lib/coradoc/input/docx.rb new file mode 100644 index 0000000..8c5e52c --- /dev/null +++ b/lib/coradoc/input/docx.rb @@ -0,0 +1,35 @@ +require "word-to-markdown" +require "coradoc/input/html" +require "fileutils" + +module Coradoc + module Input::Docx + def self.processor_id + :docx + end + + def self.processor_match?(filename) + %w[.docx .doc].any? { |i| filename.downcase.end_with?(i) } + end + + def self.processor_execute(input, options = {}) + image_dir = Dir.mktmpdir + options = options.merge(sourcedir: image_dir) + doc = WordToMarkdown.new(input, image_dir) + doc = Coradoc::Input::HTML.cleaner.preprocess_word_html(doc.document.html) + options = WordToMarkdown::REVERSE_MARKDOWN_OPTIONS.merge(options) + Coradoc::Input::HTML.to_coradoc(doc, options) + ensure + FileUtils.rm_rf(image_dir) + end + + def self.processor_postprocess(data, options) + Coradoc::Input::HTML.processor_postprocess(data, options) + end + + # This processor prefers to work on original files. + def self.processor_wants_filenames; true; end + + Coradoc::Input.define(self) + end +end diff --git a/lib/coradoc/input/html.rb b/lib/coradoc/input/html.rb index cea1dc4..4dc7ac3 100644 --- a/lib/coradoc/input/html.rb +++ b/lib/coradoc/input/html.rb @@ -2,7 +2,7 @@ require "digest" require "nokogiri" -require_relative "../input" +require "coradoc/input" require_relative "html/errors" require_relative "html/cleaner" require_relative "html/config" @@ -18,6 +18,10 @@ def self.convert(input, options = {}) Coradoc::Input::HTML::HtmlConverter.convert(input, options) end + def self.to_coradoc(input, options = {}) + Input::HTML::HtmlConverter.to_coradoc(input, options) + end + def self.config @config ||= Config.new yield @config if block_given? @@ -27,5 +31,29 @@ def self.config def self.cleaner @cleaner ||= Cleaner.new end + + def self.processor_id + :html + end + + def self.processor_match?(filename) + %w[.html .htm].any? { |i| filename.downcase.end_with?(i) } + end + + def self.processor_execute(input, options = {}) + to_coradoc(input, options) + end + + def self.processor_postprocess(data, options) + if options[:output_processor] == :adoc + data.transform_values do |v| + Input::HTML::HtmlConverter.cleanup_result(v, options) + end + else + data + end + end + + Coradoc::Input.define(self) end end diff --git a/lib/coradoc/input/html/cleaner.rb b/lib/coradoc/input/html/cleaner.rb index d6823f5..6177d43 100644 --- a/lib/coradoc/input/html/cleaner.rb +++ b/lib/coradoc/input/html/cleaner.rb @@ -1,6 +1,10 @@ module Coradoc::Input::HTML class Cleaner def tidy(string) + if string.is_a? Hash + return string.transform_values { |i| tidy(i) } + end + result = HtmlConverter.track_time "Removing inner whitespace" do remove_inner_whitespaces(String.new(string)) end diff --git a/lib/coradoc/input/html/html_converter.rb b/lib/coradoc/input/html/html_converter.rb index bc5042b..0bb320d 100644 --- a/lib/coradoc/input/html/html_converter.rb +++ b/lib/coradoc/input/html/html_converter.rb @@ -40,9 +40,8 @@ module Coradoc module Input::HTML class HtmlConverter def self.to_coradoc(input, options = {}) - plugin_instances = options.delete(:plugin_instances) Input::HTML.config.with(options) do - plugin_instances ||= Coradoc::Input::HTML.config.plugins.map(&:new) + plugin_instances = prepare_plugin_instances(options) root = track_time "Loading input HTML document" do case input @@ -85,47 +84,63 @@ def self.to_coradoc(input, options = {}) end end + options[:plugin_instances] = plugin_instances unless options.frozen? + coradoc end end def self.convert(input, options = {}) Input::HTML.config.with(options) do - plugin_instances = Coradoc::Input::HTML.config.plugins.map(&:new) - - options = options.merge(plugin_instances: plugin_instances) + plugin_instances = prepare_plugin_instances(options) coradoc = to_coradoc(input, options) if coradoc.is_a?(Hash) coradoc.to_h do |file, tree| track_time "Converting file #{file || 'main'}" do - [file, convert_single_coradoc_to_adoc(file, tree, plugin_instances)] + [file, + convert_single_coradoc_to_adoc(file, tree, options)] end end else - convert_single_coradoc_to_adoc(nil, coradoc, plugin_instances) + convert_single_coradoc_to_adoc(nil, coradoc, options) end end end - def self.convert_single_coradoc_to_adoc(_file, coradoc, plugin_instances) + def self.convert_single_coradoc_to_adoc(_file, coradoc, options) + plugin_instances = prepare_plugin_instances(options) + result = track_time "Converting Coradoc tree into Asciidoc" do Coradoc::Generator.gen_adoc(coradoc) end - result = track_time "Cleaning up the result" do - Input::HTML.cleaner.tidy(result) - end - plugin_instances.each do |plugin| - if plugin.respond_to?(:postprocess_asciidoc_string) - plugin.asciidoc_string = result - track_time "Postprocessing AsciiDoc string with #{plugin.name} plugin" do - plugin.postprocess_asciidoc_string + cleanup_result(result, options) + end + + def self.cleanup_result(result, options) + Input::HTML.config.with(options) do + plugin_instances = prepare_plugin_instances(options) + + result = track_time "Cleaning up the result" do + Input::HTML.cleaner.tidy(result) + end + plugin_instances.each do |plugin| + if plugin.respond_to?(:postprocess_asciidoc_string) + plugin.asciidoc_string = result + track_time "Postprocessing AsciiDoc string with #{plugin.name} plugin" do + plugin.postprocess_asciidoc_string + end + result = plugin.asciidoc_string end - result = plugin.asciidoc_string end + + result end - result + end + + def self.prepare_plugin_instances(options) + options[:plugin_instances] || Coradoc::Input::HTML.config.plugins.map(&:new) end @track_time_indentation = 0 diff --git a/lib/coradoc/output.rb b/lib/coradoc/output.rb new file mode 100644 index 0000000..5f48d33 --- /dev/null +++ b/lib/coradoc/output.rb @@ -0,0 +1,10 @@ +require "coradoc" + +module Coradoc + module Output + @processors = {} + extend Converter::CommonInputOutputMethods + end +end + +require "coradoc/output/adoc" diff --git a/lib/coradoc/output/adoc.rb b/lib/coradoc/output/adoc.rb new file mode 100644 index 0000000..8d6da0c --- /dev/null +++ b/lib/coradoc/output/adoc.rb @@ -0,0 +1,17 @@ +module Coradoc + module Output::Adoc + def self.processor_id + :adoc + end + + def self.processor_match?(filename) + %w[.adoc].any? { |i| filename.downcase.end_with?(i) } + end + + def self.processor_execute(input, _options = {}) + input.transform_values { |i| Coradoc::Generator.gen_adoc(i) } + end + + Coradoc::Output.define(self) + end +end