From f2e3c877d2d551dd769bd9eae7fc895fac072fea Mon Sep 17 00:00:00 2001 From: Corey Gillen Date: Thu, 2 May 2024 10:26:36 -0700 Subject: [PATCH] Hack to query separate SOLR instance for text-extraction --- lib/hydra/derivatives/processors/full_text.rb | 80 +++++++++++++++++++ 1 file changed, 80 insertions(+) create mode 100644 lib/hydra/derivatives/processors/full_text.rb diff --git a/lib/hydra/derivatives/processors/full_text.rb b/lib/hydra/derivatives/processors/full_text.rb new file mode 100644 index 000000000..f114e6349 --- /dev/null +++ b/lib/hydra/derivatives/processors/full_text.rb @@ -0,0 +1,80 @@ +# frozen_string_literal: true + +# rubocop:disable Metrics/AbcSize +module Hydra::Derivatives::Processors + # Extract the full text from the content using Solr's extract handler + class FullText < Processor + # Run the full text extraction and save the result + # @return [TrueClass,FalseClass] was the process successful. + def process + output_file_service.call(extract, directives) + end + + private + + ## + # Extract full text from the content using Solr's extract handler. + # This will extract text from the file + # + # @return [String] The extracted text + def extract + JSON.parse(fetch)[''].rstrip + end + + # send the request to the extract service and return the response if it was successful. + # TODO: this pulls the whole file into memory. We should stream it from Fedora instead + # @return [String] the result of calling the extract service + def fetch + resp = http_request + raise "Solr Extract service was unsuccessful. '#{uri}' returned code #{resp.code} for #{source_path}\n#{resp.body}" unless resp.code == '200' + + file_content.rewind if file_content.respond_to?(:rewind) + resp.body.force_encoding(resp.type_params['charset']) if resp.type_params['charset'] + resp.body + end + + # Send the request to the extract service + # @return [Net::HttpResponse] the result of calling the extract service + def http_request + Net::HTTP.start(uri.host, uri.port, use_ssl: check_for_ssl) do |http| + req = Net::HTTP::Post.new(uri.request_uri, request_headers) + req.basic_auth uri.user, uri.password unless uri.password.nil? + req.body = file_content + http.request req + end + end + + def file_content + @file_content ||= File.open(source_path).read + end + + # @return [Hash] the request headers to send to the Solr extract service + def request_headers + { Faraday::Request::UrlEncoded::CONTENT_TYPE => mime_type.to_s, + Faraday::Adapter::CONTENT_LENGTH => original_size.to_s } + end + + def mime_type + Hydra::Derivatives::MimeTypeService.mime_type(source_path) + end + + def original_size + File.size(source_path) + end + + # @returns [URI] path to the extract service + def uri + @uri ||= connection_url + 'update/extract?extractOnly=true&wt=json&extractFormat=text' + end + + def check_for_ssl + uri.scheme == 'https' + end + + # @returns [URI] path to the solr collection + def connection_url + URI.parse(ENV.fetch('SOLR_EXTRACT_URL', 'localhost')) + end + end +end +# rubocop:enable Metrics/AbcSize