From 50ea34cee903a554f852446b5bed7499b4df2e2a Mon Sep 17 00:00:00 2001 From: Brad Watson Date: Thu, 12 Dec 2024 17:42:32 -0600 Subject: [PATCH] Adds BlacklightIiifSearch to UniversalViewer. (#2296) * Adds BlacklightIiifSearc to UniversalViewer. * Correct error. * Adds and corrects rspec and adds hostname to URL helper. * adds require. * Moving classes to a better distributor file. * Switch concat method. * Remove invalid UTF-8 text. * Try a different tactic. * trying a different service encapsulation. * Another tactic change. * add search to config. * changes activation location. * puts search service on work level. * revert changes to config. * delivers iiif search url the most convenient way. * rubo * Adds transcript_text_tesi to default search. * Improves matching capabilities. * Try to make highlighting work. * Sanitizes solr document url. * match id structure to our manifest. * Reverts possibly superfluous config. * Reverts possibly superfluous config part 2. * Adds documentation and license information. * adds rspec. --- Gemfile | 1 + Gemfile.lock | 9 ++ app/controllers/catalog_controller.rb | 16 ++- app/indexers/curate/file_set_indexer.rb | 5 +- app/lib/curate/text_extraction/alto_reader.rb | 131 ++++++++++++++++++ .../text_extraction/word_coords_builder.rb | 47 +++++++ .../iiif_search_annotation_behavior.rb | 77 ++++++++++ app/models/file_set.rb | 2 +- app/models/iiif_search_builder.rb | 17 +++ app/models/solr_document.rb | 9 ++ app/views/manifest/manifest.json.jbuilder | 16 +++ config/routes.rb | 2 + spec/indexers/curate/file_set_indexer_spec.rb | 10 +- spec/models/file_set_spec.rb | 6 + .../manifest/manifest.json.jbuilder_spec.rb | 14 ++ 15 files changed, 353 insertions(+), 9 deletions(-) create mode 100644 app/lib/curate/text_extraction/alto_reader.rb create mode 100644 app/lib/curate/text_extraction/word_coords_builder.rb create mode 100644 app/models/concerns/blacklight_iiif_search/iiif_search_annotation_behavior.rb create mode 100644 app/models/iiif_search_builder.rb diff --git a/Gemfile b/Gemfile index 04fe03c4..60d79c18 100644 --- a/Gemfile +++ b/Gemfile @@ -10,6 +10,7 @@ git_source(:github) do |repo_name| end gem 'archivesspace-client' +gem 'blacklight_iiif_search' gem 'bootsnap', require: false gem 'bootstrap-sass', '~> 3.0' gem 'bulkrax' diff --git a/Gemfile.lock b/Gemfile.lock index 425a4b14..b7bf684d 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -1412,6 +1412,10 @@ GEM bootstrap-sass (~> 3.0) openseadragon (>= 0.2.0) rails + blacklight_iiif_search (1.0.0) + blacklight (~> 6.0) + iiif-presentation + rails (>= 4.2, < 6) bootsnap (1.13.0) msgpack (~> 1.2) bootstrap-sass (3.4.1) @@ -1792,6 +1796,10 @@ GEM ice_nine (0.11.2) iiif-image-api (0.2.0) activesupport + iiif-presentation (1.1.0) + activesupport (>= 3.2.18) + faraday (>= 0.9) + json iiif_manifest (1.1.1) activesupport (>= 4) iso8601 (0.9.1) @@ -2373,6 +2381,7 @@ PLATFORMS DEPENDENCIES archivesspace-client bixby (~> 3.0.1) + blacklight_iiif_search bootsnap bootstrap-sass (~> 3.0) bulkrax diff --git a/app/controllers/catalog_controller.rb b/app/controllers/catalog_controller.rb index 0325d337..6dbf6ff5 100644 --- a/app/controllers/catalog_controller.rb +++ b/app/controllers/catalog_controller.rb @@ -17,7 +17,19 @@ def self.modified_field solr_name('system_modified', :stored_sortable, type: :date) end + # CatalogController-scope behavior and configuration for BlacklightIiifSearch + include BlacklightIiifSearch::Controller + configure_blacklight do |config| + # configuration for Blacklight IIIF Content Search + config.iiif_search = { + full_text_field: 'transcript_text_tesi', # FileSet field + object_relation_field: 'is_page_of_ssi', # FileSet field + supported_params: %w[q page], + autocomplete_handler: 'iiif_suggest', + suggester_name: 'iiifSuggester' + } + config.view.gallery.partials = [:index_header, :index] config.view.masonry.partials = [:index] config.view.slideshow.partials = [:index] @@ -35,10 +47,12 @@ def self.modified_field config.http_method = :post ## Default parameters to send to solr for all search-like requests. See also SolrHelper#solr_search_params + # NOTE: transcript_text_tesi is needed here because the `iiif_search` path utilizes the default `search` qt to + # match terms in Full-Text search-enabled FileSets. config.default_solr_params = { qt: "search", rows: 10, - qf: "title_tesim description_tesim creator_tesim keyword_tesim" + qf: "title_tesim description_tesim creator_tesim keyword_tesim transcript_text_tesi" } # solr field configuration for document/show views diff --git a/app/indexers/curate/file_set_indexer.rb b/app/indexers/curate/file_set_indexer.rb index 327c0005..982f4a38 100644 --- a/app/indexers/curate/file_set_indexer.rb +++ b/app/indexers/curate/file_set_indexer.rb @@ -1,5 +1,4 @@ # frozen_string_literal: true - module Curate class FileSetIndexer < Hyrax::FileSetIndexer def generate_solr_document @@ -93,9 +92,11 @@ def preservation_event_value(preservation_event) preservation_event.pluck(:value).first end + # All fields assigned here are utilized by BlacklightIiifSearch. def full_text_fields(solr_doc) - solr_doc['alto_xml_tesi'] = object.alto_xml if object.alto_xml.present? + solr_doc['alto_xml_tesi'] = Curate::TextExtraction::AltoReader.new(object.alto_xml).json if object.alto_xml.present? solr_doc['transcript_text_tesi'] = object.transcript_text if object.transcript_text.present? + solr_doc['is_page_of_ssi'] = object.parent.id if object.parent.present? end end end diff --git a/app/lib/curate/text_extraction/alto_reader.rb b/app/lib/curate/text_extraction/alto_reader.rb new file mode 100644 index 00000000..fc6dd693 --- /dev/null +++ b/app/lib/curate/text_extraction/alto_reader.rb @@ -0,0 +1,131 @@ +# frozen_string_literal: true +require 'active_support/core_ext/module/delegation' +require 'json' +require 'nokogiri' + +# NOTE: This model is largely derived from IiifPrint's (v3.0.1) +# IiifPrint::TextExtraction::AltoReader class. Minor changes have been made to bring +# the code into Rubocop compliancy. The IiifPrint Gem application is licensed under the +# Apache License 2.0. At the time of adopting this licensed work into this application, +# Commercial use, Modification, and Private use were listed under this Gem's Permissions. +# The referenced License can be found here: +# https://github.com/scientist-softserv/iiif_print/blob/v3.0.1/LICENSE +module Curate + # Module for text extraction + module TextExtraction + # Class to obtain plain text and JSON word-coordinates from ALTO source + class AltoReader + attr_accessor :source, :doc_stream + delegate :text, to: :doc_stream + + # SAX Document Stream class to gather text and word tokens from ALTO + class AltoDocStream < Nokogiri::XML::SAX::Document + attr_accessor :text, :words + + def initialize(image_width = nil) + super() + # scaling matters: + @image_width = image_width + @scaling = 1.0 # pt to px, if ALTO using points + # plain text buffer: + @text = '' + # list of word hash, containing word+coord: + @words = [] + end + + # Return coordinates from String element attribute hash + # + # @param attrs [Hash] hash containing ALTO `String` element attributes. + # @return [Array] Array of position x, y, width, height in px. + def s_coords(attrs) + height = scale_value((attrs['HEIGHT'] || 0).to_i) + width = scale_value((attrs['WIDTH'] || 0).to_i) + hpos = scale_value((attrs['HPOS'] || 0).to_i) + vpos = scale_value((attrs['VPOS'] || 0).to_i) + [hpos, vpos, width, height] + end + + def compute_scaling(attrs) + return if @image_width.nil? + match = attrs.find { |e| e[0].casecmp?('WIDTH') } + return if match.empty? + page_width = match[1].to_i + return if @image_width == page_width + @scaling = page_width / @image_width.to_f + end + + def scale_value(v) + (v / @scaling).to_i + end + + # Callback for element start, implementation of which ignores + # non-String elements. + # + # @param name [String] element name. + # @param attrs [Array] Array of key, value pair Arrays. + def start_element(name, attrs = []) + values = attrs.to_h + compute_scaling(attrs) if name == 'Page' + return if name != 'String' + token = values['CONTENT'] + @text += token + @words << { + word: token, + coordinates: s_coords(values) + } + end + + # Callback for element end, used here to manage endings of lines and + # blocks. + # + # @param name [String] element name. + def end_element(name) + @text += " " if name == 'String' + @text += "\n" if name == 'TextBlock' + @text += "\n" if name == 'TextLine' + end + + # Callback for completion of parsing ALTO, used to normalize generated + # text content (strip unneeded whitespace incidental to output). + def end_document + # postprocess @text to remove trailing spaces on lines + @text = @text.split("\n").map(&:strip).join("\n") + # remove trailing whitespace at end of buffer + @text.strip! + end + end + + # Construct with either path + # + # @param xml [String], and process document + def initialize(xml, image_width = nil, image_height = nil) + @source = isxml?(xml) ? xml : File.read(xml) + @image_width = image_width + @image_height = image_height + @doc_stream = AltoDocStream.new(image_width) + parser = Nokogiri::XML::SAX::Parser.new(doc_stream) + parser.parse(@source) + end + + # Determine if source parameter is path or xml + # + # @param xml [String] either path to xml file or xml source + # @return [true, false] true if string appears to be XML source, not path + def isxml?(xml) + xml.lstrip.start_with?('<') + end + + # Output JSON flattened word coordinates + # + # @return [String] JSON serialization of flattened word coordinates + def json + words = @doc_stream.words + Curate::TextExtraction::WordCoordsBuilder.json_coordinates_for( + words: words, + width: @image_width, + height: @image_height + ) + end + end + end +end diff --git a/app/lib/curate/text_extraction/word_coords_builder.rb b/app/lib/curate/text_extraction/word_coords_builder.rb new file mode 100644 index 00000000..cb1800cd --- /dev/null +++ b/app/lib/curate/text_extraction/word_coords_builder.rb @@ -0,0 +1,47 @@ +# frozen_string_literal: true + +# NOTE: This model is largely derived from IiifPrint's (v3.0.1) +# IiifPrint::TextExtraction::WordCoordsBuilder class. Minor changes have been made to bring +# the code into Rubocop compliancy. The IiifPrint Gem application is licensed under the +# Apache License 2.0. At the time of adopting this licensed work into this application, +# Commercial use, Modification, and Private use were listed under this Gem's Permissions. +# The referenced License can be found here: +# https://github.com/scientist-softserv/iiif_print/blob/v3.0.1/LICENSE +module Curate + # Module for text extraction (OCR or otherwise) + module TextExtraction + class WordCoordsBuilder + # @params words [Array] an array of hash objects that have the keys `:word` and `:coordinates`. + # @params width [Integer] the width of the "canvas" on which the words appear. + # @params height [Integer] the height of the "canvas" on which the words appear. + # @return [String] a JSON encoded string. + def self.json_coordinates_for(words:, width: nil, height: nil) + new(words, width, height).to_json + end + + def initialize(words, width = nil, height = nil) + @words = words + @width = width + @height = height + end + + # Output JSON flattened word coordinates + # + # @return [String] JSON serialization of flattened word coordinates + def to_json + coordinates = {} + @words.each do |w| + word_chars = w[:word] + word_coords = w[:coordinates] + if coordinates[word_chars] + coordinates[word_chars] << word_coords + else + coordinates[word_chars] = [word_coords] + end + end + payload = { width: @width, height: @height, coords: coordinates } + JSON.generate(payload) + end + end + end +end diff --git a/app/models/concerns/blacklight_iiif_search/iiif_search_annotation_behavior.rb b/app/models/concerns/blacklight_iiif_search/iiif_search_annotation_behavior.rb new file mode 100644 index 00000000..e1170cf4 --- /dev/null +++ b/app/models/concerns/blacklight_iiif_search/iiif_search_annotation_behavior.rb @@ -0,0 +1,77 @@ +# frozen_string_literal: true + +# Blacklight IIIF Search v1.0.0 Override: per this application's instructions, +# this module must be overridden if coordinates will be provided within the results +# of this Gem's search API. It was also necessary to override #annotation_id and +# #canvas_uri_for_annotation so that we can match the format of each canvas' @id +# url value. + +# customizable behavior for IiifSearchAnnotation +module BlacklightIiifSearch + module AnnotationBehavior + ## + # Create a URL for the annotation + # @return [String] + def annotation_id + "#{emory_iiif_id_url}/canvas/#{document[:id]}/annotation/#{hl_index}" + end + + ## + # Create a URL for the canvas that the annotation refers to + # @return [String] + def canvas_uri_for_annotation + "#{emory_iiif_id_url}/canvas/#{document[:id]}" + coordinates + end + + # NOTE: The methods #coordinates, #fetch_and_parse_coords, and #default_coords below are largely derived + # from IiifPrint's (v3.0.1) IiifPrint::BlacklightIiifSearch::AnnotationDecorator module methods of the + # same name. The methods have been refactored to function according to our expectations. The IiifPrint Gem + # application is licensed under the Apache License 2.0. At the time of adopting this licensed work into + # this application, Commercial use, Modification, and Private use were listed under this Gem's Permissions. + # The referenced License can be found here: + # https://github.com/scientist-softserv/iiif_print/blob/v3.0.1/LICENSE + + ## + # return a string like "#xywh=100,100,250,20" + # corresponding to coordinates of query term on image + # @return [String] + def coordinates + coords_json = fetch_and_parse_coords + return default_coords unless coords_json.present? && coords_json['coords'].present? && query.present? + + query_terms = query.split(' ').map(&:downcase) + matches = coords_json['coords'].select do |k, _v| + k.downcase =~ /(#{query_terms.join('|')})/ + end + coords_array = matches&.values&.flatten(1)&.[](hl_index) + + coords_array.present? ? "#xywh=#{coords_array.join(',')}" : default_coords + end + + private + + ## + # a default set of coordinates + # @return [String] + def default_coords + '#xywh=0,0,0,0' + end + + ## + # return the JSON word-coordinates file contents + # @return [JSON] + def fetch_and_parse_coords + coords = document['alto_xml_tesi'] + return nil if coords.blank? + begin + JSON.parse(coords) + rescue JSON::ParserError + nil + end + end + + def emory_iiif_id_url + "http://#{ENV['HOSTNAME'] || 'localhost:3000'}/iiif/#{parent_document[:id]}/manifest" + end + end +end diff --git a/app/models/file_set.rb b/app/models/file_set.rb index 12099c94..9fccee55 100644 --- a/app/models/file_set.rb +++ b/app/models/file_set.rb @@ -93,7 +93,7 @@ def alto_xml end def transcript_text - transcript_file&.content&.to_s if transcript_file&.file_name&.first&.include?('.txt') + transcript_file&.content&.force_encoding('UTF-8') if transcript_file&.file_name&.first&.include?('.txt') end private diff --git a/app/models/iiif_search_builder.rb b/app/models/iiif_search_builder.rb new file mode 100644 index 00000000..fcbbb922 --- /dev/null +++ b/app/models/iiif_search_builder.rb @@ -0,0 +1,17 @@ +# frozen_string_literal: true + +# SearchBuilder for full-text searches with highlighting and snippets +class IiifSearchBuilder < Blacklight::SearchBuilder + include Blacklight::Solr::SearchBuilderBehavior + + self.default_processor_chain += [:ocr_search_params] + + # set params for ocr field searching + def ocr_search_params(solr_parameters = {}) + solr_parameters[:facet] = false + solr_parameters[:hl] = true + solr_parameters[:'hl.fl'] = blacklight_config.iiif_search[:full_text_field] + solr_parameters[:'hl.fragsize'] = 100 + solr_parameters[:'hl.snippets'] = 10 + end +end diff --git a/app/models/solr_document.rb b/app/models/solr_document.rb index 574f79c2..766c6757 100644 --- a/app/models/solr_document.rb +++ b/app/models/solr_document.rb @@ -162,4 +162,13 @@ def human_readable_visibility def source_collection_title self['source_collection_title_ssim'] end + + # Added here since the SolrDocument is easily available within app/views/manifest/manifest.json.jbuilder partial. + def work_iiif_search_url + return ('http://localhost:3000/catalog/' + self['id'] + '/iiif_search') if ENV['IIIF_SERVER_URL'].blank? + parsed_iiif_url = URI.parse(ENV['IIIF_SERVER_URL']) + base_path = parsed_iiif_url.to_s[/\A.*(?=#{parsed_iiif_url.path}\z)/] + + base_path + '/catalog/' + self['id'] + '/iiif_search' + end end diff --git a/app/views/manifest/manifest.json.jbuilder b/app/views/manifest/manifest.json.jbuilder index 16b81de8..a1731dd1 100644 --- a/app/views/manifest/manifest.json.jbuilder +++ b/app/views/manifest/manifest.json.jbuilder @@ -10,6 +10,22 @@ json.metadata @manifest_metadata do |child| json.value child['value'] end +# The code block below activates the IIIF Search tools within the +# Universal Viewer. This will use the presence of all_text_tsimv values +# within the Work to activate, but each text-optimized FileSet's alto_xml_tesi, +# transcript_text_tesi, and is_page_of_ssi fields must also be indexed for normal +# searching functions. +if @solr_doc['all_text_tsimv'].present? + json.service do + json.child! do + json.set! :@context, 'http://iiif.io/api/search/0/context.json' + json.set! :@id, @solr_doc.work_iiif_search_url + json.profile 'http://iiif.io/api/search/0/search' + json.label 'Search within this item' + end + end +end + json.sequences [''] do json.set! :@type, 'sc:Sequence' json.set! :@id, "#{@root_url}/sequence/normal" diff --git a/config/routes.rb b/config/routes.rb index c2f5ed71..861e91b9 100644 --- a/config/routes.rb +++ b/config/routes.rb @@ -3,6 +3,7 @@ require 'sidekiq/web' Rails.application.routes.draw do + concern :iiif_search, BlacklightIiifSearch::Routes.new mount Bulkrax::Engine, at: '/' get '/concern/curate_generic_works/:id/event_details', to: 'event_details#event_details', as: :event_details get '/iiif/:identifier/manifest', to: 'iiif#manifest', as: :iiif_manifest @@ -52,6 +53,7 @@ resources :solr_documents, only: [:show], path: '/catalog', controller: 'catalog' do concerns :exportable + concerns :iiif_search end resources :bookmarks do diff --git a/spec/indexers/curate/file_set_indexer_spec.rb b/spec/indexers/curate/file_set_indexer_spec.rb index 7d3e4e0f..28b81027 100644 --- a/spec/indexers/curate/file_set_indexer_spec.rb +++ b/spec/indexers/curate/file_set_indexer_spec.rb @@ -31,16 +31,16 @@ # rubocop:disable RSpec/MessageChain describe 'alto_xml_tesi' do + let(:alto_json) { JSON.parse(indexer['alto_xml_tesi']) } before do allow(file_set).to receive(:extracted).and_call_original allow(file_set).to receive_message_chain(:extracted, :file_name, :first, :include?).and_return(true) end - it 'returns the expected text' do - expect(indexer['alto_xml_tesi']).to include( - 'String ID="P10_S00002" HPOS="538" VPOS="3223" WIDTH="112" HEIGHT="1005" ' \ - 'STYLEREFS="StyleId-0" CONTENT="incorporation" WC="0.4776923" CC="5723770778406"' - ) + it('returns the expected text') { expect(indexer['alto_xml_tesi']).to include('"incorporation":[[538,3223,112,1005]]') } + it 'can be parsed into JSON' do + expect(alto_json.keys).to match_array(['width', 'height', 'coords']) + expect(alto_json['coords'].keys.size).to eq(228) end end diff --git a/spec/models/file_set_spec.rb b/spec/models/file_set_spec.rb index 9bf39d35..6b7bf634 100644 --- a/spec/models/file_set_spec.rb +++ b/spec/models/file_set_spec.rb @@ -228,6 +228,12 @@ 'thousand dollars was added to the endowment of the college. The' ) end + + it 'forces UTF-8 encoding to eliminate Solr persistence errors' do + expect(file_set.transcript_file.content).to receive(:force_encoding).with('UTF-8') + + file_set.transcript_text + end end end # rubocop:enable RSpec/MessageChain diff --git a/spec/views/manifest/manifest.json.jbuilder_spec.rb b/spec/views/manifest/manifest.json.jbuilder_spec.rb index b378c35e..38b1fbb5 100644 --- a/spec/views/manifest/manifest.json.jbuilder_spec.rb +++ b/spec/views/manifest/manifest.json.jbuilder_spec.rb @@ -95,4 +95,18 @@ expect(JSON.parse(rendered)).to eq(JSON.parse(doc)) expect(work.file_sets.count).to eq 5 end + + context 'when all_text_tsimv is present' do + let(:solr_document) { SolrDocument.new(attributes.merge('all_text_tsimv' => 'So much text!')) } + + it 'renders a IIIF Search service' do + render + parsed_rendered_manifest = JSON.parse(rendered) + + expect(parsed_rendered_manifest['service']).to be_present + expect(parsed_rendered_manifest['service'].first['@context']).to eq('http://iiif.io/api/search/0/context.json') + expect(parsed_rendered_manifest['service'].first['profile']).to eq('http://iiif.io/api/search/0/search') + expect(parsed_rendered_manifest['service'].first['@id']).to eq("/catalog/#{identifier}/iiif_search") + end + end end