Skip to content

Commit

Permalink
Use solr's graph query builder for nested collection queries
Browse files Browse the repository at this point in the history
This can replace the expensive nested indexing using samvera-nesting_indexer
  • Loading branch information
cjcolvar committed Sep 28, 2022
1 parent b23a24e commit f23358d
Show file tree
Hide file tree
Showing 4 changed files with 46 additions and 268 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,16 @@ class NestedCollectionsSearchBuilder < ::Hyrax::CollectionSearchBuilder
# @param access [Symbol] :edit, :read, :discover - With the given :access what all can
# @param collection [::Collection]
# @param scope [Object] Typically a controller that responds to #current_ability, #blackligh_config
# @param nesting_attributes [NestingAttributes] an object encapsulating nesting attributes of the collection
# @param nesting_attributes - no longer used
# @param nest_direction [Symbol] (:as_parent or :as_child) the direction we are adding nesting to this collection
# rubocop:disable Lint/UnusedMethodArgument
def initialize(access:, collection:, scope:, nesting_attributes:, nest_direction:)
super(scope)
@collection = collection
@discovery_permissions = extract_discovery_permissions(access)
@nesting_attributes = nesting_attributes
@nest_direction = nest_direction
end
# rubocop:enable Lint/UnusedMethodArgument

# Override for Hydra::AccessControlsEnforcement
attr_reader :discovery_permissions
Expand All @@ -24,74 +25,16 @@ def with_pagination(solr_parameters)
solr_parameters[:rows] = 1000
end

# Solr can do graph traversal without the need of special indexing with the Graph query parser so
# use this to compute both the parents and children of the current collection then exclude them
# See https://solr.apache.org/guide/solr/latest/query-guide/other-parsers.html#graph-query-parser
def show_only_other_collections_of_the_same_collection_type(solr_parameters)
solr_parameters[:fq] ||= []
solr_parameters[:fq] += [
"-" + Hyrax::SolrQueryBuilderService.construct_query_for_ids([limit_ids]),
Hyrax::SolrQueryBuilderService.construct_query(Hyrax.config.collection_type_index_field => @collection.collection_type_gid)
Hyrax::SolrQueryBuilderService.construct_query(Hyrax.config.collection_type_index_field => @collection.collection_type_gid),
"-{!graph from=id to=member_of_collection_ids_ssim}id:#{@collection.id}",
"-{!graph to=id from=member_of_collection_ids_ssim}id:#{@collection.id}"
]
solr_parameters[:fq] += limit_clause if limit_clause # add limits to prevent illegal nesting arrangements
end

private

def limit_ids
# exclude current collection from returned list
limit_ids = [@collection.id.to_s]
# cannot add a parent that is already a parent
limit_ids += @nesting_attributes.parents if @nesting_attributes.parents && @nest_direction == :as_parent
limit_ids
end

# remove collections from list in order to to prevent illegal nesting arrangements
def limit_clause
case @nest_direction
when :as_parent
eligible_to_be_a_parent
when :as_child
eligible_to_be_a_child
end
end

# To be eligible to be a parent collection of child "Collection G":
# 1) cannot have any pathnames containing Collection G's ID
# 2) cannot already be Collection G's direct parent
# => this is handled through limit_ids method
def eligible_to_be_a_parent
# Using a !lucene query allows us to get items using a wildcard query, a feature not supported via AF query builder.
["-_query_:\"{!lucene df=#{Samvera::NestingIndexer.configuration.solr_field_name_for_storing_pathnames}}*#{@collection.id}*\""]
end

# To be eligible to be a child collection of parent "Collection F":
# 1) Cannot have any pathnames containing any of Collection F's pathname or ancestors
# 2) cannot already be Collection F's direct child
def eligible_to_be_a_child
exclude_path = []
exclude_path << exclude_if_paths_contain_collection
exclude_path << exclude_if_already_parent
end

def exclude_if_paths_contain_collection
# 1) Exclude any pathnames containing any of Collection F's pathname or ancestors
array_to_exclude = [] + @nesting_attributes.pathnames unless @nesting_attributes.pathnames.nil?
array_to_exclude += @nesting_attributes.ancestors unless @nesting_attributes.ancestors.nil?
# build a unique string containing all of Collection F's pathnames and ancestors
exclude_list = ""
array_to_exclude&.uniq&.each do |element|
exclude_list += ' ' unless exclude_list.empty?
exclude_list += element.to_s
end
# Using a !lucene query allows us to get items which match any individual element
# from the list. Building the query via the AF builder created a !field query which
# only searches the field for an exact string and doesn't allow an "OR" connection
# between the elements.
return "-_query_:\"{!lucene q.op=OR df=#{Samvera::NestingIndexer.configuration.solr_field_name_for_storing_pathnames}}#{exclude_list}\"" unless exclude_list.empty?
""
end

def exclude_if_already_parent
# 2) Exclude any of Collection F's direct children
"-" + ActiveFedora::SolrQueryBuilder.construct_query(Samvera::NestingIndexer.configuration.solr_field_name_for_storing_parent_ids => @collection.id.to_s)
end
end
end
Expand Down
103 changes: 5 additions & 98 deletions app/services/hyrax/collections/nested_collection_query_service.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,26 +4,6 @@ module Collections
##
# A query service handling nested collection queries.
module NestedCollectionQueryService
##
# @api private
#
# an encapsulation of a collection's nesting index attributes
class NestingAttributes
attr_accessor :parents, :pathnames, :ancestors, :depth, :id

def initialize(id:, scope:)
query_builder = Hyrax::CollectionSearchBuilder.new(scope).where(id: id.to_s)
query = Hyrax::Collections::NestedCollectionQueryService.clean_lucene_error(builder: query_builder)
response = scope.repository.search(query)
collection_doc = response.documents.first
@id = id.to_s
@parents = collection_doc[Samvera::NestingIndexer.configuration.solr_field_name_for_storing_parent_ids]
@pathnames = collection_doc[Samvera::NestingIndexer.configuration.solr_field_name_for_storing_pathnames]
@ancestors = collection_doc[Samvera::NestingIndexer.configuration.solr_field_name_for_storing_ancestors]
@depth = collection_doc[Samvera::NestingIndexer.configuration.solr_field_name_for_deepest_nested_depth]
end
end

##
# @api public
#
Expand Down Expand Up @@ -71,8 +51,7 @@ def self.available_parent_collections(child:, scope:, limit_to_id: nil)
def self.parent_collections(child:, scope:, page: 1)
return [] unless nestable?(collection: child)
query_builder = Hyrax::NestedCollectionsParentSearchBuilder.new(scope: scope, child: child, page: page)
query = clean_lucene_error(builder: query_builder)
scope.repository.search(query)
scope.repository.search(query_builder.query)
end

##
Expand All @@ -86,39 +65,19 @@ def self.parent_collections(child:, scope:, page: 1)
# id is in the response. Useful for validation.
# @param nest_direction [Symbol] :as_child or :as_parent
def self.query_solr(collection:, access:, scope:, limit_to_id:, nest_direction:)
nesting_attributes = NestingAttributes.new(id: collection.id.to_s, scope: scope)
query_builder = Hyrax::Dashboard::NestedCollectionsSearchBuilder.new(
access: access,
collection: collection,
scope: scope,
nesting_attributes: nesting_attributes,
nesting_attributes: nil,
nest_direction: nest_direction
)

query_builder.where(id: limit_to_id.to_s) if limit_to_id
query = clean_lucene_error(builder: query_builder)
scope.repository.search(query)
scope.repository.search(query_builder.query)
end
private_class_method :query_solr

##
# @api private
#
# clean query for +{!lucene}+ error
#
# @param builder [SearchBuilder]
# @return [Blacklight::Solr::Request] cleaned and functional query
def self.clean_lucene_error(builder:)
# TODO: Need to investigate further to understand why these particular queries
# using the where cause fail when others in the app apparently work
#
# Perhaps see <https://github.com/projectblacklight/blacklight/blob/064302f73eee4baae4d2abf863c68317d3efb5b7/lib/blacklight/solr/search_builder_behavior.rb#L84-L102>.
# This can be averted by using #with in at least some cases?
query = builder.query.to_hash
query['q'] = query['q'].gsub('{!lucene}', '') if query.key?('q')
query
end

##
# @api public
#
Expand Down Expand Up @@ -154,63 +113,11 @@ def self.parent_and_child_can_nest?(parent:, child:, scope:)
# @param scope [Object] Typically a controller object that responds
# to +repository+, +can?+, +blacklight_config+, +current_ability+
#
# @return [Boolean] true if the parent can nest the child; false otherwise
def self.valid_combined_nesting_depth?(parent:, child: nil, scope:)
# We limit the total depth of collections to the size specified in the samvera-nesting_indexer configuration.
child_depth = child_nesting_depth(child: child, scope: scope)
parent_depth = parent_nesting_depth(parent: parent, scope: scope)
return false if parent_depth + child_depth > Samvera::NestingIndexer.configuration.maximum_nesting_depth
# @return [Boolean] true Always return true because we are no longer limited
def self.valid_combined_nesting_depth?(*)
true
end

# @api private
#
# Get the child collection's nesting depth
#
# @param child [::Collection]
# @return [Fixnum] the largest number of collections in a path nested
# under this collection (including this collection)
def self.child_nesting_depth(child:, scope:)
return 1 unless child
# The nesting depth of a child collection is found by finding the largest nesting depth
# among all collections and works which have the child collection in the paths, and
# subtracting the nesting depth of the child collection itself.
# => 1) First we find all the collections with this child in the path, sort the results in descending order, and take the first result.
# note: We need to include works in this search. They are included in the depth validations in
# the indexer, so we do NOT use collection search builder here.
builder = Hyrax::SearchBuilder.new(scope).with({
q: "#{Samvera::NestingIndexer.configuration.solr_field_name_for_storing_pathnames}:/.*#{child.id}.*/",
sort: "#{Samvera::NestingIndexer.configuration.solr_field_name_for_deepest_nested_depth} desc"
})
builder.rows = 1
query = clean_lucene_error(builder: builder)
response = scope.repository.search(query).documents.first

# Now we have the largest nesting depth for all paths containing this collection
descendant_depth = response[Samvera::NestingIndexer.configuration.solr_field_name_for_deepest_nested_depth]

# => 2) Then we get the stored depth of the child collection itself to eliminate the collections above this one from our count, and add 1 to add back in this collection itself
child_depth = NestingAttributes.new(id: child.id.to_s, scope: scope).depth
nesting_depth = descendant_depth - child_depth + 1

# this should always be positive, but just being safe
nesting_depth.positive? ? nesting_depth : 1
end
private_class_method :child_nesting_depth

# @api private
#
# Get the parent collection's nesting depth
#
# @param parent [::Collection]
# @return [Fixnum] the largest number of collections above
# this collection (includes this collection)
def self.parent_nesting_depth(parent:, scope:)
return 1 if parent.nil?
NestingAttributes.new(id: parent.id.to_s, scope: scope).depth
end
private_class_method :parent_nesting_depth

# @api private
#
# @param collection [Hyrax::PcdmCollection,::Collection]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,9 +58,9 @@
it 'will exclude the given collection and its parents' do
subject
expect(solr_params.fetch(:fq)).to contain_exactly(
"-{!terms f=id}#{collection_id},#{nesting_attributes.parents.first},#{nesting_attributes.parents.last}",
"_query_:\"{!field f=collection_type_gid_ssim}#{collection.collection_type_gid}\"",
"-_query_:\"{!lucene df=nesting_collection__pathnames_ssim}*#{collection_id}*\""
"-{!graph to=id from=member_of_collection_ids_ssim}id:#{collection.id}",
"-{!graph from=id to=member_of_collection_ids_ssim}id:#{collection.id}"
)
end
end
Expand All @@ -70,14 +70,11 @@

it 'will build the search for valid children' do
subject
# rubocop:disable Layout/LineLength
expect(solr_params.fetch(:fq)).to contain_exactly(
"-{!terms f=id}#{collection_id}",
"_query_:\"{!field f=collection_type_gid_ssim}#{collection.collection_type_gid}\"",
"-_query_:\"{!lucene q.op=OR df=nesting_collection__pathnames_ssim}#{nesting_attributes.pathnames.first} #{nesting_attributes.pathnames.last} #{nesting_attributes.ancestors.first} #{nesting_attributes.ancestors.last}\"",
"-_query_:\"{!field f=nesting_collection__parent_ids_ssim}#{collection_id}\""
"-{!graph to=id from=member_of_collection_ids_ssim}id:#{collection.id}",
"-{!graph from=id to=member_of_collection_ids_ssim}id:#{collection.id}"
)
# rubocop:enable Layout/LineLength
end
end
end
Expand Down
Loading

0 comments on commit f23358d

Please sign in to comment.