Skip to content

Commit

Permalink
Big Files Go Straight To S3 (#631)
Browse files Browse the repository at this point in the history
* instead of uploading large files through fcrepo, upload them stright to s3. then create an external file in fcrepo, keeping the data consistant and making sure hyrax uses the right file all the time

* additional fixes
  • Loading branch information
orangewolf authored Jun 18, 2024
1 parent 8c0e31f commit 0c50471
Show file tree
Hide file tree
Showing 6 changed files with 63 additions and 3 deletions.
2 changes: 1 addition & 1 deletion Gemfile
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ gem 'blacklight_oai_provider', '~> 6.1', '>= 6.1.1'
gem 'blacklight_range_limit', '~> 7.0'
gem 'bolognese', '>= 1.9.10'
gem 'bootstrap-datepicker-rails'
gem 'bulkrax', github: 'samvera/bulkrax', branch: 'download_imported_file' # '~> 7.0'
gem 'bulkrax', github: 'samvera/bulkrax', branch: 'main' # '~> 8.1.0'
gem 'byebug', group: %i[development test]
gem 'capybara', group: %i[test]
gem 'capybara-screenshot', '~> 1.0', group: %i[test]
Expand Down
4 changes: 2 additions & 2 deletions Gemfile.lock
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,8 @@ GIT

GIT
remote: https://github.com/samvera/bulkrax.git
revision: 31da660e766c66073375df1848c8e45b03675652
branch: download_imported_file
revision: 74b8d7eb426e4b37b35b86ce7abff084fa64783a
branch: main
specs:
bulkrax (8.1.0)
bagit (~> 0.4.6)
Expand Down
37 changes: 37 additions & 0 deletions app/actors/hyrax/actors/file_actor_decorator.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
module Hyrax
module Actors
# Actions for a file identified by file_set and relation (maps to use predicate)
# @note Spawns asynchronous jobs
module FileActorDecorator

def perform_ingest_file_through_active_fedora(io)
# Skip versioning because versions will be minted by VersionCommitter as necessary during save_characterize_and_record_committer.
# these are files too big to send to S3 w/o Streaming
Rails.logger.error("[FileActor] starting write for #{file_set.id}")
if io.size.to_i >= 3.gigabytes
Rails.logger.error("[FileActor] Uploading directly to S3 for file_set #{file_set.id}")
digest = `sha1sum #{io.path}`.split.first
file_set.s3_only = digest
s3_object = Aws::S3::Object.new(ENV['AWS_BUCKET'], digest)
s3_object.upload_file(io.path) unless s3_object.exists?
Hydra::Works::AddExternalFileToFileSet.call(file_set, s3_object.public_url, relation)
# how do we make sure the sha gets indexed?
else
Rails.logger.error("[FileActor] writing to fcrepo #{file_set.id}")
# Skip versioning because versions will be minted by VersionCommitter as necessary during save_characterize_and_record_committer.
Hydra::Works::AddFileToFileSet.call(file_set,
io,
relation,
versioning: false)
end
return false unless file_set.save
repository_file = related_file
create_version(repository_file, user)
CharacterizeJob.perform_later(file_set, repository_file.id, pathhint(io))
end

end
end
end

Hyrax::Actors::FileActor.prepend(Hyrax::Actors::FileActorDecorator)
2 changes: 2 additions & 0 deletions app/indexers/hyrax/file_set_indexer_decorator.rb
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ module Hyrax
module FileSetIndexerDecorator
def generate_solr_document
super.tap do |solr_doc|
# digest of the original file if we are using an external file due to s3 direct upload
solr_doc['digest_ssim'] = "urn:sha1:#{object.s3_only}" if object.s3_only.present?
solr_doc['rdf_type_ssim'] = object.parent_works.first.rdf_type if attachment?
solr_doc['all_text_tesimv'] = solr_doc['all_text_tsimv'] if solr_doc['all_text_tsimv'].present?
end
Expand Down
6 changes: 6 additions & 0 deletions app/models/file_set.rb
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,12 @@ class FileSet < ActiveFedora::Base
index.as :stored_searchable, :facetable
end

property :s3_only,
predicate: ::RDF::URI("https://hykucommons.org/terms/s3_only"),
multiple: false do |index|
index.as :stored_searchable, :facetable
end

include ::Hyrax::FileSetBehavior
# @return [String] the Attachment's rdf type for the given FileSet
# @return [NilClass] when there is no rdf_type for the parent work (Attachment).
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# OVERRIDE Hydra-works 2.0.0 to deal with fcrepo + s3s inability to upload empty files

module Hydra
module Works
module UpdaterDecorator
def attach_attributes(external_file_url, filename = nil)
current_file.content = StringIO.new('-') # anything but blank
current_file.original_name = filename
current_file.mime_type = "message/external-body; access-type=URL; URL=\"#{external_file_url}\""
end
end
end
end

Hydra::Works::AddExternalFileToFileSet::Updater.prepend(Hydra::Works::UpdaterDecorator)

0 comments on commit 0c50471

Please sign in to comment.