From 2d7eec78bbaf8e370b0edc0a92632a937013eba9 Mon Sep 17 00:00:00 2001 From: Omar Rodriguez Arenas Date: Wed, 30 Oct 2024 11:19:14 -0600 Subject: [PATCH 1/3] WIP Add report generation jobs This is a first pass at creating the era audit reports through an active job. This first approach seems too long as it generates all the reports in the same job. We also iterate through all Item and Thesis multiple times. Should we split the report generation through multiple jobs? --- Gemfile | 1 + Gemfile.lock | 50 +++++++++++ app/jobs/generate_reports_job.rb | 144 +++++++++++++++++++++++++++++++ 3 files changed, 195 insertions(+) create mode 100644 app/jobs/generate_reports_job.rb diff --git a/Gemfile b/Gemfile index 726de393e..379c98b59 100644 --- a/Gemfile +++ b/Gemfile @@ -38,6 +38,7 @@ gem 'bcrypt', '>= 3.1.13' gem 'omniauth', '~> 2.1' gem 'omniauth-rails_csrf_protection', '~> 1.0' gem 'omniauth-saml', '~> 2.1' +gem 'omniauth_openid_connect', '~> 0.8' # Authorization gem 'pundit', '2.3.2' diff --git a/Gemfile.lock b/Gemfile.lock index 2b5834a28..bccd371fc 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -126,7 +126,9 @@ GEM tzinfo (~> 2.0) addressable (2.8.7) public_suffix (>= 2.0.2, < 7.0) + aes_key_wrap (1.1.0) ast (2.4.2) + attr_required (1.0.2) base64 (0.2.0) bcp47_spec (0.2.1) bcrypt (3.1.20) @@ -142,6 +144,7 @@ GEM parser (>= 2.4) smart_properties bigdecimal (3.1.8) + bindata (2.5.0) bindex (0.8.1) binding_of_caller (1.0.1) debug_inspector (>= 1.2.0) @@ -214,6 +217,8 @@ GEM unicode-types (~> 1.8) edtf (3.1.1) activesupport (>= 3.0, < 8.0) + email_validator (2.2.4) + activemodel erb_lint (0.5.0) activesupport better_html (>= 2.0.1) @@ -228,6 +233,8 @@ GEM i18n (>= 1.8.11, < 2) faraday (2.9.0) faraday-net_http (>= 2.0, < 3.2) + faraday-follow_redirects (0.3.0) + faraday (>= 1, < 3) faraday-http-cache (2.5.1) faraday (>= 0.8) faraday-net_http (3.1.0) @@ -300,6 +307,13 @@ GEM jsbundling-rails (1.3.0) railties (>= 6.0.0) json (2.7.2) + json-jwt (1.16.6) + activesupport (>= 4.2) + aes_key_wrap + base64 + bindata + faraday (~> 2.0) + faraday-follow_redirects json-schema (4.3.1) addressable (>= 2.8) jwt (2.7.1) @@ -378,7 +392,23 @@ GEM omniauth-saml (2.1.0) omniauth (~> 2.0) ruby-saml (~> 1.12) + omniauth_openid_connect (0.8.0) + omniauth (>= 1.9, < 3) + openid_connect (~> 2.2) open4 (1.3.4) + openid_connect (2.3.0) + activemodel + attr_required (>= 1.0.0) + email_validator + faraday (~> 2.0) + faraday-follow_redirects + json-jwt (>= 1.16) + mail + rack-oauth2 (~> 2.2) + swd (~> 2.0) + tzinfo + validate_url + webfinger (~> 2.0) os (1.1.4) paper_trail (15.1.0) activerecord (>= 6.1) @@ -398,6 +428,13 @@ GEM raabro (1.4.0) racc (1.8.0) rack (2.2.9) + rack-oauth2 (2.2.1) + activesupport + attr_required + faraday (~> 2.0) + faraday-follow_redirects + json-jwt (>= 1.11.0) + rack (>= 2.1.0) rack-protection (3.2.0) base64 (>= 0.1.0) rack (~> 2.2, >= 2.2.4) @@ -588,6 +625,11 @@ GEM strong_migrations (2.0.0) activerecord (>= 6.1) strscan (3.1.0) + swd (2.0.3) + activesupport (>= 3) + attr_required (>= 0.0.5) + faraday (~> 2.0) + faraday-follow_redirects sxp (1.3.0) matrix (~> 0.4) rdf (~> 3.3) @@ -603,6 +645,9 @@ GEM unicode-types (1.8.0) uri (0.13.0) uuidtools (2.2.0) + validate_url (1.0.15) + activemodel (>= 3.0.0) + public_suffix vcr (5.0.0) voight_kampff (2.0.0) rack (>= 1.4) @@ -611,6 +656,10 @@ GEM activemodel (>= 6.0.0) bindex (>= 0.4.0) railties (>= 6.0.0) + webfinger (2.1.3) + activesupport + faraday (~> 2.0) + faraday-follow_redirects webmock (3.23.1) addressable (>= 2.8.0) crack (>= 0.3.2) @@ -669,6 +718,7 @@ DEPENDENCIES omniauth (~> 2.1) omniauth-rails_csrf_protection (~> 1.0) omniauth-saml (~> 2.1) + omniauth_openid_connect (~> 0.8) paper_trail (~> 15.1.0) pg (~> 1.5.6) puma (~> 6.4) diff --git a/app/jobs/generate_reports_job.rb b/app/jobs/generate_reports_job.rb new file mode 100644 index 000000000..e48395286 --- /dev/null +++ b/app/jobs/generate_reports_job.rb @@ -0,0 +1,144 @@ +class GenerateReportsJob < ApplicationJob + + queue_as :default + + def perform(*args) + # Do something later + @root_directory = './era_audit/' + @time_of_start = Time.now.utc.strftime('%Y%m%d%H%M%S') + + generate_reports + end + + private + + # Helper methods to get URLs. + + def get_entity_url(entity) + # URL example: https://era.library.ualberta.ca/items/864711f5-3021-455d-9483-9ce956ee4e78 + Rails.application.routes.url_helpers.item_url(entity) + end + + def get_community_url(community) + # URL example: https://era.library.ualberta.ca/communities/d1640714-da95-4963-9242-68065fece5f4 + Rails.application.routes.url_helpers.community_url(community) + end + + def get_collection_url(community, collection) + # URL example: https://era.library.ualberta.ca/communities/34de6895-e488-440b-b05c-75efe26c4971/collections/67e0ecb3-05b7-4c9a-bf82-31611e2dc0ce + Rails.application.routes.url_helpers.community_collection_url(community, collection) + end + + def generate_reports + report_metadata_only_records + report_file_types + report_records_with_compressed_files + report_multifile_records + end + + # Report 1: Metadata only records + + def report_metadata_only_records + [Item, Thesis].each do |klass| + entity_type = klass.name.underscore + entity_attributes = klass.first.attributes.keys + entity_headers = entity_attributes.map do |key| + klass.rdf_annotation_for_attr(key).present? ? RDF::URI(klass.rdf_annotation_for_attr(key).first.predicate).pname.to_s : key + end + file_name = "#{@root_directory}/#{entity_type}_with_metadata_only_#{@time_of_start}.csv" + CSV.open(file_name, 'wb', write_headers: true, headers: entity_headers + ['URL']) do |csv| + klass.find_each do |entity| + csv << (entity.values_at(entity_attributes) + [get_entity_url(entity)]) if entity.files.count == 0 + end + end + end + end + + # Report 2: List of file types + + def report_file_types + entity_file_types = {} + + file_name = "#{@root_directory}/entity_file_types_#{@time_of_start}.csv" + + [Item, Thesis].each do |klass| + klass.find_each do |entity| + entity.files.each do |file| + content_type = file.content_type + entity_file_types[content_type] = 0 unless entity_file_types.include?(content_type) + entity_file_types[content_type] += 1 + end + end + end + + CSV.open(file_name, 'wb', write_headers: true, headers: ['File types', 'Count']) do |csv| + entity_file_types.each do |content_type, count| + csv << [content_type, count] + end + end + end + + # Report 3: List of records containing compressed files + def report_records_with_compressed_files + compressed_file_types = [ + 'application/zip', + 'application/x-7z-compressed', + 'application/gzip', + 'application/x-xz', + 'application/x-rar-compressed;version=5', + 'application/x-tar', + 'application/x-rar' + ] + + [Item, Thesis].each do |klass| + entity_type = klass.name.underscore + entity_attributes = klass.first.attributes.keys + entity_headers = entity_attributes.map do |key| + klass.rdf_annotation_for_attr(key).present? ? RDF::URI(klass.rdf_annotation_for_attr(key).first.predicate).pname.to_s : key + end + + file_name = "#{@root_directory}/#{entity_type}_with_compressed_file_#{@time_of_start}.csv" + + CSV.open(file_name, 'wb', write_headers: true, headers: entity_headers + ['URL', 'Files metadata']) do |csv| + klass.find_each do |entity| + file_metadata = [] + + entity.files.each do |file| + content_type = file.content_type + file_metadata << file.blob.to_json if compressed_file_types.include?(content_type) + end + + unless file_metadata.empty? + csv << (entity.values_at(entity_attributes) + [get_entity_url(entity), + file_metadata]) + end + end + end + end + end + + # Report 4: List of all multi file records + def report_multifile_records + [Item, Thesis].each do |klass| + entity_type = klass.name.underscore + entity_attributes = klass.first.attributes.keys + entity_headers = entity_attributes.map do |key| + klass.rdf_annotation_for_attr(key).present? ? RDF::URI(klass.rdf_annotation_for_attr(key).first.predicate).pname.to_s : key + end + + file_name = "#{@root_directory}/#{entity_type}_with_multiple_files_#{@time_of_start}.csv" + CSV.open(file_name, 'wb', write_headers: true, headers: entity_headers + ['URL', 'Files metadata']) do |csv| + klass.includes(files_attachments: :blob).find_each do |entity| + if entity.files.count > 1 + files_metadata = [] + entity.files.each do |file| + files_metadata << file.blob.to_json + end + csv << entity.values_at(entity_attributes) + [get_entity_url(entity), files_metadata] + end + end + end + end + end + +end From 3f153dac7e61dba75f07603d38f4f9242b6f302c Mon Sep 17 00:00:00 2001 From: Omar Rodriguez Arenas Date: Wed, 30 Oct 2024 11:22:44 -0600 Subject: [PATCH 2/3] Add missing test template --- test/jobs/generate_reports_job_test.rb | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 test/jobs/generate_reports_job_test.rb diff --git a/test/jobs/generate_reports_job_test.rb b/test/jobs/generate_reports_job_test.rb new file mode 100644 index 000000000..df8206566 --- /dev/null +++ b/test/jobs/generate_reports_job_test.rb @@ -0,0 +1,7 @@ +require "test_helper" + +class GenerateReportsJobTest < ActiveJob::TestCase + # test "the truth" do + # assert true + # end +end From c823c51a3cdd93c7151205c2b73cafcdf15008c8 Mon Sep 17 00:00:00 2001 From: Omar Rodriguez Arenas Date: Wed, 30 Oct 2024 13:55:54 -0600 Subject: [PATCH 3/3] Revert changes to Gemfile and Gemfile.lock --- Gemfile | 1 - Gemfile.lock | 50 -------------------------------------------------- 2 files changed, 51 deletions(-) diff --git a/Gemfile b/Gemfile index 379c98b59..726de393e 100644 --- a/Gemfile +++ b/Gemfile @@ -38,7 +38,6 @@ gem 'bcrypt', '>= 3.1.13' gem 'omniauth', '~> 2.1' gem 'omniauth-rails_csrf_protection', '~> 1.0' gem 'omniauth-saml', '~> 2.1' -gem 'omniauth_openid_connect', '~> 0.8' # Authorization gem 'pundit', '2.3.2' diff --git a/Gemfile.lock b/Gemfile.lock index bccd371fc..2b5834a28 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -126,9 +126,7 @@ GEM tzinfo (~> 2.0) addressable (2.8.7) public_suffix (>= 2.0.2, < 7.0) - aes_key_wrap (1.1.0) ast (2.4.2) - attr_required (1.0.2) base64 (0.2.0) bcp47_spec (0.2.1) bcrypt (3.1.20) @@ -144,7 +142,6 @@ GEM parser (>= 2.4) smart_properties bigdecimal (3.1.8) - bindata (2.5.0) bindex (0.8.1) binding_of_caller (1.0.1) debug_inspector (>= 1.2.0) @@ -217,8 +214,6 @@ GEM unicode-types (~> 1.8) edtf (3.1.1) activesupport (>= 3.0, < 8.0) - email_validator (2.2.4) - activemodel erb_lint (0.5.0) activesupport better_html (>= 2.0.1) @@ -233,8 +228,6 @@ GEM i18n (>= 1.8.11, < 2) faraday (2.9.0) faraday-net_http (>= 2.0, < 3.2) - faraday-follow_redirects (0.3.0) - faraday (>= 1, < 3) faraday-http-cache (2.5.1) faraday (>= 0.8) faraday-net_http (3.1.0) @@ -307,13 +300,6 @@ GEM jsbundling-rails (1.3.0) railties (>= 6.0.0) json (2.7.2) - json-jwt (1.16.6) - activesupport (>= 4.2) - aes_key_wrap - base64 - bindata - faraday (~> 2.0) - faraday-follow_redirects json-schema (4.3.1) addressable (>= 2.8) jwt (2.7.1) @@ -392,23 +378,7 @@ GEM omniauth-saml (2.1.0) omniauth (~> 2.0) ruby-saml (~> 1.12) - omniauth_openid_connect (0.8.0) - omniauth (>= 1.9, < 3) - openid_connect (~> 2.2) open4 (1.3.4) - openid_connect (2.3.0) - activemodel - attr_required (>= 1.0.0) - email_validator - faraday (~> 2.0) - faraday-follow_redirects - json-jwt (>= 1.16) - mail - rack-oauth2 (~> 2.2) - swd (~> 2.0) - tzinfo - validate_url - webfinger (~> 2.0) os (1.1.4) paper_trail (15.1.0) activerecord (>= 6.1) @@ -428,13 +398,6 @@ GEM raabro (1.4.0) racc (1.8.0) rack (2.2.9) - rack-oauth2 (2.2.1) - activesupport - attr_required - faraday (~> 2.0) - faraday-follow_redirects - json-jwt (>= 1.11.0) - rack (>= 2.1.0) rack-protection (3.2.0) base64 (>= 0.1.0) rack (~> 2.2, >= 2.2.4) @@ -625,11 +588,6 @@ GEM strong_migrations (2.0.0) activerecord (>= 6.1) strscan (3.1.0) - swd (2.0.3) - activesupport (>= 3) - attr_required (>= 0.0.5) - faraday (~> 2.0) - faraday-follow_redirects sxp (1.3.0) matrix (~> 0.4) rdf (~> 3.3) @@ -645,9 +603,6 @@ GEM unicode-types (1.8.0) uri (0.13.0) uuidtools (2.2.0) - validate_url (1.0.15) - activemodel (>= 3.0.0) - public_suffix vcr (5.0.0) voight_kampff (2.0.0) rack (>= 1.4) @@ -656,10 +611,6 @@ GEM activemodel (>= 6.0.0) bindex (>= 0.4.0) railties (>= 6.0.0) - webfinger (2.1.3) - activesupport - faraday (~> 2.0) - faraday-follow_redirects webmock (3.23.1) addressable (>= 2.8.0) crack (>= 0.3.2) @@ -718,7 +669,6 @@ DEPENDENCIES omniauth (~> 2.1) omniauth-rails_csrf_protection (~> 1.0) omniauth-saml (~> 2.1) - omniauth_openid_connect (~> 0.8) paper_trail (~> 15.1.0) pg (~> 1.5.6) puma (~> 6.4)