From aae485109efe981a43f6f6009d743d8b30a40172 Mon Sep 17 00:00:00 2001 From: Monique Rio Date: Wed, 14 Aug 2024 16:28:57 -0400 Subject: [PATCH] adds translation map generator for sh remediation Translation map is a bit of a misnomer since more than a one to one mapping. It's only the same in that it's some config that's generated before doing indexing. --- .gitignore | 1 + umich_catalog_indexing/env.example | 1 + .../lib/jobs/translation_map_generator.rb | 6 +- .../subject_heading_remediation.rb | 89 ++++ umich_catalog_indexing/lib/services.rb | 4 + umich_catalog_indexing/lib/services/paths.rb | 8 + .../fixtures/subjects/authority_record.json | 23 ++ .../fixtures/subjects/authority_record2.json | 23 ++ .../spec/fixtures/subjects/authority_set.json | 9 + .../umich/subject_heading_remediation.json | 390 ++++++++++++++++++ .../subject_heading_remediation_spec.rb | 169 ++++++++ 11 files changed, 721 insertions(+), 2 deletions(-) create mode 100644 umich_catalog_indexing/lib/jobs/translation_map_generator/subject_heading_remediation.rb create mode 100644 umich_catalog_indexing/spec/fixtures/subjects/authority_record.json create mode 100644 umich_catalog_indexing/spec/fixtures/subjects/authority_record2.json create mode 100644 umich_catalog_indexing/spec/fixtures/subjects/authority_set.json create mode 100644 umich_catalog_indexing/spec/fixtures/translation_maps/umich/subject_heading_remediation.json create mode 100644 umich_catalog_indexing/spec/jobs/translation_map_generator/subject_heading_remediation_spec.rb diff --git a/.gitignore b/.gitignore index 28ecd3ae..3f35b145 100644 --- a/.gitignore +++ b/.gitignore @@ -17,6 +17,7 @@ umich_catalog_indexing/coverage/ umich_catalog_indexing/lib/translation_maps/hlb.json.gz umich_catalog_indexing/lib/translation_maps/umich/libLocInfo.yaml +umich_catalog_indexing/lib/translation_maps/umich/subject_heading_remediation.json umich_catalog_indexing/scratch/* !umich_catalog_indexing/scratch/.keep diff --git a/umich_catalog_indexing/env.example b/umich_catalog_indexing/env.example index ec8bfd65..a479c5be 100644 --- a/umich_catalog_indexing/env.example +++ b/umich_catalog_indexing/env.example @@ -1,3 +1,4 @@ ALMA_API_KEY='YOUR_ALMA_API_KEY' NODB=1 SUPERVISOR_ON='true' +SUBJECT_HEADING_REMEDIATION_SET_ID="YOUR_SET_ID" diff --git a/umich_catalog_indexing/lib/jobs/translation_map_generator.rb b/umich_catalog_indexing/lib/jobs/translation_map_generator.rb index fdd01e7b..cb21cf6c 100644 --- a/umich_catalog_indexing/lib/jobs/translation_map_generator.rb +++ b/umich_catalog_indexing/lib/jobs/translation_map_generator.rb @@ -5,12 +5,13 @@ def all [ HighLevelBrowse, ElectronicCollections, - LibLocInfo + LibLocInfo, + SubjectHeadingRemediation ] end def translation_map_directory - File.join(S.project_root, "lib", "translation_maps") + S.translation_map_dir end def generate_all(dir: translation_map_directory) @@ -54,3 +55,4 @@ def write_to_file(path) require_relative "translation_map_generator/electronic_collections" require_relative "translation_map_generator/lib_loc_info" require_relative "translation_map_generator/high_level_browse" +require_relative "translation_map_generator/subject_heading_remediation" diff --git a/umich_catalog_indexing/lib/jobs/translation_map_generator/subject_heading_remediation.rb b/umich_catalog_indexing/lib/jobs/translation_map_generator/subject_heading_remediation.rb new file mode 100644 index 00000000..eca4d1f9 --- /dev/null +++ b/umich_catalog_indexing/lib/jobs/translation_map_generator/subject_heading_remediation.rb @@ -0,0 +1,89 @@ +module Jobs + module TranslationMapGenerator + module SubjectHeadingRemediation + class << self + include FileWriter + def name + "Subject Headings mapping" + end + + # @returns [String] where in the translation map directory the file + # should go + def file_path + File.join("umich", "subject_heading_remediation.json") + end + + # @returns [String] JSON string of mapping + def generate + JSON.pretty_generate(Set.for(S.subject_heading_remediation_set_id).to_a) + end + end + + class Set + def self.for(id) + resp = AlmaRestClient.client.get_all(url: "conf/sets/#{id}/members", record_key: "members") + raise StandardError, "Couldn't retrieve authority set data for #{id}" if resp.status != 200 + new(resp.body) + end + + def initialize(data) + @data = data + end + + def ids + @data["member"].map { |x| x["id"] } + end + + def authority_records + ids.map do |id| + Authority.for(id) + end + end + + def to_a + authority_records.map { |x| x.to_h } + end + end + + class Authority + SUBFIELDS = ["a", "v", "x", "y", "z"] + def self.for(authority_record_id) + resp = AlmaRestClient.client.get("bibs/authorities/#{authority_record_id}", query: {view: "full"}) + raise StandardError, "Couldn't retrieve authority data for #{authority_record_id}" if resp.status != 200 + new(resp.body) + end + + def initialize(data) + @record = MARC::XMLReader.new(StringIO.new(data["anies"]&.first)).first + end + + def remediated_term + out_hash = Hash.new { |h, key| h[key] = [] } + @record.fields("150").first.subfields.each do |sf| + out_hash[sf.code].push(sf.value) if SUBFIELDS.include?(sf.code) + end + out_hash + end + + def deprecated_terms + @record.fields("450").map do |field| + out_hash = Hash.new { |h, key| h[key] = [] } + field.subfields.each do |sf| + out_hash[sf.code].push(sf.value) if SUBFIELDS.include?(sf.code) + end + out_hash + end.sort do |a, b| + b.keys.count <=> a.keys.count + end + end + + def to_h + { + "150" => remediated_term, + "450" => deprecated_terms + } + end + end + end + end +end diff --git a/umich_catalog_indexing/lib/services.rb b/umich_catalog_indexing/lib/services.rb index d2980b3a..5d401f47 100644 --- a/umich_catalog_indexing/lib/services.rb +++ b/umich_catalog_indexing/lib/services.rb @@ -7,6 +7,10 @@ # When splitting MARC records from zephir into smaller files, how many records should each file have? S.register(:marc_record_batch_size) { ENV.fetch("MARC_RECORD_BATCH_SIZE", 200_000) } +S.register(:subject_heading_remediation_set_id) { ENV["SUBJECT_HEADING_REMEDIATION_SET_ID"] } + +S.register(:app_env) { ENV["APP_ENV"] || "development" } + require_relative "services/paths" require_relative "services/logger" require_relative "services/dbs" diff --git a/umich_catalog_indexing/lib/services/paths.rb b/umich_catalog_indexing/lib/services/paths.rb index 82f8aa01..b0750cd6 100644 --- a/umich_catalog_indexing/lib/services/paths.rb +++ b/umich_catalog_indexing/lib/services/paths.rb @@ -3,3 +3,11 @@ end S.register(:scratch_dir) { File.join(S.project_root, "scratch") } + +S.register(:translation_map_dir) do + if S.app_env == "test" + File.join(S.project_root, "spec", "fixtures", "translation_maps") + else + File.join(S.project_root, "lib", "translation_maps") + end +end diff --git a/umich_catalog_indexing/spec/fixtures/subjects/authority_record.json b/umich_catalog_indexing/spec/fixtures/subjects/authority_record.json new file mode 100644 index 00000000..984a197e --- /dev/null +++ b/umich_catalog_indexing/spec/fixtures/subjects/authority_record.json @@ -0,0 +1,23 @@ +{ + "mms_id": 98187481368106380, + "record_format": "marc21_authority", + "title": "Undocumented immigrants", + "created_by": "System", + "created_date": "2021-11-17Z", + "last_modified_by": "rednaal", + "last_modified_date": "2023-03-29Z", + "originating_system": "LIBRARY_OF_CONGRESS", + "originating_system_id": "98184898010106381", + "cataloging_level": { + "value": "00", + "desc": "Default Level" + }, + "vocabulary": { + "value": "MIUSH", + "desc": "miush" + }, + "anies": [ + "01200cz a2200301n 450020230329130030.0030627i| anannbabn |a ana 98187481368106381sh 85003553(DLC)sh 85003553(LIBRARY_OF_CONGRESS)98171057700000041DLCDLCDLCWaUUndocumented immigrantsUndocumented foreign nationalsIllegal aliensAliensLegal status, laws, etc.nneAliens, IllegalIllegal aliensLegal status, laws, etc.Illegal immigrantsUndocumented noncitizensgAliensImmigrant detention centersHuman smugglingNoncitizensIllegal immigrationWork cat.: 2007017970: Illegal immigration, 2007:eCIP data sheet (Illegal immigrants)sla-lab updated based on DEIA Catalog Working Group changes May 2021; \"Undocumented immigrants\" term borrowed from Sears ; \"undocumented foreign national\" term from Bill H.R. 3776 (116th Congress)sla-lab updated to include 550s for LCSH headings as references March 2023" + ], + "link": "https://api-na.hosted.exlibrisgroup.com/almaws/v1/bibs/authorities/98187481368106381" +} diff --git a/umich_catalog_indexing/spec/fixtures/subjects/authority_record2.json b/umich_catalog_indexing/spec/fixtures/subjects/authority_record2.json new file mode 100644 index 00000000..2d722259 --- /dev/null +++ b/umich_catalog_indexing/spec/fixtures/subjects/authority_record2.json @@ -0,0 +1,23 @@ +{ + "mms_id": 999, + "record_format": "marc21_authority", + "title": "Whatever", + "created_by": "System", + "created_date": "2021-11-17Z", + "last_modified_by": "rednaal", + "last_modified_date": "2023-03-29Z", + "originating_system": "LIBRARY_OF_CONGRESS", + "originating_system_id": "98184898010106381", + "cataloging_level": { + "value": "00", + "desc": "Default Level" + }, + "vocabulary": { + "value": "MIUSH", + "desc": "miush" + }, + "anies": [ + "01200cz a2200301n 450020230329130030.0030627i| anannbabn |a ana 98187481368106381sh 85003553(DLC)sh 85003553(LIBRARY_OF_CONGRESS)98171057700000041DLCDLCDLCWaUWhateverFirst x fieldSecond x fieldFirst v fieldSecond v fieldFirst y fieldSecond y fieldFirst z fieldSecond z fieldStuffFirst deprecated x fieldSecond deprecated x fieldFirst deprecated v fieldSecond deprecated v fieldFirst deprecated y fieldSecond deprecated y fieldFirst deprecated z fieldSecond deprecated z fieldWork cat.: 2007017970: Illegal immigration, 2007:eCIP data sheet (Illegal immigrants)sla-lab updated based on DEIA Catalog Working Group changes May 2021; \"Whatever\" term borrowed from Sears ; \"undocumented foreign national\" term from Bill H.R. 3776 (116th Congress)sla-lab updated to include 550s for LCSH headings as references March 2023" + ], + "link": "https://api-na.hosted.exlibrisgroup.com/almaws/v1/bibs/authorities/98187481368106381" +} diff --git a/umich_catalog_indexing/spec/fixtures/subjects/authority_set.json b/umich_catalog_indexing/spec/fixtures/subjects/authority_set.json new file mode 100644 index 00000000..4aa75dbf --- /dev/null +++ b/umich_catalog_indexing/spec/fixtures/subjects/authority_set.json @@ -0,0 +1,9 @@ +{ + "member": [ + { + "id": "98187481368106381", + "description": "Undocumented immigrants" + } + ], + "total_record_count": 1 +} diff --git a/umich_catalog_indexing/spec/fixtures/translation_maps/umich/subject_heading_remediation.json b/umich_catalog_indexing/spec/fixtures/translation_maps/umich/subject_heading_remediation.json new file mode 100644 index 00000000..5b5c9bee --- /dev/null +++ b/umich_catalog_indexing/spec/fixtures/translation_maps/umich/subject_heading_remediation.json @@ -0,0 +1,390 @@ +[ + { + "150": { + "a": [ + "Immigrant detention centers" + ] + }, + "450": [ + { + "a": [ + "Alien detention centers" + ] + }, + { + "a": [ + "Detention centers, Alien" + ] + }, + { + "a": [ + "Detention centers, Noncitizen" + ] + }, + { + "a": [ + "Detention centers, Immigration" + ] + }, + { + "a": [ + "Immigration detention centers" + ] + }, + { + "a": [ + "Undocumented immigrant detention centers" + ] + } + ] + }, + { + "150": { + "a": [ + "Undocumented immigrants" + ] + }, + "450": [ + { + "a": [ + "Aliens" + ], + "x": [ + "Legal status, laws, etc." + ] + }, + { + "a": [ + "Illegal aliens" + ], + "x": [ + "Legal status, laws, etc." + ] + }, + { + "a": [ + "Undocumented foreign nationals" + ] + }, + { + "a": [ + "Illegal aliens" + ] + }, + { + "a": [ + "Aliens, Illegal" + ] + }, + { + "a": [ + "Illegal immigrants" + ] + }, + { + "a": [ + "Undocumented noncitizens" + ] + } + ] + }, + { + "150": { + "a": [ + "Children of undocumented immigrants" + ] + }, + "450": [ + { + "a": [ + "Children of undocumented foreign nationals" + ] + }, + { + "a": [ + "Children of illegal aliens" + ] + }, + { + "a": [ + "First generation children" + ] + }, + { + "a": [ + "Illegal aliens' children" + ] + }, + { + "a": [ + "Second generation children" + ] + }, + { + "a": [ + "Noncitizens' children" + ] + } + ] + }, + { + "150": { + "a": [ + "Children of undocumented immigrants" + ], + "x": [ + "Education" + ] + }, + "450": [ + { + "a": [ + "Children of undocumented foreign nationals" + ], + "x": [ + "Education" + ] + }, + { + "a": [ + "Children of illegal aliens" + ], + "x": [ + "Education" + ] + } + ] + }, + { + "150": { + "a": [ + "Children of undocumented immigrants" + ], + "x": [ + "Education", + "Law and legislation" + ] + }, + "450": [ + { + "a": [ + "Children of undocumented foreign nationals" + ], + "x": [ + "Education", + "Law and legislation" + ] + }, + { + "a": [ + "Children of illegal aliens" + ], + "x": [ + "Education", + "Law and legislation" + ] + } + ] + }, + { + "150": { + "a": [ + "Undocumented immigrants" + ], + "x": [ + "Government policy" + ], + "z": [ + "United States" + ] + }, + "450": [ + { + "a": [ + "Illegal aliens" + ], + "x": [ + "Goverment policy" + ], + "z": [ + "United States" + ] + }, + { + "a": [ + "Undocumented foreign nationals" + ], + "z": [ + "United States" + ] + } + ] + }, + { + "150": { + "a": [ + "Undocumented immigrants" + ], + "z": [ + "United States" + ] + }, + "450": [ + { + "a": [ + "Undocumented foreign nationals" + ], + "z": [ + "United States" + ] + }, + { + "a": [ + "Illegal aliens" + ], + "z": [ + "United States" + ] + } + ] + }, + { + "150": { + "a": [ + "Undocumented immigrants" + ], + "v": [ + "Fiction" + ] + }, + "450": [ + { + "a": [ + "Undocumented foreign nationals" + ], + "v": [ + "Fiction" + ] + }, + { + "a": [ + "Illegal aliens" + ], + "v": [ + "Fiction" + ] + } + ] + }, + { + "150": { + "a": [ + "Women undocumented immigrants" + ] + }, + "450": [ + { + "a": [ + "Women undocumented foreign nationals" + ] + }, + { + "a": [ + "Women illegal aliens" + ] + }, + { + "a": [ + "Undocumented women immigrants" + ] + }, + { + "a": [ + "Undocumented women aliens" + ] + } + ] + }, + { + "150": { + "a": [ + "Human smuggling" + ] + }, + "450": [ + { + "a": [ + "Immigrant smuggling" + ] + }, + { + "a": [ + "Migrant smuggling" + ] + }, + { + "a": [ + "People smuggling" + ] + } + ] + }, + { + "150": { + "a": [ + "Undocumented immigrants in literature" + ] + }, + "450": [ + { + "a": [ + "Undocumented foreign nationals in literature" + ] + }, + { + "a": [ + "Illegal aliens in literature" + ] + } + ] + }, + { + "150": { + "a": [ + "Undocumented immigrant children" + ] + }, + "450": [ + { + "a": [ + "Undocumented foreign national children" + ] + }, + { + "a": [ + "Illegal alien children" + ] + }, + { + "a": [ + "Illegal immigrant children" + ] + }, + { + "a": [ + "Undocumented children" + ] + }, + { + "a": [ + "Undocumented child immigrants" + ] + }, + { + "a": [ + "Unaccompanied noncitizen children" + ] + } + ] + } +] diff --git a/umich_catalog_indexing/spec/jobs/translation_map_generator/subject_heading_remediation_spec.rb b/umich_catalog_indexing/spec/jobs/translation_map_generator/subject_heading_remediation_spec.rb new file mode 100644 index 00000000..a5b818b6 --- /dev/null +++ b/umich_catalog_indexing/spec/jobs/translation_map_generator/subject_heading_remediation_spec.rb @@ -0,0 +1,169 @@ +require_relative "../../spec_helper" +require "jobs" + +def remediated_term + {"a" => ["Undocumented immigrants"]} +end + +def deprecated_terms + [ + { + "a" => ["Aliens"], + "x" => ["Legal status, laws, etc."] + }, + { + "a" => ["Illegal aliens"], + "x" => ["Legal status, laws, etc."] + }, + { + "a" => ["Undocumented foreign nationals"] + }, + { + "a" => ["Illegal aliens"] + }, + { + "a" => ["Aliens, Illegal"] + }, + { + "a" => ["Illegal immigrants"] + }, + { + "a" => ["Undocumented noncitizens"] + } + ] +end +describe Jobs::TranslationMapGenerator::SubjectHeadingRemediation::Set do + before(:each) do + @data = fixture("subjects/authority_set.json") + end + let(:set_id) { "1234" } + let(:authority_record_id) { "98187481368106381" } + let(:authority_record) { fixture("subjects/authority_record.json") } + let(:second_authority_record_id) { "999" } + let(:second_authority_record) { fixture("subjects/authority_record2.json") } + let(:stub_set_request) { + stub_alma_get_request( + url: "conf/sets/#{set_id}/members", + query: {limit: 100, offset: 0}, + output: @data + ) + } + let(:stub_authority_request) { + stub_alma_get_request( + url: "bibs/authorities/#{authority_record_id}", + query: {view: "full"}, + output: authority_record + ) + } + let(:stub_second_authority_request) { + stub_alma_get_request( + url: "bibs/authorities/#{second_authority_record_id}", + query: {view: "full"}, + output: second_authority_record + ) + } + subject do + described_class.new(JSON.parse(@data)) + end + context "#ids" do + it "returns an array of ids" do + expect(subject.ids).to contain_exactly(authority_record_id) + end + end + + context "#authority_records" do + it "returns an array of Authority objects" do + stub_authority_request + expect(subject.authority_records.first.remediated_term).to eq({"a" => ["Undocumented immigrants"]}) + end + end + + context "#to_a" do + it "returns an array of authority objects" do + # Add an extra member to json + d = JSON.parse(@data) + d["member"].push({"id" => "999", "description" => "string"}) + @data = d.to_json + stub_authority_request + stub_second_authority_request + + expect(subject.to_a).to eq( + [ + { + "150" => remediated_term, + "450" => deprecated_terms + }, + { + "150" => { + "a" => ["Whatever"], + "x" => ["First x field", "Second x field"], + "v" => ["First v field", "Second v field"], + "y" => ["First y field", "Second y field"], + "z" => ["First z field", "Second z field"] + }, + "450" => [ + { + "a" => ["Stuff"], + "x" => ["First deprecated x field", "Second deprecated x field"], + "v" => ["First deprecated v field", "Second deprecated v field"], + "y" => ["First deprecated y field", "Second deprecated y field"], + "z" => ["First deprecated z field", "Second deprecated z field"] + } + ] + } + ] + ) + end + end + context ".for" do + it "returns a Set from the Alma Set id" do + stub_set_request + expect(described_class.for(set_id).ids.first).to eq(authority_record_id) + end + it "errors out if it can't talk to alma" do + stub_alma_get_request( + url: "conf/sets/#{set_id}/members", + query: {limit: 100, offset: 0}, + no_return: true + ).to_timeout + expect { described_class.for(set_id) }.to raise_error(StandardError, /#{set_id}/) + end + end +end +describe Jobs::TranslationMapGenerator::SubjectHeadingRemediation::Authority do + before(:each) do + @data = JSON.parse(fixture("subjects/authority_record.json")) + end + subject do + described_class.new(@data) + end + let(:authority_record_id) { "12345" } + context ".for" do + it "errors out if it can't talk to Alma" do + stub_alma_get_request( + url: "bibs/authorities/#{authority_record_id}", + query: {view: "full"}, + status: 500 + ) + expect { described_class.for(authority_record_id) }.to raise_error(StandardError, /#{authority_record_id}/) + end + end + context "#remediated_term" do + it "returns the remediated term" do + expect(subject.remediated_term).to eq(remediated_term) + end + end + context "#deprecated_terms" do + it "returns the deprecated terms from the 450 field" do + expect(subject.deprecated_terms).to contain_exactly(*deprecated_terms) + end + end + context "#to_h" do + it "returns the expected deprecated_to_remediated hash with downcased terms" do + expect(subject.to_h).to eq({ + "150" => remediated_term, + "450" => deprecated_terms + }) + end + end +end