Skip to content

Commit

Permalink
adds translation map generator for sh remediation
Browse files Browse the repository at this point in the history
Translation map is a bit of a misnomer since more than a one to one
mapping. It's only the same in that it's some config that's generated
before doing indexing.
  • Loading branch information
niquerio committed Aug 14, 2024
1 parent d70b161 commit aae4851
Show file tree
Hide file tree
Showing 11 changed files with 721 additions and 2 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ umich_catalog_indexing/coverage/

umich_catalog_indexing/lib/translation_maps/hlb.json.gz
umich_catalog_indexing/lib/translation_maps/umich/libLocInfo.yaml
umich_catalog_indexing/lib/translation_maps/umich/subject_heading_remediation.json

umich_catalog_indexing/scratch/*
!umich_catalog_indexing/scratch/.keep
Expand Down
1 change: 1 addition & 0 deletions umich_catalog_indexing/env.example
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
ALMA_API_KEY='YOUR_ALMA_API_KEY'
NODB=1
SUPERVISOR_ON='true'
SUBJECT_HEADING_REMEDIATION_SET_ID="YOUR_SET_ID"
6 changes: 4 additions & 2 deletions umich_catalog_indexing/lib/jobs/translation_map_generator.rb
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,13 @@ def all
[
HighLevelBrowse,
ElectronicCollections,
LibLocInfo
LibLocInfo,
SubjectHeadingRemediation
]
end

def translation_map_directory
File.join(S.project_root, "lib", "translation_maps")
S.translation_map_dir
end

def generate_all(dir: translation_map_directory)
Expand Down Expand Up @@ -54,3 +55,4 @@ def write_to_file(path)
require_relative "translation_map_generator/electronic_collections"
require_relative "translation_map_generator/lib_loc_info"
require_relative "translation_map_generator/high_level_browse"
require_relative "translation_map_generator/subject_heading_remediation"
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
module Jobs
module TranslationMapGenerator
module SubjectHeadingRemediation
class << self
include FileWriter
def name
"Subject Headings mapping"
end

# @returns [String] where in the translation map directory the file
# should go
def file_path
File.join("umich", "subject_heading_remediation.json")
end

# @returns [String] JSON string of mapping
def generate
JSON.pretty_generate(Set.for(S.subject_heading_remediation_set_id).to_a)
end
end

class Set
def self.for(id)
resp = AlmaRestClient.client.get_all(url: "conf/sets/#{id}/members", record_key: "members")
raise StandardError, "Couldn't retrieve authority set data for #{id}" if resp.status != 200
new(resp.body)
end

def initialize(data)
@data = data
end

def ids
@data["member"].map { |x| x["id"] }
end

def authority_records
ids.map do |id|
Authority.for(id)
end
end

def to_a
authority_records.map { |x| x.to_h }
end
end

class Authority
SUBFIELDS = ["a", "v", "x", "y", "z"]
def self.for(authority_record_id)
resp = AlmaRestClient.client.get("bibs/authorities/#{authority_record_id}", query: {view: "full"})
raise StandardError, "Couldn't retrieve authority data for #{authority_record_id}" if resp.status != 200
new(resp.body)
end

def initialize(data)
@record = MARC::XMLReader.new(StringIO.new(data["anies"]&.first)).first
end

def remediated_term
out_hash = Hash.new { |h, key| h[key] = [] }
@record.fields("150").first.subfields.each do |sf|
out_hash[sf.code].push(sf.value) if SUBFIELDS.include?(sf.code)
end
out_hash
end

def deprecated_terms
@record.fields("450").map do |field|
out_hash = Hash.new { |h, key| h[key] = [] }
field.subfields.each do |sf|
out_hash[sf.code].push(sf.value) if SUBFIELDS.include?(sf.code)
end
out_hash
end.sort do |a, b|
b.keys.count <=> a.keys.count
end
end

def to_h
{
"150" => remediated_term,
"450" => deprecated_terms
}
end
end
end
end
end
4 changes: 4 additions & 0 deletions umich_catalog_indexing/lib/services.rb
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@
# When splitting MARC records from zephir into smaller files, how many records should each file have?
S.register(:marc_record_batch_size) { ENV.fetch("MARC_RECORD_BATCH_SIZE", 200_000) }

S.register(:subject_heading_remediation_set_id) { ENV["SUBJECT_HEADING_REMEDIATION_SET_ID"] }

S.register(:app_env) { ENV["APP_ENV"] || "development" }

require_relative "services/paths"
require_relative "services/logger"
require_relative "services/dbs"
Expand Down
8 changes: 8 additions & 0 deletions umich_catalog_indexing/lib/services/paths.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,11 @@
end

S.register(:scratch_dir) { File.join(S.project_root, "scratch") }

S.register(:translation_map_dir) do
if S.app_env == "test"
File.join(S.project_root, "spec", "fixtures", "translation_maps")
else
File.join(S.project_root, "lib", "translation_maps")
end
end
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
{
"mms_id": 98187481368106380,
"record_format": "marc21_authority",
"title": "Undocumented immigrants",
"created_by": "System",
"created_date": "2021-11-17Z",
"last_modified_by": "rednaal",
"last_modified_date": "2023-03-29Z",
"originating_system": "LIBRARY_OF_CONGRESS",
"originating_system_id": "98184898010106381",
"cataloging_level": {
"value": "00",
"desc": "Default Level"
},
"vocabulary": {
"value": "MIUSH",
"desc": "miush"
},
"anies": [
"<?xml version=\"1.0\" encoding=\"UTF-16\"?><record><leader>01200cz a2200301n 4500</leader><controlfield tag=\"005\">20230329130030.0</controlfield><controlfield tag=\"008\">030627i| anannbabn |a ana </controlfield><controlfield tag=\"001\">98187481368106381</controlfield><datafield ind1=\" \" ind2=\" \" tag=\"010\"><subfield code=\"a\">sh 85003553</subfield></datafield><datafield ind1=\" \" ind2=\" \" tag=\"035\"><subfield code=\"a\">(DLC)sh 85003553</subfield></datafield><datafield ind1=\" \" ind2=\" \" tag=\"035\"><subfield code=\"a\">(LIBRARY_OF_CONGRESS)98171057700000041</subfield></datafield><datafield ind1=\" \" ind2=\" \" tag=\"040\"><subfield code=\"a\">DLC</subfield><subfield code=\"c\">DLC</subfield><subfield code=\"d\">DLC</subfield><subfield code=\"d\">WaU</subfield></datafield><datafield ind1=\" \" ind2=\" \" tag=\"150\"><subfield code=\"a\">Undocumented immigrants</subfield></datafield><datafield ind1=\" \" ind2=\" \" tag=\"450\"><subfield code=\"a\">Undocumented foreign nationals</subfield></datafield><datafield ind1=\" \" ind2=\" \" tag=\"450\"><subfield code=\"a\">Illegal aliens</subfield></datafield><datafield ind1=\" \" ind2=\" \" tag=\"450\"><subfield code=\"a\">Aliens</subfield><subfield code=\"x\">Legal status, laws, etc.</subfield></datafield><datafield ind1=\" \" ind2=\" \" tag=\"450\"><subfield code=\"w\">nne</subfield><subfield code=\"a\">Aliens, Illegal</subfield></datafield><datafield ind1=\" \" ind2=\" \" tag=\"450\"><subfield code=\"a\">Illegal aliens</subfield><subfield code=\"x\">Legal status, laws, etc.</subfield></datafield><datafield ind1=\" \" ind2=\" \" tag=\"450\"><subfield code=\"a\">Illegal immigrants</subfield></datafield><datafield ind1=\" \" ind2=\" \" tag=\"450\"><subfield code=\"a\">Undocumented noncitizens</subfield></datafield><datafield ind1=\" \" ind2=\" \" tag=\"550\"><subfield code=\"w\">g</subfield><subfield code=\"a\">Aliens</subfield></datafield><datafield ind1=\" \" ind2=\" \" tag=\"550\"><subfield code=\"a\">Immigrant detention centers</subfield></datafield><datafield ind1=\" \" ind2=\" \" tag=\"550\"><subfield code=\"a\">Human smuggling</subfield></datafield><datafield ind1=\" \" ind2=\" \" tag=\"550\"><subfield code=\"a\">Noncitizens</subfield></datafield><datafield ind1=\" \" ind2=\" \" tag=\"550\"><subfield code=\"a\">Illegal immigration</subfield></datafield><datafield ind1=\" \" ind2=\" \" tag=\"670\"><subfield code=\"a\">Work cat.: 2007017970: Illegal immigration, 2007:</subfield><subfield code=\"b\">eCIP data sheet (Illegal immigrants)</subfield></datafield><datafield ind1=\" \" ind2=\" \" tag=\"690\"><subfield code=\"a\">sla-lab updated based on DEIA Catalog Working Group changes May 2021; \"Undocumented immigrants\" term borrowed from Sears ; \"undocumented foreign national\" term from Bill H.R. 3776 (116th Congress)</subfield></datafield><datafield ind1=\" \" ind2=\" \" tag=\"690\"><subfield code=\"a\">sla-lab updated to include 550s for LCSH headings as references March 2023</subfield></datafield></record>"
],
"link": "https://api-na.hosted.exlibrisgroup.com/almaws/v1/bibs/authorities/98187481368106381"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
{
"mms_id": 999,
"record_format": "marc21_authority",
"title": "Whatever",
"created_by": "System",
"created_date": "2021-11-17Z",
"last_modified_by": "rednaal",
"last_modified_date": "2023-03-29Z",
"originating_system": "LIBRARY_OF_CONGRESS",
"originating_system_id": "98184898010106381",
"cataloging_level": {
"value": "00",
"desc": "Default Level"
},
"vocabulary": {
"value": "MIUSH",
"desc": "miush"
},
"anies": [
"<?xml version=\"1.0\" encoding=\"UTF-16\"?><record><leader>01200cz a2200301n 4500</leader><controlfield tag=\"005\">20230329130030.0</controlfield><controlfield tag=\"008\">030627i| anannbabn |a ana </controlfield><controlfield tag=\"001\">98187481368106381</controlfield><datafield ind1=\" \" ind2=\" \" tag=\"010\"><subfield code=\"a\">sh 85003553</subfield></datafield><datafield ind1=\" \" ind2=\" \" tag=\"035\"><subfield code=\"a\">(DLC)sh 85003553</subfield></datafield><datafield ind1=\" \" ind2=\" \" tag=\"035\"><subfield code=\"a\">(LIBRARY_OF_CONGRESS)98171057700000041</subfield></datafield><datafield ind1=\" \" ind2=\" \" tag=\"040\"><subfield code=\"a\">DLC</subfield><subfield code=\"c\">DLC</subfield><subfield code=\"d\">DLC</subfield><subfield code=\"d\">WaU</subfield></datafield><datafield ind1=\" \" ind2=\" \" tag=\"150\"><subfield code=\"a\">Whatever</subfield><subfield code=\"x\">First x field</subfield><subfield code=\"x\">Second x field</subfield><subfield code=\"v\">First v field</subfield><subfield code=\"v\">Second v field</subfield><subfield code=\"y\">First y field</subfield><subfield code=\"y\">Second y field</subfield><subfield code=\"z\">First z field</subfield><subfield code=\"z\">Second z field</subfield></datafield><datafield ind1=\" \" ind2=\" \" tag=\"450\"><subfield code=\"a\">Stuff</subfield><subfield code=\"x\">First deprecated x field</subfield><subfield code=\"x\">Second deprecated x field</subfield><subfield code=\"v\">First deprecated v field</subfield><subfield code=\"v\">Second deprecated v field</subfield><subfield code=\"y\">First deprecated y field</subfield><subfield code=\"y\">Second deprecated y field</subfield><subfield code=\"z\">First deprecated z field</subfield><subfield code=\"z\">Second deprecated z field</subfield></datafield><datafield ind1=\" \" ind2=\" \" tag=\"670\"><subfield code=\"a\">Work cat.: 2007017970: Illegal immigration, 2007:</subfield><subfield code=\"b\">eCIP data sheet (Illegal immigrants)</subfield></datafield><datafield ind1=\" \" ind2=\" \" tag=\"690\"><subfield code=\"a\">sla-lab updated based on DEIA Catalog Working Group changes May 2021; \"Whatever\" term borrowed from Sears ; \"undocumented foreign national\" term from Bill H.R. 3776 (116th Congress)</subfield></datafield><datafield ind1=\" \" ind2=\" \" tag=\"690\"><subfield code=\"a\">sla-lab updated to include 550s for LCSH headings as references March 2023</subfield></datafield></record>"
],
"link": "https://api-na.hosted.exlibrisgroup.com/almaws/v1/bibs/authorities/98187481368106381"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
{
"member": [
{
"id": "98187481368106381",
"description": "Undocumented immigrants"
}
],
"total_record_count": 1
}
Loading

0 comments on commit aae4851

Please sign in to comment.