-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathht_item.rb
292 lines (233 loc) · 7.15 KB
/
ht_item.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
require 'traject'
require 'match_map'
require 'ht_traject/ht_constants'
require 'ht_traject/ht_macros'
require "services"
require 'json'
module HathiTrust
module Traject
# An ItemSet is just, basically, a set of items that knows something
# about its constituents as a whole
class ItemSet
include Enumerable
# Set up class-level translation maps so we don't have to mess with getting
# them over and over again
class << self
attr_accessor :ht_ns, :ht_avail_us, :ht_avail_intl
end
self.ht_avail_us = ::Traject::TranslationMap.new('ht/availability_map_ht')
self.ht_avail_intl = ::Traject::TranslationMap.new('ht/availability_map_ht_intl')
attr_reader :items
def initialize
@items = []
@ph = {}
end
def add(item)
@items << item
end
# Make it easy to get the size
def size
@items.size
end
# Basic iterator
def each
return enum_for(:each) unless block_given?
@items.each do |i|
yield i
end
end
# Some aggregate data
def ht_ids
@ids ||= map { |i| i.htid.downcase }
@ids
end
def rights_list
unless @rights_list
@rights_list = flat_map(&:rights).uniq
@rights_list = ['tombstone'] if @rights_list.size == 1 && @rights_list[0] == 'nobody'
@rights_list.uniq!
end
@rights_list
end
def last_update_dates
@last_update_dates ||= map(&:last_update_date).uniq
@last_update_dates
end
def collection_codes
@collection_codes ||= map(&:collection_code).uniq
@collection_codes
end
def collections
@collections ||= map(&:collection).uniq
end
def us_availability
@us ||= map(&:us_availability).uniq
@us
end
def intl_availability
@intl ||= map(&:intl_availability).uniq
@intl
end
def fill_print_holdings!
ids = ht_ids.flatten
@ph = HathiTrust::Services[:print_holdings].get_print_holdings_hash(ids)
each do |item|
item.print_holdings = @ph[item.htid]
end
end
def print_holdings
@ph.values.flatten.uniq
end
# Turn this item into the sort of json object
# we want to store in solr
def to_json(platform)
rv = []
needs_sorting = false
each do |item|
jsonrec = {
'htid' => item.htid,
'newly_open' => item.newly_open,
'ingest' => item.last_update_date,
'rights' => item.rights,
'heldby' => item.print_holdings,
'collection_code' => item.collection_code
}
if item.enum_chron
jsonrec['enumcron'] = item.enum_chron
needs_sorting = true
end
if item.enum_pubdate
jsonrec['enum_pubdate'] = item.enum_pubdate
jsonrec['enum_pubdate_range'] = HathiTrust::Traject::Macros::HTMacros.compute_date_range(item.enum_pubdate.to_i)
end
if platform == :ht
jsonrec['dig_source'] = item.dig_source if item.dig_source
end
rv << jsonrec
end
rv = sortHathiJSON(rv) if needs_sorting
rv.to_json
end
def enumcronSort(a, b)
matcha = /(\d{4})/.match a['enumcron']
matchb = /(\d{4})/.match b['enumcron']
if matcha && matchb && (matcha[1] != matchb[1])
# return matcha[1].to_i <=> matchb[1].to_i
end
a[:sortstring] <=> b[:sortstring]
end
# Create a sortable string based on the digit strings present in an
# enumcron string
def enumcronSortString(str)
rv = '0'
str.scan(/\d+/).each do |nums|
rv += nums.size.to_s + nums
end
rv
end
def sortHathiJSON(arr)
# Only one? Never mind
return arr if arr.size <= 1
# First, add the sortstring entries
arr.each do |h|
h[:sortstring] = if h.has_key? 'enumcron'
enumcronSortString(h['enumcron'])
else
'0'
end
end
# Then sort it
arr.sort! { |a, b| enumcronSort(a, b) }
# Then remove the sortstrings
arr.each do |h|
h.delete(:sortstring)
end
arr
end
# The whole set (record) is considered Full Text iff there is at
# least one item whose status is fulltext
def us_fulltext?
any? { |item| item.us_availability == HathiTrust::Constants::FT }
end
def intl_fulltext?
any? { |item| item.intl_availability == HathiTrust::Constants::FT }
end
end
# end of Items
# An individual item
class Item
DEFAULT_DATE = '00000000'.freeze
attr_accessor :rights, :enum_chron, :last_update_date, :print_holdings,
:collection_code, :dig_source
attr_reader :htid, :set, :enum_pubdate, :enum_pubdate_range
attr_accessor :title_sortkey, :author_sortkey
def initialize
@print_holdings = []
@rights = []
end
def self.new_from_974(f)
inst = new
inst.rights << f['r']
inst.htid = f['u']
inst.last_update_date = f['d'] || DEFAULT_DATE
inst.enum_chron = f['z']
inst.enum_pubdate = f['y']
inst.collection_code = f['c'] ? f['c'].downcase : inst.namespace
inst.dig_source = f['s'] ? f['s'].downcase : nil
inst.rights << inst.newly_open
inst
end
def htid=(s)
return unless s
@htid = s.downcase
@namespace = namespace_for(@htid)
end
def enumchron_sortstring
return '0000' if enum_chron.nil?
digit_strings = enum_chron.scan(/\d+/).map do |digits|
digits.size.to_s + digits
end
if digit_strings.empty?
'0000'
else
digit_strings
end
end
def enum_pubdate=(e)
if e && (e =~ /\d/)
@enum_pubdate = ('%04d' % e.to_i)
@enum_pubdate_range = HathiTrust::Traject::Macros::HTMacros.compute_date_range(@enum_pubdate)
else
@enum_pubdate = nil
@enum_pubdate_range = nil
end
end
def us_availability
ItemSet.ht_avail_us[rights].first
end
def intl_availability
ItemSet.ht_avail_intl[rights].first
end
def newly_open
'newly_open' if HathiTrust::Constants::NewlyOpen.include? htid
end
def namespace
@namespace ||= namespace_for(@htid)
@namespace
end
def namespace_for(htid)
if ns_match = /^(.*?)\./.match(htid)
ns_match[1]
else
:malformed_htid
end
end
def malformed?
namespace == :malformed_htid
end
def display_string
[htid, last_update_date, enum_chron, enum_pubdate, enum_pubdate_range, title_sortkey, author_sortkey].join('|')
end
end # end of Item
end # end of Modules
end