Skip to content

Commit 57bfdcc

Browse files
authored
Merge pull request #42 from tk3369/tk/user-specified-column-type
Ability to convert column to specific types given by the user
2 parents 4c4f11d + 563b321 commit 57bfdcc

File tree

5 files changed

+200
-6
lines changed

5 files changed

+200
-6
lines changed

README.md

+19
Original file line numberDiff line numberDiff line change
@@ -317,6 +317,25 @@ julia> A[1:5,:]
317317
656.0 646.0
318318
```
319319

320+
### Column Type Conversion
321+
322+
Often, you want a column to be an integer but the SAS7BDAT stores everything as Float64. Specifying the `column_type` argument does the conversion for you.
323+
324+
```
325+
julia> rs = readsas("productsales.sas7bdat", column_types=Dict(:ACTUAL=>Int))
326+
Read productsales.sas7bdat with size 1440 x 10 in 0.08043 seconds
327+
SASLib.ResultSet (1440 rows x 10 columns)
328+
Columns 1:ACTUAL, 2:PREDICT, 3:COUNTRY, 4:REGION, 5:DIVISION, 6:PRODTYPE, 7:PRODUCT, 8:QUARTER, 9:YEAR, 10:MONTH
329+
1: 925, 850.0, CANADA, EAST, EDUCATION, FURNITURE, SOFA, 1.0, 1993.0, 1993-01-01
330+
2: 999, 297.0, CANADA, EAST, EDUCATION, FURNITURE, SOFA, 1.0, 1993.0, 1993-02-01
331+
3: 608, 846.0, CANADA, EAST, EDUCATION, FURNITURE, SOFA, 1.0, 1993.0, 1993-03-01
332+
4: 642, 533.0, CANADA, EAST, EDUCATION, FURNITURE, SOFA, 2.0, 1993.0, 1993-04-01
333+
5: 656, 646.0, CANADA, EAST, EDUCATION, FURNITURE, SOFA, 2.0, 1993.0, 1993-05-01
334+
335+
julia> typeof(rs[:ACTUAL])
336+
Array{Int64,1}
337+
```
338+
320339
### File Metadata
321340

322341
You may obtain meta data for a SAS data file using the `metadata` function.

src/CIDict.jl

+53
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
# Case insensitive Dict - a simple wrapper over Dict
2+
3+
struct CIDict{K, T}
4+
5+
dct::Dict{K, T}
6+
7+
# type checking
8+
check(K) = K <: AbstractString || K <: Symbol ||
9+
throw(ArgumentError("Key must be Symbol or String type"))
10+
11+
# constructors
12+
CIDict{K, T}() where {K,T} = (check(K); new(Dict{K,T}()))
13+
CIDict{K, T}(d::Dict{K,T}) where {K,T} = begin
14+
check(K)
15+
d2 = Dict{K,T}()
16+
for k in keys(d)
17+
d2[lcase(k)] = d[k]
18+
end
19+
new(d2)
20+
end
21+
end
22+
23+
lcase(s::Symbol) = Symbol(lowercase(String(s)))
24+
lcase(s::AbstractString) = lowercase(s)
25+
26+
Base.getindex(d::CIDict, s::Symbol) = d.dct[lcase(s)]
27+
Base.getindex(d::CIDict, s::String) = d.dct[lcase(s)]
28+
29+
Base.setindex!(d::CIDict, v, s::Symbol) = d.dct[lcase(s)] = v
30+
Base.setindex!(d::CIDict, v, s::String) = d.dct[lcase(s)] = v
31+
32+
Base.haskey(d::CIDict, s::Symbol) = haskey(d.dct, lcase(s))
33+
Base.haskey(d::CIDict, s::String) = haskey(d.dct, lcase(s))
34+
35+
Base.keys(d::CIDict) = keys(d.dct)
36+
Base.values(d::CIDict) = values(d.dct)
37+
38+
Base.start(d::CIDict) = start(d.dct)
39+
Base.next(d::CIDict, i::Int) = next(d.dct, i)
40+
Base.done(d::CIDict, i::Int) = done(d.dct, i)
41+
42+
Base.length(d::CIDict) = length(d.dct)
43+
44+
issym(x) = typeof(x) == Symbol
45+
46+
function Base.show(io::IO, d::SASLib.CIDict)
47+
print(io, "CIDict(")
48+
for (i, (k,v)) in enumerate(d.dct)
49+
i > 1 && print(io, ", ")
50+
print(io, issym(k) ? ":" : "", k, " => ", v)
51+
end
52+
print(io, ")")
53+
end

src/SASLib.jl

+43-3
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ import Base: show, size
1212
include("constants.jl")
1313
include("utils.jl")
1414
include("ObjectPool.jl")
15+
include("CIDict.jl")
1516
include("Types.jl")
1617
include("ResultSet.jl")
1718
include("Metadata.jl")
@@ -32,6 +33,7 @@ function _open(config::ReaderConfig)
3233
handler.current_page = 0
3334
_get_properties(handler)
3435
_parse_metadata(handler)
36+
_post_metadata_handler(handler)
3537
return handler
3638
end
3739

@@ -43,6 +45,7 @@ open(filename::AbstractString;
4345
exclude_columns::Vector = [],
4446
string_array_fn::Dict = Dict(),
4547
number_array_fn::Dict = Dict(),
48+
column_types::Dict = Dict{Symbol,Type}(),
4649
verbose_level::Int64 = 1)
4750
4851
Open a SAS7BDAT data file. Returns a `SASLib.Handler` object that can be used in
@@ -55,9 +58,11 @@ function open(filename::AbstractString;
5558
exclude_columns::Vector = [],
5659
string_array_fn::Dict = Dict(),
5760
number_array_fn::Dict = Dict(),
61+
column_types::Dict = Dict{Symbol,Type}(),
5862
verbose_level::Int64 = 1)
5963
return _open(ReaderConfig(filename, encoding, default_chunk_size, convert_dates,
60-
include_columns, exclude_columns, string_array_fn, number_array_fn, verbose_level))
64+
include_columns, exclude_columns, string_array_fn, number_array_fn,
65+
column_types, verbose_level))
6166
end
6267

6368
"""
@@ -97,6 +102,7 @@ readsas(filename::AbstractString;
97102
exclude_columns::Vector = [],
98103
string_array_fn::Dict = Dict(),
99104
number_array_fn::Dict = Dict(),
105+
column_types::Dict = Dict{Symbol,Type}(),
100106
verbose_level::Int64 = 1)
101107
102108
Read a SAS7BDAT file.
@@ -135,6 +141,9 @@ For numeric columns, you may specify your own array constructors using
135141
the `number_array_fn` parameter. Perhaps you have a different kind of
136142
array to store the values e.g. SharedArray.
137143
144+
Specify `column_type` argument if any conversion is required. It should
145+
be a Dict, mapping column symbol to a data type.
146+
138147
For debugging purpose, `verbose_level` may be set to a value higher than 1.
139148
Verbose level 0 will output nothing to the console, essentially a total quiet
140149
option.
@@ -146,11 +155,13 @@ function readsas(filename::AbstractString;
146155
exclude_columns::Vector = [],
147156
string_array_fn::Dict = Dict(),
148157
number_array_fn::Dict = Dict(),
158+
column_types::Dict = Dict{Symbol,Type}(),
149159
verbose_level::Int64 = 1)
150160
handler = nothing
151161
try
152162
handler = _open(ReaderConfig(filename, encoding, default_chunk_size, convert_dates,
153-
include_columns, exclude_columns, string_array_fn, number_array_fn, verbose_level))
163+
include_columns, exclude_columns, string_array_fn, number_array_fn,
164+
column_types, verbose_level))
154165
return read(handler)
155166
finally
156167
isdefined(handler, :string_decoder) && Base.close(handler.string_decoder)
@@ -390,6 +401,20 @@ function _parse_metadata(handler)
390401
end
391402
end
392403

404+
# Do this after finish reading metadata but before reading data
405+
function _post_metadata_handler(handler)
406+
407+
# save a copy of column types in a case insensitive dict
408+
handler.column_types_dict = CIDict{Symbol,Type}(handler.config.column_types)
409+
410+
# check column_types
411+
for k in keys(handler.config.column_types)
412+
if !case_insensitive_in(k, handler.column_symbols)
413+
Compat.@warn("Unknown column symbol ($k) in column_types. Ignored.")
414+
end
415+
end
416+
end
417+
393418
function _process_page_meta(handler)
394419
# println3(handler, "IN: _process_page_meta")
395420
_read_page_header(handler)
@@ -1006,7 +1031,7 @@ function _chunk_to_dataframe(handler, nrows)
10061031
rslt[name] = datetime_from_float(rslt[name])
10071032
end
10081033
end
1009-
1034+
convert_column_type_if_needed!(handler, rslt, name)
10101035
elseif ty == column_type_string
10111036
# println(" String: size=$(size(handler.string_chunk))")
10121037
# println(" String: column $j, name $name, size=$(size(handler.string_chunk[js, :]))")
@@ -1018,6 +1043,21 @@ function _chunk_to_dataframe(handler, nrows)
10181043
return rslt
10191044
end
10201045

1046+
# If the user specified a type for the column, try to convert the column data.
1047+
function convert_column_type_if_needed!(handler, rslt, name)
1048+
if haskey(handler.column_types_dict, name)
1049+
type_wanted = handler.column_types_dict[name]
1050+
#println("$name exists in config.column_types, type_wanted=$type_wanted")
1051+
if type_wanted != Float64
1052+
try
1053+
rslt[name] = convert(Vector{type_wanted}, rslt[name])
1054+
catch ex
1055+
Compat.@warn("Unable to convert column to type $type_wanted, error=$ex")
1056+
end
1057+
end
1058+
end
1059+
end
1060+
10211061
# Simple loop that reads data row-by-row.
10221062
function read_data(handler, nrows)
10231063
# println("IN: read_data, nrows=$nrows")

src/Types.jl

+3
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ struct ReaderConfig
1515
exclude_columns::Vector
1616
string_array_fn::Dict{Symbol, Function}
1717
number_array_fn::Dict{Symbol, Function}
18+
column_types::Dict{Symbol, Type}
1819
verbose_level::Int64
1920
end
2021

@@ -108,6 +109,8 @@ mutable struct Handler
108109
string_decoder_buffer::IOBuffer
109110
string_decoder::StringDecoder
110111

112+
column_types_dict::CIDict{Symbol,Type}
113+
111114
Handler(config::ReaderConfig) = new(
112115
Base.open(config.filename),
113116
config)

test/runtests.jl

+82-3
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,10 @@ readfile(dir, file; kwargs...) = readsas(getpath(dir, file); kwargs...)
1010
openfile(dir, file; kwargs...) = SASLib.open(getpath(dir, file), kwargs...)
1111
getmetadata(dir, file; kwargs...) = metadata(getpath(dir, file), kwargs...)
1212

13+
# Struct used for column type conversion test case below
14+
struct YearStr year::String end
15+
Base.convert(::Type{YearStr}, v::Float64) = YearStr(string(round(Int, v)))
16+
1317
@testset "SASLib" begin
1418

1519
@testset "object pool" begin
@@ -48,6 +52,47 @@ getmetadata(dir, file; kwargs...) = metadata(getpath(dir, file), kwargs...)
4852
@test_throws BoundsError z[1:300] = 1:300
4953
end
5054

55+
@testset "case insensitive dict" begin
56+
function testdict(lowercase_key, mixedcase_key, second_lowercase_key)
57+
58+
T = typeof(lowercase_key)
59+
d = SASLib.CIDict{T,Int}()
60+
61+
# getindex/setindex!
62+
d[lowercase_key] = 99
63+
@test d[lowercase_key] == 99
64+
@test d[mixedcase_key] == 99
65+
d[mixedcase_key] = 88 # should replace original value
66+
@test length(d) == 1 # still 1 element
67+
@test d[lowercase_key] == 88
68+
@test d[mixedcase_key] == 88
69+
70+
# haskey
71+
@test haskey(d, lowercase_key) == true
72+
@test haskey(d, mixedcase_key) == true
73+
74+
# iteration
75+
d[second_lowercase_key] = 77
76+
ks = T[]
77+
vs = Int[]
78+
for (k,v) in d
79+
push!(ks, k)
80+
push!(vs, v)
81+
end
82+
@test ks == [lowercase_key, second_lowercase_key]
83+
@test vs == [88, 77]
84+
85+
# keys/values
86+
@test collect(keys(d)) == [lowercase_key, second_lowercase_key]
87+
@test collect(values(d)) == [88, 77]
88+
89+
# show
90+
@test show(d) == nothing
91+
end
92+
testdict(:abc, :ABC, :def)
93+
testdict("abc", "ABC", "def")
94+
end
95+
5196
@testset "open and close" begin
5297
handler = openfile("data_pandas", "test1.sas7bdat")
5398
@test typeof(handler) == SASLib.Handler
@@ -170,7 +215,7 @@ getmetadata(dir, file; kwargs...) = metadata(getpath(dir, file), kwargs...)
170215
@test rs[1,:ACTUAL] 200.0
171216

172217
# display related
173-
@test typeof(show(rs)) == Void
218+
@test show(rs) == nothing
174219
@test SASLib.sizestr(rs) == "1440 rows x 10 columns"
175220
end
176221

@@ -188,7 +233,7 @@ getmetadata(dir, file; kwargs...) = metadata(getpath(dir, file), kwargs...)
188233
@test md.columnsinfo[1] == Pair(:Column1, Float64)
189234

190235
md = getmetadata("data_pandas", "productsales.sas7bdat")
191-
@test typeof(show(md)) == Void
236+
@test show(md) == nothing
192237
println()
193238

194239
# Deal with v0.6/v0.7 difference
@@ -226,7 +271,7 @@ getmetadata(dir, file; kwargs...) = metadata(getpath(dir, file), kwargs...)
226271
handler = openfile("data_AHS2013", "topical.sas7bdat")
227272
rs = SASLib.read(handler, 1000)
228273
@test size(rs) == (1000, 114)
229-
@test typeof(show(handler)) == Void
274+
@test show(handler) == nothing
230275
SASLib.close(handler)
231276
# @test result[:page_count] == 10
232277
# @test result[:page_length] == 16384
@@ -301,6 +346,40 @@ getmetadata(dir, file; kwargs...) = metadata(getpath(dir, file), kwargs...)
301346

302347
end
303348

349+
# column type conversion
350+
@testset "user specified column types" begin
351+
352+
# normal use case
353+
rs = readfile("data_pandas", "productsales.sas7bdat";
354+
verbose_level = 0, column_types = Dict(:YEAR => Int16, :QUARTER => Int8))
355+
@test eltype(rs[:YEAR]) == Int16
356+
@test eltype(rs[:QUARTER]) == Int8
357+
358+
# error handling - warn() when a column cannot be converted
359+
rs = readfile("data_pandas", "productsales.sas7bdat";
360+
verbose_level = 0, column_types = Dict(:YEAR => Int8, :QUARTER => Int8))
361+
@test eltype(rs[:YEAR]) == Float64
362+
@test eltype(rs[:QUARTER]) == Int8
363+
#TODO expect warning for :YEAR conversion
364+
365+
# case insensitive column symbol
366+
rs = readfile("data_pandas", "productsales.sas7bdat";
367+
verbose_level = 0, column_types = Dict(:Quarter => Int8))
368+
@test eltype(rs[:QUARTER]) == Int8
369+
370+
# conversion to custom types
371+
rs = readfile("data_pandas", "productsales.sas7bdat";
372+
verbose_level = 0, column_types = Dict(:Year => YearStr))
373+
@test eltype(rs[:YEAR]) == YearStr
374+
375+
# test Union type
376+
let T = Union{Int,Missing}
377+
rs = readfile("data_pandas", "productsales.sas7bdat";
378+
verbose_level = 0, column_types = Dict(:Year => T))
379+
@test eltype(rs[:YEAR]) == T
380+
end
381+
end
382+
304383
# see output; keep this for coverage reason
305384
@testset "verbosity" begin
306385
rs = readfile("data_pandas", "test1.sas7bdat"; verbose_level = 2)

0 commit comments

Comments
 (0)