Skip to content

Commit 2b30222

Browse files
authored
Merge pull request #24 from tk3369/tk/numarrayfn
Implementation of number_array_fn
2 parents 61aee0d + 8f27cad commit 2b30222

11 files changed

+571
-9
lines changed

src/SASLib.jl

+30-6
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ struct ReaderConfig
2828
include_columns::Vector
2929
exclude_columns::Vector
3030
string_array_fn::Dict{Symbol, Function}
31+
number_array_fn::Dict{Symbol, Function}
3132
verbose_level::Int64
3233
end
3334

@@ -152,6 +153,7 @@ open(filename::AbstractString;
152153
include_columns::Vector = [],
153154
exclude_columns::Vector = [],
154155
string_array_fn::Dict = Dict(),
156+
number_array_fn::Dict = Dict(),
155157
verbose_level::Int64 = 1)
156158
157159
Open a SAS7BDAT data file. Returns a `SASLib.Handler` object that can be used in
@@ -163,9 +165,10 @@ function open(filename::AbstractString;
163165
include_columns::Vector = [],
164166
exclude_columns::Vector = [],
165167
string_array_fn::Dict = Dict(),
168+
number_array_fn::Dict = Dict(),
166169
verbose_level::Int64 = 1)
167170
return _open(ReaderConfig(filename, encoding, default_chunk_size, convert_dates,
168-
include_columns, exclude_columns, string_array_fn, verbose_level))
171+
include_columns, exclude_columns, string_array_fn, number_array_fn, verbose_level))
169172
end
170173

171174
"""
@@ -205,6 +208,7 @@ readsas(filename::AbstractString;
205208
include_columns::Vector = [],
206209
exclude_columns::Vector = [],
207210
string_array_fn::Dict = Dict(),
211+
number_array_fn::Dict = Dict(),
208212
verbose_level::Int64 = 1)
209213
210214
Read a SAS7BDAT file.
@@ -239,6 +243,10 @@ For examples,
239243
or
240244
`string_array_fn = Dict(:column1 => REGULAR_STR_ARRAY)`.
241245
246+
For numeric columns, you may specify your own array constructors using
247+
the `number_array_fn` parameter. Perhaps you have a different kind of
248+
array to store the values e.g. SharedArray.
249+
242250
For debugging purpose, `verbose_level` may be set to a value higher than 1.
243251
Verbose level 0 will output nothing to the console, essentially a total quiet
244252
option.
@@ -249,11 +257,12 @@ function readsas(filename::AbstractString;
249257
include_columns::Vector = [],
250258
exclude_columns::Vector = [],
251259
string_array_fn::Dict = Dict(),
260+
number_array_fn::Dict = Dict(),
252261
verbose_level::Int64 = 1)
253262
handler = nothing
254263
try
255264
handler = _open(ReaderConfig(filename, encoding, default_chunk_size, convert_dates,
256-
include_columns, exclude_columns, string_array_fn, verbose_level))
265+
include_columns, exclude_columns, string_array_fn, number_array_fn, verbose_level))
257266
return read(handler)
258267
finally
259268
isdefined(handler, :string_decoder) && Base.close(handler.string_decoder)
@@ -982,7 +991,7 @@ function read_chunk(handler, nrows=0)
982991
perf_read_data = toq()
983992

984993
tic()
985-
rslt = _chunk_to_dataframe(handler)
994+
rslt = _chunk_to_dataframe(handler, nrows)
986995
perf_chunk_to_data_frame = toq()
987996

988997
# here column symbols contains only ones for columns that are actually read
@@ -1015,7 +1024,8 @@ function read_chunk(handler, nrows=0)
10151024
:column_info => column_info,
10161025
:compression => compressionstring(handler),
10171026
:perf_read_data => perf_read_data,
1018-
:perf_type_conversion => perf_chunk_to_data_frame
1027+
:perf_type_conversion => perf_chunk_to_data_frame,
1028+
:process_id => myid()
10191029
)
10201030
end
10211031

@@ -1039,6 +1049,17 @@ function createstrarray(handler, column_symbol, nrows)
10391049
end
10401050
end
10411051

1052+
# create numeric array
1053+
function createnumarray(handler, column_symbol, nrows)
1054+
if haskey(handler.config.number_array_fn, column_symbol)
1055+
handler.config.number_array_fn[column_symbol](nrows)
1056+
elseif haskey(handler.config.number_array_fn, :_all_)
1057+
handler.config.number_array_fn[:_all_](nrows)
1058+
else
1059+
zeros(Float64, nrows)
1060+
end
1061+
end
1062+
10421063
function nullresult(filename)
10431064
Dict(
10441065
:data => Dict(),
@@ -1121,7 +1142,7 @@ end
11211142
# Construct Dict object that holds the columns.
11221143
# For date or datetime columns, convert from numeric value to Date/DateTime type column.
11231144
# The resulting dictionary uses column symbols as the key.
1124-
function _chunk_to_dataframe(handler)
1145+
function _chunk_to_dataframe(handler, nrows)
11251146
# println("IN: _chunk_to_dataframe")
11261147

11271148
n = handler.current_row_in_chunk_index
@@ -1137,7 +1158,8 @@ function _chunk_to_dataframe(handler)
11371158
#if j == 1 && length(bytes) < 100 #debug only
11381159
# println(" bytes=$bytes")
11391160
#end
1140-
values = convertfloat64f(bytes, handler.file_endianness)
1161+
values = createnumarray(handler, name, nrows)
1162+
convertfloat64f!(values, bytes, handler.file_endianness)
11411163
#println(length(bytes))
11421164
#rslt[name] = bswap(rslt[name])
11431165
rslt[name] = values
@@ -1149,6 +1171,7 @@ function _chunk_to_dataframe(handler)
11491171
rslt[name] = datetime_from_float(rslt[name])
11501172
end
11511173
end
1174+
11521175
elseif ty == column_type_string
11531176
# println(" String: size=$(size(handler.string_chunk))")
11541177
# println(" String: column $j, name $name, size=$(size(handler.string_chunk[js, :]))")
@@ -1714,6 +1737,7 @@ function Base.show(io::IO, h::Handler)
17141737
println(io, " page size: $(h.page_length)")
17151738
println(io, " pages: $(h.page_count)")
17161739
println(io, " rows: $(h.row_count)")
1740+
println(io, " cols: $(h.column_count)")
17171741
end
17181742

17191743

src/utils.jl

+3-2
Original file line numberDiff line numberDiff line change
@@ -161,10 +161,11 @@ end
161161
# Version f. Best one so far!
162162
# julia> @btime convertfloat64f(r, :LittleEndian)
163163
# 35.132 μs (2 allocations: 78.20 KiB)
164-
function convertfloat64f(bytes::Vector{UInt8}, endianess::Symbol)
164+
#
165+
# results will be updated directly in the provided array `r`
166+
function convertfloat64f!(r::AbstractVector{Float64}, bytes::Vector{UInt8}, endianess::Symbol)
165167
L = length(bytes)
166168
n = div(L, 8) # numbers to convert
167-
r = zeros(Float64, n) # results
168169
j = 1 # result index
169170
@inbounds for i in 1:8:L
170171
if endianess == :LittleEndian
+78
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
# Julia/Python Performance Test Result
2+
3+
## Summary
4+
5+
Julia is ~10.7x faster than Python/Pandas
6+
7+
## Test File
8+
9+
Iterations: 50
10+
11+
Filename|Size|Rows|Columns|Numeric Columns|String Columns
12+
--------|----|----|-------|---------------|--------------
13+
homimp.sas7bdat|1.2 MB|46641|6|1|5
14+
15+
## Python
16+
```
17+
$ python -V
18+
Python 3.6.3 :: Anaconda custom (64-bit)
19+
20+
$ python perf_test1.py data_AHS2013/homimp.sas7bdat 50
21+
Minimum: 0.2720 seconds
22+
Median: 0.3014 seconds
23+
Mean: 0.3140 seconds
24+
Maximum: 0.4728 seconds
25+
26+
```
27+
28+
## Julia (ObjectPool)
29+
```
30+
Julia Version 0.6.2
31+
Commit d386e40c17 (2017-12-13 18:08 UTC)
32+
Platform Info:
33+
OS: macOS (x86_64-apple-darwin14.5.0)
34+
CPU: Intel(R) Core(TM) i5-4258U CPU @ 2.40GHz
35+
WORD_SIZE: 64
36+
BLAS: libopenblas (USE64BITINT DYNAMIC_ARCH NO_AFFINITY Haswell)
37+
LAPACK: libopenblas64_
38+
LIBM: libopenlibm
39+
LLVM: libLLVM-3.9.1 (ORCJIT, haswell)
40+
41+
BenchmarkTools.Trial:
42+
memory estimate: 20.56 MiB
43+
allocs estimate: 513299
44+
--------------
45+
minimum time: 47.109 ms (0.00% GC)
46+
median time: 56.312 ms (11.21% GC)
47+
mean time: 57.920 ms (10.72% GC)
48+
maximum time: 78.471 ms (9.23% GC)
49+
--------------
50+
samples: 50
51+
evals/sample: 1
52+
```
53+
54+
## Julia (Regular String Array)
55+
```
56+
Julia Version 0.6.2
57+
Commit d386e40c17 (2017-12-13 18:08 UTC)
58+
Platform Info:
59+
OS: macOS (x86_64-apple-darwin14.5.0)
60+
CPU: Intel(R) Core(TM) i5-4258U CPU @ 2.40GHz
61+
WORD_SIZE: 64
62+
BLAS: libopenblas (USE64BITINT DYNAMIC_ARCH NO_AFFINITY Haswell)
63+
LAPACK: libopenblas64_
64+
LIBM: libopenlibm
65+
LLVM: libLLVM-3.9.1 (ORCJIT, haswell)
66+
67+
BenchmarkTools.Trial:
68+
memory estimate: 19.37 MiB
69+
allocs estimate: 512178
70+
--------------
71+
minimum time: 25.528 ms (0.00% GC)
72+
median time: 39.970 ms (33.88% GC)
73+
mean time: 41.932 ms (35.10% GC)
74+
maximum time: 113.933 ms (76.81% GC)
75+
--------------
76+
samples: 50
77+
evals/sample: 1
78+
```
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
# Julia/Python Performance Test Result
2+
3+
## Summary
4+
5+
Julia is ~11.8x faster than Python/Pandas
6+
7+
## Test File
8+
9+
Iterations: 100
10+
11+
Filename|Size|Rows|Columns|Numeric Columns|String Columns
12+
--------|----|----|-------|---------------|--------------
13+
numeric_1000000_2.sas7bdat|16.3 MB|1000000|2|2|0
14+
15+
## Python
16+
```
17+
$ python -V
18+
Python 3.6.3 :: Anaconda custom (64-bit)
19+
20+
$ python perf_test1.py data_misc/numeric_1000000_2.sas7bdat 100
21+
Minimum: 1.7937 seconds
22+
Median: 1.8426 seconds
23+
Mean: 1.8485 seconds
24+
Maximum: 2.0821 seconds
25+
26+
```
27+
28+
## Julia
29+
```
30+
Julia Version 0.6.2
31+
Commit d386e40c17 (2017-12-13 18:08 UTC)
32+
Platform Info:
33+
OS: macOS (x86_64-apple-darwin14.5.0)
34+
CPU: Intel(R) Core(TM) i5-4258U CPU @ 2.40GHz
35+
WORD_SIZE: 64
36+
BLAS: libopenblas (USE64BITINT DYNAMIC_ARCH NO_AFFINITY Haswell)
37+
LAPACK: libopenblas64_
38+
LIBM: libopenlibm
39+
LLVM: libLLVM-3.9.1 (ORCJIT, haswell)
40+
41+
BenchmarkTools.Trial:
42+
memory estimate: 153.16 MiB
43+
allocs estimate: 1002737
44+
--------------
45+
minimum time: 152.629 ms (3.05% GC)
46+
median time: 231.873 ms (35.79% GC)
47+
mean time: 203.540 ms (23.47% GC)
48+
maximum time: 257.027 ms (38.52% GC)
49+
--------------
50+
samples: 25
51+
evals/sample: 1
52+
```
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
# Julia/Python Performance Test Result
2+
3+
## Summary
4+
5+
Julia is ~21.1x faster than Python/Pandas
6+
7+
## Test File
8+
9+
Iterations: 100
10+
11+
Filename|Size|Rows|Columns|Numeric Columns|String Columns
12+
--------|----|----|-------|---------------|--------------
13+
productsales.sas7bdat|148.5 kB|1440|10|5|5
14+
15+
## Python
16+
```
17+
$ python -V
18+
Python 3.6.3 :: Anaconda custom (64-bit)
19+
20+
$ python perf_test1.py data_pandas/productsales.sas7bdat 100
21+
Minimum: 0.0292 seconds
22+
Median: 0.0316 seconds
23+
Mean: 0.0325 seconds
24+
Maximum: 0.0572 seconds
25+
26+
```
27+
28+
## Julia (ObjectPool)
29+
```
30+
Julia Version 0.6.2
31+
Commit d386e40c17 (2017-12-13 18:08 UTC)
32+
Platform Info:
33+
OS: macOS (x86_64-apple-darwin14.5.0)
34+
CPU: Intel(R) Core(TM) i5-4258U CPU @ 2.40GHz
35+
WORD_SIZE: 64
36+
BLAS: libopenblas (USE64BITINT DYNAMIC_ARCH NO_AFFINITY Haswell)
37+
LAPACK: libopenblas64_
38+
LIBM: libopenlibm
39+
LLVM: libLLVM-3.9.1 (ORCJIT, haswell)
40+
41+
BenchmarkTools.Trial:
42+
memory estimate: 1.07 MiB
43+
allocs estimate: 18583
44+
--------------
45+
minimum time: 2.084 ms (0.00% GC)
46+
median time: 2.188 ms (0.00% GC)
47+
mean time: 2.408 ms (3.88% GC)
48+
maximum time: 5.143 ms (47.78% GC)
49+
--------------
50+
samples: 100
51+
evals/sample: 1
52+
```
53+
54+
## Julia (Regular String Array)
55+
```
56+
Julia Version 0.6.2
57+
Commit d386e40c17 (2017-12-13 18:08 UTC)
58+
Platform Info:
59+
OS: macOS (x86_64-apple-darwin14.5.0)
60+
CPU: Intel(R) Core(TM) i5-4258U CPU @ 2.40GHz
61+
WORD_SIZE: 64
62+
BLAS: libopenblas (USE64BITINT DYNAMIC_ARCH NO_AFFINITY Haswell)
63+
LAPACK: libopenblas64_
64+
LIBM: libopenlibm
65+
LLVM: libLLVM-3.9.1 (ORCJIT, haswell)
66+
67+
BenchmarkTools.Trial:
68+
memory estimate: 1.05 MiB
69+
allocs estimate: 18510
70+
--------------
71+
minimum time: 1.382 ms (0.00% GC)
72+
median time: 1.430 ms (0.00% GC)
73+
mean time: 1.608 ms (7.05% GC)
74+
maximum time: 5.258 ms (65.43% GC)
75+
--------------
76+
samples: 100
77+
evals/sample: 1
78+
```

0 commit comments

Comments
 (0)