tk3369
diff --git a/‎.gitignore
+2-1 b/‎.gitignore
+2-1
diff --git a/‎LICENSE_READSTAT.md
+19 b/‎LICENSE_READSTAT.md
+19
diff --git a/‎README.md
+117-28 b/‎README.md
+117-28
diff --git a/‎src/ObjectPool.jl
+91 b/‎src/ObjectPool.jl
+91
@@ -2,4 +2,5 @@
 *.jl.*.cov
 *.jl.mem
 **/.ipynb_checkpoints/*
-**/*.swp
+**/*.swp
+**/*.log
@@ -0,0 +1,19 @@
+Copyright (c) 2013-2016 Evan Miller (except where otherwise noted)
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
@@ -3,46 +3,51 @@
 [![Build Status](https://travis-ci.org/tk3369/SASLib.jl.svg?branch=master)](https://travis-ci.org/tk3369/SASLib.jl)
 [![codecov.io](http://codecov.io/github/tk3369/SASLib.jl/coverage.svg?branch=master)](http://codecov.io/github/tk3369/SASLib.jl?branch=master)
 
-This is a port of Pandas' read_sas function.  
+This project started out as a port of Pandas' read_sas function.  Since the first public release, several bugs have been fixed and additional features have been added e.g. reading a subset of columns.  The goal is to have a fast reader that allows greater interoperability of Julia with the SAS ecosystem.
 
-Only `sas7bdat` format is supported, however.  If anyone needs to read `xport` formatted files, please create an issue or contribute/send me a pull request.
+Only `sas7bdat` format is supported, however.  If anyone needs to read `xport` files, please submit an issue.  Pull requests are welcome as well.
 
 ## Installation
 
 ```
 Pkg.add("SASLib")
 ```
 
-## Examples
+## Read Performance
+
+I did benchmarking mostly on my Macbook Pro laptop.  In general, the Julia implementation is somewhere between 7-25x faster than the Python counterpart.  Test results are documented in the `test/perf_results_<version>` folders.
+
+## User Guide
+
+### Basic Use Case
 
-Use the `readsas` function to read the file.  The result is a dictionary of various information about the file as well as the data itself.
+Use the `readsas` function to read a SAS7BDAT file.  The result is a dictionary of various information about the file as well as the data itself.
 
 ```julia
 julia> using SASLib
 
 julia> x = readsas("productsales.sas7bdat")
-Read data set of size 1440 x 10 in 2.0 seconds
-Dict{Symbol,Any} with 16 entries:
+Read productsales.sas7bdat with size 1440 x 10 in 1.05315 seconds
+Dict{Symbol,Any} with 17 entries:
   :filename             => "productsales.sas7bdat"
   :page_length          => 8192
   :file_encoding        => "US-ASCII"
   :system_endianness    => :LittleEndian
   :ncols                => 10
-  :column_types         => Type[Float64, Float64, Union{AbstractString, Missings.Missing}, Union{AbstractString, Missings.Missing}, Union{AbstractString,…
-  :data                 => Dict{Any,Any}(Pair{Any,Any}(:QUARTER, [1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 3.0, 3.0, 3.0, 4.0  …  1.0, 2.0, 2.0, 2.0, 3.0, 3.0, 3.0,…
-  :perf_type_conversion => 0.0262305
+  :column_types         => Type[Float64, Float64, String, String, String, String, String, Float64, Float64, Union{Date, Missings.Missing}]
+  :column_info          => Tuple{Int64,Symbol,Symbol,Type,DataType}[(1, :ACTUAL, :Number, Float64, Array{Float64,1}), (2, :PREDICT, :Number, Float64, A…
+  :data                 => Dict{Any,Any}(Pair{Any,Any}(:QUARTER, [1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 3.0, 3.0, 3.0, 4.0  …  1.0, 2.0, 2.0, 2.0, 3.0, 3.0, 3.…
+  :perf_type_conversion => 0.0399293
   :page_count           => 18
-  :column_names         => String["QUARTER", "YEAR", "COUNTRY", "DIVISION", "REGION", "MONTH", "PREDICT", "ACTUAL", "PRODTYPE", "PRODUCT"]
-  :column_symbols       => Symbol[:QUARTER, :YEAR, :COUNTRY, :DIVISION, :REGION, :MONTH, :PREDICT, :ACTUAL, :PRODTYPE, :PRODUCT]
+  :column_names         => String["ACTUAL", "PREDICT", "COUNTRY", "REGION", "DIVISION", "PRODTYPE", "PRODUCT", "QUARTER", "YEAR", "MONTH"]
+  :column_symbols       => Symbol[:ACTUAL, :PREDICT, :COUNTRY, :REGION, :DIVISION, :PRODTYPE, :PRODUCT, :QUARTER, :YEAR, :MONTH]
   :column_lengths       => [8, 8, 10, 10, 10, 10, 10, 8, 8, 8]
   :file_endianness      => :LittleEndian
   :nrows                => 1440
-  :perf_read_data       => 0.00639309
+  :perf_read_data       => 0.035717
   :column_offsets       => [0, 8, 40, 50, 60, 70, 80, 16, 24, 32]
 ```
 
-Number of columns and rows are returned as in `:ncols` and `:nrows` respectively.
-
 The data, reference by `:data` key, is represented as a Dict object with the column symbol as the key.
 
 ```juia
@@ -55,16 +60,34 @@ julia> x[:data][:ACTUAL]
  656.0
  948.0
  612.0
- 114.0
- 685.0
- 657.0
- 608.0
- 353.0
- 107.0
    ⋮  
+
 ```
 
-If you really like DataFrame, you can easily convert as such:
+Additional metadata are available as follows:
+
+Key              |Type           |Description
+-----------------|---------------|-------------------------------
+:nrows           | Int           | Number of rows in the result
+:ncols           | Int           | Number of columns in the result
+:filename        | String        | Filename for which data was read
+:file_encoding   | String        | Character encoding used in the file
+:file_endianness | Symbol        | Either :LittleEndian or :BigEndian
+:column_symbols  | Array{Symbol} | Column symbols
+:column_names    | Array{String} | Column names
+:column_types    | Array{Type}   | Column types e.g. Float64, String
+:column_info     | Array{Tuple}  | Tuple (column#, symbol, Num/Str, eltype, array type)
+:column_lengths  | Array{Int}    | Column lengths as in the SAS file format
+:column_offsets  | Array{Int}    | Column offsets as in the SAS file format
+:page_length     | Int           | Page length as in the SAS file format
+:page_count      | Int           | Number of pages as in the SAS file format
+:perf\_read\_data  | Float       | Performance stat: seconds used to read data into memory
+:perf\_type\_conversion  | Float | Performance stat: seconds used to convert data to proper types e.g. Date/DateTime
+:system_endianness | Symbol      | Either :LittleEndian or :BigEndian 
+
+### Conversion to DataFrame
+
+Since the data is just a Dict of array columns, it's easy to convert into a DataFrame:
 
 ```julia
 julia> using DataFrames
@@ -82,7 +105,25 @@ julia> head(df, 5)
 │ 5   │ 656.0  │ CANADA  │ EDUCATION │ 1993-05-01 │ 646.0   │ FURNITURE │ SOFA    │ 2.0     │ EAST   │ 1993.0 │
 ```
 
-If you only need to read few columns, just pass an `include_columns` argument:
+You may find the columns being mixed up a bit annoying since a regular Dict does not have any concept of orders and DataFrame just sort them aphabetically.  To work around that issue, you can leverage `:column_symbols` array, which has the _natural order_ from the file:
+
+```
+julia> df = DataFrame(((c => x[:data][c]) for c in x[:column_symbols])...);
+
+julia> head(df,5)
+5×10 DataFrames.DataFrame
+│ Row │ ACTUAL │ PREDICT │ COUNTRY │ REGION │ DIVISION  │ PRODTYPE  │ PRODUCT │ QUARTER │ YEAR   │ MONTH      │
+├─────┼────────┼─────────┼─────────┼────────┼───────────┼───────────┼─────────┼─────────┼────────┼────────────┤
+│ 1   │ 925.0  │ 850.0   │ CANADA  │ EAST   │ EDUCATION │ FURNITURE │ SOFA    │ 1.0     │ 1993.0 │ 1993-01-01 │
+│ 2   │ 999.0  │ 297.0   │ CANADA  │ EAST   │ EDUCATION │ FURNITURE │ SOFA    │ 1.0     │ 1993.0 │ 1993-02-01 │
+│ 3   │ 608.0  │ 846.0   │ CANADA  │ EAST   │ EDUCATION │ FURNITURE │ SOFA    │ 1.0     │ 1993.0 │ 1993-03-01 │
+│ 4   │ 642.0  │ 533.0   │ CANADA  │ EAST   │ EDUCATION │ FURNITURE │ SOFA    │ 2.0     │ 1993.0 │ 1993-04-01 │
+│ 5   │ 656.0  │ 646.0   │ CANADA  │ EAST   │ EDUCATION │ FURNITURE │ SOFA    │ 2.0     │ 1993.0 │ 1993-05-01 │
+```
+
+### Inclusion/Exclusion of Columns
+
+It is always faster to read only the columns that you need.  The `include_columns` argument comes in handy:
 
 ```
 julia> head(DataFrame(readsas("productsales.sas7bdat", include_columns=[:YEAR, :MONTH, :PRODUCT, :ACTUAL])[:data]))
@@ -114,7 +155,9 @@ Read data set of size 1440 x 6 in 0.031 seconds
 │ 6   │ CANADA  │ EDUCATION │ 486.0   │ FURNITURE │ 2.0     │ EAST   │
 ```
 
-If you need to read files incrementally:
+### Incremental Reading
+
+If you need to read files incrementally, you can do so as such:
 
 ```julia
 handler = SASLib.open("productsales.sas7bdat")
@@ -123,18 +166,64 @@ results = SASLib.read(handler, 4)   # read next 4 rows
 SASLib.close(handler)              # remember to close the handler when done
 ```
 
-## Read Performance
+Note that there is no facility at the moment to jump and read a subset of rows.  Currently, SASLib always read from the beginning.
 
-I don't have too much performance test results but initial comparison between SASLib.jl and Pandas on my Macbook Pro is encouraging.  In general, the Julia implementation is somewhere between 4x to 7x faster than the Python counterpart. See the perf\_results\_* folders for test results related to the version being published.
+### String Columns
+
+By default, string columns are read into a special AbstractArray structure called ObjectPool in order to conserve memory space that might otherwise be wasted for duplicate string values.  SASLib tries to be smart -- when it encounters too many unique values (> 10%) in a large array (> 2000 rows), it falls back to a regular Julia array.
+
+You can use a different array type (e.g. [CategoricalArray](https://github.com/JuliaData/CategoricalArrays.jl) or [PooledArray](https://github.com/JuliaComputing/PooledArrays.jl)) for any columns as you wish by specifying a `string_array_fn` parameter when reading the file.  This argument must be a Dict that maps a column symbol into a function that takes an integer argument and returns any array of that size.
+
+Here's the normal case:
+
+```
+julia> x = readsas("productsales.sas7bdat", include_columns=[:COUNTRY, :REGION]);
+Read productsales.sas7bdat with size 1440 x 2 in 0.00277 seconds
+
+julia> typeof.(collect(values(x[:data])))
+2-element Array{DataType,1}:
+ SASLib.ObjectPool{String,UInt16}
+ SASLib.ObjectPool{String,UInt16}
+```
+
+Now, you can force SASLib to use a regular array as such.
+
+```
+julia> x = readsas("productsales.sas7bdat", include_columns=[:COUNTRY, :REGION],
+                   string_array_fn=Dict(:COUNTRY => (n)->fill("",n)));
+Read productsales.sas7bdat with size 1440 x 2 in 0.05009 seconds
+
+julia> typeof.(collect(values(x[:data])))
+2-element Array{DataType,1}:
+ Array{String,1}                 
+ SASLib.ObjectPool{String,UInt16}
+```
+
+For convenience, `SASLib.REGULAR_STR_ARRAY` could be used instead.  In addition, if you need all columns to be configured then the key of the `string_array_fn` dict may be just the symbol `:_all_`. 
+
+```
+julia> x = readsas("productsales.sas7bdat", include_columns=[:COUNTRY, :REGION],
+                   string_array_fn=Dict(:_all_ => REGULAR_STR_ARRAY));
+Read productsales.sas7bdat with size 1440 x 2 in 0.01005 seconds
+
+julia> typeof.(collect(values(x[:data])))
+2-element Array{DataType,1}:
+ Array{String,1}
+ Array{String,1}
+```
 
 ## Why another package?
 
-At first, I was just going to use ReadStat.  However, ReadStat does not support reading files with compressed binary data.  I could have chosen to contribute to that project instead but I would rather learn and code in Julia  ;-)  The implementation in Pandas is fairly straightforward, making it a relatively easy porting project.  
+At first, I was just going to use [ReadStat.jl](https://github.com/davidanthoff/ReadStat.jl), which uses the [ReadStat C-library](https://github.com/WizardMac/ReadStat).  However, ReadStat does not support reading RDC-compressed binary files.  I could have chosen to contribute to that project but I would rather learn and code in Julia instead ;-)  The implementation in Pandas is fairly straightforward, making it a relatively easy porting project.  
 
 ## Porting Notes
 
-I chose to copy the code from Pandas and made minimal changes so I can have a working version quickly.  Hence, the code isn't very Julia-friendly e.g. variable and function naming are all mixed up.  It is not a priority at this point but I would think some major refactoring would be required to make it more clean & performant.
+I chose to copy the code from Pandas and made minimal changes so I can have a working version quickly.  Hence, the code isn't very Julia-friendly e.g. variable and function naming are all mixed up.  It is not a priority at this point but I would think some major refactoring would be required to clean up the code.
 
 ## Credits
 
-Many thanks to Jared Hobbs, the original author of the SAS I/O code from Python Pandas.  See LICENSE_SAS7BDAT.md for license details.
+- Jared Hobbs, the author of the SAS reader code from Python Pandas.  See LICENSE_SAS7BDAT.md.
+- [Evan Miller](https://github.com/evanmiller), the author of ReadStat C/C++ library.  See LICENSE_READSTAT.md.
+- [David Anthoff](https://github.com/davidanthoff), who provide many valuable ideas at the early stage of development.
+
+I also want to thank all the active members at the [Julia Discourse community] (https://discourse.julialang.org).  This project wouldn't be possible without all the help I got from the community.  That's the beauty of open-source development.
@@ -0,0 +1,91 @@
+"""
+ObjectPool is a fixed-size one-dimensional array that does not store 
+any duplicate copies of the same object.  So the benefit is space-efficiency. 
+The tradeoff is the time used to maintain the index.  
+This is useful for denormalized data frames where string values
+may be repeated many times.
+
+An ObjectPool must be initialize with a default value and a fixed
+array size.  If your requirement does not fit such assumptions, 
+you may want to look into using `PooledArrays` or 
+`CategoricalArrays` package instead.
+
+The implementation is very primitive and is tailor for application
+that knows exactly how much memory to allocate.
+"""
+mutable struct ObjectPool{T, S <: Unsigned} <: AbstractArray{T, 1}
+    pool::Array{T}             # maintains the pool of unique things
+    idx::Array{S}              # index references into `pool`
+    indexcache::Dict{T, S}     # dict for fast lookups (K=object, V=index)
+    uniqueitemscount::Int64    # how many items in `pool`, always start with 1
+    itemscount::Int64          # how many items perceived in this array
+    capacity::Int64            # max number of items in the pool
+end
+
+# Initially, there is only one item in the pool and the `idx` array has
+# elements all pointing to that one default vaue.  The dictionary `indexcache`
+# also has one item that points to that one value.  Hence `uniqueitemcount`
+# would be 1 and `itemscount` would be `n`.
+function ObjectPool{T, S}(val::T, n::Integer) where {T, S <: Unsigned} 
+    # Note: 64-bit case is constrainted by Int64 type (for convenience)
+    maxsize = ifelse(S == UInt8, 2 << 7 - 1,
+                ifelse(S == UInt16, 2 << 15 - 1,
+                    ifelse(S == UInt32, 2 << 31 - 1, 
+                        2 << 62 - 1))) 
+    ObjectPool{T, S}([val], fill(1, n), Dict(val => 1), 1, n, maxsize)
+end
+
+# If the value already exist in the pool then just the index value is stored.
+function Base.setindex!{T}(op::ObjectPool, val::T, i::Integer)
+    if haskey(op.indexcache, val)
+        # The value `val` already exists in the cache.  
+        # Just set the array element to the index value from cache.
+        op.idx[i] = op.indexcache[val]
+    else
+        if op.uniqueitemscount >= op.capacity
+            throw(BoundsError("Exceeded pool capacity $(op.capacity). Consider using a larger pool size e.g. UInt32."))
+        end
+        # Encountered a new value `val`:
+        # 1. add ot the object pool array
+        # 2. increment the number of unique items
+        # 3. store the new index in the cache
+        # 4. set the array element with the new index value
+        push!(op.pool, val)
+        op.uniqueitemscount += 1
+        op.indexcache[val] = op.uniqueitemscount
+        op.idx[i] = op.uniqueitemscount
+    end
+    op
+end
+
+# AbstractArray trait 
+# Base.IndexStyle(::Type{<:ObjectPool}) = IndexLinear()
+
+# single indexing
+Base.getindex(op::ObjectPool, i::Number) = op.pool[op.idx[convert(Int, i)]]
+
+# general sizes
+Base.size(op::ObjectPool)   = (op.itemscount, )
+# Base.length(op::ObjectPool) = op.itemscount
+# Base.endof(op::ObjectPool)  = op.itemscount
+
+# typing
+#Base.eltype(op::ObjectPool) = eltype(op.pool)
+
+# make it iterable
+# Base.start(op::ObjectPool)  = 1
+# Base.next(op::ObjectPool, state) = (op.pool[op.idx[state]], state + 1)
+# Base.done(op::ObjectPool, state) = state > op.itemscount
+
+# custom printing
+# function Base.show(io::IO, op::ObjectPool)
+#     L = op.itemscount
+#     print(io, "$L-element ObjectPool with $(op.uniqueitemscount) unique items:\n")
+#     if L > 20
+#         for i in 1:10  print(io, " ", op[i], "\n") end
+#         print(io, " ⋮\n")
+#         for i in L-9:L print(io, " ", op[i], "\n") end
+#     else
+#         for i in 1:L   print(io, " ", op[i], "\n") end
+#     end
+# end