diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..a37d6f5 --- /dev/null +++ b/.gitignore @@ -0,0 +1,10 @@ +# ignore cmake configured file. +include/gaenari/gaenari-config.h + +# ignore downloaded sqlite sources. +extern/sqlite/sqlite/* + +# write '*' in .gitignore to ignore the cmake build directory in repo. +# do it in root CMakeFiles.txt. +# do nothing. + diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..d2c35b1 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,158 @@ +###################### +# gaenari build & test +###################### +# mkdir build +# cd build +# cmake .. +# cmake --build . --config release +# cmake --install . --prefix install +# ctest --verbose +# (find executables in build/tests/* and just execute it, display in color.) + +#################### +# build with gaenari +#################### +# ------------------- +# wrapper/wrapper.cpp +# ------------------- +# #include "gaenari/gaenari.hpp" +# int main(void) { +# gaenari::logger::init1("/temp/_log.txt"); +# using supul_t = supul::supul::supul_t; +# supul_t::api::project::create("/temp/supul_dir"); +# supul_t::api::project::add_field("/temp/supul_dir", "x1", "REAL"); +# supul_t::api::project::add_field("/temp/supul_dir", "x2", "INTEGER"); +# supul_t::api::project::add_field("/temp/supul_dir", "x3", "TEXT_ID"); +# supul_t::api::project::add_field("/temp/supul_dir", "y0", "TEXT_ID"); +# supul_t::api::project::x("/temp/supul_dir", {"x1", "x2", "x3"}); +# supul_t::api::project::y("/temp/supul_dir", "y0"); +# supul_t::api::project::set_property("/temp/supul_dir", "db.type", "sqlite"); +# supul_t supul; +# supul.api.lifetime.open("/temp/supul_dir"); +# supul.api.model.insert_chunk_csv("/temp/dataset.csv"); +# supul.api.model.update(); +# supul.api.model.deinit(); +# // ... +# supul.api.model.rebuild(); +# supul.api.lifetime.close(); +# return 0; +# } +# +# ---------------------- +# wrapper/CMakeLists.txt +# ---------------------- +# cmake_minimum_required(VERSION 3.6) +# project(wrapper) +# +# # call order is important. +# +# add_subdirectory() +# check_cpp17_gaenari() +# +# add_executable(wrapper wrapper.cpp) +# add_gaenari(wrapper) +# +# ----- +# build +# ----- +# wrapper/build$ cmake .. +# wrapper/build$ cmake --build . --config release + +cmake_minimum_required(VERSION 3.6) + +########## +# settings +########## +# project. +project(gaenari VERSION 1.0.0) + +################################# +# auto git ignore build directory +################################# +if(NOT EXISTS ${PROJECT_BINARY_DIR}/.gitignore) + file(WRITE ${PROJECT_BINARY_DIR}/.gitignore "*") +endif() + +######### +# include +######### +include("${CMAKE_CURRENT_SOURCE_DIR}/cmake/gaenari_util.cmake") + +######### +# options +######### +set(_SQLITE_SHA256_ "999826FE4C871F18919FDB8ED7EC9DD8217180854DD1FE21EEA96AED36186729") +set(SQLITE_VERSION "3360000" CACHE STRING "sqlite version to build. minimum(3350400(=3.35.5) for supporting `returning`)") +set(SQLITE_RELEASE_YEAR "2021" CACHE STRING "release year of SQLITE_VERSION.") +set(SQLITE_SHA256 ${_SQLITE_SHA256_} CACHE STRING "sha256 value of sqlite.zip.") + +################## +# global variables +################## +# download directory. +set(DOWNLOAD_DIR "${CMAKE_CURRENT_BINARY_DIR}/download") + +# gaenari include directory. +set(GAENARI_INCLUDE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/include/") + +# sqlite include directory. +set(SQLITE_INCLUDE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/extern/sqlite") + +# gaenari hpp files. +# gaenari has only header files. +# this list is used to build source_group for IDE tools, not compilation. +file(GLOB_RECURSE GAENARI_HPP_FILES + "${GAENARI_INCLUDE_DIR}/*.hpp" + "${GAENARI_INCLUDE_DIR}/*.h" +) + +####### +# c++17 +####### +check_cpp17_gaenari() + +########### +# configure +########### +configure_file("${GAENARI_INCLUDE_DIR}/gaenari/gaenari-config.h.in" "${GAENARI_INCLUDE_DIR}/gaenari/gaenari-config.h") + +############ +# definition +############ +# compile definitions. +set(GAENARI_DEFINITIONS "") +add_compile_definitions(${GAENARI_DEFINITIONS}) + +######### +# targets +######### +# add sqlite target to build. +add_subdirectory("${CMAKE_CURRENT_SOURCE_DIR}/extern/sqlite") + +# add tests target to build. +add_subdirectory("${CMAKE_CURRENT_SOURCE_DIR}/tests") + +# add files target to edit some files in IDE. +add_custom_target(files SOURCES + "${CMAKE_CURRENT_SOURCE_DIR}/README.md" + "${CMAKE_CURRENT_SOURCE_DIR}/TODO.md" +) + +######################### +# for add_gaenari() macro +######################### +set(GAENARI_INCLUDE_DIR_FOR_BUILD ${GAENARI_INCLUDE_DIR} ${SQLITE_INCLUDE_DIR} CACHE INTERNAL "_GAENARI_INCLUDE_DIR_FOR_BUILD") +set(GAEANRI_DEFINITION_FOR_BUILD ${GAENARI_DEFINITIONS} CACHE INTERNAL "_GAEANRI_DEFINITION_FOR_BUILD") +set(GAENARI_LINK_FOR_BUILD sqlite3 CACHE INTERNAL "_GAENARI_LINK_FOR_BUILD") + +###### +# test +###### +# call tests from tests sub-directory. +enable_testing() + +######### +# install +######### +# add gaenari header files to install directory. +install(DIRECTORY "${GAENARI_INCLUDE_DIR}" DESTINATION include) diff --git a/LICENSE b/LICENSE index b09cd78..0032521 100644 --- a/LICENSE +++ b/LICENSE @@ -1,4 +1,4 @@ -Apache License + Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ @@ -186,7 +186,7 @@ Apache License same "printed page" as the copyright notice for easier identification within third-party archives. - Copyright [yyyy] [name of copyright owner] + Copyright 2022 greenfish77 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/NOTICE b/NOTICE new file mode 100644 index 0000000..bbd2db4 --- /dev/null +++ b/NOTICE @@ -0,0 +1,23 @@ +Apache gaenari +Copyright 2022 The Apache Software Foundation + +================================ +sqlite (https://www.sqlite.org/) +================================ +public domain +https://www.sqlite.org/copyright.html + +============= +forsythia.jpg +============= +public domain +https://cdn.pixabay.com/photo/2020/02/28/21/42/forsythia-4888681__340.jpg +(compressed) + +========== +apples.gif +========== +public domain +https://cdn.pixabay.com/photo/2019/02/04/06/45/apple-3974055_1280.jpg +(resized, compressed, and animated) + diff --git a/README.md b/README.md new file mode 100644 index 0000000..4a20478 --- /dev/null +++ b/README.md @@ -0,0 +1,668 @@ +gaenari +======= + +`gaenari` is the Korean name for spring-blooming `forsythia` in East Asia. it is a plant with small yellow flowers.\ +![forsythia](./doc/img/forsythia.jpg) + +Here, `gaenari` means: +- full project name with C++17 header-only libraries. +- a single `decision tree` like [`ID3`](https://en.wikipedia.org/wiki/ID3_algorithm) and [`C4.5`](https://en.wikipedia.org/wiki/C4.5_algorithm). + +when `machine learning` works with real world data, its **accuracy decreases over time**. it is quite different from the AI industry news or the success stories of academic journals. what is the cause of the problem? + +### concept drift + +[**`concept drift`**](https://en.wikipedia.org/wiki/Concept_drift) is one of the biggest obstacles of machine learning in the real world AI. + +can data before the `COVID-19` pandemic predict future information? not easy. because data trends are always changing, so current models cannot easily predict the future one step ahead. + +> we live in an incomplete `real world` rather than a `toy world` that separates dog and cat images. + +### solutions + +- assume that perfect modeling is impossible.\ + it relies on `heuristic technique` instead of mathemetical and statistical algorithms. +- update the model through [**`incremental learning`**](https://en.wikipedia.org/wiki/Incremental_learning). +- accumulate data and provide insights. +- minimize the use of complex hyper-parameters. + +[**this is an example**](#chunk_history_trend) of resolving the decrease in accuracy due to data trend change by calling `rebuild()`. + +### supul + +**`supul`** means forest in Korean, and it is a metaphor for multiple decision trees. +`supul` is the another library in `gaenari`. + +- support for `incremental learning` through `inserts`, `updates`, and `rebuilds`. +- combine multiple `decision trees`. +- manage a database. + +### generation + +the `supul` expands by `generation`. `rebuild` trains a single `decision tree` on the weak parts and then combine it. + +![legend](./doc/img/legend.png)\ +![generation](./doc/img/generation.gif) + +the goal is to increase(or keep) accuracy through rule segmentation. +similar to the effect of sharpening a photo as shown in the picture below. + +![apples](./doc/img/apple.gif) + +### i learned that overfitting is bad. + +as above, the tree seems to overfit over time. so it is true that negative thoughts arise. + +in traditional machine learning, the training data is a sampled subset of the population. so there is a sampling error(the training data are not representative of all cases in the population), overfitting is the target of avoidance. + +however, the goal of the `supul` is to learn the whole data, not the samples. +therefore, in this case, it adaps to new data through continous `incremental learning`. this reduces the risk of overfitting. + +### library design + +![design](./doc/img/design.gif) + +a single `decision tree` and `dataset` are implemented in `ganari`. +`supul` implements a public supul methods that can be called externally. +database and model processing for incremental learning are key. + +dataframe repository is implemented as an interface and can be easily appended to. the same goes for databases. +databases other than sqlite are also possible. + +### build + +`gaenari` is a header-only library, so only include is needed. +but some external libraries, such as sqlite, require c/c++ compilation for link. + +#### build `gaenari` + +```bash +$ cd /path/to/gaenari +$ mkdir build +$ cd build +$ cmake .. +$ cmake --build . --config release +``` + +#### test + +```bash +$ ctest --verbose +``` + +find executables in build/tests/* and just execute it, display in color. + +![log](./doc/img/log.gif) + +#### build with `gaenari` + +##### wrapper/wrapper.cpp + +```c++ +#include "gaenari/gaenari.hpp" +int main(void) { + gaenari::logger::init1("/temp/_log.txt"); + using supul_t = supul::supul::supul_t; + supul_t::api::project::create("/temp/supul_dir"); + supul_t::api::project::add_field("/temp/supul_dir", "x1", "REAL"); + supul_t::api::project::add_field("/temp/supul_dir", "x2", "INTEGER"); + supul_t::api::project::add_field("/temp/supul_dir", "x3", "TEXT_ID"); + supul_t::api::project::add_field("/temp/supul_dir", "y0", "TEXT_ID"); + supul_t::api::project::x("/temp/supul_dir", {"x1", "x2", "x3"}); + supul_t::api::project::y("/temp/supul_dir", "y0"); + supul_t::api::project::set_property("/temp/supul_dir", "db.type", "sqlite"); + supul_t supul; + supul.api.lifetime.open("/temp/supul_dir"); + supul.api.model.insert_chunk_csv("/temp/dataset.csv"); + supul.api.model.update(); + // ... + supul.api.model.rebuild(); + supul.api.lifetime.close(); + return 0; +} +``` + +##### wrapper/CMakeLists.txt + +```cmake +cmake_minimum_required(VERSION 3.6) +project(wrapper) + +# call order is important. + +add_subdirectory() +check_cpp17_gaenari() + +add_executable(wrapper wrapper.cpp) +add_gaenari(wrapper) +``` + +##### build + +```bash +wrapper/build$ cmake .. +wrapper/build$ cmake --build . --config release +``` + +### walkthrough + +you can call supul.api.\<*category*>.\<*function*>(...). +it is implemented as noexcept, so checks for errors by +checking the return `false`(or `std::nullopt`). for convenience, the return value check is omitted. +see comments for more details. + +#### walkthrough :: ready + +since `gaenari` has only headers, a single include is required. + +```c++ +#include "gaenari/gaenari.hpp" +``` + +initialize log: +```c++ +gaenari::logger::init1("/temp/log_gaenari.log"); +``` + +#### walkthrough :: project + +`supul` runs as a project in a directory unit. +the project directory contains configuration, and sqlite database files. +the project creation is as follows. +```c++ +supul::supul::supul_t::api::project::create("/temp/my_project"); +``` + +all functions in project category are static, so they can be called directly without an object. +the main files in the project directory are: + + +|file name |note | +|---------------|---------------------| +|property.txt |project configuration| +|attributes.json|schema definition | +|*.db |sqlite database file | + +##### /temp/my_project/property.txt + +```ini +# supul configuration. +ver = 1.0.0 +# supported db type : sqlite. +db.type = sqlite +# set default database name. +db.dbname = supul +# set table name prefix. +db.tablename.prefix = +# if the treenode is less accurate(<=) than this value, it is weak. the higher value, the more aggresive rebuild, and the more complex the tree. +model.weak_treenode_condition.accuracy = 0.8 +# it is weak when the number of treenode's instances is greater(>=) than this. the lower value, the more aggresive rebuild, and the more complex the tree. +model.weak_treenode_condition.total_count = 5 +``` + +you must choose **`db.type`** after project_create(). +for example, select `sqlite`. +you can edit manually or fix it using following function. +```c++ +supul::supul::supul_t::api::project::set_property("/temp/my_project", + "db.type", + "sqlite"); +``` + +##### attributes.json + +```json +{ + "revision": 0, + "fields": { + "salary": "REAL", + "commission": "REAL", + "age": "INTEGER", + "elevel": "TEXT_ID", + "car": "TEXT_ID", + "zipcode": "TEXT_ID", + "hvalue": "REAL", + "hyears": "INTEGER", + "loan": "REAL", + "group": "TEXT_ID" + }, + "x": [ + "salary", + "commission", + "age", + "elevel", + "car", + "zipcode", + "hvalue", + "hyears", + "loan" + ], + "y": "group" +} +``` + +the above json is an example of [`agrawal dataset`](https://weka.sourceforge.io/doc.dev/weka/datagenerators/classifiers/classification/Agrawal.html). +it is created with the dataset generator provided by [`weka`](https://www.cs.waikato.ac.nz/ml/weka/) and divided into two groups. +there are 9 `function`s and the same `function` have the same data trend. used for `concept drift` experiments. + +`INTEGER`, `REAL`, and `TEXT_ID` are supported as data types. +`TEXT_ID` use index stored in a `string table`. +it is `nominal` data. + +these fields should be included in the header of the csv where the instances are stored. +not all fields need to be included in x(e.g. internal id values needed for tracking). +explicitly determines the x items in the fields. and choose one y item as well. + +you can edit the json manually or use the function below, too. + + +```c++ +using supul_t = supul::supul::supul_t; +std::string base_dir = "/temp/my_project"; + +supul_t::api::project::add_field(base_dir, "salary", "REAL"); +supul_t::api::project::add_field(base_dir, "commission", "REAL"); +... +supul_t::api::project::add_field(base_dir, "group", "TEXT_ID"); + +supul_t::api::project::x(base_dir, {"salary", "commission", ..., "loan"}); +supul_t::api::project::y(base_dir, "group"); +``` + +#### walkthrough :: create a supul object + +after project creation, create a `supul` object. + +```c++ +supul::supul::supul_t supul; +``` + +if you want to use a supul object as a function return, you can use `unique_ptr`. + +```c++ +auto supul = std::make_unique(); +... +return supul; +``` + +we can get `supul` api hints from ide tools (ex, visual studio). + +![intellisense](./doc/img/intellisense.gif) + +you can use the lifetime api to open and close your project. + +```c++ +supul.api.lifetime.open("/temp/my_project"); +``` + +#### walkthrough :: insert a csv file + +`supul` supports `incremental learning`. train a continuous dataset, and one dataset is called a **`chunk`**. + +prepare the csv in the same format as the definition in attributes.json. + +salary|commision|age|elevel|car|zipcode|hvalue|hyears|loan|group +------|---------|---|------|---|-------|------|------|----|----- +111811.9025|0|50|L2|C16|Z2|135000|9|374566.1561|G1 +62308.5782|33338.59959|52|L3|C3|Z0|135000|6|64557.41339|G1 +...|...|...|...|...|...|...|...|...|... +> when creating a csv with weka, elevel, car, zipcode, and group are expressed +only as numbers(actually nominal). + +we can create agrawal dataset.csv as below. + +```bash +$ java -classpath weka.jar weka.datagenerators.classifiers.classification.Agrawal -r temp -S 0 -n 100 -F 0 -P 0.005 > dataset.arff +$ java -classpath weka.jar weka.core.converters.CSVSaver -i data.arff -o dataset.csv +``` +(see create_agrawal_dataset() function.) + +insert an instances in csv into the database. +```c++ +supul.api.model.insert_chunk_csv("/temp/dataset.csv"); +``` + +> `supul` inserts all new in-comming data into database. therefore, +the database size is continuously increasing. it requres techniques to keep it +on a limited scale. it is in TO-DO. + +#### walkthrough :: update + +`insert` stores only instance data, so we need to call `update()` for the next step. +the update information includes things like the evaluation results for the current model. +so, unlike `insert`, `update` requires extra time. + +```c++ +supul.api.model.update(); +``` + +> when update is called, the first model training will automatically proceed if the model has not yet been built. + +update also stores statistical data(accuracy, etc.) for each `chunk`. this allows you to see how well the currently trained model reflects the new `chunk`. + +#### walkthrough :: rebuild + +when the trend in the data changes, the accuracy of the `chunks` decreases. +`rebuild()` finds weak instances, re-trains only those parts, and combines +them with the existing tree to overcome the loss of accuracy. + +```c++ +supul.api.model.rebuild(); +``` + +> if the `rebuild` results in somewhat less accurate, rollback to the previous state. + +> `rebuild` increases the size of the model because it is a continous method +of combining models. the way to maintain a limited scale is included in TO-DO. + +> `reubild` is not yet automatically invoked by trigger. +the call to `rebuild` under certain conditions is not yet implemented. + +#### walkthrough :: predict + +predict the y value of the x parameters that is input to the current model. +the previous model is used by database transactions when changes +(insert, update, rebuild, etc.) are currently in progress. +a map of (key, value) is used for the x parameter, where key +and value are strings. value is automatically converted by attributes.json. + +```c++ +std::unordered_map x; +x = {{"salary", "1000.0"}, + {"commision", "0.0"}, + {"age", "25"}, + {"elevel", "3"}, + {"car", "1"}, + {"zipcode", "1"}, + {"hvalue", "132000"}, + {"hyears", "3"}}; +auto ret = supul.api.model.predict(x); +``` + +returned information of predict: + +```c++ +// predict result. +struct predict_result { + bool error = false; + std::string errormsg; + int64_t label_index = 0; + std::string label; + int64_t correct_count = 0; + int64_t total_count = 0; + double accuracy = 0.0; +}; +``` + +the `label` value is the predicted y value. `label_index` is the string table index of the `label`. +`correct_count`, `total_count`, and `accuracy` are information of the leaf tree node +classified in the decision tree. +> these three values can be used as confidence information +for prediction. + +#### walkthrough :: report + +current status can be output as `json` and `gnuplot` charts. + +> install `gnuplot` and add to path. + +to get report as json: +```c++ +auto ret = supul.api.report.json(""); +if (not ret) {/* error */} +auto& json = ret.value(); +``` + +to get report as gnuplot (png): +```c++ +supul.api.report.gnuplot(json, { + {"terminal", "pngcairo"}, + {"terminal_option", "font `Times-New-Roman,10` size 800,800"}, + {"output_filepath", "/tmp/chart.png"}, + {"plt_filepath", "/tmp/gnuplot_script.plt"}, +}); +``` + +> see configuring `gnuplot` for `terminal` and `terminal option`. +> if the terminal is `dumb`, it will output an ascii chart. + +pretty processed json: +```json +{ + "doc_ver": 1, + "error": false, + "category": { + "global": { + "schema_version": 1, + "instance_count": 5000, + "updated_instance_count": 5000, + "instance_correct_count": 3766, + "instance_accuracy": 0.7532, + "acc_weak_instance_count": 4029 + }, + "confusion_matrix": { + "label_name": [ + "1", + "0" + ], + ... +``` + +chart.png: + + + +> `chunk_history` allows you to see the current accuracy trend of the model and +call `rebuild` if necessary for better accuracy. + + +the `chunk_history` above is the result of \_develop.hpp::report(). +it processed in the following order(agrawal dataset). +1. insert and update 10 chunks (**func=1**) +2. insert and update 10 chunks (**func=2**) +3. rebuild +4. insert and update 10 chunks (**func=2**) +5. rebuild +6. insert and update 10 chunks (**func=2**) +7. insert and update 10 chunks (**func=1**) + + + +gnuplot_script.plt + +``` +# ${SET_TERMINAL} + +# common + +# data block +$data_block_chunk_history << EOD +0 0.99 100 +... +# multiplot: chunk_history +set origin 0, 0.67 +set size 1, 0.34 +... +reset +unset key +unset multiplot +``` + +> `gnuplot`'s script leaves the `terminal` configuration blank to use the system +defaults. set it yourself if necessary. + +#### walkthrough :: wrap-up + +##### stage 1: create a project that calls only once at the start. + +```c++ +#include "gaenari/gaenari.hpp" +... +gaenari::logger::init1("/temp/log_gaenari.log"); +std::string base_dir = "/temp/my_project"; +supul::supul::supul_t::api::project::create(base_dir); +supul::supul::supul_t::api::project::set_property(base_dir, "db.type", "sqlite"); +supul::supul::supul_t::api::project::add_field(base_dir, "salary", "REAL"); +supul::supul::supul_t::api::project::add_field(base_dir, "commission", "REAL"); +supul::supul::supul_t::api::project::add_field(base_dir, "group", "TEXT_ID"); +// ... omit ... +supul::supul::supul_t::api::project::x(base_dir, {"salary", "commission", ..., "loan"}); +supul::supul::supul_t::api::project::y(base_dir, "group"); +``` + +##### stage 2: continuous insert of new data(`chunks`). + +```c++ +#include "gaenari/gaenari.hpp" +... +gaenari::logger::init1("/temp/log_gaenari.log"); +supul::supul::supul_t supul; +supul.api.lifetime.open("/temp/my_project"); +supul.api.model.insert_chunk_csv("/temp/dataset1.csv"); +supul.api.model.update(); +supul.api.model.insert_chunk_csv("/temp/dataset2.csv"); +supul.api.model.update(); +supul.api.model.insert_chunk_csv("/temp/dataset3.csv"); +supul.api.model.update(); +``` + +##### stage 3: predict a instance. + +```c++ +#include "gaenari/gaenari.hpp" +... +std::unordered_map instance = {{"salary":"3"}, ...}; +... +gaenari::logger::init1("/temp/log_gaenari.log"); +supul::supul::supul_t supul; +supul.api.lifetime.open("/temp/my_project"); +auto ret = supul.api.model.predict(instance); +auto& predicted = ret.label; +``` + +##### stage 4: rebuild due to data trend change. + +```c++ +#include "gaenari/gaenari.hpp" +... +gaenari::logger::init1("/temp/log_gaenari.log"); +supul::supul::supul_t supul; +supul.api.lifetime.open("/temp/my_project"); +supul.api.model.rebuild(); +``` + +##### stage 5: analyze the report. + +```c++ +#include "gaenari/gaenari.hpp" +... +gaenari::logger::init1("/temp/log_gaenari.log"); +supul::supul::supul_t supul; +supul.api.lifetime.open("/temp/my_project"); +auto ret = supul.api.report.json(""); +if (not ret) {/* error */} +auto& json = ret.value(); +supul.api.report.gnuplot(json, { + {"terminal", "pngcairo"}, + {"terminal_option", "font `Times-New-Roman,10` size 800,800"}, + {"output_filepath", "/tmp/chart.png"}, + {"plt_filepath", "/tmp/gnuplot_script.plt"}, +}); +``` + +### database + +database is at the heart of `supul`. so, it is helpful to understand the database structure. + +er-diagram:\ + + +* the primary key for all tables is `id`. +* fields in the `instance` table are dynamically determined by `attributes.json`. +* fields with `*ref_*` are references to other table `id`. +* implementations that rely on specific database are prohibited. +* fields used in the `where` clause are added to the index. +* numerous `treenode` queries are required while running `predict`. so the `cache` is used for performance. +* use `prepared statements` for security and performance. + +a tool like [DB Browser for SQLite](https://sqlitebrowser.org) makes it easier to understand the structure. + +![sqlite_browser](./doc/img/sqlite_browser0.jpg)\ +the sqlite database file with extension .db is located under the project directory. +the agrwal instances are in the `instance` table. + +#### let's look for misclassified instances. + +execute this query: +```sql +select instance.* + from instance + join instance_info on instance.id = instance_info.ref_instance_id +where instance_info.correct = 0 +``` + +![sqlite_browser](./doc/img/sqlite_browser1.jpg)\ +5629 instances were found (id: 1, 3, 7, ...). + +![sqlite_browser](./doc/img/sqlite_browser2.png)\ +30000 - 24371 = 5629, matches the calculation result of the global value. + +### error handling + +the `supul` api in `api` category is a noexcept function, so no exceptions are thrown. +check for errors with the return value. + +|return type |error | +|-------------|------------| +|bool |false | +|std::optional|std::nullopt| +|struct |some bool member variable| + +if an error occurs, check the reason by: +```c++ +auto msg = supul.api.misc.errmsg(); +``` + +> static functions are not supported. + +##### example + +if `x` in the `predict` call does not have the required value: +```c++ +std::unordered_map x = {{"foobar", "1"}}; +auto result = supul.api.model.predict(x); +if (result.error) { + std::cout << "* supul.api.misc.errmsg(): " << supul.api.misc.errmsg() << std::endl; +} +``` +![error_msg](./doc/img/err0.png) + +the error occurred because `x` does not have a required `age` value. +the code location and reason are printed. +it is also returned by calling `errmsg()`. + +### api list + +here is the list of supported apis.\ +see the comments in the code for detail. + +|category|static|name| +|-|:-:|-| +|project|O|create| +||O|set_property| +||O|add_field| +||O|x| +||O|y| +|lifetime||open| +|||close| +|model||insert_chunk_csv| +|||update| +|||rebuild| +|||predict| +|report||json| +||O|gnuplot| +|misc|O|version| +|||errmsg| +|property||set_property| +|||get_property| +|test||verify| diff --git a/TODO.md b/TODO.md new file mode 100644 index 0000000..744f012 --- /dev/null +++ b/TODO.md @@ -0,0 +1,30 @@ +TODO +==== + +### Todo +- [ ] rebuild() optimization. + - [ ] gridsearch(weak-treenode-condition parameter sweep). + - [ ] any other ideas? +- [ ] survival of the fittest: managing the life time of trees. +- [ ] restapi server. +- [ ] dynamically adding fields to instances. +- [ ] dynamically adding values to label. +- [ ] multi-threaded parallel train(). +- [ ] change the value of any field in an instance (including labels). +- [ ] monitoring (with trigger?). +- [ ] cache synchronization. +- [ ] event log. +- [ ] avoid hard-coded train options. +- [ ] reducing the number of instances in the DB (or limit size). +- [ ] reducing the size of model in the DB (or limit size). +- [ ] support other databases(postgresql, ...). +- [ ] tree model to json. + +### In Progress + +### Done ✓ +##### 1.0.0.0 +- [X] first release. + +### abandoned +- [ ] ~~nothing~~ diff --git a/cmake/gaenari_util.cmake b/cmake/gaenari_util.cmake new file mode 100644 index 0000000..234717c --- /dev/null +++ b/cmake/gaenari_util.cmake @@ -0,0 +1,23 @@ +####### +# c++17 +####### +macro(check_cpp17_gaenari) + enable_language(CXX) + set(CMAKE_CXX_STANDARD 17) + set(CMAKE_CXX_STANDARD_REQUIRED True) + set(CMAKE_CXX_EXTENSIONS OFF) + if(MSVC_VERSION GREATER_EQUAL 1910) + # for using `and`, `or` keyword. + add_compile_options(/permissive-) + endif() +endmacro() + +####################### +# to build with gaenari +####################### +macro(add_gaenari target) + # add include and link, ... + target_include_directories(${target} PRIVATE ${GAENARI_INCLUDE_DIR_FOR_BUILD}) + set_property(TARGET ${target} APPEND PROPERTY COMPILE_DEFINITIONS ${GAEANRI_DEFINITION_FOR_BUILD}) + set_property(TARGET ${target} APPEND PROPERTY LINK_LIBRARIES ${GAENARI_LINK_FOR_BUILD}) +endmacro() diff --git a/doc/img/apple.gif b/doc/img/apple.gif new file mode 100644 index 0000000..96f0ba8 Binary files /dev/null and b/doc/img/apple.gif differ diff --git a/doc/img/design.gif b/doc/img/design.gif new file mode 100644 index 0000000..84e6101 Binary files /dev/null and b/doc/img/design.gif differ diff --git a/doc/img/er_diagram.png b/doc/img/er_diagram.png new file mode 100644 index 0000000..2fe22b1 Binary files /dev/null and b/doc/img/er_diagram.png differ diff --git a/doc/img/err0.png b/doc/img/err0.png new file mode 100644 index 0000000..fe8570f Binary files /dev/null and b/doc/img/err0.png differ diff --git a/doc/img/forsythia.jpg b/doc/img/forsythia.jpg new file mode 100644 index 0000000..9db0f0c Binary files /dev/null and b/doc/img/forsythia.jpg differ diff --git a/doc/img/generation.gif b/doc/img/generation.gif new file mode 100644 index 0000000..970932e Binary files /dev/null and b/doc/img/generation.gif differ diff --git a/doc/img/intellisense.gif b/doc/img/intellisense.gif new file mode 100644 index 0000000..1cd15ee Binary files /dev/null and b/doc/img/intellisense.gif differ diff --git a/doc/img/legend.png b/doc/img/legend.png new file mode 100644 index 0000000..cbc1127 Binary files /dev/null and b/doc/img/legend.png differ diff --git a/doc/img/log.gif b/doc/img/log.gif new file mode 100644 index 0000000..a59d539 Binary files /dev/null and b/doc/img/log.gif differ diff --git a/doc/img/report.png b/doc/img/report.png new file mode 100644 index 0000000..26d629d Binary files /dev/null and b/doc/img/report.png differ diff --git a/doc/img/report_desc.png b/doc/img/report_desc.png new file mode 100644 index 0000000..aaa1a28 Binary files /dev/null and b/doc/img/report_desc.png differ diff --git a/doc/img/sqlite_browser0.jpg b/doc/img/sqlite_browser0.jpg new file mode 100644 index 0000000..46758d1 Binary files /dev/null and b/doc/img/sqlite_browser0.jpg differ diff --git a/doc/img/sqlite_browser1.jpg b/doc/img/sqlite_browser1.jpg new file mode 100644 index 0000000..772eb7d Binary files /dev/null and b/doc/img/sqlite_browser1.jpg differ diff --git a/doc/img/sqlite_browser2.png b/doc/img/sqlite_browser2.png new file mode 100644 index 0000000..36ebba2 Binary files /dev/null and b/doc/img/sqlite_browser2.png differ diff --git a/extern/sqlite/CMakeLists.txt b/extern/sqlite/CMakeLists.txt new file mode 100644 index 0000000..8477b74 --- /dev/null +++ b/extern/sqlite/CMakeLists.txt @@ -0,0 +1,79 @@ +PROJECT(sqlite3) +cmake_minimum_required(VERSION 3.6) + +# download sqlite source. +set(SQLITE_URL "https://www.sqlite.org/${SQLITE_RELEASE_YEAR}/sqlite-amalgamation-${SQLITE_VERSION}.zip") +message(STATUS "[info] download sqlite. wait...") +message(STATUS "[info] * url: ${SQLITE_URL}") +message(STATUS "[info] * to: ${DOWNLOAD_DIR}/sqlite-amalgamation-${SQLITE_VERSION}.zip") +message(STATUS "[info] if the network is blocked, copy the downloaded file to the directory. of course, the SHA256 value must match.") +file(DOWNLOAD + "${SQLITE_URL}" + "${DOWNLOAD_DIR}/sqlite-amalgamation-${SQLITE_VERSION}.zip" + # SHOW_PROGRESS + EXPECTED_HASH SHA256=${SQLITE_SHA256} +) +message(STATUS "[info] download completed, and extract.") + +# `file(ARCHIVE_EXTRACT ...` require cmake 3.18. +# use the old way. +# file(ARCHIVE_EXTRACT +# INPUT ${DOWNLOAD_DIR}/sqlite-amalgamation-${SQLITE_VERSION}.zip +# DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/extract +# VERBOSE +# ) +file(MAKE_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/extract") +execute_process(COMMAND "${CMAKE_COMMAND}" -E tar xzf "${DOWNLOAD_DIR}/sqlite-amalgamation-${SQLITE_VERSION}.zip" + WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/extract" +) + +# compare (it doesn't compare all files in the directory, only the sqlite3.h file.) +execute_process(COMMAND "${CMAKE_COMMAND}" -E compare_files + "${CMAKE_CURRENT_BINARY_DIR}/extract/sqlite-amalgamation-${SQLITE_VERSION}/sqlite3.h" + "${CMAKE_CURRENT_SOURCE_DIR}/sqlite/sqlite3.h" + RESULT_VARIABLE COMPARE_RESULT +) +if(NOT COMPARE_RESULT EQUAL 0) + message(STATUS "[info] sqlite source compare: diff -> copy.") + file(COPY "${CMAKE_CURRENT_BINARY_DIR}/extract/sqlite-amalgamation-${SQLITE_VERSION}/sqlite3.h" DESTINATION "${CMAKE_CURRENT_SOURCE_DIR}/sqlite/") + file(COPY "${CMAKE_CURRENT_BINARY_DIR}/extract/sqlite-amalgamation-${SQLITE_VERSION}/sqlite3.c" DESTINATION "${CMAKE_CURRENT_SOURCE_DIR}/sqlite/") + file(COPY "${CMAKE_CURRENT_BINARY_DIR}/extract/sqlite-amalgamation-${SQLITE_VERSION}/sqlite3ext.h" DESTINATION "${CMAKE_CURRENT_SOURCE_DIR}/sqlite/") +else() + message(STATUS "[info] sqlite source compare: same.") +endif() +file(REMOVE_RECURSE "${CMAKE_CURRENT_BINARY_DIR}/extract") + +# build static sqlite3.lib. +add_library(sqlite3 STATIC + ${CMAKE_CURRENT_SOURCE_DIR}/sqlite/sqlite3.c + ${CMAKE_CURRENT_SOURCE_DIR}/sqlite/sqlite3.h + ${CMAKE_CURRENT_SOURCE_DIR}/sqlite/sqlite3ext.h +) + +# compile option. +# check it. +# https://www.sqlite.org/compile.html +# 2. Recommended Compile-time Options +add_definitions(-DSQLITE_DQS=0) +add_definitions(-DSQLITE_DEFAULT_MEMSTATUS=0) +add_definitions(-DSQLITE_MAX_EXPR_DEPTH=0) +add_definitions(-DSQLITE_OMIT_DECLTYPE) +add_definitions(-DSQLITE_OMIT_DEPRECATED) +add_definitions(-DSQLITE_OMIT_PROGRESS_CALLBACK) +add_definitions(-DSQLITE_OMIT_SHARED_CACHE) +add_definitions(-DSQLITE_USE_ALLOCA) + +# disable compile warning. +if(MSVC) + target_compile_options(sqlite3 PRIVATE /W1) +else() + target_compile_options(sqlite3 PRIVATE -Wall -Wformat=0) +endif() + +# install. +install(FILES + "${SQLITE_INCLUDE_DIR}/sqlite/sqlite3.h" + "${SQLITE_INCLUDE_DIR}/sqlite/sqlite3ext.h" + DESTINATION include/sqlite +) +install(TARGETS sqlite3 LIBRARY DESTINATION lib ARCHIVE DESTINATION lib) diff --git a/include/gaenari/gaenari-config-base.h b/include/gaenari/gaenari-config-base.h new file mode 100644 index 0000000..821e01a --- /dev/null +++ b/include/gaenari/gaenari-config-base.h @@ -0,0 +1,14 @@ +#ifndef HEADER_GAENARI_CONFIG_BASE_H +#define HEADER_GAENARI_CONFIG_BASE_H + +// uses gaenari-config.h that are configured from cmake. +// this header file defines default values to avoid compilation errors when used without cmake. + +// version is only meaningful through cmake. +#define GAENARI_VERSION_MAJOR 0 +#define GAENARI_VERSION_MINOR 0 +#define GAENARI_VERSION_PATCH 0 +#define GAENARI_VERSION_TWEAK 0 +#define GAENARI_VERSION __QuOtE__(0.0.0) + +#endif // HEADER_GAENARI_CONFIG_BASE_H diff --git a/include/gaenari/gaenari-config.h.in b/include/gaenari/gaenari-config.h.in new file mode 100644 index 0000000..3a25729 --- /dev/null +++ b/include/gaenari/gaenari-config.h.in @@ -0,0 +1,13 @@ +#ifndef HEADER_GAENARI_CONFIG_H +#define HEADER_GAENARI_CONFIG_H + +// this file(gaenari-config.h) is generated by cmake. + +#define GAENARI_VERSION_MAJOR @gaenari_VERSION_MAJOR@ +#define GAENARI_VERSION_MINOR @gaenari_VERSION_MINOR@ +#define GAENARI_VERSION_PATCH @gaenari_VERSION_PATCH@ +#define GAENARI_VERSION_TWEAK @gaenari_VERSION_TWEAK@ +#define __QuOtE__(name) #name +#define GAENARI_VERSION __QuOtE__(@gaenari_VERSION@) + +#endif // HEADER_GAENARI_CONFIG_H diff --git a/include/gaenari/gaenari.hpp b/include/gaenari/gaenari.hpp new file mode 100644 index 0000000..5450468 --- /dev/null +++ b/include/gaenari/gaenari.hpp @@ -0,0 +1,21 @@ +#ifndef HEADER_GAENARI_HPP +#define HEADER_GAENARI_HPP + +// configuration. +// __has_include: c++17 standard. +#if __has_include("gaenari-config.h") +#include "gaenari-config.h" +#else +// no config heaer file. +// recommend to build with `cmake ..` +#pragma message ("[WARNING] compile without cmake.") +#include "gaenari-config-base.h" +#endif + +// include all gaenari header files. +#include "gaenari/gaenari.hpp" + +// include all supul header files. +#include "supul/supul.hpp" + +#endif // HEADER_OUTER_GAENARI_HPP diff --git a/include/gaenari/gaenari/common/cache.hpp b/include/gaenari/gaenari/common/cache.hpp new file mode 100644 index 0000000..cef2bd1 --- /dev/null +++ b/include/gaenari/gaenari/common/cache.hpp @@ -0,0 +1,183 @@ +#ifndef HEADER_GAENARI_GAENARI_COMMON_CACHE_HPP +#define HEADER_GAENARI_GAENARI_COMMON_CACHE_HPP + +namespace gaenari { +namespace common { + +// supports memory-based (key, value) set/get. +// stores the count of get(key) calls. +// and when the number of keys increases to capacity, some cache entries are deleted. +// if the rank of count is less than `survive_size`, it will be deleted. +// and the conut value of the remaining items is set to zero. +// +// K : key type +// V : value type +// C : count type (default int) +template +class cache { +public: + cache() = delete; + inline cache(_in size_t capacity, _in size_t survive_size) { + if ((survive_size >= capacity) or (capacity < 4)) THROW_GAENARI_INTERNAL_ERROR0; + this->capacity = static_cast(capacity); + this->survive_size = static_cast(survive_size); + } + ~cache() = default; + +public: + using callback_set = std::function; + +public: + // get the copied value of target key. + // if not found, calls cb to get the value. + // calling 'set' checks capacity and deletes items if necessary. + // ex) + // auto v = cache.get("key", [](_in auto& k, _out auto& v) { + // // so long time. + // v = "value" + // }); + inline const V get(_in const K& k, _in callback_set cb) { + // mutex lock. + std::lock_guard l(mutex); + + // return copy. + return _get(k, cb); + } + + // get the value of target key. + // if not found, calls cb to get the value. + // calling 'set' checks capacity and deletes items if necessary. + // ex) + // auto& v = cache.get_ref("key", [](_in auto& k, _out auto& v) { + // // so long time. + // v = "value" + // }); + // + // when you get return as a reference, be careful. + // if necessary, use get_mutex() to lock one more time. + inline const V& get_ref(_in const K& k, _in callback_set cb) { + // mutex lock. + std::lock_guard l(mutex); + + // return reference. + return _get(k, cb); + } + + // to pre-lock at the caller. + // this is for more secure transactions. + inline std::recursive_mutex& get_mutex(void) { + return mutex; + } + + // get mutable cache items. + // call after lock. + inline auto& get_items(void) { + return items; + } + + // erase. + inline void erase(_in const K& k) { + // mutex lock. + std::lock_guard l(mutex); + items.erase(k); + } + + // clear. + inline void clear(void) { + // mutex lock. + std::lock_guard l(mutex); + items.clear(); + } + +protected: + inline const V& _get(_in const K& k, _in callback_set cb) { + // find key. + auto find = items.find(k); + if (find != items.end()) { + // found! + auto& p = find->second; + + // add count. + p.second++; + + // return value. + return p.first; + } + + // not found. + // call callback to get value. + V v; + (cb)(k, v); + + C size = static_cast(items.size()); + if (size < capacity) { + // can afford. + items[k] = std::make_pair(std::move(v), 1); + return items[k].first; + } + + // survive. + std::vector counts; + for (const auto& item: items) { + const auto& p = item.second; + counts.push_back(p.second); + } + std::sort(counts.begin(), counts.end(), std::greater()); + C survive = 0; + C acc = 0; + for (auto count: counts) { + if (acc + count <= survive_size) { + survive = count; + acc += count; + continue; + } + + break; + } + + // survive run. + for (auto it=items.begin(); it != items.end(); ) { + auto& p = it->second; + auto& c = p.second; + if (c >= survive) { + // survived ! + c = 0; // survived, but set count to zero. + ++it; + } else { + // died ! + items.erase(it++); + } + } + + // log warning. + gaenari::logger::warn("cache refreshed."); + + // set and return value. + items[k] = std::make_pair(std::move(v), 1); + return items[k].first; + } + +protected: + // cache data. + // (key, (value, frequency)) map. + std::unordered_map> items; + std::recursive_mutex mutex; + C capacity = 0; + C survive_size = 0; +}; + +// ex) +// cache c(5, 2); -> 5 capacity, 2 survive. +// c.get("aaa", [](auto& k, auto& v) {v = "111";}); -> callback called. ("aaa", ("111", 1)) +// c.get("aaa", [](auto& k, auto& v) {v = "111";}); -> callback not called. ("aaa", ("111", 2)) +// c.get("bbb", [](auto& k, auto& v) {v = "222";}); -> callback called. ("aaa", ("111", 2)), ("bbb", ("222", 1)) +// c.get("ccc", [](auto& k, auto& v) {v = "333";}); -> callback called. +// c.get("ddd", [](auto& k, auto& v) {v = "444";}); -> callback called. +// c.get("eee", [](auto& k, auto& v) {v = "555";}); -> callback called. +// c.get("fff", [](auto& k, auto& v) {v = "666";}); -> callback called. survive run. ("aaa", ("111", 0)), ("fff", ("666", 1)) +// c.get("ggg", [](auto& k, auto& v) {v = "777";}); + +} // common +} // gaenari + +#endif // HEADER_GAENARI_GAENARI_COMMON_CACHE_HPP diff --git a/include/gaenari/gaenari/common/insert_order_map.hpp b/include/gaenari/gaenari/common/insert_order_map.hpp new file mode 100644 index 0000000..1a43434 --- /dev/null +++ b/include/gaenari/gaenari/common/insert_order_map.hpp @@ -0,0 +1,259 @@ +#ifndef HEADER_GAENARI_GAENARI_COMMON_INSERT_ORDER_MAP_HPP +#define HEADER_GAENARI_GAENARI_COMMON_INSERT_ORDER_MAP_HPP + +namespace gaenari { +namespace common { + +// map with insertion order iteration. +template +struct insert_order_map { + using key_type = key_t; + using mapped_type = value_t; +protected: + // forward declaration. + struct comp; + + // internal variables. + // - m(map with value) + // - o(map with order) + // - c(comparator of m) + // - i(insertion count) + std::map* m = nullptr; + std::map, countable_t, std::less> o; + comp c; + countable_t i = 0; + +public: + inline insert_order_map() { + // build a new m with comparator c. + // comparator c can link to this. + c.parent = this; + m = new std::map(c); + } + + // insert_order_map m{{k1,v1}, {k2,v2}, ...} + inline insert_order_map(const std::initializer_list>& v) { + c.parent = this; + m = new std::map(c); + + // hard-copy. + for (const auto& it: v) { + (*this)[it.first] = it.second; + } + } + + inline insert_order_map(const insert_order_map& s) { + // do assign. + c.parent = this; + m = new std::map(c); + *this = s; + } + + inline ~insert_order_map() { + if (m) delete m; + m = nullptr; + } + + // used when accessing m from outside. + // you can call some methods of std::map(read only). + // reference) auto& d = m.get_map(); + // copy) auto d = m.get_map(); + inline const auto& get_map(void) const { + return *m; + } + + // assign. + // map = map. + inline insert_order_map& operator=(const insert_order_map& s) { + // clear. + this->clear(); + + // hard-copy. + // if `*this->m = *s->m` is used, the comparator in map is also copied, + // and then inserts are incorrectly passed. + // so do hard-copy. + for (const auto& it: (*s.m)) { + (*this)[it.first] = it.second; + } + + // count copy. + this->i = s.i; + + return *this; + } + + // assign. + // map = {{...}} + inline insert_order_map& operator=(const std::initializer_list>& v) { + // clear. + this->clear(); + + // hard-copy. + for (const auto& it: v) { + (*this)[it.first] = it.second; + } + + return *this; + } + + // move. + inline insert_order_map& operator=(insert_order_map&& s) noexcept { + // a perfect `move` is not supported(do hard-copy). + // if `*this->m = std::move(*s.m);` is used, map pointer in comparator(=comp::parent) is also copied, + // and then insertions are incorrectly passed. + // so do hard-copy. + // + // is there a way to dynamically change the map's comparator contents? + // if someone finds a way, we can implement a more perfect move. + + // just call `assign`(hard-copy). + (*this) = s; + + // count copy. + this->i = s.i; + + // after assign, clear s. + s.clear(); + + return *this; + } + + // iterators. + // warning) + // since the map must always be managed with o, + // do not use it to change the structure. + using iterator = typename std::map::iterator; + using const_iterator = typename std::map::const_iterator; + inline auto begin(void) { return this->m->begin(); } + inline auto end(void) { return this->m->end(); } + inline auto begin(void) const { return this->m->cbegin(); } + inline auto end(void) const { return this->m->cend(); } + inline auto cbegin(void) { return this->m->cbegin(); } + inline auto cend(void) { return this->m->cend(); } + inline auto rbegin(void) { return this->m->rbegin(); } + inline auto rend(void) { return this->m->rend(); } + inline auto rbegin(void) const { return this->m->rbegin(); } + inline auto rend(void) const { return this->m->rend(); } + inline auto crbegin(void) { return this->m->crbegin(); } + inline auto crend(void) { return this->m->crend(); } + + // map api. + + // clear. + inline void clear(void) { + m->clear(); + o.clear(); + i = 0; + } + + // empty. + inline bool empty(void) const { + return m->empty(); + } + + // access insert_order_map[k]. + inline value_t& operator[](key_t const& k) { + const auto [it, inserted] = (*m).insert({k, value_t()}); + if (not inserted) { + // k exsited. `it` is found iteration. + return it->second; + } + + // not found. `it` is inserted iteration. + // set key's order. + // if o.count() is used instead of i++, it my be duplicated in case of erase. + // use unique increment values as count. + o[it->first] = static_cast(i++); + return it->second; + } + + // insert(define only one typical use). + // logic is the same as assign. + inline std::pair insert(std::pair p) { + auto ret = (*m).insert(p); + if (not ret.second) return ret; + o[ret.first->first] = static_cast(i++); + return ret; + } + + inline iterator find(const key_t& k) { + return m->find(k); + } + + inline const_iterator find(const key_t& k) const { + return m->find(k); + } + + // erase one key. + inline size_t erase(const key_t& k) { + auto find = o.find(k); + size_t ret = m->erase(k); + if (find != o.end()) { + o.erase(find); + } + return ret; + } + + // size. + inline size_t size(void) const { + return m->size(); + } + + // count. + inline size_t count(void) const { + return m->count(); + } + + // bonus 1. + // return order by key. + // if key is not found, return {}(std::nullopt). + inline std::optional order(const key_t& k) const { + auto find = o.find(k); + if (find == o.end()) return {}; + return find->second; + } + + // bonus 2. + // return key by order. + // if order is out-of-range, return {}(std::nullopt). + inline std::optional> key(size_t order) const { + // `o` is not vector, so it is O(n), not O(1). + for (const auto& i: o) { + auto& k = i.first; + auto& v = i.second; + if (v == order) return k; + } + return {}; + } + + // comparator. +protected: + struct comp { + // comparator can link to parent. + insert_order_map* parent = nullptr; + + // determines the order of keys. + // usually `return a < b` is used. + // here, it is determined by the insert order of the key, not the value of the key. + inline bool operator() (const key_t& a, const key_t& b) const { + // default to maximum index value; + size_t index_a = std::numeric_limits::max(); + size_t index_b = std::numeric_limits::max(); + + // find the order of each a and b. + // if the order has not been inserted yet, the default(maximum) is remained. + auto find_a = (parent)->o.find(a); + auto find_b = (parent)->o.find(b); + if (find_a != (parent)->o.end()) index_a = find_a->second; + if (find_b != (parent)->o.end()) index_b = find_b->second; + + // returned by the input order, not the value of the key. + return index_a < index_b; + } + }; +}; + +} // namespace common +} // namespace gaenari + +#endif // HEADER_GAENARI_GAENARI_COMMON_INSERT_ORDER_MAP_HPP diff --git a/include/gaenari/gaenari/common/json.hpp b/include/gaenari/gaenari/common/json.hpp new file mode 100644 index 0000000..84a7a30 --- /dev/null +++ b/include/gaenari/gaenari/common/json.hpp @@ -0,0 +1,2213 @@ +#ifndef HEADER_JSON_HPP +#define HEADER_JSON_HPP + +namespace gaenari { +namespace common { + +// forward definition. +template