diff --git a/.gitignore b/.gitignore index 1025392fd7..1f909369da 100644 --- a/.gitignore +++ b/.gitignore @@ -29,3 +29,4 @@ test/samples/ # Sphinx and Doxygen Doc-Site doc/_build/* doc/en/docs/model_zoo/ +cmake-build-debug/* diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/Dockerfile b/examples/model_selection/TRAILS-Database-Native-Model-Selection/Dockerfile deleted file mode 100644 index 35cd7512e0..0000000000 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/Dockerfile +++ /dev/null @@ -1,79 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -FROM ubuntu:20.04 - -ENV DEBIAN_FRONTEND=noninteractive - -# Install Python, Vim, and necessary libraries -RUN apt-get update && \ - apt-get install -y software-properties-common wget gnupg2 lsb-release git sudo && \ - add-apt-repository ppa:deadsnakes/ppa && \ - apt-get install -y python3.6 python3-pip vim && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* - -# Install necessary dependencies for PostgreSQL and Rust -RUN apt-get update && \ - apt-get install -y pkg-config libssl-dev libpq-dev libclang-dev curl && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* - -# Install necessary dependencies for pgrx -RUN apt-get update && \ - apt-get install -y bison flex libreadline-dev && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* - -# Create the postgres user -USER root -RUN adduser --disabled-password --gecos "" postgres && \ - mkdir /project && \ - adduser postgres sudo && \ - chown -R postgres:postgres /project - -# Add PostgreSQL's repository -RUN wget --quiet -O - https://www.postgresql.org/media/keys/ACCC4CF8.asc | apt-key add - \ - && sh -c 'echo "deb http://apt.postgresql.org/pub/repos/apt $(. /etc/os-release; echo $VERSION_CODENAME)-pgdg main" > /etc/apt/sources.list.d/pgdg.list' - -# Switch to the postgres user and Install Rust and init the cargo -USER postgres -RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y && \ - echo 'source $HOME/.cargo/env' >> $HOME/.bashrc && \ - /bin/bash -c "source $HOME/.cargo/env && cargo install cargo-pgrx --version '0.9.7' --locked" && \ - /bin/bash -c "source $HOME/.cargo/env && cargo pgrx init" - -# Set environment variables for Rust and Python -ENV PATH="/root/.cargo/bin:${PATH}" -ENV PYTHONPATH="${PYTHONPATH}:/project/Trails/internal/ml/model_selection" - -# ARG CACHEBUST=1 is to force re-execute the following CMDs at each updates. -ARG CACHEBUST=1 - -# Clone code to there, install dependences, -WORKDIR /project -RUN git clone https://github.com/apache/singa/tree/dev-postgresql && \ - cd ./singa/examples/model_selection/TRAILS-Database-Native-Model-Selection && \ - pip install -r requirement.txt - - -WORKDIR /project -RUN chmod +x ./singa/examples/model_selection/init.sh - -# Set the entry point to your script -ENTRYPOINT ["/project/singa/examples/model_selection/TRAILS-Database-Native-Model-Selection/init.sh"] diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/documents/dev_guide.md b/examples/model_selection/TRAILS-Database-Native-Model-Selection/documents/dev_guide.md deleted file mode 100644 index 3b0927a837..0000000000 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/documents/dev_guide.md +++ /dev/null @@ -1,251 +0,0 @@ - - -# Change the permission - -```bash -chmod -R 777 internal/pg_extension -chmod -R 777 TRAILS -``` - -# PSQL CMD - -```sql -psql -h localhost -p 28814 -U postgres -\c frappe -\dt -\d frappe_train -DROP TABLE frappe_train; -SELECT * FROM frappe_train LIMIT 10; -SELECT * FROM frappe_test LIMIT 10; -SELECT * FROM frappe_valid LIMIT 10; -DROP DATABASE frappe; -psql -U postgres -``` - -# Build and run the container - -```bash -docker build -t trails . - -docker run -d --name trails \ - --network="host" \ - -v $(pwd)/TRAILS:/project/TRAILS \ - -v /hdd1/xingnaili/exp_data/:/project/exp_data \ - trails - -docker exec -it trails bash -``` - -# This is in docker image already - -```bash -# if those are already on docker, skip them. -cargo install --locked cargo-pgrx -# run after package update -cargo pgrx init -cargo pgrx new my_extension -# just run this after code updates. -cargo pgrx run -``` - -# Develop - -## Load data into database. - -```bash -bash /project/TRAILS/internal/ml/model_selection/scripts/database/load_data_to_db.sh /project/exp_data/data/structure_data/frappe frappe -bash /project/TRAILS/internal/ml/model_selection/scripts/database/load_data_to_db.sh /project/exp_data/data/structure_data/uci_diabetes uci_diabetes -bash /project/TRAILS/internal/ml/model_selection/scripts/database/load_data_to_db.sh /project/exp_data/data/structure_data/criteo_full criteo -``` - -## 1. Compile - -In shell - -```bash -cd ./internal/pg_extension/ -cargo clean -rm -r /home/postgres/.pgrx/14.9/pgrx-install/lib/pg_extension.so -cargo pgrx run -rm /home/postgres/.pgrx/14.9/pgrx-install/share/extension/pg_extension--0.1.0.sql -vi /home/postgres/.pgrx/14.9/pgrx-install/share/extension/pg_extension--0.1.0.sql -paste the latest sqls -# generate schema -cargo pgrx schema >> /home/postgres/.pgrx/14.9/pgrx-install/share/extension/pg_extension--0.1.0.sql -``` - -In SQL - -```sql -DROP EXTENSION IF EXISTS pg_extension; -CREATE EXTENSION pg_extension; -``` - -## 2. Edit the config file - -Update the `nfield` in the `config.ini` file, it is == number of columns used. E.g, `ARRAY['col1', 'col2', 'col3', 'label']` => `nfield` = 3 - -## 3. Run it - -```sql -CREATE EXTENSION pg_extension; - -# Test if the UDF is there or not -SELECT * FROM pg_proc WHERE proname = 'model_selection_workloads'; - -# micro -select benchmark_filtering_phase_latency(4, '/project/TRAILS/internal/ml/model_selection/config.ini'); - -select benchmark_filtering_latency_in_db(5000, 'frappe', '/project/TRAILS/internal/ml/model_selection/config.ini'); - -select benchmark_filtering_latency_in_db(5000, 'uci_diabetes', '/project/TRAILS/internal/ml/model_selection/config.ini'); - -select benchmark_filtering_latency_in_db(4, 'criteo', '/project/TRAILS/internal/ml/model_selection/config.ini'); - -# Test coordinator -SELECT coordinator('0.08244', '168.830156', '800', false, '/project/TRAILS/internal/ml/model_selection/config.ini'); - -# this is database name, columns used, time budget, batch size, and config file -CALL model_selection_sp('dummy', ARRAY['col1', 'col2', 'col3', 'label'], '30', 32, '/project/TRAILS/internal/ml/model_selection/config.ini'); - -# end2end model selection -CALL model_selection_end2end('dummy', ARRAY['col1', 'col2', 'col3', 'label'], '15', '/project/TRAILS/internal/ml/model_selection/config.ini'); - -# filtering & refinement with workloads -CALL model_selection_workloads('dummy', ARRAY['col1', 'col2', 'col3', 'label'], 300, 3, '/project/TRAILS/internal/ml/model_selection/config.ini'); - -response = requests.post(args.refinement_url, json=data).json() - -``` - -# Test the pg-extension works using pipython - -```sql -# switch to a postgres -su postgres - -CREATE EXTENSION plpython3u; - -CREATE FUNCTION py_version() RETURNS text AS $$ -import sys -return sys.version -$$ LANGUAGE plpython3u; - -SELECT py_version(); - -CREATE OR REPLACE FUNCTION test_numpy() - RETURNS text -LANGUAGE plpython3u -AS $$ -import numpy -import torch -import sklearn -import torchvision -import tqdm -print("asdf") -return str(numpy.__version__) + " torch: " + str(torch.__version__) -$$; - -SELECT test_numpy(); - -CREATE EXTENSION my_extension; -SELECT hello_my_extension(); -``` - -# Container log - -Each line in your output represents a different process that is currently running on your PostgreSQL server. Here's what each one is doing: - -1. `/bin/sh -c service postgresql start && tail -F /var/log/postgresql/postgresq` : This is the command that was used to start your PostgreSQL server. It also includes a command to continuously display new entries from the PostgreSQL log file. - - -2. `/usr/lib/postgresql/14/bin/postgres -D /var/lib/postgresql/14/main -c config` : This is the main PostgreSQL process. All other PostgreSQL processes are children of this process. - - -3. `postgres: 14/main: checkpointer` : The checkpointer process is responsible for making sure data changes get saved to disk regularly. This is important for database recovery in case of a crash. - - -4. `postgres: 14/main: background writer` : The background writer process is responsible for writing buffers to disk when they become dirty. This reduces the amount of work that needs to be done when a buffer is reused. - - -5. `postgres: 14/main: walwriter` : The walwriter process writes transaction logs (Write-Ahead Logs or WAL) to disk. This is also important for database recovery and replication. - - -6. `postgres: 14/main: autovacuum launcher` : The autovacuum launcher process starts autovacuum worker processes as needed. These processes automatically clean up and optimize the database. - - -7. `postgres: 14/main: stats collector` : The stats collector process collects statistics about the server's activity. This information can be viewed using the `pg_stat` family of system views. - - -8. `postgres: 14/main: logical replication launcher` : The logical replication launcher manages the worker processes that perform logical replication, copying data changes to other databases. - - -9. `tail -F /var/log/postgresql/postgresql-14-main.log` : This process is displaying the end of the PostgreSQL log file and updating as more entries are added. - - -10. `bash` : These are shell sessions, likely interactive ones you've started. - - -11. `/usr/lib/postgresql/14/bin/psql -h localhost -p 28814 pg_extension` : These are instances of the psql command line interface, connected to your database. - - -12. `postgres: postgres pg_extension 127.0.0.1(52236) CALL` : This is your currently running stored procedure. - - -13. `ps aux` : This is the command you ran to display the list of processes. - -Each process is part of the PostgreSQL database system and helps it to run efficiently and robustly. - -# MAC locally - -```bash -conda activate firmest38 -export PYTHON_SYS_EXECUTABLE=/Users/kevin/opt/anaconda3/envs/firmest38/bin/python -export DYLD_LIBRARY_PATH=/Users/kevin/opt/anaconda3/envs/firmest38/lib/:$DYLD_LIBRARY_PATH -cargo run --features python -``` - -# What cargo run do? - -Before: - -``` -postgres 1 0.1 0.0 2612 588 ? Ss 14:30 0:00 /bin/sh -c service postgresql start && tail -F /var/log/postgresql/postgresql-14-main.log -postgres 20 0.1 0.0 214688 29332 ? Ss 14:30 0:00 /usr/lib/postgresql/14/bin/postgres -D /var/lib/postgresql/14/main -c config_file=/etc/postgresql/14/main/postgresql.conf -postgres 22 0.0 0.0 214688 6120 ? Ss 14:30 0:00 postgres: 14/main: checkpointer -postgres 23 0.0 0.0 214688 6084 ? Ss 14:30 0:00 postgres: 14/main: background writer -postgres 24 0.0 0.0 214688 10352 ? Ss 14:30 0:00 postgres: 14/main: walwriter -postgres 25 0.0 0.0 215224 8864 ? Ss 14:30 0:00 postgres: 14/main: autovacuum launcher -postgres 26 0.0 0.0 69280 5184 ? Ss 14:30 0:00 postgres: 14/main: stats collector -postgres 27 0.0 0.0 215236 6972 ? Ss 14:30 0:00 postgres: 14/main: logical replication launcher -postgres 38 0.0 0.0 2548 512 ? S 14:30 0:00 tail -F /var/log/postgresql/postgresql-14-main.log -postgres 39 0.1 0.0 4112 3424 pts/0 Ss+ 14:30 0:00 bash -postgres 48 0.1 0.0 4112 3424 pts/1 Ss 14:30 0:00 bash -postgres 59 0.0 0.0 5896 2860 pts/1 R+ 14:30 0:00 ps aux -``` - -After: - - - - - - - diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/documents/image-20231020174425377.png b/examples/model_selection/TRAILS-Database-Native-Model-Selection/documents/image-20231020174425377.png deleted file mode 100644 index 9e73b270d9..0000000000 Binary files a/examples/model_selection/TRAILS-Database-Native-Model-Selection/documents/image-20231020174425377.png and /dev/null differ diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/documents/image-20231020174945226.png b/examples/model_selection/TRAILS-Database-Native-Model-Selection/documents/image-20231020174945226.png deleted file mode 100644 index d7b686d2ba..0000000000 Binary files a/examples/model_selection/TRAILS-Database-Native-Model-Selection/documents/image-20231020174945226.png and /dev/null differ diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/init.sh b/examples/model_selection/TRAILS-Database-Native-Model-Selection/init.sh deleted file mode 100644 index 3909adae95..0000000000 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/init.sh +++ /dev/null @@ -1,24 +0,0 @@ -#!/bin/bash - -# Those cmds will triggered after docker run . - -# Compile code, and run postgresql -cd /project/singa/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/pg_extension || exit -/bin/bash -c "source $HOME/.cargo/env && echo '\q' | cargo pgrx run --release" - -# Wait for PostgreSQL to become available -until psql -h localhost -p 28814 -U postgres -d pg_extension -c '\q'; do - >&2 echo "Postgres is unavailable - sleeping" - sleep 1 -done - -# Run setup commands -psql -h localhost -p 28814 -U postgres -d pg_extension -c "CREATE EXTENSION pg_extension;" -psql -h localhost -p 28814 -U postgres -d pg_extension -f /project/singa/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/pg_extension/sql/model_selection_cpu.sql -# Load example dataset into database -bash /project/singa/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/TRAILS-Database-Native-Model-Selection/scripts/database/load_data_to_db.sh /project/singa/examples/model_selection/TRAILS-Database-Native-Model-Selection/dataset/frappe frappe - -# Continue with the rest of your container's CMD -tail -f /dev/null - -echo "Done!" diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/README.md b/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/README.md deleted file mode 100644 index 3025139f17..0000000000 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/README.md +++ /dev/null @@ -1,290 +0,0 @@ - - -# TRAILS: A Database Native Model Selection System - -![image-20230702035806963](documents/imgs/image-20230702035806963.png) - -[TOC] - -# Config Environments - -```bash -# Create virtual env -conda config --set ssl_verify false -conda create -n "trails" python=3.8.10 -conda activate trails -pip install -r requirement.txt - -cd TRAILS - -# make a dir to store all results. -mkdir ../exp_data -``` - -# Reproduce the results - -## NAS-Bench-Tabular - - NAS-Bench-Tabular can be either **download** or build from scratch. - -### Download NAS-Bench-Tabular - -1. **Download** the dataset using the following link, and extract them to `exp_data` - -```bash -https://drive.google.com/file/d/1TGii9ymbmX81c9-GKWXbe_4Z64R8Btz1/view?usp=sharing -``` - -### Build NAS-Bench-Tabular - -2. Build the **NAS-Bench-Tabular** from scratch - -```python -# Construct NAS-Bench-Tabular: -## 1. Training all models. -bash internal/ml/model_selection/scripts/nas-bench-tabular/train_all_models_frappe.sh -bash internal/ml/model_selection/scripts/nas-bench-tabular/train_all_models_diabetes.sh -bash internal/ml/model_selection/scripts/nas-bench-tabular/train_all_models_criteo.sh - -## 2. Scoring all models using all TFMEMs. -bash internal/ml/model_selection/scripts/nas-bench-tabular/score_all_modesl_frappe.sh -bash internal/ml/model_selection/scripts/nas-bench-tabular/score_all_modesl_uci.sh -bash internal/ml/model_selection/scripts/nas-bench-tabular/score_all_modesl_criteo.sh -``` - -3. Build the **NAS-Bench-Img** from scratch - - To facilitate the experiments and query speed (NASBENCH API is slow) - - 1. We retrieve all results from NASBENCH API and store them as a json file. - 2. We score all models in NB201 and 28K models in NB101. - 3. We search with EA + Score and record the searching process in terms of - `run_id, current_explored_model, top_400 highest scored model, time_usage` - to SQLLite. - -```python -# 1. Record NASBENCH API data into json file -## This requires to install nats_bench: pip install nats_bench -bash ./internal/ml/model_selection/scripts/nas-bench-img/convert_api_2_json.sh - -# 2. Scoring all models using all TFMEMs. -nohup bash ./internal/ml/model_selection/scripts/nas-bench-img/score_all_models.sh & - -# 3. Explore with EA ans score result and store exploring process into SQLLite -bash ./internal/ml/model_selection/scripts/nas-bench-img/explore_all_models.sh - -# 4. Generate the baseline. -bash ./internal/ml/model_selection/scripts/baseline_system_img.sh -``` - -The following experiment could then query filtering phase results based on `run_id`. - -## SLO-Aware 2Phase-MS - -With the above **NAS-Bench-Tabular**, we could run various experiments. - -```bash -# 1. Generate the results for drawing the figure -## tabular data: training-base-ms -bash internal/ml/model_selection/scripts/baseline_system_tab.sh -## tabular data: training-free-ms, 2phase-ms -nohup bash internal/ml/model_selection/scripts/anytime_tab.sh & -## image data: training-base-ms, training-free-ms, 2phase-ms -nohup bash internal/ml/model_selection/scripts/anytime_img_w_baseline.sh & - -# 2. Draw figure -python internal/ml/model_selection/exps/macro/anytime_tab_draw.py -python internal/ml/model_selection/exps/macro/anytime_img_draw.py -``` - -![image-20230702035554579](documents/imgs/image-20230702035554579.png) - -## Micro: Benchmark TFMEMs - -```bash -export PYTHONPATH=$PYTHONPATH:./internal/ml/model_selection -conda activate trails -python ./internal/ml/model_selection/exps/micro/benchmark_correlation.py -``` - -![image-20230421214835152](./documents/imgs/image-20230421214835152.png) - -## Micro: Benchmark Budge-Aware Algorithm - -```bash -bash internal/ml/model_selection/scripts/micro_budget_aware_alg.sh -``` - -![image-20230724111659545](./documents/imgs/image-20230724111659545.png) - -## Micro: Benchmark N, K, U - -With ranking the models by ther TFMEM score in the filtering phase, we aim to determine - -1. Further examinng more models (**K**) with each going through less training epoch (**U**) is more easier to find good model? - or examine less but each training more epochs? -2. How many models to explore (**N**) and how many to keep (**K**) ? - -```bash -bash internal/ml/model_selection/scripts/micro_nku_tradeoff.sh -``` - -This is the experimental result conducted at the UCI Diabetes datasets. -Clearly, expore more models in refinement phase (large **K** ) is more helpful to find the a better model. -Although increasing **U** can find a better model accurately, it runs more training epochs leading to higher training cost. - -![image-20230722202555763](./documents/imgs/image-20230722202555763.png) - -Then we fix **U=1** for cost efficiency and determine N/K for higher searching effectiveness. -Clearly, K/N reaches 100 yields better scheduling result in both image and tabular dataset, thus, we set **N/K=100** in coordinator. - -![image-20230724111325368](./documents/imgs/image-20230724111325368.png) - -![image-20230722205244718](./documents/imgs/image-20230722205244718.png) - -## Micro: Device Placement & Embedding Cache - -1. To measure the time usage for filtering phase on vairous hardware, run the following - - ```bash - # Without embedding cache at the filtering phase - nohup bash internal/ml/model_selection/scripts/latency_phase1_cpu_gpu.sh & - # With embedding cache at the filtering phase (faster) - nohup bash internal/ml/model_selection/scripts/latency_embedding_cache.sh & - # Draw graph - python ./internal/ml/model_selection/exps/micro/draw_filtering_latency.py - python ./internal/ml/model_selection/exps/micro/draw_filtering_memory_bar.py - python ./internal/ml/model_selection/exps/micro/draw_filtering_memory_line.py - python ./internal/ml/model_selection/exps/micro/draw_filtering_memory_cache_CPU.py - ``` - -2. Further we measure the end-2-end latency under two CPU, GPU, and Hybrid. - - ```bash - nohup bash internal/ml/model_selection/scripts/latency_phase1_cpu_gpu.sh & - ``` - -## Micro: In-DB vs Out-DB filtering phase - -```bash -# run out-of db, read data via psycopg2 -bash ./internal/ml/model_selection/scripts/latency_phase1_in_db.sh - -# run in-db query, read data via SPI -select benchmark_filtering_latency_in_db(5000, 'frappe', '/project/TRAILS/internal/ml/model_selection/config.ini'); - -select benchmark_filtering_latency_in_db(5000, 'uci_diabetes', '/project/TRAILS/internal/ml/model_selection/config.ini'); - -select benchmark_filtering_latency_in_db(5000, 'criteo', '/project/TRAILS/internal/ml/model_selection/config.ini'); -``` - -## Micro: On-the-Fly Data transmission, Refinement - -```bash -# start cache service -python ./internal/cache-service/cache_service.py -python ./internal/cache-service/trigger_cache_svc.py -# consume from the cache-svc - - -``` - -## Reproduce Figure7 - -```bash -python exps/main_v2/analysis/2.\ cost_draw.py -python exps/main_v2/analysis/3.\ cost_train_based.py -``` - -![image-20230702035622198](documents/imgs/image-20230702035622198.png) - -## Reproduce Figure8 - -```bash -# draw figure 8(a) -python exps/main_v2/analysis/5.draw_IDMS_var_workloads.py -# draw figure 8(b) -python exps/main_v2/analysis/6.draw_IDMS_dataloading.py -``` - -![image-20230702035639502](documents/imgs/image-20230702035639502.png) -# Baselines - -We compare with Training-Based MS, TabNAS, and training-free MS etc. - -For image data, it already generated at the NAS-Bench-Img part, see above. - -# Appendix - -Here all experiments is on the Frappe dataset. - -1. Computational Costs - - ```bash - bash ./internal/ml/model_selection/exps/micro/resp/benchmark_cost.sh - ``` - -2. Search Cost, multiple training-free or training-based combinations (warm-up / movel proposal) - - ```bash - # get RL, RE, RS + training-based model evaluation - bash ./internal/ml/model_selection/scripts/micro_search_strategy.sh - # this will read previous file, and run warm-up/move proposal, and draw all together - bash ./internal/ml/model_selection/exps/micro/resp/benchmark_search_cost.sh - ``` - -3. How des the K influence the result? - - ```bash - python ./internal/ml/model_selection/exps/micro/resp/benchmark_k_fix_time.py - ``` - -4. Nosy in selecting top K models - - ```bash - python ./internal/ml/model_selection/exps/micro/resp/benchmark_noisy_influence.py - ``` - -5. Weight-sharing result - - ```bash - nohup bash internal/ml/model_selection/scripts/benchmark_weight_sharing.sh & - ``` - - - - - - - -# Run end2end model selection - -download the dataset and put it in the `exp_data/data/structure_data` - -``` -python main.py --budget=100 --dataset=frappe -``` - -Check the log at the `logs_default` - -![image-20230421220338391](./documents/imgs/image-20230421220338391.png) - -![image-20230421220443231](./documents/imgs/image-20230421220443231.png) - diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/documents/imgs/image-20230421214835152.png b/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/documents/imgs/image-20230421214835152.png deleted file mode 100644 index 06a86f9537..0000000000 Binary files a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/documents/imgs/image-20230421214835152.png and /dev/null differ diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/documents/imgs/image-20230421220338391.png b/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/documents/imgs/image-20230421220338391.png deleted file mode 100644 index dde2a761b3..0000000000 Binary files a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/documents/imgs/image-20230421220338391.png and /dev/null differ diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/documents/imgs/image-20230421220443231.png b/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/documents/imgs/image-20230421220443231.png deleted file mode 100644 index c94d59cfdf..0000000000 Binary files a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/documents/imgs/image-20230421220443231.png and /dev/null differ diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/documents/imgs/image-20230702035554579.png b/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/documents/imgs/image-20230702035554579.png deleted file mode 100644 index 387f3ce492..0000000000 Binary files a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/documents/imgs/image-20230702035554579.png and /dev/null differ diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/documents/imgs/image-20230702035622198.png b/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/documents/imgs/image-20230702035622198.png deleted file mode 100644 index c63e58598f..0000000000 Binary files a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/documents/imgs/image-20230702035622198.png and /dev/null differ diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/documents/imgs/image-20230702035639502.png b/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/documents/imgs/image-20230702035639502.png deleted file mode 100644 index 0422e108b7..0000000000 Binary files a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/documents/imgs/image-20230702035639502.png and /dev/null differ diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/documents/imgs/image-20230702035806963.png b/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/documents/imgs/image-20230702035806963.png deleted file mode 100644 index c33bee1859..0000000000 Binary files a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/documents/imgs/image-20230702035806963.png and /dev/null differ diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/documents/imgs/image-20230722202555763.png b/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/documents/imgs/image-20230722202555763.png deleted file mode 100644 index 527d1eb848..0000000000 Binary files a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/documents/imgs/image-20230722202555763.png and /dev/null differ diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/documents/imgs/image-20230722205244718.png b/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/documents/imgs/image-20230722205244718.png deleted file mode 100644 index bc1ae3af5d..0000000000 Binary files a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/documents/imgs/image-20230722205244718.png and /dev/null differ diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/documents/imgs/image-20230724111325368.png b/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/documents/imgs/image-20230724111325368.png deleted file mode 100644 index 8637aaee7f..0000000000 Binary files a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/documents/imgs/image-20230724111325368.png and /dev/null differ diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/documents/imgs/image-20230724111659545.png b/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/documents/imgs/image-20230724111659545.png deleted file mode 100644 index 2fb081d4e2..0000000000 Binary files a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/documents/imgs/image-20230724111659545.png and /dev/null differ diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/exps/draw_img_lib.py b/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/exps/draw_img_lib.py deleted file mode 100644 index 7d4acb3e2e..0000000000 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/exps/draw_img_lib.py +++ /dev/null @@ -1,724 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import os - -from matplotlib import pyplot as plt -import seaborn as sns -import numpy as np -import palettable -from matplotlib.ticker import MaxNLocator -import numpy -from src.common.constant import Config -import matplotlib - -# lines' mark size -set_marker_size = 15 -# points' mark size -set_marker_point = 14 -# points' mark size -set_font_size = 40 -set_lgend_size = 15 -set_tick_size = 20 - -frontinsidebox = 23 - -# update tick size -matplotlib.rc('xtick', labelsize=set_tick_size) -matplotlib.rc('ytick', labelsize=set_tick_size) - -plt.rcParams['axes.labelsize'] = set_tick_size - -mark_list = ["o", "*", "<", "^", "s", "d", "D", ">", "h"] -mark_size_list = [set_marker_size, set_marker_size + 1, set_marker_size + 1, set_marker_size, - set_marker_size, set_marker_size, set_marker_size, set_marker_size + 1, set_marker_size + 2] -line_shape_list = ['-.', '--', '-', ':'] - - -# this is for draw figure3 only -def get_plot_compare_with_base_line_cfg(search_space, dataset, if_with_phase1=False): - if search_space == Config.NB201: - run_range_ = range(0, 100, 1) - if if_with_phase1: - draw_graph = draw_anytime_result_with_p1 - else: - draw_graph = draw_anytime_result - # min, this is for plot only - if dataset == Config.c10: - # C10 array - budget_array = [0.017, 0.083] + list(range(1, 350, 4)) - sub_graph_y1 = [91, 94.5] - sub_graph_y2 = [53.5, 55] - sub_graph_split = 60 - elif dataset == Config.c100: - # C10 array - budget_array = [0.017, 0.083] + list(range(1, 350, 4)) - - sub_graph_y1 = [64, 73.5] - sub_graph_y2 = [15, 16] - sub_graph_split = 20 - else: - # ImgNet X array - budget_array = [0.017, 0.083] + list(range(1, 350, 4)) - sub_graph_y1 = [33, 48] - sub_graph_y2 = [15.5, 17] - sub_graph_split = 34 - else: - # this is NB101 + C10, because only 101 has 20 run. others have 100 run. - run_range_ = range(0, 20, 1) - if if_with_phase1: - draw_graph = draw_anytime_result_one_graph_with_p1 - # budget_array = list(range(1, 16, 1)) - budget_array = numpy.arange(0.02, 15, 0.02).tolist() - else: - draw_graph = draw_anytime_result_one_graph - budget_array = [0.017, 0.083] + list(range(1, 2000, 8)) - - if dataset == Config.c10: - # C10 array - # budget_array = list(range(0, 2000, 1)) - sub_graph_y1 = [90, 94.5] - sub_graph_y2 = [52, 55] - sub_graph_split = 60 - else: - raise Exception - - return run_range_, budget_array, sub_graph_y1, sub_graph_y2, sub_graph_split, draw_graph - - -def draw_anytime_result(result_dir, y_acc_list_arr, x_T_list, - x_acc_train, y_acc_train_l, y_acc_train_m, y_acc_train_h, - annotations, lv, - name_img, dataset, - x1_lim=[], x2_lim=[], - ): - fig, (ax1, ax2) = plt.subplots(2, 1, sharex=True, dpi=100, gridspec_kw={'height_ratios': [4, 1]}) - exp = np.array(y_acc_list_arr) - sys_acc_h = np.quantile(exp, .75, axis=0) - sys_acc_m = np.quantile(exp, .5, axis=0) - sys_acc_l = np.quantile(exp, .25, axis=0) - - # plot simulate result of system - ax1.fill_between(x_T_list, sys_acc_l, sys_acc_h, alpha=0.1) - ax1.plot(x_T_list, sys_acc_m, mark_list[-1], label="TRAILS") - ax2.fill_between(x_T_list, sys_acc_l, sys_acc_h, alpha=0.1) - - # plot simulate result of train-based line - ax1.fill_between(x_acc_train, y_acc_train_l, y_acc_train_h, alpha=0.3) - ax1.plot(x_acc_train, y_acc_train_m, mark_list[-2], label="Training-based MS") - ax2.fill_between(x_acc_train, y_acc_train_l, y_acc_train_h, alpha=0.3) - - for i in range(len(annotations)): - ele = annotations[i] - if ele[1] < lv: - # convert to mins - ax2.plot(ele[2] / 60, ele[1], mark_list[i], label=ele[0], fontsize=set_marker_size) - else: - ax1.plot(ele[2] / 60, ele[1], mark_list[i], label=ele[0], fontsize=set_marker_size) - # ax2.scatter(ele[2]/60, ele[1]* 0.01, s=100, color="red") - # ax2.annotate(ele[0], (ele[2]/60, ele[1] * 0.01)) - - if len(x1_lim) > 0 and len(x2_lim) > 0: - ax1.set_ylim(x1_lim[0], x1_lim[1]) # 子图1设置y轴范围,只显示部分图 - ax2.set_ylim(x2_lim[0], x2_lim[1]) # 子图2设置y轴范围,只显示部分图 - - ax1.spines['bottom'].set_visible(False) # 关闭子图1中底部脊 - ax2.spines['top'].set_visible(False) ##关闭子图2中顶部脊 - ax2.set_xticks(range(0, 31, 1)) - - d = .85 # 设置倾斜度 - # 绘制断裂处的标记 - kwargs = dict(marker=[(-1, -d), (1, d)], markersize=set_marker_size, - linestyle='none', color='r', mec='r', mew=1, clip_on=False) - ax1.plot([0, 1], [0, 0], transform=ax1.transAxes, **kwargs) - ax2.plot([0, 1], [1, 1], transform=ax2.transAxes, **kwargs) - - plt.tight_layout() - plt.xscale("symlog") - ax1.grid() - ax2.grid() - plt.xlabel("Time Budget given by user (min)", fontsize=set_font_size) - ax1.set_ylabel(f"Test accuracy on {dataset}", fontsize=set_font_size) - ax1.legend(ncol=1, fontsize=set_lgend_size) - ax2.legend(fontsize=set_lgend_size) - # plt.show() - plt.savefig(f"{result_dir}/any_time_{name_img}.pdf", bbox_inches='tight') - - -def draw_anytime_result_one_graph(y_acc_list_arr, x_T_list, - x_acc_train, y_acc_train_l, y_acc_train_m, y_acc_train_h, - annotations, lv, - name_img, dataset, - x1_lim=[], x2_lim=[], - ): - # fig, (ax1, ax2) = plt.subplots(2, 1, sharex=True, dpi=100, gridspec_kw={'height_ratios': [5, 1]}) - exp = np.array(y_acc_list_arr) * 100 - sys_acc_h = np.quantile(exp, .75, axis=0) - sys_acc_m = np.quantile(exp, .5, axis=0) - sys_acc_l = np.quantile(exp, .25, axis=0) - - # exp_time = np.array(real_time_used_arr) - # time_mean = np.quantile(exp_time, .5, axis=0) - time_mean = x_T_list - - # plot simulate result of system - plt.fill_between(time_mean, sys_acc_l, sys_acc_h, alpha=0.1) - plt.plot(time_mean, sys_acc_m, "o-", label="TRAILS") - # plt.plot(time_mean, sys_acc_m, label="TRAILS") - - # plot simulate result of train-based line - plt.fill_between(x_acc_train, y_acc_train_l, y_acc_train_h, alpha=0.3) - plt.plot(x_acc_train, y_acc_train_m, "o-", label="Training-based MS") - # plt.plot(x_acc_train, y_acc_train_m, label="Training-based MS") - - if len(x1_lim) > 0: - plt.ylim(x1_lim[0], x1_lim[1]) # 子图1设置y轴范围,只显示部分图 - - d = .85 # 设置倾斜度 - # 绘制断裂处的标记 - kwargs = dict(marker=[(-1, -d), (1, d)], markersize=set_marker_size, - linestyle='none', color='r', mec='r', mew=1, clip_on=False) - # plt.plot([0, 1], [0, 0], transform=ax1.transAxes, **kwargs) - # plt.plot([0, 1], [1, 1], transform=ax2.transAxes, **kwargs) - - plt.tight_layout() - # plt.xscale("symlog") - plt.grid() - plt.xlabel("Time Budget given by user (min)", fontsize=set_font_size) - plt.ylabel(f"Test accuracy on {dataset}", fontsize=set_font_size) - plt.legend(ncol=1, fontsize=set_lgend_size) - plt.show() - # plt.savefig(f"amy_time_{name_img}.pdf", bbox_inches='tight') - - -# those two function will plot phase 1 and phase 2 -def draw_anytime_result_with_p1(result_dir, y_acc_list_arr, x_T_list, y_acc_list_arr_p1, x_T_list_p1, - x_acc_train, y_acc_train_l, y_acc_train_m, y_acc_train_h, - annotations, lv, - name_img, dataset, max_value, - x1_lim=[], x2_lim=[], - ): - fig, (ax1, ax2) = plt.subplots( - 2, 1, - sharex=True, - dpi=100, - gridspec_kw={'height_ratios': [6, 1]}) - - shade_degree = 0.2 - - # plot simulate result of train-based line - ax1.plot(x_acc_train, y_acc_train_m, mark_list[-3] + line_shape_list[0], label="Training-Based MS", - markersize=mark_size_list[-3]) - ax1.fill_between(x_acc_train, y_acc_train_l, y_acc_train_h, alpha=shade_degree) - ax2.fill_between(x_acc_train, y_acc_train_l, y_acc_train_h, alpha=shade_degree) - - # plot simulate result of system - exp = np.array(y_acc_list_arr_p1) - sys_acc_p1_h = np.quantile(exp, .75, axis=0) - sys_acc_p1_m = np.quantile(exp, .5, axis=0) - sys_acc_p1_l = np.quantile(exp, .25, axis=0) - ax1.plot(x_T_list_p1, sys_acc_p1_m, mark_list[-2] + line_shape_list[1], label="Training-Free MS", - markersize=mark_size_list[-2]) - ax1.fill_between(x_T_list_p1, sys_acc_p1_l, sys_acc_p1_h, alpha=shade_degree) - ax2.fill_between(x_T_list_p1, sys_acc_p1_l, sys_acc_p1_h, alpha=shade_degree) - - # plot simulate result of system - exp = np.array(y_acc_list_arr) - sys_acc_h = np.quantile(exp, .75, axis=0) - sys_acc_m = np.quantile(exp, .5, axis=0) - sys_acc_l = np.quantile(exp, .25, axis=0) - ax1.plot(x_T_list, sys_acc_m, mark_list[-1] + line_shape_list[2], label="2Phase-MS", markersize=mark_size_list[-1]) - ax1.fill_between(x_T_list, sys_acc_l, sys_acc_h, alpha=shade_degree) - ax2.fill_between(x_T_list, sys_acc_l, sys_acc_h, alpha=shade_degree) - - print(f"speed-up on {dataset} = {x_acc_train[-1] / x_T_list[-2]}, " - f"t_train = {x_acc_train[-1]}, t_f = {x_T_list[-2]}") - - for i in range(len(annotations)): - ele = annotations[i] - if ele[1] < lv: - # convert to mins - ax2.plot(ele[2] / 60, ele[1], mark_list[i], label=ele[0], markersize=set_marker_point) - else: - ax1.plot(ele[2] / 60, ele[1], mark_list[i], label=ele[0], markersize=set_marker_point) - # ax2.scatter(ele[2]/60, ele[1]* 0.01, s=100, color="red") - # ax2.annotate(ele[0], (ele[2]/60, ele[1] * 0.01)) - - if len(x1_lim) > 0 and len(x2_lim) > 0: - ax1.set_ylim(x1_lim[0], x1_lim[1]) # 子图1设置y轴范围,只显示部分图 - ax2.set_ylim(x2_lim[0], x2_lim[1]) # 子图2设置y轴范围,只显示部分图 - - ax1.spines['bottom'].set_visible(False) # 关闭子图1中底部脊 - ax2.spines['top'].set_visible(False) ##关闭子图2中顶部脊 - ax2.set_xticks(range(0, 31, 1)) - - d = .85 # 设置倾斜度 - # 绘制断裂处的标记 - kwargs = dict(marker=[(-1, -d), (1, d)], markersize=set_marker_size, - linestyle='none', color='r', mec='r', mew=1, clip_on=False) - ax1.plot([0, 1], [0, 0], transform=ax1.transAxes, **kwargs) - ax2.plot([0, 1], [1, 1], transform=ax2.transAxes, **kwargs) - - plt.xscale("log") - ax1.grid() - ax2.grid() - plt.xlabel(r"Response Time Threshold $T_{max}$ (min)", fontsize=set_font_size) - ax1.set_ylabel(f"Test Acc on {'In-16'}", fontsize=set_font_size) - # ax1.legend(ncol=1, fontsize=set_lgend_size) - # ax2.legend(fontsize=set_lgend_size) - - ax1.xaxis.label.set_size(set_tick_size) - ax1.yaxis.label.set_size(set_tick_size) - # ax1.set_xticks([]) - - ax2.xaxis.label.set_size(set_tick_size) - ax2.yaxis.label.set_size(set_tick_size) - - ax1.yaxis.set_major_locator(MaxNLocator(nbins=4, integer=True)) - - ax1.axhline(max_value, color='r', linestyle='-', label='Global Best Accuracy') - - tick_values = [0.01, 0.1, 1, 10, 100, 1000] - ax2.set_xticks(tick_values) - ax2.set_xticklabels([f'$10^{{{int(np.log10(val))}}}$' for val in tick_values]) - - # this is for unique hash - export_legend( - fig, - colnum=3, - unique_labels=['TE-NAS (Training-Free)', 'ENAS (Weight sharing)', - 'KNAS (Training-Free)', 'DARTS-V1 (Weight sharing)', 'DARTS-V2 (Weight sharing)', - 'Training-Based MS', 'Training-Free MS', '2Phase-MS', 'Global Best Accuracy']) - plt.tight_layout() - fig.savefig(f"{result_dir}/any_time_{name_img}_p1_from_0.1_sec.pdf", bbox_inches='tight') - - -def export_legend(ori_fig, filename="any_time_legend", colnum=9, unique_labels=[]): - fig2 = plt.figure(figsize=(5, 0.3)) - lines_labels = [ax.get_legend_handles_labels() for ax in ori_fig.axes] - lines, labels = [sum(lol, []) for lol in zip(*lines_labels)] - # grab unique labels - if len(unique_labels) == 0: - unique_labels = set(labels) - # assign labels and legends in dict - legend_dict = dict(zip(labels, lines)) - # query dict based on unique labels - unique_lines = [legend_dict[x] for x in unique_labels] - fig2.legend(unique_lines, unique_labels, loc='center', - ncol=colnum, - fancybox=True, - shadow=True, scatterpoints=1, fontsize=set_lgend_size) - fig2.tight_layout() - fig2.savefig(f"{filename}.pdf", bbox_inches='tight') - - -def draw_anytime_result_one_graph_with_p1(y_acc_list_arr, x_T_list, y_acc_list_arr_p1, x_T_list_p1, - x_acc_train, y_acc_train_l, y_acc_train_m, y_acc_train_h, - annotations, lv, - name_img, dataset, - x1_lim=[], x2_lim=[], - ): - # fig, (ax1, ax2) = plt.subplots(2, 1, sharex=True, dpi=100, gridspec_kw={'height_ratios': [5, 1]}) - - # plot simulate result of system - exp = np.array(y_acc_list_arr_p1) * 100 - sys_acc_p1_h = np.quantile(exp, .75, axis=0) - sys_acc_p1_m = np.quantile(exp, .5, axis=0) - sys_acc_p1_l = np.quantile(exp, .25, axis=0) - - plt.fill_between(x_T_list_p1, sys_acc_p1_l, sys_acc_p1_h, alpha=0.1) - plt.plot(x_T_list_p1, sys_acc_p1_m, "o-", label="TRAILS-P1") - # plt.fill_between(x_T_list_p1, sys_acc_p1_l, sys_acc_p1_h, alpha=0.1) - - exp = np.array(y_acc_list_arr) * 100 - sys_acc_h = np.quantile(exp, .75, axis=0) - sys_acc_m = np.quantile(exp, .5, axis=0) - sys_acc_l = np.quantile(exp, .25, axis=0) - - # exp_time = np.array(real_time_used_arr) - # time_mean = np.quantile(exp_time, .5, axis=0) - time_mean = x_T_list - - # plot simulate result of system - plt.fill_between(time_mean, sys_acc_l, sys_acc_h, alpha=0.1) - plt.plot(time_mean, sys_acc_m, "o-", label="TRAILS") - # plt.plot(time_mean, sys_acc_m, label="TRAILS") - - # plot simulate result of train-based line - plt.fill_between(x_acc_train, y_acc_train_l, y_acc_train_h, alpha=0.3) - plt.plot(x_acc_train, y_acc_train_m, "o-", label="Training-based MS") - # plt.plot(x_acc_train, y_acc_train_m, label="Training-based MS") - - if len(x1_lim) > 0: - plt.ylim(x1_lim[0], x1_lim[1]) # 子图1设置y轴范围,只显示部分图 - - d = .85 # 设置倾斜度 - # 绘制断裂处的标记 - kwargs = dict(marker=[(-1, -d), (1, d)], markersize=set_marker_size, - linestyle='none', color='r', mec='r', mew=1, clip_on=False) - # plt.plot([0, 1], [0, 0], transform=ax1.transAxes, **kwargs) - # plt.plot([0, 1], [1, 1], transform=ax2.transAxes, **kwargs) - - plt.tight_layout() - plt.xscale("symlog") - plt.grid() - plt.xlabel("Time Budget given by user (min)", fontsize=set_font_size) - plt.ylabel(f"Test accuracy on {dataset}", fontsize=set_font_size) - plt.legend(ncol=1, fontsize=set_lgend_size) - # plt.show() - plt.savefig(f"amy_time_{name_img}.pdf", bbox_inches='tight') - - -# for K, U N trade-off -def draw_grid_graph_with_budget( - acc, bt, b1, b2, - img_name: str, y_array: list, x_array: list): - """ - :param acc: Two array list - :param bt: Two array list - :param img_name: img name string - :return: - """ - - acc_new = np.array(acc) - acc = acc_new.tolist() - - mask = np.array(acc) - mask[mask > 0] = 0 - mask[mask < 0] = 1 - - bt = np.round(np.array(bt), 2).tolist() - mask2 = np.array(bt) - mask2[mask2 > 0] = 0 - mask2[mask2 < 0] = 1 - - mask3 = np.array(b1) - mask3[mask3 > 0] = 0 - mask3[mask3 < 0] = 1 - - mask4 = np.array(b2) - mask4[mask4 > 0] = 0 - mask4[mask4 < 0] = 1 - - fig, ax = plt.subplots(2, 2, figsize=(15, 14)) - - linewidths = 0.5 - sns.set(font_scale=3) - sns.heatmap( - data=acc, - vmax=99, - vmin=93, - cmap=palettable.cmocean.diverging.Curl_10.mpl_colors, - annot=True, - fmt=".2f", - annot_kws={'size': frontinsidebox, 'weight': 'normal', 'color': 'w', 'va': 'bottom'}, - mask=mask, - square=True, linewidths=linewidths, # 每个方格外框显示,外框宽度设置 - cbar_kws={"shrink": .5}, - ax=ax[0, 0] - ) - - sns.heatmap( - data=bt, - # vmax=, - vmin=-9, - cmap=palettable.cmocean.diverging.Curl_10.mpl_colors, - annot=True, - fmt=".2f", - annot_kws={'size': frontinsidebox, 'weight': 'normal', 'color': 'w', 'va': 'top'}, - mask=mask2, - square=True, linewidths=linewidths, # 每个方格外框显示,外框宽度设置 - cbar_kws={"shrink": .5}, - ax=ax[0, 1] - ) - - sns.heatmap( - data=b1, - vmax=17000, - vmin=15000, - cmap=palettable.cmocean.diverging.Curl_10.mpl_colors, - annot=True, - fmt=".0f", - annot_kws={'size': frontinsidebox, 'weight': 'normal', 'color': 'w', 'va': 'top'}, - mask=mask4, - square=True, linewidths=linewidths, # 每个方格外框显示,外框宽度设置 - cbar_kws={"shrink": .5}, - ax=ax[1, 0] - ) - - sns.heatmap( - data=b2, - # vmax=, - # vmin=-9, - cmap=palettable.cmocean.diverging.Curl_10.mpl_colors, - annot=True, - fmt=".0f", - annot_kws={'size': frontinsidebox, 'weight': 'normal', 'color': 'w', 'va': 'top'}, - mask=mask4, - square=True, linewidths=linewidths, # 每个方格外框显示,外框宽度设置 - cbar_kws={"shrink": .5}, - ax=ax[1, 1] - ) - - plt.tight_layout() - plt.xlabel("U (epoch)", fontsize=set_font_size) - plt.ylabel("K (# models)", fontsize=set_font_size) - - for i in [0, 1]: - for j in [0, 1]: - ax[i, j].set_xticklabels(x_array, fontsize=set_font_size) - ax[i, j].set_yticklabels(y_array, fontsize=set_font_size) - ax[i, j].set_xlabel("U (# epoch)", fontsize=set_font_size) - ax[i, j].set_ylabel("K (# models)", fontsize=set_font_size) - - ax[0, 0].set_title('Test Accuracy (%)', fontsize=set_font_size) - ax[0, 1].set_title(r'Time Budget $T$ (min)', fontsize=set_font_size) - ax[1, 0].set_title(r'$N$', fontsize=set_font_size) - ax[1, 1].set_title(r"$K \cdot U \cdot \log_{\eta}K$", fontsize=set_font_size) - - plt.tight_layout() - fig.subplots_adjust(wspace=0.001, hspace=0.3) - - # plt.show() - base_dr = os.getcwd() - path_gra = os.path.join(base_dr, f"{img_name}.pdf") - fig.savefig(path_gra, bbox_inches='tight') - - -def draw_grid_graph_with_budget_only_Acc_and_T( - acc, bt, b1, b2, - img_name: str, y_array: list, x_array: list): - """ - :param acc: Two array list - :param bt: Two array list - :param img_name: img name string - :return: - """ - - acc_new = np.array(acc) - acc = acc_new.tolist() - - mask = np.array(acc) - mask[mask > 0] = 0 - mask[mask < 0] = 1 - - bt = np.round(np.array(bt), 2).tolist() - mask2 = np.array(bt) - mask2[mask2 > 0] = 0 - mask2[mask2 < 0] = 1 - - mask3 = np.array(b1) - mask3[mask3 > 0] = 0 - mask3[mask3 < 0] = 1 - - mask4 = np.array(b2) - mask4[mask4 > 0] = 0 - mask4[mask4 < 0] = 1 - - fig, ax = plt.subplots(1, 2, figsize=(15, 14)) - - linewidths = 0.5 - sns.set(font_scale=2) - sns.heatmap( - data=acc, - vmax=99, - vmin=93, - cmap=palettable.cmocean.diverging.Curl_10.mpl_colors, - annot=True, - fmt=".2f", - annot_kws={'size': frontinsidebox, 'weight': 'normal', 'color': 'w', 'va': 'bottom'}, - mask=mask, - square=True, - linewidths=linewidths, # 每个方格外框显示,外框宽度设置 - cbar_kws={"shrink": .4}, - ax=ax[0] - ) - - sns.heatmap( - data=bt, - vmax=600, - # vmin=-9, - cmap=palettable.cmocean.diverging.Curl_10.mpl_colors, - annot=True, - fmt=".2f", - annot_kws={'size': frontinsidebox, 'weight': 'normal', 'color': 'w', 'va': 'top'}, - mask=mask2, - square=True, - linewidths=linewidths, # 每个方格外框显示,外框宽度设置 - cbar_kws={"shrink": .4}, - ax=ax[1] - ) - - plt.tight_layout() - plt.xlabel("U (epoch)", fontsize=set_font_size) - plt.ylabel("K (# models)", fontsize=set_font_size) - - for j in [0, 1]: - ax[j].set_xticklabels(x_array, fontsize=set_font_size) - ax[j].set_yticklabels(y_array, fontsize=set_font_size) - ax[j].set_xlabel("U (# epoch)", fontsize=set_font_size) - ax[j].set_ylabel("K (# models)", fontsize=set_font_size) - - ax[0].set_title('Test Accuracy (%)', fontsize=set_font_size) - ax[1].set_title(r'Time Budget $T$ (min)', fontsize=set_font_size) - - plt.tight_layout() - fig.subplots_adjust(wspace=0.3, hspace=0.3) - - # plt.show() - base_dr = os.getcwd() - path_gra = os.path.join(base_dr, f"{img_name}.pdf") - fig.savefig(path_gra, bbox_inches='tight') - - -def draw_grid_graph_with_budget_only_Acc( - acc, bt, b1, b2, - img_name: str, y_array: list, x_array: list): - """ - :param acc: Two array list - :param bt: Two array list - :param img_name: img name string - :return: - """ - - acc_new = np.array(acc) - acc = acc_new.tolist() - - mask = np.array(acc) - mask[mask > 0] = 0 - mask[mask < 0] = 1 - - fig = plt.figure(figsize=(7, 14)) - - linewidths = 0.5 - sns.set(font_scale=2) - sns.heatmap( - data=acc, - vmax=99, - vmin=93, - cmap=palettable.cmocean.diverging.Curl_10.mpl_colors, - annot=True, - fmt=".2f", - annot_kws={'size': frontinsidebox, 'weight': 'normal', 'color': 'w', 'va': 'bottom'}, - mask=mask, - square=True, - linewidths=linewidths, # 每个方格外框显示,外框宽度设置 - cbar_kws={"shrink": .4}, - ax=fig - ) - - plt.tight_layout() - plt.xlabel("U (epoch)", fontsize=set_font_size) - plt.ylabel("K (# models)", fontsize=set_font_size) - - plt.xticks(x_array, fontsize=set_font_size) - plt.yticks(y_array, fontsize=set_font_size) - - plt.title('Test Accuracy (%)', fontsize=set_font_size) - plt.tight_layout() - # fig.subplots_adjust(wspace=0.3, hspace=0.3) - # plt.show() - base_dr = os.getcwd() - path_gra = os.path.join(base_dr, f"{img_name}.pdf") - fig.savefig(path_gra, bbox_inches='tight') - - -def draw_grid_graph_with_budget_only_T( - acc, bt, b1, b2, - img_name: str, y_array: list, x_array: list): - """ - :param acc: Two array list - :param bt: Two array list - :param img_name: img name string - :return: - """ - - acc_new = np.array(acc) - acc = acc_new.tolist() - - mask = np.array(acc) - mask[mask > 0] = 0 - mask[mask < 0] = 1 - - bt = np.round(np.array(bt), 2).tolist() - mask2 = np.array(bt) - mask2[mask2 > 0] = 0 - mask2[mask2 < 0] = 1 - - mask3 = np.array(b1) - mask3[mask3 > 0] = 0 - mask3[mask3 < 0] = 1 - - mask4 = np.array(b2) - mask4[mask4 > 0] = 0 - mask4[mask4 < 0] = 1 - - fig, ax = plt.subplots(1, 2, figsize=(15, 14)) - - linewidths = 0.5 - sns.set(font_scale=2) - sns.heatmap( - data=acc, - vmax=99, - vmin=93, - cmap=palettable.cmocean.diverging.Curl_10.mpl_colors, - annot=True, - fmt=".2f", - annot_kws={'size': frontinsidebox, 'weight': 'normal', 'color': 'w', 'va': 'bottom'}, - mask=mask, - square=True, - linewidths=linewidths, # 每个方格外框显示,外框宽度设置 - cbar_kws={"shrink": .4}, - ax=ax[0] - ) - - sns.heatmap( - data=bt, - vmax=600, - # vmin=-9, - cmap=palettable.cmocean.diverging.Curl_10.mpl_colors, - annot=True, - fmt=".2f", - annot_kws={'size': frontinsidebox, 'weight': 'normal', 'color': 'w', 'va': 'top'}, - mask=mask2, - square=True, - linewidths=linewidths, # 每个方格外框显示,外框宽度设置 - cbar_kws={"shrink": .4}, - ax=ax[1] - ) - - plt.tight_layout() - plt.xlabel("U (epoch)", fontsize=set_font_size) - plt.ylabel("K (# models)", fontsize=set_font_size) - - for j in [0, 1]: - ax[j].set_xticklabels(x_array, fontsize=set_font_size) - ax[j].set_yticklabels(y_array, fontsize=set_font_size) - ax[j].set_xlabel("U (# epoch)", fontsize=set_font_size) - ax[j].set_ylabel("K (# models)", fontsize=set_font_size) - - ax[0].set_title('Test Accuracy (%)', fontsize=set_font_size) - ax[1].set_title(r'Time Budget $T$ (min)', fontsize=set_font_size) - - plt.tight_layout() - fig.subplots_adjust(wspace=0.3, hspace=0.3) - - # plt.show() - base_dr = os.getcwd() - path_gra = os.path.join(base_dr, f"{img_name}.pdf") - fig.savefig(path_gra, bbox_inches='tight') diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/exps/draw_tab_lib.py b/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/exps/draw_tab_lib.py deleted file mode 100644 index 6c30cc06b4..0000000000 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/exps/draw_tab_lib.py +++ /dev/null @@ -1,215 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from typing import List - -import numpy as np -from matplotlib import pyplot as plt -from matplotlib.ticker import MaxNLocator -import warnings -import matplotlib.cbook - -warnings.filterwarnings("ignore", category=matplotlib.cbook.mplDeprecation) - -# lines' mark size -set_marker_size = 1 -# points' mark size -set_marker_point = 14 -# points' mark size -set_font_size = 20 -set_lgend_size = 15 -set_tick_size = 20 - -frontinsidebox = 23 - -# update tick size -matplotlib.rc('xtick', labelsize=set_tick_size) -matplotlib.rc('ytick', labelsize=set_tick_size) - -plt.rcParams['axes.labelsize'] = set_tick_size - -mark_list = ["o", "*", "<", "^", "s", "d", "D", ">", "h"] -mark_size_list = [set_marker_size, set_marker_size + 1, set_marker_size + 1, set_marker_size, - set_marker_size, set_marker_size, set_marker_size, set_marker_size + 1, set_marker_size + 2] -line_shape_list = ['-.', '--', '-', ':'] -shade_degree = 0.2 - - -def Add_one_line(x_time_array: list, y_twod_budget: List[List], namespace: str, index, ax): - # training-based - x_ = x_time_array - y_ = y_twod_budget - - if all(isinstance(item, list) for item in x_): - expx = np.array(x_) - x_m = np.quantile(expx, .5, axis=0) - else: - x_m = x_ - - exp = np.array(y_) - exp = np.where(exp > 10, exp, exp * 100) - - y_h = np.quantile(exp, .75, axis=0) - y_m = np.quantile(exp, .5, axis=0) - y_l = np.quantile(exp, .25, axis=0) - - ax.plot(x_m, y_m, - mark_list[int(index % len(mark_list))] + line_shape_list[int(index % len(line_shape_list))], - label=namespace, - markersize=mark_size_list[int(index % len(mark_list))], - linewidth=3 - ) - - ax.fill_between(x_m, y_l, y_h, alpha=shade_degree) - return x_m - - -def draw_structure_data_anytime( - all_lines: List, - dataset: str, name_img: str, max_value, - figure_size=(6.4, 4.5), - annotations=[], - x_ticks=None, y_ticks=None, unique_labels=None): - fig, ax = plt.subplots(figsize=figure_size) - - # draw all lines - time_usage = [] - for i, each_line_info in enumerate(all_lines): - _x_array = each_line_info[0] - _y_2d_array = each_line_info[1] - _name_space = each_line_info[2] - time_arr = Add_one_line(_x_array, _y_2d_array, _name_space, i, ax) - time_usage.append(time_arr) - - # print(f"speed-up on {dataset} = {time_usage[0][-1] / time_usage[2][-2]}, " - # f"t_train = {time_usage[0][-1]}, t_f = {time_usage[2][-2]}") - - # plt.xscale("log") - # plt.grid() - # plt.xlabel(r"Time Budget $T$ (min)", fontsize=set_font_size) - # plt.ylabel(f"AUC on {dataset.upper()}", fontsize=set_font_size) - - plt.xscale("log") - ax.grid() - ax.set_xlabel(r"Response Time Threshold $T_{max}$ (min)", fontsize=set_font_size) - ax.set_ylabel(f"AUC on {dataset.upper()}", fontsize=set_font_size) - # ax.set_xscale("log") - # ax.set_xlim(0.001, 10e4) - # ax.set_ylim(x1_lim[0], x1_lim[1]) - - if y_ticks is not None: - if y_ticks[0] is not None: - ax.set_ylim(bottom=y_ticks[0]) - if y_ticks[1] is not None: - ax.set_ylim(top=y_ticks[1]) - # ax.set_ylim(y_ticks[0], y_ticks[1]) - # ax.set_yticks(y_ticks) - # ax.set_yticklabels(y_ticks) - if x_ticks is not None: - if x_ticks[0] is not None: - ax.set_xlim(left=x_ticks[0]) - if x_ticks[1] is not None: - ax.set_xlim(right=x_ticks[1]) - - ax.yaxis.set_major_locator(MaxNLocator(nbins=6, integer=False)) - - if max_value > 0: - plt.axhline(max_value, color='r', linestyle='-', label='Global Best AUC') - - for i in range(len(annotations)): - ele = annotations[i] - ax.plot(ele[2], ele[1], mark_list[i], label=ele[0], markersize=set_marker_point) - - # export_legend(fig, filename="any_time_legend", unique_labels=["Training-Based MS", "Training-Free MS", "2Phase-MS", 'Global Best AUC']) - export_legend(ori_fig=fig, colnum=5, unique_labels=unique_labels) - plt.tight_layout() - - fig.savefig(f"{name_img}.pdf", bbox_inches='tight') - - -def export_legend(ori_fig, filename="any_time_legend", colnum=9, unique_labels=None): - if unique_labels is None: - unique_labels = [] - fig2 = plt.figure(figsize=(5, 0.3)) - lines_labels = [ax.get_legend_handles_labels() for ax in ori_fig.axes] - lines, labels = [sum(lol, []) for lol in zip(*lines_labels)] - # grab unique labels - if len(unique_labels) == 0: - unique_labels = set(labels) - # assign labels and legends in dict - legend_dict = dict(zip(labels, lines)) - # query dict based on unique labels - unique_lines = [legend_dict[x] for x in unique_labels] - fig2.legend(unique_lines, unique_labels, loc='center', - ncol=colnum, - fancybox=True, - shadow=True, scatterpoints=1, fontsize=set_lgend_size) - fig2.tight_layout() - fig2.savefig(f"{filename}.pdf", bbox_inches='tight') - - -import seaborn as sns -import matplotlib.pyplot as plt - - -def plot_heatmap(data: List, fontsize: int, - x_array_name: str, y_array_name: str, - title: str, output_file: str, - decimal_places: int, - u_ticks, k_ticks, - ): - labelsize = fontsize - # Convert the data to a NumPy array - data_array = np.array(data) - - # Custom annotation function - def custom_annot(val): - return "{:.{}f}".format(val, decimal_places) if val > 0 else "" - - # Convert the custom annotations to a 2D array - annot_array = np.vectorize(custom_annot)(data_array) - - # Create a masked array to hide the cells with values less than or equal to 0 - masked_data = np.ma.masked_array(data_array, data_array <= 0) - - # Set the figure size (width, height) in inches - fig, ax = plt.subplots(figsize=(8, 4)) - - # Use the "viridis" colormap - cmap = "viridis" - - # Create a heatmap - sns.heatmap(masked_data, annot=annot_array, fmt='', cmap=cmap, mask=masked_data.mask, ax=ax, - annot_kws={"size": fontsize, "ha": "center", "va": "center"}, - xticklabels=u_ticks, yticklabels=k_ticks) - - # Set axis labels - ax.set_xlabel(x_array_name, fontsize=fontsize) - ax.set_ylabel(y_array_name, fontsize=fontsize) - - # Set x/y-axis tick size - ax.tick_params(axis='both', which='major', labelsize=labelsize) - - # Set the title - # ax.set_title(title, fontsize=fontsize) - - # Set tight layout - plt.tight_layout() - - # Save the plot to a PDF file - plt.savefig(output_file) diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/exps/nas_bench_tabular/2.seq_train_dist_online.py b/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/exps/nas_bench_tabular/2.seq_train_dist_online.py deleted file mode 100644 index e515647ecb..0000000000 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/exps/nas_bench_tabular/2.seq_train_dist_online.py +++ /dev/null @@ -1,163 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import argparse -import calendar -import json -import logging -import os -import time - -from exps.shared_args import parse_arguments - - -def partition_list_by_worker_id(lst, num_workers=15): - partitions = [] - for i in range(num_workers): - partitions.append([]) - for idx, item in enumerate(lst): - worker_id = idx % num_workers - partitions[worker_id].append(item) - return partitions - - -def start_one_worker(queue, args, worker_id, my_partition, search_space_ins, res): - from src.tools.io_tools import write_json, read_json - gmt = time.gmtime() - ts = calendar.timegm(gmt) - - os.environ.setdefault("log_file_name", f"{args.log_name}_{args.dataset}_wkid_{worker_id}_{ts}.log") - # import logging - logger = logging.getLogger(f"{args.dataset}_wkid_{worker_id}_{ts}") - if not os.path.exists(f"./{args.log_folder}"): - os.makedirs(f"./{args.log_folder}") - handler = logging.FileHandler(f"./{args.log_folder}/{args.log_name}_{args.dataset}_wkid_{worker_id}_{ts}.log") - formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') - handler.setFormatter(formatter) - logger.addHandler(handler) - from src.eva_engine.phase2.algo.trainer import ModelTrainer - - if args.total_models_per_worker is None: - logger.info( - f" ---- begin exploring, current worker have " - f"{len(my_partition)} models. explore all those models ") - else: - logger.info(f" ---- begin exploring, current worker have " - f"{len(my_partition)} models. but explore {args.total_models_per_worker} models ") - - train_loader, val_loader, test_loader = queue.get() - - checkpoint_file_name = f"./base_line_res_{args.dataset}/train_baseline_{args.dataset}_wkid_{worker_id}.json" - visited = read_json(checkpoint_file_name) - if visited == {}: - visited = {args.dataset: {}} - logger.info(f" ---- initialize checkpointing with {visited} . ") - else: - logger.info(f" ---- recovery from checkpointing with {len(visited[args.dataset])} model. ") - - explored_arch_num = 0 - for arch_index in my_partition: - print(f"begin to train the {arch_index}") - model = search_space_ins.new_architecture(res[arch_index]).to(args.device) - valid_auc, total_run_time, train_log = ModelTrainer.fully_train_arch( - model=model, - use_test_acc=False, - epoch_num=args.epoch, - train_loader=train_loader, - val_loader=val_loader, - test_loader=test_loader, - args=args, logger=logger) - - logger.info(f' ----- model id: {res[arch_index]}, Val_AUC : {valid_auc} Total running time: ' - f'{total_run_time}-----') - - # update the shared model eval res - logger.info(f" ---- exploring {explored_arch_num} model. ") - logger.info(f" ---- info: {json.dumps({res[arch_index]: train_log})}") - visited[args.dataset][res[arch_index]] = train_log - explored_arch_num += 1 - - if args.total_models_per_worker is not None and explored_arch_num > args.total_models_per_worker: - break - - logger.info(f" Saving result to: {checkpoint_file_name}") - write_json(checkpoint_file_name, visited) - - -if __name__ == "__main__": - mp.set_start_method('spawn', force=True) - args = parse_arguments() - - # set the log name - gmt = time.gmtime() - ts = calendar.timegm(gmt) - - os.environ.setdefault("log_file_name", f"{args.log_name}_{args.dataset}_main_{ts}.log") - os.environ.setdefault("base_dir", args.base_dir) - - from src.search_space.init_search_space import init_search_space - from src.dataset_utils.structure_data_loader import libsvm_dataloader - from src.tools.io_tools import write_json, read_json - import torch.multiprocessing as mp - - search_space_ins = init_search_space(args) - search_space_ins.load() - - # 1. main process partition data and group results, - res = read_json(args.pre_partitioned_file) - - total_workers = args.worker_each_gpu * args.gpu_num - all_partition = partition_list_by_worker_id(list(res.keys()), total_workers) - - train_loader, val_loader, test_loader = libsvm_dataloader( - args=args, - data_dir=os.path.join(args.base_dir, "data", "structure_data", args.dataset), - nfield=args.nfield, - batch_size=args.batch_size) - - # 2. put the shared dataloader into the queue, - queue = mp.Queue() - - # 3. Create a list of processes to train the models - processes = [] - worker_id = 0 - for gpu_id in range(args.gpu_num): - for _ in range(args.worker_each_gpu): - if args.device != "cpu": - args.device = f"cuda:{gpu_id}" - print(f"running process {[args.device, worker_id, len(all_partition[worker_id])]}") - p = mp.Process( - target=start_one_worker, - args=(queue, args, worker_id, all_partition[worker_id], search_space_ins, res, - ) - ) - p.start() - processes.append(p) - worker_id += 1 - - # 4. send to the queue - for gpu_id in range(args.gpu_num): - for _ in range(args.worker_each_gpu): - print("putting to queue ....") - queue.put((train_loader, val_loader, test_loader)) - - print("All processing are running, waiting all to finish....") - for p in processes: - p.join() - - diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/exps/nas_bench_tabular/2.seq_train_online.py b/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/exps/nas_bench_tabular/2.seq_train_online.py deleted file mode 100644 index cc1b44481a..0000000000 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/exps/nas_bench_tabular/2.seq_train_online.py +++ /dev/null @@ -1,118 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import calendar -import json -import os -import time - -from exps.shared_args import parse_arguments - - -def partition_list_by_worker_id(lst, num_workers=15): - partitions = [] - for i in range(num_workers): - partitions.append([]) - for idx, item in enumerate(lst): - worker_id = idx % num_workers - partitions[worker_id].append(item) - return partitions - - -if __name__ == "__main__": - - args = parse_arguments() - - # set the log name - gmt = time.gmtime() - ts = calendar.timegm(gmt) - - os.environ.setdefault("log_logger_folder_name", f"{args.log_folder}") - os.environ.setdefault("log_file_name", f"{args.log_name}_{args.dataset}_wkid_{args.worker_id}_{ts}.log") - os.environ.setdefault("base_dir", args.base_dir) - - from src.logger import logger - from src.eva_engine.phase2.algo.trainer import ModelTrainer - from src.search_space.init_search_space import init_search_space - from src.dataset_utils.structure_data_loader import libsvm_dataloader - from src.tools.io_tools import write_json, read_json - - search_space_ins = init_search_space(args) - search_space_ins.load() - - # 1. data loader - logger.info(f" Loading data....") - train_loader, val_loader, test_loader = libsvm_dataloader( - args=args, - data_dir=os.path.join(args.base_dir, "data", "structure_data", args.dataset), - nfield=args.nfield, - batch_size=args.batch_size) - - res = read_json(args.pre_partitioned_file) - - all_partition = partition_list_by_worker_id(list(res.keys()), args.total_workers) - - if args.total_models_per_worker == -1: - logger.info( - f" ---- begin exploring, current worker have " - f"{len(all_partition[args.worker_id])} models. explore all those models ") - else: - logger.info(f" ---- begin exploring, current worker have " - f"{len(all_partition[args.worker_id])} models. but explore {args.total_models_per_worker} models ") - - # read the checkpoint - checkpoint_file_name = f"{args.result_dir}/train_baseline_{args.dataset}_wkid_{args.worker_id}.json" - visited = read_json(checkpoint_file_name) - if visited == {}: - visited = {args.dataset: {}} - logger.info(f" ---- initialize checkpointing with {visited} . ") - else: - logger.info(f" ---- recovery from checkpointing with {len(visited[args.dataset])} model. ") - - explored_arch_num = 0 - for arch_index in all_partition[args.worker_id]: - print(f"begin to train the {arch_index}") - if res[arch_index] in visited[args.dataset]: - logger.info(f" ---- model {res[arch_index]} already visited") - continue - model = search_space_ins.new_architecture(res[arch_index]) - model.init_embedding(requires_grad=True) - model.to(args.device) - valid_auc, total_run_time, train_log = ModelTrainer.fully_train_arch( - model=model, - use_test_acc=False, - epoch_num=args.epoch, - train_loader=train_loader, - val_loader=val_loader, - test_loader=test_loader, - args=args) - - logger.info(f' ----- model id: {res[arch_index]}, Val_AUC : {valid_auc} Total running time: ' - f'{total_run_time}-----') - - # update the shared model eval res - logger.info(f" ---- exploring {explored_arch_num} model. ") - logger.info(f" ---- info: {json.dumps({res[arch_index]: train_log})}") - visited[args.dataset][res[arch_index]] = train_log - explored_arch_num += 1 - - if args.total_models_per_worker != -1 and explored_arch_num > args.total_models_per_worker: - break - - logger.info(f" Saving result to: {checkpoint_file_name}") - write_json(checkpoint_file_name, visited) diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/exps/nas_bench_tabular/measure_ecdf.py b/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/exps/nas_bench_tabular/measure_ecdf.py deleted file mode 100644 index 645f72f437..0000000000 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/exps/nas_bench_tabular/measure_ecdf.py +++ /dev/null @@ -1,136 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import numpy as np -import matplotlib -import matplotlib.pyplot as plt -import os -from src.tools.io_tools import read_json - -# lines' mark size -set_marker_size = 15 -# points' mark size -set_marker_point = 14 -# points' mark size -set_font_size = 25 -set_lgend_size = 15 -set_tick_size = 20 - -frontinsidebox = 23 - -# update tick size -matplotlib.rc('xtick', labelsize=set_tick_size) -matplotlib.rc('ytick', labelsize=set_tick_size) - -plt.rcParams['axes.labelsize'] = set_tick_size - -mark_list = ["o", "*", "<", "^", "s", "d", "D", ">", "h"] -mark_size_list = [set_marker_size, set_marker_size + 1, set_marker_size + 1, set_marker_size, - set_marker_size, set_marker_size, set_marker_size, set_marker_size + 1, set_marker_size + 2] -line_shape_list = ['-.', '--', '-', ':'] -shade_degree = 0.2 -base_dir = "../exp_data/" - - -def export_legend(ori_fig, filename="any_time_legend", colnum=9, unique_labels=None): - if unique_labels is None: - unique_labels = [] - fig2 = plt.figure(figsize=(5, 0.3)) - lines_labels = [ax.get_legend_handles_labels() for ax in ori_fig.axes] - lines, labels = [sum(lol, []) for lol in zip(*lines_labels)] - # grab unique labels - if len(unique_labels) == 0: - unique_labels = set(labels) - # assign labels and legends in dict - legend_dict = dict(zip(labels, lines)) - # query dict based on unique labels - unique_lines = [legend_dict[x] for x in unique_labels] - fig2.legend(unique_lines, unique_labels, loc='center', - ncol=colnum, - fancybox=True, - shadow=True, scatterpoints=1, fontsize=set_lgend_size) - fig2.tight_layout() - fig2.savefig(f"{filename}.pdf", bbox_inches='tight') - - -def draw_edcf(): - # extract train_auc and valid_auc into separate lists - for dataset, architectures in data_dict.items(): - - fig, ax = plt.subplots(figsize=(6.4, 3.5)) - print(dataset) - train_auc = [] - valid_auc = [] - for architecture, epochs in architectures.items(): - for epoch, metrics in epochs.items(): - if str(epoch_sampled[dataset]) == epoch: - train_auc.append(metrics["train_auc"]) - valid_auc.append(metrics["valid_auc"]) - break - - # calculate and plot ECDF for train_auc - sorted_train_auc = np.sort(train_auc) - y_train = np.arange(1, len(sorted_train_auc) + 1) / len(sorted_train_auc) - plt.plot(sorted_train_auc, y_train, label='Training AUC', linewidth=3, linestyle='--') - - # calculate and plot ECDF for valid_auc - sorted_valid_auc = np.sort(valid_auc) - y_valid = np.arange(1, len(sorted_valid_auc) + 1) / len(sorted_valid_auc) - plt.plot(sorted_valid_auc, y_valid, label='Validation AUC', linewidth=3, linestyle='-') - - y_m = np.quantile(sorted_valid_auc, .5, axis=0) - print("medium", y_m, "best", max(sorted_valid_auc)) - # plt.xlim(left=0.45) - - plt.grid() - plt.xlabel('Accuracy') - plt.ylabel('ECDF') - # plt.legend(loc='upper left', fontsize=set_lgend_size) - plt.tight_layout() - export_legend(ori_fig=fig, colnum=5) - fig.savefig(f"space_{dataset}.pdf", bbox_inches='tight') - - -# dataset_used = "frappe" -dataset_used = "uci_diabetes" -# dataset_used = "criteo" - - -epoch_sampled = {"frappe": 19, "uci_diabetes": 35, "criteo": 9} - -if dataset_used == "frappe": - mlp_train_frappe = os.path.join( - base_dir, - "tab_data/frappe/all_train_baseline_frappe.json") - data_dict = read_json(mlp_train_frappe) -elif dataset_used == "uci_diabetes": - mlp_train_uci_diabetes = os.path.join( - base_dir, - "tab_data/uci_diabetes/all_train_baseline_uci_160k_40epoch.json") - - data_dict = read_json(mlp_train_uci_diabetes) -elif dataset_used == "criteo": - mlp_train_criteo = os.path.join( - base_dir, - "tab_data/criteo/all_train_baseline_criteo.json") - - data_dict = read_json(mlp_train_criteo) -else: - print("err") - -draw_edcf() diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/exps/nas_bench_tabular/measure_param_auc.py b/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/exps/nas_bench_tabular/measure_param_auc.py deleted file mode 100644 index 0ff6748231..0000000000 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/exps/nas_bench_tabular/measure_param_auc.py +++ /dev/null @@ -1,144 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import numpy as np -import matplotlib -import matplotlib.pyplot as plt -import os -from src.tools.io_tools import read_json - -# lines' mark size -set_marker_size = 15 -# points' mark size -set_marker_point = 14 -# points' mark size -set_font_size = 25 -set_lgend_size = 15 -set_tick_size = 20 -import matplotlib.ticker as ticker - -frontinsidebox = 23 - -# update tick size -matplotlib.rc('xtick', labelsize=set_tick_size) -matplotlib.rc('ytick', labelsize=set_tick_size) - -plt.rcParams['axes.labelsize'] = set_tick_size - -mark_list = ["o", "*", "<", "^", "s", "d", "D", ">", "h"] -mark_size_list = [set_marker_size, set_marker_size + 1, set_marker_size + 1, set_marker_size, - set_marker_size, set_marker_size, set_marker_size, set_marker_size + 1, set_marker_size + 2] -line_shape_list = ['-.', '--', '-', ':'] -shade_degree = 0.2 -base_dir = "../exp_data/" - - -def export_legend(ori_fig, filename="any_time_legend", colnum=9, unique_labels=None): - if unique_labels is None: - unique_labels = [] - fig2 = plt.figure(figsize=(5, 0.3)) - lines_labels = [ax.get_legend_handles_labels() for ax in ori_fig.axes] - lines, labels = [sum(lol, []) for lol in zip(*lines_labels)] - # grab unique labels - if len(unique_labels) == 0: - unique_labels = set(labels) - # assign labels and legends in dict - legend_dict = dict(zip(labels, lines)) - # query dict based on unique labels - unique_lines = [legend_dict[x] for x in unique_labels] - fig2.legend(unique_lines, unique_labels, loc='center', - ncol=colnum, - fancybox=True, - shadow=True, scatterpoints=1, fontsize=set_lgend_size) - fig2.tight_layout() - fig2.savefig(f"{filename}.pdf", bbox_inches='tight') - - -# Function to compute number of parameters for an architecture -def compute_params(architecture): - layers = [int(layer) for layer in architecture.split('-')] - params = 0 - for i in range(len(layers) - 1): - params += layers[i] * layers[i + 1] - # Add bias terms - params += sum(layers[1:]) - return params - - -# Function to convert large number into a string with 'k' for thousands -def func(x, pos): # formatter function takes tick label and tick position - if x == 0: - return f"0" - else: - s = f'{x / 1000000}M' - return s - - -def draw_parameter_performance(): - # extract train_auc and valid_auc into separate lists - for dataset, architectures in data_dict.items(): - fig, ax = plt.subplots(figsize=(6.4, 4)) - print(dataset) - param_sizes = [] - valid_auc = [] - for architecture, epochs in architectures.items(): - for epoch, metrics in epochs.items(): - if str(epoch_sampled[dataset]) == epoch: - param_sizes.append(compute_params(architecture)) - valid_auc.append(metrics["valid_auc"]) - break - - plt.scatter(param_sizes, valid_auc) - y_format = ticker.FuncFormatter(func) - ax.xaxis.set_major_formatter(y_format) - plt.grid() - plt.xlabel('Parameter Size') - plt.ylabel('Validation AUC') - # plt.legend(loc='upper left', fontsize=set_lgend_size) - plt.tight_layout() - export_legend(ori_fig=fig, colnum=5) - fig.savefig(f"para_{dataset}.jpg", bbox_inches='tight') - - -dataset_used = "frappe" -# dataset_used = "uci_diabetes" -# dataset_used = "criteo" - -epoch_sampled = {"frappe": 19, "uci_diabetes": 35, "criteo": 9} - -if dataset_used == "frappe": - mlp_train_frappe = os.path.join( - base_dir, - "tab_data/frappe/all_train_baseline_frappe.json") - data_dict = read_json(mlp_train_frappe) -elif dataset_used == "uci_diabetes": - mlp_train_uci_diabetes = os.path.join( - base_dir, - "tab_data/uci_diabetes/all_train_baseline_uci_160k_40epoch.json") - - data_dict = read_json(mlp_train_uci_diabetes) -elif dataset_used == "criteo": - mlp_train_criteo = os.path.join( - base_dir, - "tab_data/criteo/all_train_baseline_criteo.json") - - data_dict = read_json(mlp_train_criteo) -else: - print("err") - -draw_parameter_performance() diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/init_env b/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/init_env deleted file mode 100644 index b3204ea062..0000000000 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/init_env +++ /dev/null @@ -1,12 +0,0 @@ - - - - -export PYTHONPATH=$PYTHONPATH:/project/TRAILS/internal/ml/model_selection -conda activate trails - - - -export PYTHONPATH=$PYTHONPATH:./internal/ml/model_selection -conda activate trails - diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/scripts/anytime_img_w_baseline.sh b/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/scripts/anytime_img_w_baseline.sh deleted file mode 100644 index aef3810537..0000000000 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/scripts/anytime_img_w_baseline.sh +++ /dev/null @@ -1,58 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -export PYTHONPATH=$PYTHONPATH:./internal/ml/model_selection -conda activate trails - - - -############## c10 dataset ############## -# run both 2phase-MS and training-free MS -python internal/ml/model_selection/exps/macro/anytime_img.py \ - --search_space nasbench201 \ - --api_loc NAS-Bench-201-v1_1-096897.pth \ - --epoch 200 \ - --dataset cifar10 \ - --num_labels 10 \ - --base_dir ../exp_data/ \ - --result_dir ./internal/ml/model_selection/exp_result/ - - -############## c100 dataset ############## -python internal/ml/model_selection/exps/macro/anytime_img.py \ - --search_space nasbench201 \ - --api_loc NAS-Bench-201-v1_1-096897.pth \ - --epoch 200 \ - --dataset cifar100 \ - --num_labels 100 \ - --base_dir ../exp_data/ \ - --result_dir ./internal/ml/model_selection/exp_result/ - - -############## imageNet dataset ############## -python internal/ml/model_selection/exps/macro/anytime_img.py \ - --search_space nasbench201 \ - --api_loc NAS-Bench-201-v1_1-096897.pth \ - --epoch 200 \ - --dataset ImageNet16-120 \ - --num_labels 120 \ - --base_dir ../exp_data/ \ - --result_dir ./internal/ml/model_selection/exp_result/ - - - diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/scripts/anytime_tab.sh b/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/scripts/anytime_tab.sh deleted file mode 100644 index 3bfb947d5e..0000000000 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/scripts/anytime_tab.sh +++ /dev/null @@ -1,142 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -export PYTHONPATH=$PYTHONPATH:./internal/ml/model_selection - - -############## frappe dataset ############## - -# run the 2phase-MS -python internal/ml/model_selection/exps/macro/anytime_simulate.py \ - --search_space mlp_sp \ - --num_layers 4 \ - --hidden_choice_len 20 \ - --batch_size 128 \ - --nfeat 5500 \ - --nfield 10 \ - --base_dir=/hdd1/xingnaili/exp_data/ \ - --dataset frappe \ - --num_labels 2 \ - --only_phase1 False \ - --is_simulate True \ - --device cpu \ - --log_folder any_time_frappe \ - --result_dir ./internal/ml/model_selection/exp_result/ \ - --num_points 5 - - -# run the training-free MS -python internal/ml/model_selection/exps/macro/anytime_simulate.py \ - --search_space mlp_sp \ - --num_layers 4 \ - --hidden_choice_len 20 \ - --batch_size 128 \ - --nfeat 5500 \ - --nfield 10 \ - --base_dir=/hdd1/xingnaili/exp_data/ \ - --dataset frappe \ - --num_labels 2 \ - --only_phase1 True \ - --is_simulate True \ - --device cpu \ - --log_folder any_time_frappe \ - --result_dir ./internal/ml/model_selection/exp_result/ \ - --num_points 5 - - -############## uci dataset ############## - -# run the 2phase-MS -python internal/ml/model_selection/exps/macro/anytime_simulate.py \ - --search_space mlp_sp \ - --num_layers 4 \ - --hidden_choice_len 20 \ - --batch_size 128 \ - --nfeat 369 \ - --nfield 43 \ - --base_dir=/hdd1/xingnaili/exp_data/ \ - --dataset uci_diabetes \ - --num_labels 2 \ - --only_phase1 False \ - --is_simulate True \ - --device cpu \ - --log_folder any_time_uci_diabetes \ - --result_dir ./internal/ml/model_selection/exp_result/ \ - --num_points 5 - - -# run the training-free MS -python internal/ml/model_selection/exps/macro/anytime_simulate.py \ - --search_space mlp_sp \ - --num_layers 4 \ - --hidden_choice_len 20 \ - --batch_size 128 \ - --nfeat 369 \ - --nfield 43 \ - --base_dir=/hdd1/xingnaili/exp_data/ \ - --dataset uci_diabetes \ - --num_labels 2 \ - --only_phase1 True \ - --is_simulate True \ - --device cpu \ - --log_folder any_time_uci_diabetes \ - --result_dir ./internal/ml/model_selection/exp_result/ \ - --num_points 5 - - -############## criteo dataset ############## - -# run the 2phase-MS -python internal/ml/model_selection/exps/macro/anytime_simulate.py \ - --search_space mlp_sp \ - --num_layers 4 \ - --hidden_choice_len 10 \ - --batch_size 128 \ - --nfeat 2100000 \ - --nfield 39 \ - --base_dir=/hdd1/xingnaili/exp_data/ \ - --dataset criteo \ - --num_labels 2 \ - --only_phase1 False \ - --is_simulate True \ - --device cpu \ - --log_folder any_time_criteo \ - --result_dir ./internal/ml/model_selection/exp_result/ \ - --num_points 5 - - -# run the training-free MS -python internal/ml/model_selection/exps/macro/anytime_simulate.py \ - --search_space mlp_sp \ - --num_layers 4 \ - --hidden_choice_len 10 \ - --batch_size 128 \ - --nfeat 2100000 \ - --nfield 39 \ - --base_dir=/hdd1/xingnaili/exp_data/ \ - --dataset criteo \ - --num_labels 2 \ - --only_phase1 True \ - --is_simulate True \ - --device cpu \ - --log_folder any_time_criteo \ - --result_dir ./internal/ml/model_selection/exp_result/ \ - --num_points 5 - - - diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/scripts/baseline_system_img.sh b/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/scripts/baseline_system_img.sh deleted file mode 100644 index 20d6679177..0000000000 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/scripts/baseline_system_img.sh +++ /dev/null @@ -1,61 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -export PYTHONPATH=$PYTHONPATH:./internal/ml/model_selection -conda activate trails - - - -# run both training-based MS -############## c10 dataset ############## -python internal/ml/model_selection/exps/baseline/train_with_ea.py \ - --search_space nasbench201 \ - --api_loc NAS-Bench-201-v1_1-096897.pth \ - --epoch 200 \ - --dataset cifar10 \ - --num_labels 10 \ - --base_dir ../exp_data/ \ - --log_folder log_baseline_c10 \ - --result_dir ./internal/ml/model_selection/exp_result/ - - -############## c100 dataset ############## -python internal/ml/model_selection/exps/baseline/train_with_ea.py \ - --search_space nasbench201 \ - --api_loc NAS-Bench-201-v1_1-096897.pth \ - --epoch 200 \ - --dataset cifar100 \ - --num_labels 100 \ - --base_dir ../exp_data/ \ - --log_folder log_baseline_c100 \ - --result_dir ./internal/ml/model_selection/exp_result/ - - -############## ImgNet dataset ############## -python internal/ml/model_selection/exps/baseline/train_with_ea.py \ - --search_space nasbench201 \ - --api_loc NAS-Bench-201-v1_1-096897.pth \ - --epoch 200 \ - --dataset ImageNet16-120 \ - --num_labels 120 \ - --base_dir ../exp_data/ \ - --log_folder log_baseline_imgnet \ - --result_dir ./internal/ml/model_selection/exp_result/ - - - diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/scripts/baseline_system_tab.sh b/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/scripts/baseline_system_tab.sh deleted file mode 100644 index 41a2d00560..0000000000 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/scripts/baseline_system_tab.sh +++ /dev/null @@ -1,83 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -export PYTHONPATH=$PYTHONPATH:./internal/ml/model_selection -conda activate trails - - - -# run both training-based MS -############## frappe dataset ############## -python internal/ml/model_selection/exps/baseline/train_with_ea.py \ - --search_space mlp_sp \ - --num_layers 4 \ - --hidden_choice_len 20 \ - --epoch 19 \ - --batch_size=512 \ - --lr=0.001 \ - --iter_per_epoch=200 \ - --nfeat=5500 \ - --nfield=10 \ - --nemb=10 \ - --base_dir ../exp_data/ \ - --dataset frappe \ - --num_labels 2 \ - --device=cpu \ - --log_folder baseline_frappe \ - --result_dir ./internal/ml/model_selection/exp_result/ - - -############## uci dataset ############## -python internal/ml/model_selection/exps/baseline/train_with_ea.py \ - --search_space mlp_sp \ - --num_layers 4 \ - --hidden_choice_len 20 \ - --epoch 0 \ - --batch_size=1024 \ - --lr=0.001 \ - --iter_per_epoch=200 \ - --nfeat=369 \ - --nfield=43 \ - --nemb=10 \ - --base_dir ../exp_data/ \ - --dataset uci_diabetes \ - --num_labels 2 \ - --device=cpu \ - --log_folder baseline_uci_diabetes \ - --result_dir ./internal/ml/model_selection/exp_result/ - - -############## criteo dataset ############## -python internal/ml/model_selection/exps/baseline/train_with_ea.py \ - --search_space mlp_sp \ - --num_layers 4 \ - --hidden_choice_len 10 \ - --epoch 9 \ - --batch_size=1024 \ - --lr=0.001 \ - --iter_per_epoch=2000 \ - --nfeat=2100000 \ - --nfield=39 \ - --nemb=10 \ - --base_dir ../exp_data/ \ - --dataset criteo \ - --num_labels 2 \ - --device=cpu \ - --log_folder baseline_criteo \ - --result_dir ./internal/ml/model_selection/exp_result/ - diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/scripts/benchmark_weight_sharing.sh b/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/scripts/benchmark_weight_sharing.sh deleted file mode 100644 index d1c2db5b2a..0000000000 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/scripts/benchmark_weight_sharing.sh +++ /dev/null @@ -1,41 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -export PYTHONPATH=$PYTHONPATH:./internal/ml/model_selection -conda activate trails - - -python ./internal/ml/model_selection/exps/micro/resp/benchmark_weight_sharing.py \ - --log_name=baseline_train_based \ - --search_space=mlp_sp \ - --num_layers=4 \ - --hidden_choice_len=20 \ - --base_dir=/hdd1/xingnaili/exp_data/ \ - --num_labels=2 \ - --device=cuda:0 \ - --batch_size=512 \ - --lr=0.001 \ - --epoch=20 \ - --iter_per_epoch=200 \ - --dataset=frappe \ - --nfeat=5500 \ - --nfield=10 \ - --nemb=10 \ - --log_folder=log_frappe \ - --total_models_per_worker=-1 \ - --result_dir=./internal/ml/model_selection/exp_result/ \ No newline at end of file diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/scripts/latency_embedding_cache.sh b/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/scripts/latency_embedding_cache.sh deleted file mode 100644 index e9068a2bdd..0000000000 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/scripts/latency_embedding_cache.sh +++ /dev/null @@ -1,138 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -export PYTHONPATH=$PYTHONPATH:./internal/ml/model_selection - -# frappe -python3 ./internal/ml/model_selection/exps/micro/benchmark_filtering_latency.py \ - --embedding_cache_filtering=True \ - --tfmem=express_flow \ - --models_explore=5000 \ - --search_space=mlp_sp \ - --num_layers=4 \ - --hidden_choice_len=20 \ - --base_dir=/hdd1/xingnaili/exp_data/ \ - --num_labels=2 \ - --device=cuda:0 \ - --batch_size=32 \ - --dataset=frappe \ - --nfeat=5500 \ - --nfield=10 \ - --nemb=10 \ - --workers=0 \ - --result_dir=./internal/ml/model_selection/exp_result/ - -#criteo -python3 ./internal/ml/model_selection/exps/micro/benchmark_filtering_latency.py \ - --embedding_cache_filtering=True \ - --tfmem=express_flow \ - --models_explore=5000 \ - --search_space=mlp_sp \ - --num_layers=4 \ - --hidden_choice_len=10 \ - --base_dir=/hdd1/xingnaili/exp_data/ \ - --num_labels=2 \ - --device=cuda:0 \ - --batch_size=32 \ - --dataset=criteo \ - --nfeat=2100000 \ - --nfield=39 \ - --nemb=10 \ - --workers=0 \ - --result_dir=./internal/ml/model_selection/exp_result/ - -# uci -python3 ./internal/ml/model_selection/exps/micro/benchmark_filtering_latency.py \ - --embedding_cache_filtering=True \ - --tfmem=express_flow \ - --models_explore=5000 \ - --search_space=mlp_sp \ - --num_layers=4 \ - --hidden_choice_len=20 \ - --base_dir=/hdd1/xingnaili/exp_data/ \ - --num_labels=2 \ - --device=cuda:0 \ - --batch_size=32 \ - --dataset=uci_diabetes \ - --nfeat=369 \ - --nfield=43 \ - --nemb=10 \ - --workers=0 \ - --result_dir=./internal/ml/model_selection/exp_result/ - -########################## CPU ############################## -# this is run on cpu, only change the device==cpu for all above - -# frappe -python3 ./internal/ml/model_selection/exps/micro/benchmark_filtering_latency.py \ - --embedding_cache_filtering=True \ - --tfmem=express_flow \ - --models_explore=5000 \ - --search_space=mlp_sp \ - --num_layers=4 \ - --hidden_choice_len=20 \ - --base_dir=/hdd1/xingnaili/exp_data/ \ - --num_labels=2 \ - --device=cpu \ - --batch_size=32 \ - --dataset=frappe \ - --nfeat=5500 \ - --nfield=10 \ - --nemb=10 \ - --workers=0 \ - --result_dir=./internal/ml/model_selection/exp_result/ - -#criteo -python3 ./internal/ml/model_selection/exps/micro/benchmark_filtering_latency.py \ - --embedding_cache_filtering=True \ - --tfmem=express_flow \ - --models_explore=5000 \ - --search_space=mlp_sp \ - --num_layers=4 \ - --hidden_choice_len=10 \ - --base_dir=/hdd1/xingnaili/exp_data/ \ - --num_labels=2 \ - --device=cpu \ - --batch_size=32 \ - --dataset=criteo \ - --nfeat=2100000 \ - --nfield=39 \ - --nemb=10 \ - --workers=0 \ - --result_dir=./internal/ml/model_selection/exp_result/ - -# uci -python3 ./internal/ml/model_selection/exps/micro/benchmark_filtering_latency.py \ - --embedding_cache_filtering=True \ - --tfmem=express_flow \ - --models_explore=5000 \ - --search_space=mlp_sp \ - --num_layers=4 \ - --hidden_choice_len=20 \ - --base_dir=/hdd1/xingnaili/exp_data/ \ - --num_labels=2 \ - --device=cpu \ - --batch_size=32 \ - --dataset=uci_diabetes \ - --nfeat=369 \ - --nfield=43 \ - --nemb=10 \ - --workers=0 \ - --result_dir=./internal/ml/model_selection/exp_result/ - - diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/scripts/latency_embedding_cache_concurrent.sh b/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/scripts/latency_embedding_cache_concurrent.sh deleted file mode 100644 index f7eb18954e..0000000000 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/scripts/latency_embedding_cache_concurrent.sh +++ /dev/null @@ -1,155 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -export PYTHONPATH=$PYTHONPATH:./internal/ml/model_selection -conda activate trails - - - -########################## CPU ############################## -# this is run on cpu, only change the device==cpu for all above - -# frappe -python3 ./internal/ml/model_selection/exps/micro/benchmark_filtering_latency_concurrent.py \ - --concurrency=8 \ - --embedding_cache_filtering=True \ - --tfmem=express_flow \ - --models_explore=5000 \ - --log_name=score_based \ - --search_space=mlp_sp \ - --num_layers=4 \ - --hidden_choice_len=20 \ - --base_dir=/hdd1/xingnaili/exp_data/ \ - --num_labels=2 \ - --device=cpu \ - --batch_size=32 \ - --dataset=frappe \ - --nfeat=5500 \ - --nfield=10 \ - --nemb=10 \ - --workers=0 \ - --result_dir=./internal/ml/model_selection/exp_current_filter_cache/ \ - --log_folder=log_score_time_frappe_cache - -#criteo -python3 ./internal/ml/model_selection/exps/micro/benchmark_filtering_latency_concurrent.py \ - --embedding_cache_filtering=True \ - --tfmem=express_flow \ - --models_explore=5000 \ - --log_name=score_based \ - --search_space=mlp_sp \ - --num_layers=4 \ - --hidden_choice_len=10 \ - --base_dir=/hdd1/xingnaili/exp_data/ \ - --num_labels=2 \ - --device=cpu \ - --batch_size=32 \ - --dataset=criteo \ - --nfeat=2100000 \ - --nfield=39 \ - --nemb=10 \ - --workers=0 \ - --result_dir=./internal/ml/model_selection/exp_current_filter_cache/ \ - --log_folder=log_score_time_frappe_cache - -# uci -python3 ./internal/ml/model_selection/exps/micro/benchmark_filtering_latency_concurrent.py \ - --embedding_cache_filtering=True \ - --tfmem=express_flow \ - --models_explore=5000 \ - --log_name=score_based \ - --search_space=mlp_sp \ - --num_layers=4 \ - --hidden_choice_len=20 \ - --base_dir=/hdd1/xingnaili/exp_data/ \ - --num_labels=2 \ - --device=cpu \ - --batch_size=32 \ - --dataset=uci_diabetes \ - --nfeat=369 \ - --nfield=43 \ - --nemb=10 \ - --workers=0 \ - --result_dir=./internal/ml/model_selection/exp_current_filter_cache/ \ - --log_folder=log_score_time_frappe_cache - - -# here is concurrent run but no embedding cache -####################################################################################### - -# frappe -python3 ./internal/ml/model_selection/exps/micro/benchmark_filtering_latency_concurrent.py \ - --tfmem=express_flow \ - --models_explore=5000 \ - --log_name=score_based \ - --search_space=mlp_sp \ - --num_layers=4 \ - --hidden_choice_len=20 \ - --base_dir=/hdd1/xingnaili/exp_data/ \ - --num_labels=2 \ - --device=cpu \ - --batch_size=32 \ - --dataset=frappe \ - --nfeat=5500 \ - --nfield=10 \ - --nemb=10 \ - --workers=0 \ - --result_dir=./internal/ml/model_selection/exp_current_filter_no_cache/ \ - --log_folder=log_score_time_frappe_cache - -#criteo -python3 ./internal/ml/model_selection/exps/micro/benchmark_filtering_latency_concurrent.py \ - --tfmem=express_flow \ - --models_explore=5000 \ - --log_name=score_based \ - --search_space=mlp_sp \ - --num_layers=4 \ - --hidden_choice_len=10 \ - --base_dir=/hdd1/xingnaili/exp_data/ \ - --num_labels=2 \ - --device=cpu \ - --batch_size=32 \ - --dataset=criteo \ - --nfeat=2100000 \ - --nfield=39 \ - --nemb=10 \ - --workers=0 \ - --result_dir=./internal/ml/model_selection/exp_current_filter_no_cache/ \ - --log_folder=log_score_time_frappe_cache - -# uci -python3 ./internal/ml/model_selection/exps/micro/benchmark_filtering_latency_concurrent.py \ - --tfmem=express_flow \ - --models_explore=5000 \ - --log_name=score_based \ - --search_space=mlp_sp \ - --num_layers=4 \ - --hidden_choice_len=20 \ - --base_dir=/hdd1/xingnaili/exp_data/ \ - --num_labels=2 \ - --device=cpu \ - --batch_size=32 \ - --dataset=uci_diabetes \ - --nfeat=369 \ - --nfield=43 \ - --nemb=10 \ - --workers=0 \ - --result_dir=./internal/ml/model_selection/exp_filter_exp_current_filter_no_cachecache/ \ - --log_folder=log_score_time_frappe_cache - - diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/scripts/latency_phase1_cpu_gpu.sh b/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/scripts/latency_phase1_cpu_gpu.sh deleted file mode 100644 index 65aabd3c85..0000000000 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/scripts/latency_phase1_cpu_gpu.sh +++ /dev/null @@ -1,227 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -export PYTHONPATH=$PYTHONPATH:./internal/ml/model_selection - - -# frappe -python3 ./internal/ml/model_selection/exps/micro/benchmark_filtering_latency.py \ - --embedding_cache_filtering=False \ - --tfmem=express_flow \ - --models_explore=5000 \ - --search_space=mlp_sp \ - --num_layers=4 \ - --hidden_choice_len=20 \ - --base_dir=/hdd1/xingnaili/exp_data/ \ - --num_labels=2 \ - --device=cuda:0 \ - --batch_size=32 \ - --dataset=frappe \ - --nfeat=5500 \ - --nfield=10 \ - --nemb=10 \ - --workers=0 \ - --result_dir=./internal/ml/model_selection/exp_result_sever_wo_cache/ - -#criteo -python3 ./internal/ml/model_selection/exps/micro/benchmark_filtering_latency.py \ - --embedding_cache_filtering=False \ - --tfmem=express_flow \ - --models_explore=5000 \ - --search_space=mlp_sp \ - --num_layers=4 \ - --hidden_choice_len=10 \ - --base_dir=/hdd1/xingnaili/exp_data/ \ - --num_labels=2 \ - --device=cuda:0 \ - --batch_size=32 \ - --dataset=criteo \ - --nfeat=2100000 \ - --nfield=39 \ - --nemb=10 \ - --workers=0 \ - --result_dir=./internal/ml/model_selection/exp_result_sever_wo_cache/ - -# uci -python3 ./internal/ml/model_selection/exps/micro/benchmark_filtering_latency.py \ - --embedding_cache_filtering=False \ - --tfmem=express_flow \ - --models_explore=5000 \ - --search_space=mlp_sp \ - --num_layers=4 \ - --hidden_choice_len=20 \ - --base_dir=/hdd1/xingnaili/exp_data/ \ - --num_labels=2 \ - --device=cuda:0 \ - --batch_size=32 \ - --dataset=uci_diabetes \ - --nfeat=369 \ - --nfield=43 \ - --nemb=10 \ - --workers=0 \ - --result_dir=./internal/ml/model_selection/exp_result_sever_wo_cache/ - - -# cifar 10 -python3 ./internal/ml/model_selection/exps/micro/benchmark_filtering_latency.py \ - --embedding_cache_filtering=False \ - --tfmem=express_flow \ - --models_explore=5000 \ - --search_space=nasbench201 \ - --api_loc=NAS-Bench-201-v1_1-096897.pth \ - --base_dir=/hdd1/xingnaili/exp_data/ \ - --num_labels=10 \ - --device=cuda:0 \ - --batch_size=32 \ - --dataset=cifar10 \ - --result_dir=./internal/ml/model_selection/exp_result_sever_wo_cache/ - - -# cifar 100 -python3 ./internal/ml/model_selection/exps/micro/benchmark_filtering_latency.py \ - --embedding_cache_filtering=False \ - --tfmem=express_flow \ - --models_explore=5000 \ - --search_space=nasbench201 \ - --api_loc=NAS-Bench-201-v1_1-096897.pth \ - --base_dir=/hdd1/xingnaili/exp_data/ \ - --num_labels=100 \ - --device=cuda:0 \ - --batch_size=32 \ - --dataset=cifar100 \ - --result_dir=./internal/ml/model_selection/exp_result_sever_wo_cache/ - - -# imageNet -python3 ./internal/ml/model_selection/exps/micro/benchmark_filtering_latency.py \ - --embedding_cache_filtering=False \ - --tfmem=express_flow \ - --models_explore=5000 \ - --search_space=nasbench201 \ - --api_loc=NAS-Bench-201-v1_1-096897.pth \ - --base_dir=/hdd1/xingnaili/exp_data/ \ - --num_labels=120 \ - --device=cuda:0 \ - --batch_size=32 \ - --dataset=ImageNet16-120 \ - --result_dir=./internal/ml/model_selection/exp_result_sever_wo_cache/ - -########################## CPU ############################## -# this is run on cpu, only change the device==cpu for all above - -# frappe -python3 ./internal/ml/model_selection/exps/micro/benchmark_filtering_latency.py \ - --embedding_cache_filtering=False \ - --tfmem=express_flow \ - --models_explore=5000 \ - --search_space=mlp_sp \ - --num_layers=4 \ - --hidden_choice_len=20 \ - --base_dir=/hdd1/xingnaili/exp_data/ \ - --num_labels=2 \ - --device=cpu \ - --batch_size=32 \ - --dataset=frappe \ - --nfeat=5500 \ - --nfield=10 \ - --nemb=10 \ - --workers=0 \ - --result_dir=./internal/ml/model_selection/exp_result_sever_wo_cache/ - -# criteo -python3 ./internal/ml/model_selection/exps/micro/benchmark_filtering_latency.py \ - --embedding_cache_filtering=False \ - --tfmem=express_flow \ - --models_explore=5000 \ - --search_space=mlp_sp \ - --num_layers=4 \ - --hidden_choice_len=10 \ - --base_dir=/hdd1/xingnaili/exp_data/ \ - --num_labels=2 \ - --device=cpu \ - --batch_size=32 \ - --dataset=criteo \ - --nfeat=2100000 \ - --nfield=39 \ - --nemb=10 \ - --workers=0 \ - --result_dir=./internal/ml/model_selection/exp_result_sever_wo_cache/ - -# uci -python3 ./internal/ml/model_selection/exps/micro/benchmark_filtering_latency.py \ - --embedding_cache_filtering=False \ - --tfmem=express_flow \ - --models_explore=5000 \ - --search_space=mlp_sp \ - --num_layers=4 \ - --hidden_choice_len=20 \ - --base_dir=/hdd1/xingnaili/exp_data/ \ - --num_labels=2 \ - --device=cpu \ - --batch_size=32 \ - --dataset=uci_diabetes \ - --nfeat=369 \ - --nfield=43 \ - --nemb=10 \ - --workers=0 \ - --result_dir=./internal/ml/model_selection/exp_result_sever_wo_cache/ - - -# cifar 10 -python3 ./internal/ml/model_selection/exps/micro/benchmark_filtering_latency.py \ - --embedding_cache_filtering=False \ - --tfmem=express_flow \ - --models_explore=5000 \ - --search_space=nasbench201 \ - --api_loc=NAS-Bench-201-v1_1-096897.pth \ - --base_dir=/hdd1/xingnaili/exp_data/ \ - --num_labels=10 \ - --device=cpu \ - --batch_size=32 \ - --dataset=cifar10 \ - --result_dir=./internal/ml/model_selection/exp_result_sever_wo_cache/ - - -# cifar 100 -python3 ./internal/ml/model_selection/exps/micro/benchmark_filtering_latency.py \ - --embedding_cache_filtering=False \ - --tfmem=express_flow \ - --models_explore=5000 \ - --search_space=nasbench201 \ - --api_loc=NAS-Bench-201-v1_1-096897.pth \ - --base_dir=/hdd1/xingnaili/exp_data/ \ - --num_labels=100 \ - --device=cpu \ - --batch_size=32 \ - --dataset=cifar100 \ - --result_dir=./internal/ml/model_selection/exp_result_sever_wo_cache/ - - -# imageNet -python3 ./internal/ml/model_selection/exps/micro/benchmark_filtering_latency.py \ - --embedding_cache_filtering=False \ - --tfmem=express_flow \ - --models_explore=5000 \ - --search_space=nasbench201 \ - --api_loc=NAS-Bench-201-v1_1-096897.pth \ - --base_dir=/hdd1/xingnaili/exp_data/ \ - --num_labels=120 \ - --device=cpu \ - --batch_size=32 \ - --dataset=ImageNet16-120 \ - --result_dir=./internal/ml/model_selection/exp_result_sever_wo_cache/ diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/scripts/latency_phase1_in_db.sh b/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/scripts/latency_phase1_in_db.sh deleted file mode 100644 index 84406263d0..0000000000 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/scripts/latency_phase1_in_db.sh +++ /dev/null @@ -1,77 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -export PYTHONPATH=$PYTHONPATH:./internal/ml/model_selection - - -# frappe -python3 ./internal/ml/model_selection/exps/micro/benchmark_filtering_latency_sql.py \ - --embedding_cache_filtering=True \ - --tfmem=express_flow \ - --models_explore=5000 \ - --search_space=mlp_sp \ - --num_layers=4 \ - --hidden_choice_len=20 \ - --base_dir=/hdd1/xingnaili/exp_data/ \ - --num_labels=2 \ - --device=cpu \ - --batch_size=32 \ - --dataset=frappe \ - --nfeat=5500 \ - --nfield=10 \ - --nemb=10 \ - --workers=0 \ - --result_dir=./internal/ml/model_selection/exp_result_sever_cache_sql/ - -#criteo -python3 ./internal/ml/model_selection/exps/micro/benchmark_filtering_latency_sql.py \ - --embedding_cache_filtering=True \ - --tfmem=express_flow \ - --models_explore=5000 \ - --search_space=mlp_sp \ - --num_layers=4 \ - --hidden_choice_len=10 \ - --base_dir=/hdd1/xingnaili/exp_data/ \ - --num_labels=2 \ - --device=cpu \ - --batch_size=32 \ - --dataset=criteo \ - --nfeat=2100000 \ - --nfield=39 \ - --nemb=10 \ - --workers=0 \ - --result_dir=./internal/ml/model_selection/exp_result_sever_cache_sql/ - -# uci -python3 ./internal/ml/model_selection/exps/micro/benchmark_filtering_latency_sql.py \ - --embedding_cache_filtering=True \ - --tfmem=express_flow \ - --models_explore=5000 \ - --search_space=mlp_sp \ - --num_layers=4 \ - --hidden_choice_len=20 \ - --base_dir=/hdd1/xingnaili/exp_data/ \ - --num_labels=2 \ - --device=cpu \ - --batch_size=32 \ - --dataset=uci_diabetes \ - --nfeat=369 \ - --nfield=43 \ - --nemb=10 \ - --workers=0 \ - --result_dir=./internal/ml/model_selection/exp_result_sever_cache_sql/ diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/scripts/latency_phase2.sh b/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/scripts/latency_phase2.sh deleted file mode 100644 index 84406263d0..0000000000 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/scripts/latency_phase2.sh +++ /dev/null @@ -1,77 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -export PYTHONPATH=$PYTHONPATH:./internal/ml/model_selection - - -# frappe -python3 ./internal/ml/model_selection/exps/micro/benchmark_filtering_latency_sql.py \ - --embedding_cache_filtering=True \ - --tfmem=express_flow \ - --models_explore=5000 \ - --search_space=mlp_sp \ - --num_layers=4 \ - --hidden_choice_len=20 \ - --base_dir=/hdd1/xingnaili/exp_data/ \ - --num_labels=2 \ - --device=cpu \ - --batch_size=32 \ - --dataset=frappe \ - --nfeat=5500 \ - --nfield=10 \ - --nemb=10 \ - --workers=0 \ - --result_dir=./internal/ml/model_selection/exp_result_sever_cache_sql/ - -#criteo -python3 ./internal/ml/model_selection/exps/micro/benchmark_filtering_latency_sql.py \ - --embedding_cache_filtering=True \ - --tfmem=express_flow \ - --models_explore=5000 \ - --search_space=mlp_sp \ - --num_layers=4 \ - --hidden_choice_len=10 \ - --base_dir=/hdd1/xingnaili/exp_data/ \ - --num_labels=2 \ - --device=cpu \ - --batch_size=32 \ - --dataset=criteo \ - --nfeat=2100000 \ - --nfield=39 \ - --nemb=10 \ - --workers=0 \ - --result_dir=./internal/ml/model_selection/exp_result_sever_cache_sql/ - -# uci -python3 ./internal/ml/model_selection/exps/micro/benchmark_filtering_latency_sql.py \ - --embedding_cache_filtering=True \ - --tfmem=express_flow \ - --models_explore=5000 \ - --search_space=mlp_sp \ - --num_layers=4 \ - --hidden_choice_len=20 \ - --base_dir=/hdd1/xingnaili/exp_data/ \ - --num_labels=2 \ - --device=cpu \ - --batch_size=32 \ - --dataset=uci_diabetes \ - --nfeat=369 \ - --nfield=43 \ - --nemb=10 \ - --workers=0 \ - --result_dir=./internal/ml/model_selection/exp_result_sever_cache_sql/ diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/scripts/micro_budget_aware_alg.sh b/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/scripts/micro_budget_aware_alg.sh deleted file mode 100644 index f91ae3ce1f..0000000000 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/scripts/micro_budget_aware_alg.sh +++ /dev/null @@ -1,60 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -export PYTHONPATH=$PYTHONPATH:./internal/ml/model_selection -conda activate trails - - - -############## c10 dataset ############## -# run both 2phase-MS and training-free MS -python internal/ml/model_selection/exps/micro/benchmark_budget_aware_alg.py \ - --search_space nasbench201 \ - --api_loc NAS-Bench-201-v1_1-096897.pth \ - --dataset cifar10 \ - --epoch 200 \ - --base_dir ../exp_data/ \ - --log_name logs_default \ - --result_dir ./internal/ml/model_selection/exp_result/ - - -############## c100 dataset ############## -python internal/ml/model_selection/exps/micro/benchmark_budget_aware_alg.py \ - --search_space nasbench201 \ - --api_loc NAS-Bench-201-v1_1-096897.pth \ - --dataset cifar100 \ - --epoch 200 \ - --base_dir ../exp_data/ \ - --log_name logs_default \ - --result_dir ./internal/ml/model_selection/exp_result/ - - -############## imageNet dataset ############## -python internal/ml/model_selection/exps/micro/benchmark_budget_aware_alg.py \ - --search_space nasbench201 \ - --api_loc NAS-Bench-201-v1_1-096897.pth \ - --dataset ImageNet16-120 \ - --epoch 200 \ - --base_dir ../exp_data/ \ - --log_name logs_default \ - --result_dir ./internal/ml/model_selection/exp_result/ - - - -############## draw graphs ############## -python internal/ml/model_selection/exps/micro/draw_budget_aware_alg.py diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/scripts/micro_nku_tradeoff.sh b/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/scripts/micro_nku_tradeoff.sh deleted file mode 100644 index 6aec7c1959..0000000000 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/scripts/micro_nku_tradeoff.sh +++ /dev/null @@ -1,179 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -export PYTHONPATH=$PYTHONPATH:./internal/ml/model_selection -conda activate trails - - - - -# ==================================== -# ==================================== -# determine the K and U tradeoff -# ==================================== -# ==================================== -# frappe -python internal/ml/model_selection/exps/micro/benchmark_ku.py \ - --search_space mlp_sp \ - --epoch 20 \ - --hidden_choice_len 20 \ - --dataset frappe \ - --base_dir ../exp_data/ \ - --only_phase1 True \ - --is_simulate True \ - --log_folder log_ku_tradeoff - - -# uci -python internal/ml/model_selection/exps/micro/benchmark_ku.py \ - --search_space mlp_sp \ - --hidden_choice_len 20 \ - --epoch 5 \ - --dataset uci_diabetes \ - --base_dir ../exp_data/ \ - --only_phase1 True \ - --is_simulate True \ - --log_folder log_ku_tradeoff - -# criteo -python internal/ml/model_selection/exps/micro/benchmark_ku.py \ - --search_space mlp_sp \ - --hidden_choice_len 10 \ - --epoch 10 \ - --dataset criteo \ - --base_dir ../exp_data/ \ - --only_phase1 True \ - --is_simulate True \ - --log_folder log_ku_tradeoff - - -# c10 -python internal/ml/model_selection/exps/micro/benchmark_ku.py \ - --search_space nasbench201 \ - --api_loc NAS-Bench-201-v1_1-096897.pth \ - --epoch 200 \ - --dataset cifar10 \ - --base_dir ../exp_data/ \ - --only_phase1 True \ - --is_simulate True \ - --log_folder log_ku_tradeoff - - -# c100 -python internal/ml/model_selection/exps/micro/benchmark_ku.py \ - --search_space nasbench201 \ - --api_loc NAS-Bench-201-v1_1-096897.pth \ - --epoch 200 \ - --dataset cifar100 \ - --base_dir ../exp_data/ \ - --only_phase1 True \ - --is_simulate True \ - --log_folder log_ku_tradeoff - - -# imageNet -python internal/ml/model_selection/exps/micro/benchmark_ku.py \ - --search_space nasbench201 \ - --api_loc NAS-Bench-201-v1_1-096897.pth \ - --epoch 200 \ - --dataset ImageNet16-120 \ - --base_dir ../exp_data/ \ - --only_phase1 True \ - --is_simulate True \ - --log_folder log_ku_tradeoff - - - -# ==================================== -# ==================================== -# determine the K and U tradeoff -# ==================================== -# ==================================== - - -python internal/ml/model_selection/exps/micro/benchmark_nk.py \ - --search_space mlp_sp \ - --epoch 20 \ - --hidden_choice_len 20 \ - --dataset frappe \ - --base_dir ../exp_data/ \ - --only_phase1 True \ - --is_simulate True \ - --log_folder log_ku_tradeoff - - -#uci -python internal/ml/model_selection/exps/micro/benchmark_nk.py \ - --search_space mlp_sp \ - --hidden_choice_len 20 \ - --epoch 5 \ - --dataset uci_diabetes \ - --base_dir ../exp_data/ \ - --only_phase1 True \ - --is_simulate True \ - --log_folder log_ku_tradeoff - - -# criteo -python internal/ml/model_selection/exps/micro/benchmark_nk.py \ - --search_space mlp_sp \ - --hidden_choice_len 10 \ - --epoch 10 \ - --dataset criteo \ - --base_dir ../exp_data/ \ - --only_phase1 True \ - --is_simulate True \ - --log_folder log_ku_tradeoff - - - -# c10 -python internal/ml/model_selection/exps/micro/benchmark_nk.py \ - --search_space nasbench201 \ - --api_loc NAS-Bench-201-v1_1-096897.pth \ - --epoch 200 \ - --dataset cifar10 \ - --base_dir ../exp_data/ \ - --only_phase1 True \ - --is_simulate True \ - --log_folder log_ku_tradeoff - - -# c100 -python internal/ml/model_selection/exps/micro/benchmark_nk.py \ - --search_space nasbench201 \ - --api_loc NAS-Bench-201-v1_1-096897.pth \ - --epoch 200 \ - --dataset cifar100 \ - --base_dir ../exp_data/ \ - --only_phase1 True \ - --is_simulate True \ - --log_folder log_ku_tradeoff - - -# imageNet -python internal/ml/model_selection/exps/micro/benchmark_nk.py \ - --search_space nasbench201 \ - --api_loc NAS-Bench-201-v1_1-096897.pth \ - --epoch 200 \ - --dataset ImageNet16-120 \ - --base_dir ../exp_data/ \ - --only_phase1 True \ - --is_simulate True \ - --log_folder log_ku_tradeoff - diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/scripts/micro_score_metrics_relation.sh b/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/scripts/micro_score_metrics_relation.sh deleted file mode 100644 index 3e55e9e3d8..0000000000 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/scripts/micro_score_metrics_relation.sh +++ /dev/null @@ -1,54 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -export PYTHONPATH=$PYTHONPATH:./internal/ml/model_selection - - -############## Frappe ############## -# run both 2phase-MS and training-free MS -python ./internal/ml/model_selection/exps/micro/benchmark_score_metrics.py \ - --tfmem=express_flow \ - --search_space mlp_sp \ - --dataset frappe \ - --base_dir ../exp_data/ \ - --log_name logs_default \ - --result_dir ./internal/ml/model_selection/exp_result/ - - -############## criteo dataset ############## -python ./internal/ml/model_selection/exps/micro/benchmark_score_metrics.py \ - --tfmem=express_flow \ - --search_space mlp_sp \ - --dataset criteo \ - --base_dir ../exp_data/ \ - --log_name logs_default \ - --result_dir ./internal/ml/model_selection/exp_result/ - - -############## Uci dataset ############## -python ./internal/ml/model_selection/exps/micro/benchmark_score_metrics.py \ - --tfmem=express_flow \ - --search_space=mlp_sp \ - --dataset uci_diabetes \ - --base_dir ../exp_data/ \ - --log_name logs_default \ - --result_dir ./internal/ml/model_selection/exp_result/ - - -############## draw graphs ############## -python ./internal/ml/model_selection/exps/micro/draw_score_metric_relation.py diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/scripts/micro_search_strategy.sh b/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/scripts/micro_search_strategy.sh deleted file mode 100644 index 4b9f375666..0000000000 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/scripts/micro_search_strategy.sh +++ /dev/null @@ -1,70 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -export PYTHONPATH=$PYTHONPATH:./internal/ml/model_selection - -# rs -python internal/ml/model_selection/exps/baseline/train_with_random.py \ - --search_space mlp_sp \ - --num_layers 4 \ - --hidden_choice_len 20 \ - --epoch 19 \ - --batch_size=512 \ - --lr=0.001 \ - --iter_per_epoch=200 \ - --nfeat=5500 \ - --nfield=10 \ - --nemb=10 \ - --base_dir ../exp_data/ \ - --dataset frappe \ - --num_labels 2 \ - --device=cpu \ - --log_folder baseline_frappe \ - --result_dir ./internal/ml/model_selection/exp_result/ - - -# rl -python internal/ml/model_selection/exps/baseline/train_with_rl.py - - -# re -python internal/ml/model_selection/exps/baseline/train_with_ea.py \ - --search_space mlp_sp \ - --num_layers 4 \ - --hidden_choice_len 20 \ - --epoch 19 \ - --batch_size=512 \ - --lr=0.001 \ - --iter_per_epoch=200 \ - --nfeat=5500 \ - --nfield=10 \ - --nemb=10 \ - --base_dir ../exp_data/ \ - --dataset frappe \ - --num_labels 2 \ - --device=cpu \ - --log_folder baseline_frappe \ - --result_dir ./internal/ml/model_selection/exp_result/ - -# bohb -python internal/ml/model_selection/exps/baseline/train_bohb.py - -############## draw the graph ############## -python internal/ml/model_selection/exps/baseline/draw_benchmark_train_based.py --dataset frappe - - diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/scripts/nas-bench-img/convert_api_2_json.sh b/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/scripts/nas-bench-img/convert_api_2_json.sh deleted file mode 100644 index 8d71ff283b..0000000000 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/scripts/nas-bench-img/convert_api_2_json.sh +++ /dev/null @@ -1,29 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -export PYTHONPATH=$PYTHONPATH:./internal/ml/model_selection -conda activate trails - - -# pip install nats_bench - -python internal/ml/model_selection/exps/nas_bench_img/0_characterize_gt.py -python internal/ml/model_selection/exps/nas_bench_img/0_parse_testacc_101.py -python internal/ml/model_selection/exps/nas_bench_img/0_parse_testacc_201.py - - diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/scripts/nas-bench-img/explore_all_models.sh b/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/scripts/nas-bench-img/explore_all_models.sh deleted file mode 100644 index aea5ff9e79..0000000000 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/scripts/nas-bench-img/explore_all_models.sh +++ /dev/null @@ -1,77 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -export PYTHONPATH=$PYTHONPATH:./internal/ml/model_selection -conda activate trails - - -# cifar10 + nb101 -python ./internal/ml/model_selection/exps/nas_bench_img/1_explore_models_100_run.py \ - --search_space=nasbench101 \ - --api_loc=nasbench_only108.pkl \ - --base_dir=../exp_data/ \ - --dataset=cifar10 \ - --num_labels=10 \ - --device=cpu \ - --log_folder=log_img_explore_ea \ - --result_dir=./internal/ml/model_selection/exp_result/ - - -# cifar10 + nb201 -python ./internal/ml/model_selection/exps/nas_bench_img/1_explore_models_100_run.py \ - --search_space=nasbench201 \ - --api_loc=NAS-Bench-201-v1_1-096897.pth \ - --base_dir=../exp_data/ \ - --dataset=cifar10 \ - --init_channels=16 \ - --num_stacks=3 \ - --num_modules_per_stack=3 \ - --num_labels=10 \ - --device=cpu \ - --log_folder=log_img_explore_ea \ - --result_dir=./internal/ml/model_selection/exp_result/ - - -# cifar100 + nb201 -python ./internal/ml/model_selection/exps/nas_bench_img/1_explore_models_100_run.py \ - --search_space=nasbench201 \ - --api_loc=NAS-Bench-201-v1_1-096897.pth \ - --base_dir=../exp_data/ \ - --dataset=cifar100 \ - --init_channels=16 \ - --num_stacks=3 \ - --num_modules_per_stack=3 \ - --num_labels=100 \ - --device=cpu \ - --log_folder=log_img_explore_ea \ - --result_dir=./internal/ml/model_selection/exp_result/ - - -# imgnet + nb201 -python ./internal/ml/model_selection/exps/nas_bench_img/1_explore_models_100_run.py \ - --search_space=nasbench201 \ - --api_loc=NAS-Bench-201-v1_1-096897.pth \ - --base_dir=../exp_data/ \ - --dataset=ImageNet16-120 \ - --init_channels=16 \ - --num_stacks=3 \ - --num_modules_per_stack=3 \ - --num_labels=120 \ - --device=cpu \ - --log_folder=log_img_explore_ea \ - --result_dir=./internal/ml/model_selection/exp_result/ diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/scripts/nas-bench-img/score_all_models.sh b/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/scripts/nas-bench-img/score_all_models.sh deleted file mode 100644 index 1e7e9bf3b1..0000000000 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/scripts/nas-bench-img/score_all_models.sh +++ /dev/null @@ -1,75 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -export PYTHONPATH=$PYTHONPATH:./internal/ml/model_selection - - -for i in {1..4} -do - # cifar10 + nb101 -# /home/xingnaili/miniconda3/envs/trails/bin/python ./internal/ml/model_selection/exps/nas_bench_tabular/4.seq_score_online.py \ -# --models_explore=1200 \ -# --search_space=nasbench101 \ -# --api_loc=nasbench_only108.pkl \ -# --base_dir=/hdd1/xingnaili/exp_data/ \ -# --dataset=cifar10 \ -# --batch_size=32 \ -# --num_labels=10 \ -# --device=cuda:0 \ -# --log_folder=log_score_all_img10_101 \ -# --result_dir=./internal/ml/model_selection/exp_result/ - - # cifar10 + nb201 - /home/xingnaili/miniconda3/envs/trails/bin/python ./internal/ml/model_selection/exps/nas_bench_tabular/4.seq_score_online.py \ - --models_explore=1200 \ - --search_space=nasbench201 \ - --api_loc=NAS-Bench-201-v1_1-096897.pth \ - --base_dir=/hdd1/xingnaili/exp_data/ \ - --dataset=cifar10 \ - --batch_size=32 \ - --num_labels=10 \ - --device=cpu \ - --log_folder=log_score_all_img10 \ - --result_dir=./internal/ml/model_selection/exp_result/ - - # cifar100 + nb201 - /home/xingnaili/miniconda3/envs/trails/bin/python ./internal/ml/model_selection/exps/nas_bench_tabular/4.seq_score_online.py \ - --models_explore=1200 \ - --search_space=nasbench201 \ - --api_loc=NAS-Bench-201-v1_1-096897.pth \ - --base_dir=/hdd1/xingnaili/exp_data/ \ - --dataset=cifar100 \ - --batch_size=32 \ - --num_labels=100 \ - --device=cpu \ - --log_folder=log_score_all_img100 \ - --result_dir=./internal/ml/model_selection/exp_result/ - - # imgnet + nb201 - /home/xingnaili/miniconda3/envs/trails/bin/python ./internal/ml/model_selection/exps/nas_bench_tabular/4.seq_score_online.py \ - --models_explore=1200 \ - --search_space=nasbench201 \ - --api_loc=NAS-Bench-201-v1_1-096897.pth \ - --base_dir=/hdd1/xingnaili/exp_data/ \ - --dataset=ImageNet16-120 \ - --batch_size=32 \ - --num_labels=120 \ - --device=cpu \ - --log_folder=log_score_all_img_imgnet \ - --result_dir=./internal/ml/model_selection/exp_result/ -done \ No newline at end of file diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/scripts/nas-bench-tabular/score_all_modesl_criteo.sh b/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/scripts/nas-bench-tabular/score_all_modesl_criteo.sh deleted file mode 100644 index 004ecb1a6e..0000000000 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/scripts/nas-bench-tabular/score_all_modesl_criteo.sh +++ /dev/null @@ -1,43 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -export PYTHONPATH=$PYTHONPATH:./internal/ml/model_selection - - -nohup python ./internal/ml/model_selection/exps/nas_bench_tabular/4.seq_score_online.py \ - --embedding_cache_filtering=True \ - --models_explore=9999 \ - --tfmem=express_flow \ - --log_name=score_based \ - --search_space=mlp_sp \ - --num_layers=4 \ - --hidden_choice_len=10 \ - --base_dir=/hdd1/xingnaili/exp_data/ \ - --num_labels=2 \ - --device=cpu \ - --batch_size=32 \ - --dataset=criteo \ - --nfeat=2100000 \ - --nfield=39 \ - --nemb=10 \ - --workers=0 \ - --result_dir=./internal/ml/model_selection/exp_result/ \ - --log_folder=log_score_time_criteo > outputCriScorAll.log& - - - diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/scripts/nas-bench-tabular/score_all_modesl_frappe.sh b/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/scripts/nas-bench-tabular/score_all_modesl_frappe.sh deleted file mode 100644 index 81d4ff12a1..0000000000 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/scripts/nas-bench-tabular/score_all_modesl_frappe.sh +++ /dev/null @@ -1,44 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -export PYTHONPATH=$PYTHONPATH:./internal/ml/model_selection -conda activate trails - - -nohup python ./internal/ml/model_selection/exps/nas_bench_tabular/4.seq_score_online.py \ - --embedding_cache_filtering=True \ - --models_explore=159999 \ - --tfmem=express_flow \ - --log_name=score_based \ - --search_space=mlp_sp \ - --num_layers=4 \ - --hidden_choice_len=20 \ - --base_dir=/hdd1/xingnaili/exp_data/ \ - --num_labels=2 \ - --device=cpu \ - --batch_size=32 \ - --dataset=frappe \ - --nfeat=5500 \ - --nfield=10 \ - --nemb=10 \ - --workers=0 \ - --result_dir=./internal/ml/model_selection/exp_result/ \ - --log_folder=log_score_time_frappe > output.log& - - - diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/scripts/nas-bench-tabular/train_all_models_criteo.sh b/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/scripts/nas-bench-tabular/train_all_models_criteo.sh deleted file mode 100644 index 3d11671c8d..0000000000 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/scripts/nas-bench-tabular/train_all_models_criteo.sh +++ /dev/null @@ -1,63 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -export PYTHONPATH=$PYTHONPATH:./internal/ml/model_selection -conda activate trails - - -worker_id=0 -GPU_NUM=9 -worker_each_gpu=6 -total_workers=$((worker_each_gpu*GPU_NUM)) - -for((gpu_id=0; gpu_id < GPU_NUM; ++gpu_id)); do -# echo "GPU id is $gpu_id" - for((i=0; i < worker_each_gpu; ++i)); do - echo "Assign task to worker id is $worker_id" - echo "nohup python ./internal/ml/model_selection/exps/nas_bench_tabular/2.seq_train_online.py \ - --log_name=baseline_train_based \ - --search_space=mlp_sp \ - --num_layers=4 \ - --hidden_choice_len=10 \ - --base_dir=../exp_data/ \ - --num_labels=2 \ - --device=cuda:$gpu_id \ - --batch_size=1024 \ - --lr=0.001 \ - --epoch=10 \ - --iter_per_epoch=2000 \ - --dataset=criteo \ - --nfeat=2100000 \ - --nfield=39 \ - --nemb=10 \ - --worker_id=$worker_id \ - --total_workers=$total_workers \ - --workers=0 \ - --log_folder=log_train_criteo \ - --total_models_per_worker=-1 \ - --result_dir=./internal/ml/model_selection/exp_result/ \ - --pre_partitioned_file=./internal/ml/model_selection/exps/nas_bench_tabular/sampled_models_10000_models.json & ">> train_all_models_criteo_seq.sh - -# sleep 1 - worker_id=$((worker_id+1)) - done -done - - -# pkill -9 -f 2.seq_train_online.py -# run with bash internal/ml/model_selection/scripts/nas-bench-tabular/train_all_models_criteo.sh >criteobash & diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/scripts/nas-bench-tabular/train_all_models_criteo_distirbuted.sh b/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/scripts/nas-bench-tabular/train_all_models_criteo_distirbuted.sh deleted file mode 100644 index 39e7701240..0000000000 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/scripts/nas-bench-tabular/train_all_models_criteo_distirbuted.sh +++ /dev/null @@ -1,64 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# frappe -python exps/main_v2/ground_truth/2.seq_train_dist_online.py \ - --log_name=baseline_train_based \ - --search_space=mlp_sp \ - --num_layers=4 \ - --hidden_choice_len=20 \ - --base_dir=../exp_data/ \ - --num_labels=1 \ - --device=gpu \ - --batch_size=1024 \ - --lr=0.001 \ - --epoch=10 \ - --iter_per_epoch=100 \ - --dataset=frappe \ - --nfeat=5500 \ - --nfield=10 \ - --nemb=10 \ - --total_models_per_worker=10 \ - --workers=0 \ - --worker_each_gpu=1 \ - --gpu_num=8 \ - --log_folder=LogFrappee \ - --pre_partitioned_file=./exps/main_v2/ground_truth/sampled_models_10000_models.json & - -# criteo -python exps/main_v2/ground_truth/2.seq_train_dist_online.py \ - --log_name=baseline_train_based \ - --search_space=mlp_sp \ - --num_layers=4 \ - --hidden_choice_len=10 \ - --base_dir=../exp_data/ \ - --num_labels=1 \ - --device=gpu \ - --batch_size=1024 \ - --lr=0.001 \ - --epoch=10 \ - --iter_per_epoch=2000 \ - --dataset=criteo \ - --nfeat=2100000 \ - --nfield=39 \ - --nemb=10 \ - --workers=0 \ - --worker_each_gpu=9 \ - --gpu_num=8 \ - --log_folder=LogCriteo \ - --pre_partitioned_file=./exps/main_v2/ground_truth/sampled_models_10000_models.json & diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/scripts/nas-bench-tabular/train_all_models_diabetes.sh b/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/scripts/nas-bench-tabular/train_all_models_diabetes.sh deleted file mode 100644 index 397836405e..0000000000 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/scripts/nas-bench-tabular/train_all_models_diabetes.sh +++ /dev/null @@ -1,63 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -export PYTHONPATH=$PYTHONPATH:./internal/ml/model_selection -conda activate trails - - -worker_id=0 -GPU_NUM=8 -worker_each_gpu=4 -total_workers=$((worker_each_gpu*GPU_NUM)) - -for((gpu_id=0; gpu_id < GPU_NUM; ++gpu_id)); do - for((i=0; i < worker_each_gpu; ++i)); do - - echo "nohup python ./internal/ml/model_selection/exps/nas_bench_tabular/2.seq_train_online.py \ - --log_name=baseline_train_based \ - --search_space=mlp_sp \ - --num_layers=4 \ - --hidden_choice_len=20 \ - --base_dir=../exp_data/ \ - --num_labels=2 \ - --device=cuda:$gpu_id \ - --batch_size=1024 \ - --lr=0.001 \ - --epoch=40 \ - --iter_per_epoch=200 \ - --dataset=uci_diabetes \ - --nfeat=369 \ - --nfield=43 \ - --nemb=10 \ - --worker_id=$worker_id \ - --total_workers=$total_workers \ - --workers=0 \ - --log_folder=log_train_uci \ - --total_models_per_worker=-1 \ - --result_dir=./internal/ml/model_selection/exp_result/ \ - --pre_partitioned_file=./internal/ml/model_selection/exps/nas_bench_tabular/uci_left_8k_models.json > outputuci.log& ">> train_all_models_diabetes_seq.sh - - worker_id=$((worker_id+1)) - done -done - - -# pkill -9 -f ./internal/ml/model_selection/exps/nas_bench_tabular//2.seq_train_online.py -# pkill -9 -f /home/naili/miniconda3/envs/firmest_torch11/bin/python - -# run with bash internal/ml/model_selection/scripts/nas-bench-tabular/train_all_models_diabetes.sh >ucibash & diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/scripts/nas-bench-tabular/train_all_models_frappe.sh b/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/scripts/nas-bench-tabular/train_all_models_frappe.sh deleted file mode 100644 index 8d4af9eac6..0000000000 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/scripts/nas-bench-tabular/train_all_models_frappe.sh +++ /dev/null @@ -1,61 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -export PYTHONPATH=$PYTHONPATH:./internal/ml/model_selection -conda activate trails - - -worker_id=0 -GPU_NUM=8 -worker_each_gpu=16 -total_workers=$((worker_each_gpu*GPU_NUM)) - -for((gpu_id=0; gpu_id < GPU_NUM; ++gpu_id)); do -# echo "GPU id is $gpu_id" - for((i=0; i < worker_each_gpu; ++i)); do - echo "nohup python ./internal/ml/model_selection/exps/nas_bench_tabular/2.seq_train_online.py \ - --log_name=baseline_train_based \ - --search_space=mlp_sp \ - --num_layers=4 \ - --hidden_choice_len=20 \ - --base_dir=/home/shaofeng/naili/firmest_data/ \ - --num_labels=2 \ - --device=cuda:$gpu_id \ - --batch_size=512 \ - --lr=0.001 \ - --epoch=20 \ - --iter_per_epoch=200 \ - --dataset=frappe \ - --nfeat=5500 \ - --nfield=10 \ - --nemb=10 \ - --worker_id=$worker_id \ - --total_workers=$total_workers \ - --workers=0 \ - --log_folder=log_frappe \ - --total_models_per_worker=-1 \ - --result_dir=./internal/ml/model_selection/exp_result/ \ - --pre_partitioned_file=./internal/ml/model_selection/exps/nas_bench_tabular/sampled_models_all.json & ">> train_all_models_frappe_seq.sh - - sleep 1 - worker_id=$((worker_id+1)) - done -done - - -# pkill -9 -f internal/ml/model_selection/scripts/nas-bench-tabular/train_all_models_frappe.sh diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/scripts/nas-bench-tabular/train_one_model_dev.sh b/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/scripts/nas-bench-tabular/train_one_model_dev.sh deleted file mode 100644 index 86e36c2f51..0000000000 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/scripts/nas-bench-tabular/train_one_model_dev.sh +++ /dev/null @@ -1,41 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -export PYTHONPATH=$PYTHONPATH:./internal/ml/model_selection - -python ./internal/ml/model_selection/exps/nas_bench_tabular/0.train_one_model.py \ - --log_name=baseline_train_based \ - --search_space=mlp_sp \ - --num_layers=4 \ - --hidden_choice_len=20 \ - --base_dir=/hdd1/xingnaili/exp_data/ \ - --num_labels=2 \ - --device=cuda:0 \ - --batch_size=512 \ - --lr=0.001 \ - --epoch=20 \ - --iter_per_epoch=200 \ - --dataset=frappe \ - --nfeat=5500 \ - --nfield=10 \ - --nemb=10 \ - --worker_id=0 \ - --total_workers=1 \ - --workers=1 \ - --result_dir=./internal/ml/model_selection/exp_result/ \ - --log_folder=log_frappe \ No newline at end of file diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/scripts/nas-bench-tabular/train_params_tune_criteo.sh b/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/scripts/nas-bench-tabular/train_params_tune_criteo.sh deleted file mode 100644 index a3ea087907..0000000000 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/scripts/nas-bench-tabular/train_params_tune_criteo.sh +++ /dev/null @@ -1,162 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -export PYTHONPATH=$PYTHONPATH:./internal/ml/model_selection -conda activate trails - - - -# default setting. -python ./internal/ml/model_selection/exps/nas_bench_tabular/0.train_one_model.py \ - --log_name=baseline_train_based \ - --search_space=mlp_sp \ - --base_dir=../exp_data/ \ - --num_labels=2 \ - --device=cuda:0 \ - --batch_size=1024 \ - --lr=0.001 \ - --epoch=5 \ - --iter_per_epoch=2000 \ - --dataset=criteo \ - --nfeat=2100000 \ - --nfield=39 \ - --nemb=10 \ - --workers=0 \ - --result_dir=./internal/ml/model_selection/exp_result/ \ - --log_folder=log_criteo_train_tune >criteo_5.log & - - -python ./internal/ml/model_selection/exps/nas_bench_tabular/0.train_one_model.py \ - --log_name=baseline_train_based \ - --search_space=mlp_sp \ - --base_dir=../exp_data/ \ - --num_labels=2 \ - --device=cuda:0 \ - --batch_size=1024 \ - --lr=0.001 \ - --epoch=10 \ - --iter_per_epoch=2000 \ - --dataset=criteo \ - --nfeat=2100000 \ - --nfield=39 \ - --nemb=10 \ - --workers=0 \ - --result_dir=./internal/ml/model_selection/exp_result/ \ - --log_folder=log_criteo_train_tune >criteo_10.log & - - - -python ./internal/ml/model_selection/exps/nas_bench_tabular/0.train_one_model.py \ - --log_name=baseline_train_based \ - --search_space=mlp_sp \ - --base_dir=../exp_data/ \ - --num_labels=2 \ - --device=cuda:1 \ - --batch_size=1024 \ - --lr=0.001 \ - --epoch=20 \ - --iter_per_epoch=2000 \ - --dataset=criteo \ - --nfeat=2100000 \ - --nfield=39 \ - --nemb=10 \ - --workers=0 \ - --result_dir=./internal/ml/model_selection/exp_result/ \ - --log_folder=log_criteo_train_tune >criteo_20.log & - - - - -python ./internal/ml/model_selection/exps/nas_bench_tabular/0.train_one_model.py \ - --log_name=baseline_train_based \ - --search_space=mlp_sp \ - --base_dir=../exp_data/ \ - --num_labels=2 \ - --device=cuda:2 \ - --batch_size=1024 \ - --lr=0.001 \ - --epoch=40 \ - --iter_per_epoch=2000 \ - --dataset=criteo \ - --nfeat=2100000 \ - --nfield=39 \ - --nemb=10 \ - --workers=0 \ - --result_dir=./internal/ml/model_selection/exp_result/ \ - --log_folder=log_criteo_train_tune >criteo_40.log & - - - -python ./internal/ml/model_selection/exps/nas_bench_tabular/0.train_one_model.py \ - --log_name=baseline_train_based \ - --search_space=mlp_sp \ - --base_dir=../exp_data/ \ - --num_labels=2 \ - --device=cuda:3 \ - --batch_size=1024 \ - --lr=0.001 \ - --epoch=60 \ - --iter_per_epoch=2000 \ - --dataset=criteo \ - --nfeat=2100000 \ - --nfield=39 \ - --nemb=10 \ - --workers=0 \ - --result_dir=./internal/ml/model_selection/exp_result/ \ - --log_folder=log_criteo_train_tune >criteo_60.log & - - - -python ./internal/ml/model_selection/exps/nas_bench_tabular/0.train_one_model.py \ - --log_name=baseline_train_based \ - --search_space=mlp_sp \ - --base_dir=../exp_data/ \ - --num_labels=2 \ - --device=cuda:4 \ - --batch_size=1024 \ - --lr=0.001 \ - --epoch=80 \ - --iter_per_epoch=2000 \ - --dataset=criteo \ - --nfeat=2100000 \ - --nfield=39 \ - --nemb=10 \ - --workers=0 \ - --result_dir=./internal/ml/model_selection/exp_result/ \ - --log_folder=log_criteo_train_tune >criteo_80.log & - - - -python ./internal/ml/model_selection/exps/nas_bench_tabular/0.train_one_model.py \ - --log_name=baseline_train_based \ - --search_space=mlp_sp \ - --base_dir=../exp_data/ \ - --num_labels=2 \ - --device=cuda:5 \ - --batch_size=1024 \ - --lr=0.001 \ - --epoch=100 \ - --iter_per_epoch=2000 \ - --dataset=criteo \ - --nfeat=2100000 \ - --nfield=39 \ - --nemb=10 \ - --workers=0 \ - --result_dir=./internal/ml/model_selection/exp_result/ \ - --log_folder=log_criteo_train_tune >criteo_100.log & - diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/scripts/nas-bench-tabular/train_params_tune_diabetes.sh b/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/scripts/nas-bench-tabular/train_params_tune_diabetes.sh deleted file mode 100644 index 697816e241..0000000000 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/scripts/nas-bench-tabular/train_params_tune_diabetes.sh +++ /dev/null @@ -1,86 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -export PYTHONPATH=$PYTHONPATH:./internal/ml/model_selection -conda activate trails - - -nohup python ./internal/ml/model_selection/exps/nas_bench_tabular/0.train_one_model.py \ - --log_name=baseline_train_based \ - --search_space=mlp_sp \ - --num_layers=4 \ - --hidden_choice_len=20 \ - --base_dir=../exp_data/ \ - --num_labels=2 \ - --device=cuda:0 \ - --batch_size=1024 \ - --lr=0.001 \ - --epoch=3 \ - --iter_per_epoch=200 \ - --dataset=uci_diabetes \ - --nfeat=369 \ - --nfield=43 \ - --nemb=10 \ - --workers=0 \ - --result_dir=./internal/ml/model_selection/exp_result/ \ - --log_folder=log_uci_train_tune >uci_3.log & - - - -nohup python ./internal/ml/model_selection/exps/nas_bench_tabular/0.train_one_model.py \ - --log_name=baseline_train_based \ - --search_space=mlp_sp \ - --num_layers=4 \ - --hidden_choice_len=20 \ - --base_dir=../exp_data/ \ - --num_labels=2 \ - --device=cuda:1 \ - --batch_size=1024 \ - --lr=0.001 \ - --epoch=5 \ - --iter_per_epoch=200 \ - --dataset=uci_diabetes \ - --nfeat=369 \ - --nfield=43 \ - --nemb=10 \ - --workers=0 \ - --result_dir=./internal/ml/model_selection/exp_result/ \ - --log_folder=log_uci_train_tune >uci_5.log & - - -# default setting. -nohup python ./internal/ml/model_selection/exps/nas_bench_tabular/0.train_one_model.py \ - --log_name=baseline_train_based \ - --search_space=mlp_sp \ - --num_layers=4 \ - --hidden_choice_len=20 \ - --base_dir=../exp_data/ \ - --num_labels=2 \ - --device=cuda:2 \ - --batch_size=1024 \ - --lr=0.001 \ - --epoch=7 \ - --iter_per_epoch=200 \ - --dataset=uci_diabetes \ - --nfeat=369 \ - --nfield=43 \ - --nemb=10 \ - --workers=0 \ - --result_dir=./internal/ml/model_selection/exp_result/ \ - --log_folder=log_uci_train_tune >uci_7.log & - diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/controller/core/__init__.py b/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/controller/core/__init__.py deleted file mode 100644 index 4e04c2b3b1..0000000000 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/controller/core/__init__.py +++ /dev/null @@ -1,17 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# \ No newline at end of file diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/README.md b/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/README.md deleted file mode 100644 index b7c96e8845..0000000000 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/README.md +++ /dev/null @@ -1,18 +0,0 @@ - diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/phase1/vote.py b/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/phase1/vote.py deleted file mode 100644 index 1f1ee1f39d..0000000000 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/phase1/vote.py +++ /dev/null @@ -1,133 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from src.eva_engine.phase1.algo.alg_base import Evaluator -from .utils.autograd_hacks import * -from src.common.constant import Config - -class VoteEvaluator(Evaluator): - - def __init__(self): - super().__init__() - - def evaluate(self, arch: nn.Module, device, batch_data: object, batch_labels: torch.Tensor, space_name: str) -> float: - """ - This is simply sum over all weigth's norm to calculate models performance - :param arch: - :param device: CPU or GPU - :param batch_data: - :param batch_labels: - :return: - """ - - pass - - -def vote_between_two_arch(arch1_info: dict, arch2_info: dict, metric: list, space: str): - """ - Return which architecture is better, - :param arch1_info: - :param arch2_info: - :param metric: - :param space: - :return: - """ - left_vote = 0 - right_vote = 0 - for m_name in metric: - # if this metrics vote to left - if vote_to_left[space](m_name, - float(arch1_info["scores"][m_name]["score"]), - float(arch2_info["scores"][m_name]["score"])): - left_vote += 1 - else: - right_vote += 1 - - if left_vote > right_vote: - return arch1_info["architecture_id"] - else: - return arch2_info["architecture_id"] - - -def compare_score_201(m_name: str, s1: float, s2: float) -> bool: - """ - Return if s1 is better than s2, - :param m_name: - :param s1: - :param s2: - :return: if s1 is better than s2 - """ - if m_name == "grad_norm": - return s1 > s2 - if m_name == "grad_plain": - return s1 < s2 - if m_name == "ntk_cond_num": - return s1 < s2 - if m_name == "ntk_trace": - return s1 > s2 - if m_name == "ntk_trace_approx": - return s1 > s2 - if m_name == "fisher": - return s1 > s2 - if m_name == "grasp": - return s1 > s2 - if m_name == "snip": - return s1 > s2 - if m_name == "synflow": - return s1 > s2 - if m_name == "weight_norm": - return s1 > s2 - if m_name == "nas_wot": - return s1 > s2 - - -def compare_score_101(m_name: str, s1: float, s2: float) -> bool: - """ - Return if s1 is better than s2, - :param m_name: - :param s1: - :param s2: - :return: if s1 is better than s2 - """ - if m_name == "grad_norm": - return s1 < s2 - if m_name == "grad_plain": - return s1 < s2 - if m_name == "ntk_cond_num": - return s1 < s2 - if m_name == "ntk_trace": - return s1 < s2 - if m_name == "ntk_trace_approx": - return s1 < s2 - if m_name == "fisher": - return s1 < s2 - if m_name == "grasp": - return s1 > s2 - if m_name == "snip": - return s1 < s2 - if m_name == "synflow": - return s1 > s2 - if m_name == "weight_norm": - return s1 > s2 - if m_name == "nas_wot": - return s1 > s2 - - -vote_to_left = {} -vote_to_left["101"] = compare_score_101 -vote_to_left["201"] = compare_score_201 diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/query_api/__init__.py b/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/query_api/__init__.py deleted file mode 100644 index 3df60b02f7..0000000000 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/query_api/__init__.py +++ /dev/null @@ -1,17 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/search_space/__init__.py b/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/search_space/__init__.py deleted file mode 100644 index 4e04c2b3b1..0000000000 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/search_space/__init__.py +++ /dev/null @@ -1,17 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# \ No newline at end of file diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/search_space/core/__init__.py b/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/search_space/core/__init__.py deleted file mode 100644 index 3df60b02f7..0000000000 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/search_space/core/__init__.py +++ /dev/null @@ -1,17 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/search_space/mlp_api/__init__.py b/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/search_space/mlp_api/__init__.py deleted file mode 100644 index 3df60b02f7..0000000000 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/search_space/mlp_api/__init__.py +++ /dev/null @@ -1,17 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/third_pkg/darts_lib/__init__.py b/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/third_pkg/darts_lib/__init__.py deleted file mode 100644 index 3df60b02f7..0000000000 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/third_pkg/darts_lib/__init__.py +++ /dev/null @@ -1,17 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/third_pkg/darts_lib/genotypes.py b/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/third_pkg/darts_lib/genotypes.py deleted file mode 100644 index fa94001867..0000000000 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/third_pkg/darts_lib/genotypes.py +++ /dev/null @@ -1,36 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from collections import namedtuple - - -Genotype = namedtuple('Genotype', 'normal normal_concat reduce reduce_concat') - -PRIMITIVES = [ - 'none', - 'max_pool_3x3', - 'avg_pool_3x3', - 'skip_connect', - 'sep_conv_3x3', - 'sep_conv_5x5', - 'dil_conv_3x3', - 'dil_conv_5x5' -] - -NUM_VERTICES = 4 -NUM_OPS = 7 diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/third_pkg/darts_lib/model.py b/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/third_pkg/darts_lib/model.py deleted file mode 100644 index f8be9a9dd2..0000000000 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/third_pkg/darts_lib/model.py +++ /dev/null @@ -1,308 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from operations import * -from .utils import drop_path - - -class Cell(nn.Module): - - def __init__(self, genotype, C_prev_prev, C_prev, C, reduction, reduction_prev): - super(Cell, self).__init__() - # print(C_prev_prev, C_prev, C) - - if reduction_prev: - self.preprocess0 = FactorizedReduce(C_prev_prev, C) - else: - self.preprocess0 = ReLUConvBN(C_prev_prev, C, 1, 1, 0) - self.preprocess1 = ReLUConvBN(C_prev, C, 1, 1, 0) - - if reduction: - op_names, indices = zip(*genotype.reduce) - concat = genotype.reduce_concat - else: - op_names, indices = zip(*genotype.normal) - concat = genotype.normal_concat - self._compile(C, op_names, indices, concat, reduction) - - def _compile(self, C, op_names, indices, concat, reduction): - assert len(op_names) == len(indices) - self._steps = len(op_names) // 2 - self._concat = concat - self.multiplier = len(concat) - - self._ops = nn.ModuleList() - for name, index in zip(op_names, indices): - stride = 2 if reduction and index < 2 else 1 - op = OPS[name](C, stride, True) - self._ops += [op] - self._indices = indices - - def forward(self, s0, s1, drop_prob): - s0 = self.preprocess0(s0) - s1 = self.preprocess1(s1) - - states = [s0, s1] - for i in range(self._steps): - h1 = states[self._indices[2 * i]] - h2 = states[self._indices[2 * i + 1]] - op1 = self._ops[2 * i] - op2 = self._ops[2 * i + 1] - h1 = op1(h1) - h2 = op2(h2) - if self.training and drop_prob > 0.: - if not isinstance(op1, Identity): - h1 = drop_path(h1, drop_prob) - if not isinstance(op2, Identity): - h2 = drop_path(h2, drop_prob) - s = h1 + h2 - states += [s] - return torch.cat([states[i] for i in self._concat], dim=1) - - -class AuxiliaryHeadCIFAR(nn.Module): - - def __init__(self, C, num_classes): - """assuming input size 8x8""" - super(AuxiliaryHeadCIFAR, self).__init__() - self.features = nn.Sequential( - nn.ReLU(inplace=True), - # image size = 2 x 2 - nn.AvgPool2d(5, stride=3, padding=0, count_include_pad=False), - nn.Conv2d(C, 128, 1, bias=False), - nn.BatchNorm2d(128), - nn.ReLU(inplace=True), - nn.Conv2d(128, 768, 2, bias=False), - nn.BatchNorm2d(768), - nn.ReLU(inplace=True) - ) - self.classifier = nn.Linear(768, num_classes) - - def forward(self, x): - x = self.features(x) - x = self.classifier(x.view(x.size(0), -1)) - return x - - -class AuxiliaryHeadTinyImageNet(nn.Module): - - def __init__(self, C, num_classes): - """assuming input size 8x8""" - super(AuxiliaryHeadTinyImageNet, self).__init__() - self.features = nn.Sequential( - nn.ReLU(inplace=False), - # image size = 2 x 2 - nn.AvgPool2d(5, stride=3, padding=0, count_include_pad=False), - nn.Conv2d(C, 128, 1, bias=False), - nn.BatchNorm2d(128), - nn.ReLU(inplace=False), - nn.Conv2d(128, 768, 2, bias=False), - nn.BatchNorm2d(768), - nn.ReLU(inplace=False) - ) - self.classifier = nn.Linear(768, num_classes) - - def forward(self, x): - x = self.features(x) - x = self.classifier(x.view(x.size(0), -1)) - return x - - -class AuxiliaryHeadImageNet(nn.Module): - - def __init__(self, C, num_classes): - """assuming input size 14x14""" - super(AuxiliaryHeadImageNet, self).__init__() - self.features = nn.Sequential( - nn.ReLU(inplace=True), - nn.AvgPool2d(5, stride=2, padding=0, count_include_pad=False), - nn.Conv2d(C, 128, 1, bias=False), - nn.BatchNorm2d(128), - nn.ReLU(inplace=True), - nn.Conv2d(128, 768, 2, bias=False), - # NOTE: This batchnorm was omitted in my earlier implementation due to a typo. - # Commenting it out for consistency with the experiments in the paper. - # nn.BatchNorm2d(768), - nn.ReLU(inplace=True) - ) - self.classifier = nn.Linear(768, num_classes) - - def forward(self, x): - x = self.features(x) - x = self.classifier(x.view(x.size(0), -1)) - return x - - -class NetworkCIFAR(nn.Module): - - def __init__(self, C, num_classes, layers, auxiliary, genotype): - super(NetworkCIFAR, self).__init__() - self._layers = layers - self._auxiliary = auxiliary - - stem_multiplier = 3 - C_curr = stem_multiplier * C - self.stem = nn.Sequential( - nn.Conv2d(3, C_curr, 3, padding=1, bias=False), - nn.BatchNorm2d(C_curr) - ) - - C_prev_prev, C_prev, C_curr = C_curr, C_curr, C - self.cells = nn.ModuleList() - reduction_prev = False - for i in range(layers): - if i in [layers // 3, 2 * layers // 3]: - C_curr *= 2 - reduction = True - else: - reduction = False - cell = Cell(genotype, C_prev_prev, C_prev, - C_curr, reduction, reduction_prev) - reduction_prev = reduction - self.cells += [cell] - C_prev_prev, C_prev = C_prev, cell.multiplier * C_curr - if i == 2 * layers // 3: - C_to_auxiliary = C_prev - - if auxiliary: - self.auxiliary_head = AuxiliaryHeadCIFAR( - C_to_auxiliary, num_classes) - self.global_pooling = nn.AdaptiveAvgPool2d(1) - self.classifier = nn.Linear(C_prev, num_classes) - - def forward(self, input): - logits_aux = None - s0 = s1 = self.stem(input) - for i, cell in enumerate(self.cells): - s0, s1 = s1, cell(s0, s1, self.drop_path_prob) - if i == 2 * self._layers // 3: - if self._auxiliary and self.training: - logits_aux = self.auxiliary_head(s1) - out = self.global_pooling(s1) - logits = self.classifier(out.view(out.size(0), -1)) - return logits, logits_aux - - -class NetworkTinyImageNet(nn.Module): - - def __init__(self, C, num_classes, layers, auxiliary, genotype): - super(NetworkTinyImageNet, self).__init__() - self._layers = layers - self._auxiliary = auxiliary - - stem_multiplier = 3 - C_curr = stem_multiplier * C - self.stem = nn.Sequential( - nn.Conv2d(3, C_curr, 3, stride=2, padding=1, bias=False), - nn.BatchNorm2d(C_curr) - ) - - C_prev_prev, C_prev, C_curr = C_curr, C_curr, C - self.cells = nn.ModuleList() - reduction_prev = False - for i in range(layers): - if i in [layers // 3, 2 * layers // 3]: - C_curr *= 2 - reduction = True - else: - reduction = False - cell = Cell(genotype, C_prev_prev, C_prev, - C_curr, reduction, reduction_prev) - reduction_prev = reduction - self.cells += [cell] - C_prev_prev, C_prev = C_prev, cell.multiplier * C_curr - if i == 2 * layers // 3: - C_to_auxiliary = C_prev - - if auxiliary: - self.auxiliary_head = AuxiliaryHeadCIFAR( - C_to_auxiliary, num_classes) - self.global_pooling = nn.AdaptiveAvgPool2d(1) - self.classifier = nn.Linear(C_prev, num_classes) - - def forward(self, input): - logits_aux = None - s0 = s1 = self.stem(input) - for i, cell in enumerate(self.cells): - s0, s1 = s1, cell(s0, s1, self.drop_path_prob) - if i == 2 * self._layers // 3: - if self._auxiliary and self.training: - logits_aux = self.auxiliary_head(s1) - out = self.global_pooling(s1) - logits = self.classifier(out.view(out.size(0), -1)) - return logits, logits_aux - - -class NetworkImageNet(nn.Module): - - def __init__(self, C, num_classes, layers, auxiliary, genotype): - super(NetworkImageNet, self).__init__() - self._layers = layers - self._auxiliary = auxiliary - - self.stem0 = nn.Sequential( - nn.Conv2d(3, C // 2, kernel_size=3, - stride=2, padding=1, bias=False), - nn.BatchNorm2d(C // 2), - nn.ReLU(inplace=True), - nn.Conv2d(C // 2, C, 3, stride=2, padding=1, bias=False), - nn.BatchNorm2d(C), - ) - - self.stem1 = nn.Sequential( - nn.ReLU(inplace=True), - nn.Conv2d(C, C, 3, stride=2, padding=1, bias=False), - nn.BatchNorm2d(C), - ) - - C_prev_prev, C_prev, C_curr = C, C, C - - self.cells = nn.ModuleList() - reduction_prev = True - for i in range(layers): - if i in [layers // 3, 2 * layers // 3]: - C_curr *= 2 - reduction = True - else: - reduction = False - cell = Cell(genotype, C_prev_prev, C_prev, - C_curr, reduction, reduction_prev) - reduction_prev = reduction - self.cells += [cell] - C_prev_prev, C_prev = C_prev, cell.multiplier * C_curr - if i == 2 * layers // 3: - C_to_auxiliary = C_prev - - if auxiliary: - self.auxiliary_head = AuxiliaryHeadImageNet( - C_to_auxiliary, num_classes) - self.global_pooling = nn.AvgPool2d(7) - self.classifier = nn.Linear(C_prev, num_classes) - - def forward(self, input): - logits_aux = None - s0 = self.stem0(input) - s1 = self.stem1(s0) - for i, cell in enumerate(self.cells): - s0, s1 = s1, cell(s0, s1, self.drop_path_prob) - if i == 2 * self._layers // 3: - if self._auxiliary and self.training: - logits_aux = self.auxiliary_head(s1) - out = self.global_pooling(s1) - logits = self.classifier(out.view(out.size(0), -1)) - return logits, logits_aux diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/third_pkg/darts_lib/util_convert.py b/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/third_pkg/darts_lib/util_convert.py deleted file mode 100644 index fee5905301..0000000000 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/third_pkg/darts_lib/util_convert.py +++ /dev/null @@ -1,126 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from scipy.special import softmax -from .genotypes import * - - -def genotype(weights, steps=4, multiplier=4): - def _parse(weights): - gene = [] - n = 2 - start = 0 - for i in range(steps): - end = start + n - W = weights[start:end].copy() - edges = sorted(range(i + 2), key=lambda x: -max( - W[x][k] for k in range(len(W[x])) if k != PRIMITIVES.index('none')))[:2] - for j in edges: - k_best = None - for k in range(len(W[j])): - if k != PRIMITIVES.index('none'): - if k_best is None or W[j][k] > W[j][k_best]: - k_best = k - gene.append((PRIMITIVES[k_best], j)) - start = end - n += 1 - return gene - - gene_normal = _parse(softmax(weights[0], axis=-1)) - gene_reduce = _parse(softmax(weights[1], axis=-1)) - - concat = range(2 + steps - multiplier, steps + 2) - genotype = Genotype( - normal=gene_normal, normal_concat=concat, - reduce=gene_reduce, reduce_concat=concat - ) - return genotype - - -# from naslib -def convert_genotype_to_compact(genotype): - """Converts Genotype to the compact representation""" - OPS = [ - "max_pool_3x3", - "avg_pool_3x3", - "skip_connect", - "sep_conv_3x3", - "sep_conv_5x5", - "dil_conv_3x3", - "dil_conv_5x5", - ] - compact = [] - - for i, cell_type in enumerate(["normal", "reduce"]): - cell = eval("genotype." + cell_type) - compact.append([]) - - for j in range(8): - compact[i].append((cell[j][1], OPS.index(cell[j][0]))) - - compact_tuple = (tuple(compact[0]), tuple(compact[1])) - return compact_tuple - - -# from naslib -def convert_compact_to_genotype(compact): - """Converts the compact representation to a Genotype""" - OPS = [ - "max_pool_3x3", - "avg_pool_3x3", - "skip_connect", - "sep_conv_3x3", - "sep_conv_5x5", - "dil_conv_3x3", - "dil_conv_5x5", - ] - genotype = [] - - for i in range(2): - cell = compact[i] - genotype.append([]) - - for j in range(8): - genotype[i].append((OPS[cell[j][1]], cell[j][0])) - - return Genotype( - normal=genotype[0], - normal_concat=[2, 3, 4, 5], - reduce=genotype[1], - reduce_concat=[2, 3, 4, 5], - ) - # TODO: need to check with Colin and/or Arber - # return Genotype( - # normal = genotype[0], - # normal_concat = [2, 3, 4, 5, 6], - # reduce = genotype[1], - # reduce_concat = [4, 5, 6] - # ) - - -# from naslib -def make_compact_mutable(compact): - # convert tuple to list so that it is mutable - arch_list = [] - for cell in compact: - arch_list.append([]) - for pair in cell: - arch_list[-1].append([]) - for num in pair: - arch_list[-1][-1].append(num) - return arch_list diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/third_pkg/sp101_lib/__init__.py b/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/third_pkg/sp101_lib/__init__.py deleted file mode 100644 index 3df60b02f7..0000000000 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/third_pkg/sp101_lib/__init__.py +++ /dev/null @@ -1,17 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/third_pkg/sp101_lib/graph_util.py b/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/third_pkg/sp101_lib/graph_util.py deleted file mode 100644 index b3e8194f0c..0000000000 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/third_pkg/sp101_lib/graph_util.py +++ /dev/null @@ -1,168 +0,0 @@ -# Copyright 2019 The Google Research Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Utility functions used by generate_graph.py.""" -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import hashlib -import itertools - -import numpy as np - - -def gen_is_edge_fn(bits): - """Generate a boolean function for the edge connectivity. - - Given a bitstring FEDCBA and a 4x4 matrix, the generated matrix is - [[0, A, B, D], - [0, 0, C, E], - [0, 0, 0, F], - [0, 0, 0, 0]] - - Note that this function is agnostic to the actual matrix dimension due to - order in which elements are filled out (column-major, starting from least - significant bit). For example, the same FEDCBA bitstring (0-padded) on a 5x5 - matrix is - [[0, A, B, D, 0], - [0, 0, C, E, 0], - [0, 0, 0, F, 0], - [0, 0, 0, 0, 0], - [0, 0, 0, 0, 0]] - - Args: - bits: integer which will be interpreted as a bit mask. - - Returns: - vectorized function that returns True when an edge is present. - """ - - def is_edge(x, y): - """Is there an edge from x to y (0-indexed)?""" - if x >= y: - return 0 - # Map x, y to index into bit string - index = x + (y * (y - 1) // 2) - return (bits >> index) % 2 == 1 - - return np.vectorize(is_edge) - - -def is_full_dag(matrix): - """Full DAG == all vertices on a path from vert 0 to (V-1). - - i.e. no disconnected or "hanging" vertices. - - It is sufficient to check for: - 1) no rows of 0 except for row V-1 (only output vertex has no out-edges) - 2) no cols of 0 except for col 0 (only input vertex has no in-edges) - - Args: - matrix: V x V upper-triangular adjacency matrix - - Returns: - True if the there are no dangling vertices. - """ - shape = np.shape(matrix) - - rows = matrix[:shape[0] - 1, :] == 0 - rows = np.all(rows, axis=1) # Any row with all 0 will be True - rows_bad = np.any(rows) - - cols = matrix[:, 1:] == 0 - cols = np.all(cols, axis=0) # Any col with all 0 will be True - cols_bad = np.any(cols) - - return (not rows_bad) and (not cols_bad) - - -def num_edges(matrix): - """Computes number of edges in adjacency matrix.""" - return np.sum(matrix) - - -def hash_module(matrix, labeling): - """Computes a graph-invariance MD5 hash of the matrix and label pair. - - Args: - matrix: np.ndarray square upper-triangular adjacency matrix. - labeling: list of int labels of length equal to both dimensions of - matrix. - - Returns: - MD5 hash of the matrix and labeling. - """ - vertices = np.shape(matrix)[0] - in_edges = np.sum(matrix, axis=0).tolist() - out_edges = np.sum(matrix, axis=1).tolist() - - assert len(in_edges) == len(out_edges) == len(labeling) - hashes = list(zip(out_edges, in_edges, labeling)) - hashes = [hashlib.md5(str(h).encode('utf-8')).hexdigest() for h in hashes] - # Computing this up to the diameter is probably sufficient but since the - # operation is fast, it is okay to repeat more times. - for _ in range(vertices): - new_hashes = [] - for v in range(vertices): - in_neighbors = [hashes[w] for w in range(vertices) if matrix[w, v]] - out_neighbors = [hashes[w] for w in range(vertices) if matrix[v, w]] - new_hashes.append(hashlib.md5( - (''.join(sorted(in_neighbors)) + '|' + - ''.join(sorted(out_neighbors)) + '|' + - hashes[v]).encode('utf-8')).hexdigest()) - hashes = new_hashes - fingerprint = hashlib.md5(str(sorted(hashes)).encode('utf-8')).hexdigest() - - return fingerprint - - -def permute_graph(graph, label, permutation): - """Permutes the graph and labels based on permutation. - - Args: - graph: np.ndarray adjacency matrix. - label: list of labels of same length as graph dimensions. - permutation: a permutation list of ints of same length as graph dimensions. - - Returns: - np.ndarray where vertex permutation[v] is vertex v from the original graph - """ - # vertex permutation[v] in new graph is vertex v in the old graph - forward_perm = zip(permutation, list(range(len(permutation)))) - inverse_perm = [x[1] for x in sorted(forward_perm)] - edge_fn = lambda x, y: graph[inverse_perm[x], inverse_perm[y]] == 1 - new_matrix = np.fromfunction(np.vectorize(edge_fn), - (len(label), len(label)), - dtype=np.int8) - new_label = [label[inverse_perm[i]] for i in range(len(label))] - return new_matrix, new_label - - -def is_isomorphic(graph1, graph2): - """Exhaustively checks if 2 graphs are isomorphic.""" - matrix1, label1 = np.array(graph1[0]), graph1[1] - matrix2, label2 = np.array(graph2[0]), graph2[1] - assert np.shape(matrix1) == np.shape(matrix2) - assert len(label1) == len(label2) - - vertices = np.shape(matrix1)[0] - # Note: input and output in our constrained graphs always map to themselves - # but this script does not enforce that. - for perm in itertools.permutations(range(0, vertices)): - pmatrix1, plabel1 = permute_graph(matrix1, label1, perm) - if np.array_equal(pmatrix1, matrix2) and plabel1 == label2: - return True - - return False diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/third_pkg/sp101_lib/model_spec.py b/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/third_pkg/sp101_lib/model_spec.py deleted file mode 100644 index 5d5992119a..0000000000 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/third_pkg/sp101_lib/model_spec.py +++ /dev/null @@ -1,343 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import copy -import hashlib -import itertools - -import numpy as np - -# Graphviz is optional and only required for visualization. -try: - import graphviz # pylint: disable=g-import-not-at-top -except ImportError: - pass - -INPUT = "input" -OUTPUT = "output" -CONV3X3 = "conv3x3-bn-relu" -CONV1X1 = "conv1x1-bn-relu" -MAXPOOL3X3 = "maxpool3x3" -OPS = [CONV3X3, CONV1X1, MAXPOOL3X3] - -NUM_VERTICES = 7 -OP_SPOTS = NUM_VERTICES - 2 -MAX_EDGES = 9 - - -class NASBench101ModelSpec(object): - """Model specification given adjacency matrix and labeling.""" - - def __init__(self, matrix, ops, data_format='channels_last'): - """Initialize the module spec. - - Args: - matrix: ndarray or nested list with shape [V, V] for the adjacency matrix. - ops: V-length list of labels for the base ops used. The first and last - elements are ignored because they are the input and output vertices - which have no operations. The elements are retained to keep consistent - indexing. - data_format: channels_last or channels_first. - - Raises: - ValueError: invalid matrix or ops - """ - if not isinstance(matrix, np.ndarray): - matrix = np.array(matrix) - shape = np.shape(matrix) - if len(shape) != 2 or shape[0] != shape[1]: - raise ValueError('matrix must be square') - if shape[0] != len(ops): - raise ValueError('length of ops must match matrix dimensions') - if not is_upper_triangular(matrix): - raise ValueError('matrix must be upper triangular') - - # Both the original and pruned matrices are deep copies of the matrix and - # ops so any changes to those after initialization are not recognized by the - # spec. - self.original_matrix = copy.deepcopy(matrix) - self.original_ops = copy.deepcopy(ops) - - self.matrix = copy.deepcopy(matrix) - self.ops = copy.deepcopy(ops) - self.valid_spec = True - self._prune() - - self.data_format = data_format - - def _prune(self): - """Prune the extraneous parts of the graph. - - General procedure: - 1) Remove parts of graph not connected to input. - 2) Remove parts of graph not connected to output. - 3) Reorder the vertices so that they are consecutive after steps 1 and 2. - - These 3 steps can be combined by deleting the rows and columns of the - vertices that are not reachable from both the input and output (in reverse). - """ - num_vertices = np.shape(self.original_matrix)[0] - - # DFS forward from input - visited_from_input = set([0]) - frontier = [0] - while frontier: - top = frontier.pop() - for v in range(top + 1, num_vertices): - if self.original_matrix[top, v] and v not in visited_from_input: - visited_from_input.add(v) - frontier.append(v) - - # DFS backward from output - visited_from_output = set([num_vertices - 1]) - frontier = [num_vertices - 1] - while frontier: - top = frontier.pop() - for v in range(0, top): - if self.original_matrix[v, top] and v not in visited_from_output: - visited_from_output.add(v) - frontier.append(v) - - # Any vertex that isn't connected to both input and output is extraneous to - # the computation graph. - extraneous = set(range(num_vertices)).difference( - visited_from_input.intersection(visited_from_output)) - - # If the non-extraneous graph is less than 2 vertices, the input is not - # connected to the output and the spec is invalid. - if len(extraneous) > num_vertices - 2: - self.matrix = None - self.ops = None - self.valid_spec = False - return - - self.matrix = np.delete(self.matrix, list(extraneous), axis=0) - self.matrix = np.delete(self.matrix, list(extraneous), axis=1) - for index in sorted(extraneous, reverse=True): - del self.ops[index] - - def hash_spec(self, canonical_ops): - """Computes the isomorphism-invariant graph hash of this spec. - - Args: - canonical_ops: list of operations in the canonical ordering which they - were assigned (i.e. the order provided in the config['available_ops']). - - Returns: - MD5 hash of this spec which can be used to query the dataset. - """ - # Invert the operations back to integer label indices used in graph gen. - labeling = [-1] + [canonical_ops.index(op) for op in self.ops[1:-1]] + [-2] - return hash_module(self.matrix, labeling) - - def visualize(self): - """Creates a dot graph. Can be visualized in colab directly.""" - num_vertices = np.shape(self.matrix)[0] - g = graphviz.Digraph() - g.node(str(0), 'input') - for v in range(1, num_vertices - 1): - g.node(str(v), self.ops[v]) - g.node(str(num_vertices - 1), 'output') - - for src in range(num_vertices - 1): - for dst in range(src + 1, num_vertices): - if self.matrix[src, dst]: - g.edge(str(src), str(dst)) - - return g - - @classmethod - def random_sample_one_architecture(cls, dataset_api: dict, min_size=7): - """ - This will sample a random architecture and update the edges in the - naslib object accordingly. - From the NASBench repository: - one-hot adjacency matrix - draw [0,1] for each slot in the adjacency matrix - """ - while True: - matrix = np.random.choice([0, 1], size=(NUM_VERTICES, NUM_VERTICES)) - matrix = np.triu(matrix, 1) - ops = np.random.choice(OPS, size=min_size).tolist() - ops[0] = INPUT - ops[-1] = OUTPUT - spec = dataset_api["api"].ModelSpec(matrix=matrix, ops=ops) - if not dataset_api["nb101_data"].is_valid(spec): - continue - - spec = NASBench101ModelSpec(matrix, ops) - # only sample model with 7 nodes. - if len(spec.matrix) == min_size: - break - - return spec - - -def is_upper_triangular(matrix): - """True if matrix is 0 on diagonal and below.""" - for src in range(np.shape(matrix)[0]): - for dst in range(0, src + 1): - if matrix[src, dst] != 0: - return False - - return True - - -def gen_is_edge_fn(bits): - """Generate a boolean function for the edge connectivity. - - Given a bitstring FEDCBA and a 4x4 matrix, the generated matrix is - [[0, A, B, D], - [0, 0, C, E], - [0, 0, 0, F], - [0, 0, 0, 0]] - - Note that this function is agnostic to the actual matrix dimension due to - order in which elements are filled out (column-major, starting from least - significant bit). For example, the same FEDCBA bitstring (0-padded) on a 5x5 - matrix is - [[0, A, B, D, 0], - [0, 0, C, E, 0], - [0, 0, 0, F, 0], - [0, 0, 0, 0, 0], - [0, 0, 0, 0, 0]] - - Args: - bits: integer which will be interpreted as a bit mask. - - Returns: - vectorized function that returns True when an edge is present. - """ - - def is_edge(x, y): - """Is there an edge from x to y (0-indexed)?""" - if x >= y: - return 0 - # Map x, y to index into bit string - index = x + (y * (y - 1) // 2) - return (bits >> index) % 2 == 1 - - return np.vectorize(is_edge) - - -def is_full_dag(matrix): - """Full DAG == all vertices on a path from vert 0 to (V-1). - - i.e. no disconnected or "hanging" vertices. - - It is sufficient to check for: - 1) no rows of 0 except for row V-1 (only output vertex has no out-edges) - 2) no cols of 0 except for col 0 (only input vertex has no in-edges) - - Args: - matrix: V x V upper-triangular adjacency matrix - - Returns: - True if the there are no dangling vertices. - """ - shape = np.shape(matrix) - - rows = matrix[:shape[0] - 1, :] == 0 - rows = np.all(rows, axis=1) # Any row with all 0 will be True - rows_bad = np.any(rows) - - cols = matrix[:, 1:] == 0 - cols = np.all(cols, axis=0) # Any col with all 0 will be True - cols_bad = np.any(cols) - - return (not rows_bad) and (not cols_bad) - - -def num_edges(matrix): - """Computes number of edges in adjacency matrix.""" - return np.sum(matrix) - - -def hash_module(matrix, labeling): - """Computes a graph-invariance MD5 hash of the matrix and label pair. - - Args: - matrix: np.ndarray square upper-triangular adjacency matrix. - labeling: list of int labels of length equal to both dimensions of - matrix. - - Returns: - MD5 hash of the matrix and labeling. - """ - vertices = np.shape(matrix)[0] - in_edges = np.sum(matrix, axis=0).tolist() - out_edges = np.sum(matrix, axis=1).tolist() - - assert len(in_edges) == len(out_edges) == len(labeling) - hashes = list(zip(out_edges, in_edges, labeling)) - hashes = [hashlib.md5(str(h).encode('utf-8')).hexdigest() for h in hashes] - # Computing this up to the diameter is probably sufficient but since the - # operation is fast, it is okay to repeat more times. - for _ in range(vertices): - new_hashes = [] - for v in range(vertices): - in_neighbors = [hashes[w] for w in range(vertices) if matrix[w, v]] - out_neighbors = [hashes[w] for w in range(vertices) if matrix[v, w]] - new_hashes.append(hashlib.md5( - (''.join(sorted(in_neighbors)) + '|' + - ''.join(sorted(out_neighbors)) + '|' + - hashes[v]).encode('utf-8')).hexdigest()) - hashes = new_hashes - fingerprint = hashlib.md5(str(sorted(hashes)).encode('utf-8')).hexdigest() - - return fingerprint - - -def permute_graph(graph, label, permutation): - """Permutes the graph and labels based on permutation. - - Args: - graph: np.ndarray adjacency matrix. - label: list of labels of same length as graph dimensions. - permutation: a permutation list of ints of same length as graph dimensions. - - Returns: - np.ndarray where vertex permutation[v] is vertex v from the original graph - """ - # vertex permutation[v] in new graph is vertex v in the old graph - forward_perm = zip(permutation, list(range(len(permutation)))) - inverse_perm = [x[1] for x in sorted(forward_perm)] - edge_fn = lambda x, y: graph[inverse_perm[x], inverse_perm[y]] == 1 - new_matrix = np.fromfunction(np.vectorize(edge_fn), - (len(label), len(label)), - dtype=np.int8) - new_label = [label[inverse_perm[i]] for i in range(len(label))] - return new_matrix, new_label - - -def is_isomorphic(graph1, graph2): - """Exhaustively checks if 2 graphs are isomorphic.""" - matrix1, label1 = np.array(graph1[0]), graph1[1] - matrix2, label2 = np.array(graph2[0]), graph2[1] - assert np.shape(matrix1) == np.shape(matrix2) - assert len(label1) == len(label2) - - vertices = np.shape(matrix1)[0] - # Note: input and output in our constrained graphs always map to themselves - # but this script does not enforce that. - for perm in itertools.permutations(range(0, vertices)): - pmatrix1, plabel1 = permute_graph(matrix1, label1, perm) - if np.array_equal(pmatrix1, matrix2) and plabel1 == label2: - return True - - return False diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/third_pkg/sp101_lib/nb101_api.py b/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/third_pkg/sp101_lib/nb101_api.py deleted file mode 100644 index 0990d13cec..0000000000 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/third_pkg/sp101_lib/nb101_api.py +++ /dev/null @@ -1,481 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - - -"""This is a NAS-Bench-101 version. - -Before using this API, download the data files from the links in the README. - -Usage: - # Load the data from file (this will take some time) - nasbench = api.NASBench('/path/to/pickle/or/shelve') - - # Create an Inception-like module (5x5 convolution replaced with two 3x3 - # convolutions). - model_spec = api.ModelSpec( - # Adjacency matrix of the module - matrix=[[0, 1, 1, 1, 0, 1, 0], # input layer - [0, 0, 0, 0, 0, 0, 1], # 1x1 conv - [0, 0, 0, 0, 0, 0, 1], # 3x3 conv - [0, 0, 0, 0, 1, 0, 0], # 5x5 conv (replaced by two 3x3's) - [0, 0, 0, 0, 0, 0, 1], # 5x5 conv (replaced by two 3x3's) - [0, 0, 0, 0, 0, 0, 1], # 3x3 max-pool - [0, 0, 0, 0, 0, 0, 0]], # output layer - # Operations at the vertices of the module, matches order of matrix - ops=[INPUT, CONV1X1, CONV3X3, CONV3X3, CONV3X3, MAXPOOL3X3, OUTPUT]) - - - # Query this model from dataset - data = nasbench.query(model_spec) - -Adjacency matrices are expected to be upper-triangular 0-1 matrices within the -defined search space (7 vertices, 9 edges, 3 allowed ops). The first and last -operations must be 'input' and 'output'. The other operations should be from -config['available_ops']. Currently, the available operations are: - CONV3X3 = "conv3x3-bn-relu" - CONV1X1 = "conv1x1-bn-relu" - MAXPOOL3X3 = "maxpool3x3" - -When querying a spec, the spec will first be automatically pruned (removing -unused vertices and edges along with ops). If the pruned spec is still out of -the search space, an OutOfDomainError will be raised, otherwise the data is -returned. - -The returned data object is a dictionary with the following keys: - - module_adjacency: numpy array for the adjacency matrix - - module_operations: list of operation labels - - trainable_parameters: number of trainable parameters in the model - - training_time: the total training time in seconds up to this point - - train_accuracy: training accuracy - - validation_accuracy: validation_accuracy - - test_accuracy: testing accuracy - -Instead of querying the dataset for a single run of a model, it is also possible -to retrieve all metrics for a given spec, using: - - fixed_stats, computed_stats = nasbench.get_metrics_from_spec(model_spec) - -The fixed_stats is a dictionary with the keys: - - module_adjacency - - module_operations - - trainable_parameters - -The computed_stats is a dictionary from epoch count to a list of metric -dicts. For example, computed_stats[108][0] contains the metrics for the first -repeat of the provided model trained to 108 epochs. The available keys are: - - halfway_training_time - - halfway_train_accuracy - - halfway_validation_accuracy - - halfway_test_accuracy - - final_training_time - - final_train_accuracy - - final_validation_accuracy - - final_test_accuracy -""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import copy -import random -import time -import shelve -import hashlib -import _pickle as pickle -import numpy as np - - -class OutOfDomainError(Exception): - """Indicates that the requested graph is outside of the search domain.""" - - -class NASBench(object): - """User-facing API for accessing the NASBench dataset.""" - - def __init__(self, dataset_file, seed=None, data_format='pickle'): - """Initialize dataset, this should only be done once per experiment. - - Args: - dataset_file: path to .tfrecord file containing the dataset. - seed: random seed used for sampling queried models. Two NASBench objects - created with the same seed will return the same data points when queried - with the same models in the same order. By default, the seed is randomly - generated. - """ - self.config = { - 'module_vertices': 7, - 'max_edges': 9, - 'num_repeats': 3, - 'available_ops': ['conv3x3-bn-relu', 'conv1x1-bn-relu', 'maxpool3x3'], - } - random.seed(seed) - - print('Loading dataset from file... This may take a few minutes...') - start = time.time() - - # Stores the fixed statistics that are independent of evaluation (i.e., - # adjacency matrix, operations, and number of parameters). - # hash --> metric name --> scalar - self.fixed_statistics = {} - - # Stores the statistics that are computed via training and evaluating the - # model on CIFAR-10. Statistics are computed for multiple repeats of each - # model at each max epoch length. - # hash --> epochs --> repeat index --> metric name --> scalar - self.computed_statistics = {} - - # Valid queriable epoch lengths. {4, 12, 36, 108} for the full dataset or - # {108} for the smaller dataset with only the 108 epochs. - self.valid_epochs = set() - - # open the database - if data_format == 'shelve': - with shelve.open(dataset_file, 'r') as shelf: - for module_hash in shelf: - # Parse the data from the data file. - fixed_statistics, computed_statistics = shelf[module_hash] - - self.fixed_statistics[module_hash] = fixed_statistics - self.computed_statistics[module_hash] = computed_statistics - - self.valid_epochs.update(set(computed_statistics.keys())) - elif data_format == 'pickle': - with open(dataset_file, 'rb') as f: - data = pickle.load(f) - for module_hash, stats in data.items(): - self.fixed_statistics[module_hash] = stats[0] - self.computed_statistics[module_hash] = stats[1] - - self.valid_epochs.update(set(stats[1].keys())) - else: - raise Exception('Data format not supported') - - elapsed = time.time() - start - print('Loaded dataset in %d seconds' % elapsed) - - self.history = {} - self.training_time_spent = 0.0 - self.total_epochs_spent = 0 - - def query(self, model_spec, epochs=108, stop_halfway=False): - """Fetch one of the evaluations for this model spec. - - Each call will sample one of the config['num_repeats'] evaluations of the - model. This means that repeated queries of the same model (or isomorphic - models) may return identical metrics. - - This function will increment the budget counters for benchmarking purposes. - See self.training_time_spent, and self.total_epochs_spent. - - This function also allows querying the evaluation metrics at the halfway - point of training using stop_halfway. Using this option will increment the - budget counters only up to the halfway point. - - Args: - model_spec: ModelSpec object. - epochs: number of epochs trained. Must be one of the evaluated number of - epochs, [4, 12, 36, 108] for the full dataset. - stop_halfway: if True, returned dict will only contain the training time - and accuracies at the halfway point of training (num_epochs/2). - Otherwise, returns the time and accuracies at the end of training - (num_epochs). - - Returns: - dict containing the evaluated data for this object. - - Raises: - OutOfDomainError: if model_spec or num_epochs is outside the search space. - """ - if epochs not in self.valid_epochs: - raise OutOfDomainError('invalid number of epochs, must be one of %s' - % self.valid_epochs) - - fixed_stat, computed_stat = self.get_metrics_from_spec(model_spec) - sampled_index = random.randint(0, self.config['num_repeats'] - 1) - computed_stat = computed_stat[epochs][sampled_index] - - data = {} - data['module_adjacency'] = fixed_stat['module_adjacency'] - data['module_operations'] = fixed_stat['module_operations'] - data['trainable_parameters'] = fixed_stat['trainable_parameters'] - - if stop_halfway: - data['training_time'] = computed_stat['halfway_training_time'] - data['train_accuracy'] = computed_stat['halfway_train_accuracy'] - data['validation_accuracy'] = computed_stat['halfway_validation_accuracy'] - data['test_accuracy'] = computed_stat['halfway_test_accuracy'] - else: - data['training_time'] = computed_stat['final_training_time'] - data['train_accuracy'] = computed_stat['final_train_accuracy'] - data['validation_accuracy'] = computed_stat['final_validation_accuracy'] - data['test_accuracy'] = computed_stat['final_test_accuracy'] - - self.training_time_spent += data['training_time'] - if stop_halfway: - self.total_epochs_spent += epochs // 2 - else: - self.total_epochs_spent += epochs - - return data - - def is_valid(self, model_spec): - """Checks the validity of the model_spec. - - For the purposes of benchmarking, this does not increment the budget - counters. - - Args: - model_spec: ModelSpec object. - - Returns: - True if model is within space. - """ - try: - self._check_spec(model_spec) - except OutOfDomainError: - return False - - return True - - def get_budget_counters(self): - """Returns the time and budget counters.""" - return self.training_time_spent, self.total_epochs_spent - - def reset_budget_counters(self): - """Reset the time and epoch budget counters.""" - self.training_time_spent = 0.0 - self.total_epochs_spent = 0 - - def hash_iterator(self): - """Returns iterator over all unique model hashes.""" - return self.fixed_statistics.keys() - - def get_metrics_from_hash(self, module_hash): - """Returns the metrics for all epochs and all repeats of a hash. - - This method is for dataset analysis and should not be used for benchmarking. - As such, it does not increment any of the budget counters. - - Args: - module_hash: MD5 hash, i.e., the values yielded by hash_iterator(). - - Returns: - fixed stats and computed stats of the model spec provided. - """ - fixed_stat = copy.deepcopy(self.fixed_statistics[module_hash]) - computed_stat = copy.deepcopy(self.computed_statistics[module_hash]) - return fixed_stat, computed_stat - - def get_metrics_from_spec(self, model_spec): - """Returns the metrics for all epochs and all repeats of a model. - - This method is for dataset analysis and should not be used for benchmarking. - As such, it does not increment any of the budget counters. - - Args: - model_spec: ModelSpec object. - - Returns: - fixed stats and computed stats of the model spec provided. - """ - self._check_spec(model_spec) - module_hash = self._hash_spec(model_spec) - return self.get_metrics_from_hash(module_hash) - - def _check_spec(self, model_spec): - """Checks that the model spec is within the dataset.""" - if not model_spec.valid_spec: - raise OutOfDomainError('invalid spec, provided graph is disconnected.') - - num_vertices = len(model_spec.ops) - num_edges = np.sum(model_spec.matrix) - - if num_vertices > self.config['module_vertices']: - raise OutOfDomainError('too many vertices, got %d (max vertices = %d)' - % (num_vertices, config['module_vertices'])) - - if num_edges > self.config['max_edges']: - raise OutOfDomainError('too many edges, got %d (max edges = %d)' - % (num_edges, self.config['max_edges'])) - - if model_spec.ops[0] != 'input': - raise OutOfDomainError('first operation should be \'input\'') - if model_spec.ops[-1] != 'output': - raise OutOfDomainError('last operation should be \'output\'') - for op in model_spec.ops[1:-1]: - if op not in self.config['available_ops']: - raise OutOfDomainError('unsupported op %s (available ops = %s)' - % (op, self.config['available_ops'])) - - def _hash_spec(self, model_spec): - """Returns the MD5 hash for a provided model_spec.""" - return model_spec.hash_spec(self.config['available_ops']) - - -class ModelSpec(object): - """Model specification given adjacency matrix and labeling.""" - - def __init__(self, matrix, ops, data_format='channels_last'): - """Initialize the module spec. - - Args: - matrix: ndarray or nested list with shape [V, V] for the adjacency matrix. - ops: V-length list of labels for the base ops used. The first and last - elements are ignored because they are the input and output vertices - which have no operations. The elements are retained to keep consistent - indexing. - data_format: channels_last or channels_first. - - Raises: - ValueError: invalid matrix or ops - """ - if not isinstance(matrix, np.ndarray): - matrix = np.array(matrix) - shape = np.shape(matrix) - if len(shape) != 2 or shape[0] != shape[1]: - raise ValueError('matrix must be square') - if shape[0] != len(ops): - raise ValueError('length of ops must match matrix dimensions') - if not is_upper_triangular(matrix): - raise ValueError('matrix must be upper triangular') - - # Both the original and pruned matrices are deep copies of the matrix and - # ops so any changes to those after initialization are not recognized by the - # spec. - self.original_matrix = copy.deepcopy(matrix) - self.original_ops = copy.deepcopy(ops) - - self.matrix = copy.deepcopy(matrix) - self.ops = copy.deepcopy(ops) - self.valid_spec = True - self._prune() - - self.data_format = data_format - - def _prune(self): - """Prune the extraneous parts of the graph. - - General procedure: - 1) Remove parts of graph not connected to input. - 2) Remove parts of graph not connected to output. - 3) Reorder the vertices so that they are consecutive after steps 1 and 2. - - These 3 steps can be combined by deleting the rows and columns of the - vertices that are not reachable from both the input and output (in reverse). - """ - num_vertices = np.shape(self.original_matrix)[0] - - # DFS forward from input - visited_from_input = set([0]) - frontier = [0] - while frontier: - top = frontier.pop() - for v in range(top + 1, num_vertices): - if self.original_matrix[top, v] and v not in visited_from_input: - visited_from_input.add(v) - frontier.append(v) - - # DFS backward from output - visited_from_output = set([num_vertices - 1]) - frontier = [num_vertices - 1] - while frontier: - top = frontier.pop() - for v in range(0, top): - if self.original_matrix[v, top] and v not in visited_from_output: - visited_from_output.add(v) - frontier.append(v) - - # Any vertex that isn't connected to both input and output is extraneous to - # the computation graph. - extraneous = set(range(num_vertices)).difference( - visited_from_input.intersection(visited_from_output)) - - # If the non-extraneous graph is less than 2 vertices, the input is not - # connected to the output and the spec is invalid. - if len(extraneous) > num_vertices - 2: - self.matrix = None - self.ops = None - self.valid_spec = False - return - - self.matrix = np.delete(self.matrix, list(extraneous), axis=0) - self.matrix = np.delete(self.matrix, list(extraneous), axis=1) - for index in sorted(extraneous, reverse=True): - del self.ops[index] - - def hash_spec(self, canonical_ops): - """Computes the isomorphism-invariant graph hash of this spec. - - Args: - canonical_ops: list of operations in the canonical ordering which they - were assigned (i.e. the order provided in the config['available_ops']). - - Returns: - MD5 hash of this spec which can be used to query the dataset. - """ - # Invert the operations back to integer label indices used in graph gen. - labeling = [-1] + [canonical_ops.index(op) for op in self.ops[1:-1]] + [-2] - return hash_module(self.matrix, labeling) - - -def is_upper_triangular(matrix): - """True if matrix is 0 on diagonal and below.""" - for src in range(np.shape(matrix)[0]): - for dst in range(0, src + 1): - if matrix[src, dst] != 0: - return False - - return True - - -def hash_module(matrix, labeling): - """Computes a graph-invariance MD5 hash of the matrix and label pair. - - Args: - matrix: np.ndarray square upper-triangular adjacency matrix. - labeling: list of int labels of length equal to both dimensions of - matrix. - - Returns: - MD5 hash of the matrix and labeling. - """ - vertices = np.shape(matrix)[0] - in_edges = np.sum(matrix, axis=0).tolist() - out_edges = np.sum(matrix, axis=1).tolist() - - assert len(in_edges) == len(out_edges) == len(labeling) - hashes = list(zip(out_edges, in_edges, labeling)) - hashes = [hashlib.md5(str(h).encode('utf-8')).hexdigest() for h in hashes] - # Computing this up to the diameter is probably sufficient but since the - # operation is fast, it is okay to repeat more times. - for _ in range(vertices): - new_hashes = [] - for v in range(vertices): - in_neighbors = [hashes[w] for w in range(vertices) if matrix[w, v]] - out_neighbors = [hashes[w] for w in range(vertices) if matrix[v, w]] - new_hashes.append(hashlib.md5( - (''.join(sorted(in_neighbors)) + '|' + - ''.join(sorted(out_neighbors)) + '|' + - hashes[v]).encode('utf-8')).hexdigest()) - hashes = new_hashes - fingerprint = hashlib.md5(str(sorted(hashes)).encode('utf-8')).hexdigest() - - return fingerprint - - diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/third_pkg/sp201_lib/nasbench2.py b/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/third_pkg/sp201_lib/nasbench2.py deleted file mode 100644 index e845b6442f..0000000000 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/third_pkg/sp201_lib/nasbench2.py +++ /dev/null @@ -1,117 +0,0 @@ -# Copyright 2021 Samsung Electronics Co., Ltd. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at - -# http://www.apache.org/licenses/LICENSE-2.0 - -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================= - -from .nasbench2_ops import * - - -def gen_searchcell_mask_from_arch_str(arch_str): - nodes = arch_str.split('+') - nodes = [node[1:-1].split('|') for node in nodes] - nodes = [[op_and_input.split('~') for op_and_input in node] for node in nodes] - - keep_mask = [] - for curr_node_idx in range(len(nodes)): - for prev_node_idx in range(curr_node_idx+1): - _op = [edge[0] for edge in nodes[curr_node_idx] if int(edge[1]) == prev_node_idx] - assert len(_op) == 1, 'The arch string does not follow the assumption of 1 connection between two nodes.' - for _op_name in OPS.keys(): - keep_mask.append(_op[0] == _op_name) - return keep_mask - - -def get_model_from_arch_str(arch_str, num_classes, bn=True, init_channels=16): - keep_mask = gen_searchcell_mask_from_arch_str(arch_str) - net = NAS201Model(arch_str=arch_str, num_classes=num_classes, use_bn=bn, keep_mask=keep_mask, stem_ch=init_channels) - return net - - -def get_super_model(num_classes, use_bn=True): - net = NAS201Model(arch_str=arch_str, num_classes=num_classes, use_bn=use_bn) - return net - - -class NAS201Model(nn.Module): - - def __init__(self, arch_str, num_classes, use_bn=True, keep_mask=None, stem_ch=16): - super(NAS201Model, self).__init__() - self.arch_str=arch_str - self.num_classes=num_classes - self.use_bn= use_bn - self.stem_ch = stem_ch - - self.stem = stem(out_channels=stem_ch, use_bn=use_bn) - self.stack_cell1 = nn.Sequential(*[SearchCell(in_channels=stem_ch, out_channels=stem_ch, stride=1, affine=False, track_running_stats=False, use_bn=use_bn, keep_mask=keep_mask) for i in range(5)]) - self.reduction1 = reduction(in_channels=stem_ch, out_channels=stem_ch*2) - self.stack_cell2 = nn.Sequential(*[SearchCell(in_channels=stem_ch*2, out_channels=stem_ch*2, stride=1, affine=False, track_running_stats=False, use_bn=use_bn, keep_mask=keep_mask) for i in range(5)]) - self.reduction2 = reduction(in_channels=stem_ch*2, out_channels=stem_ch*4) - self.stack_cell3 = nn.Sequential(*[SearchCell(in_channels=stem_ch*4, out_channels=stem_ch*4, stride=1, affine=False, track_running_stats=False, use_bn=use_bn, keep_mask=keep_mask) for i in range(5)]) - self.top = top(in_dims=stem_ch*4, num_classes=num_classes, use_bn=use_bn) - - def forward(self, x): - x = self.stem(x) - - x = self.stack_cell1(x) - x = self.reduction1(x) - - x = self.stack_cell2(x) - x = self.reduction2(x) - - x = self.stack_cell3(x) - - x = self.top(x) - return x - - def get_prunable_copy(self, bn=False): - model_new = get_model_from_arch_str(self.arch_str, self.num_classes, use_bn=bn, init_channels=self.stem_ch) - - #TODO this is quite brittle and doesn't work with nn.Sequential when bn is different - # it is only required to maintain initialization -- maybe init after get_punable_copy? - model_new.load_state_dict(self.state_dict(), strict=False) - model_new.train() - - return model_new - - -def get_arch_str_from_model(net): - search_cell = net.stack_cell1[0].options - keep_mask = net.stack_cell1[0].keep_mask - num_nodes = net.stack_cell1[0].num_nodes - - nodes = [] - idx = 0 - for curr_node in range(num_nodes -1): - edges = [] - for prev_node in range(curr_node+1): # n-1 prev nodes - for _op_name in OPS.keys(): - if keep_mask[idx]: - edges.append(f'{_op_name}~{prev_node}') - idx += 1 - node_str = '|'.join(edges) - node_str = f'|{node_str}|' - nodes.append(node_str) - arch_str = '+'.join(nodes) - return arch_str - - -if __name__ == "__main__": - arch_str = '|nor_conv_3x3~0|+|none~0|none~1|+|avg_pool_3x3~0|nor_conv_3x3~1|nor_conv_3x3~2|' - - n = get_model_from_arch_str(arch_str=arch_str, num_classes=10) - print(n.stack_cell1[0]) - - arch_str2 = get_arch_str_from_model(n) - print(arch_str) - print(arch_str2) - print(f'Are the two arch strings same? {arch_str == arch_str2}') diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/third_pkg/sp201_lib/nasbench2_ops.py b/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/third_pkg/sp201_lib/nasbench2_ops.py deleted file mode 100644 index efcdba3224..0000000000 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/third_pkg/sp201_lib/nasbench2_ops.py +++ /dev/null @@ -1,160 +0,0 @@ -# Copyright 2021 Samsung Electronics Co., Ltd. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at - -# http://www.apache.org/licenses/LICENSE-2.0 - -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================= - -import torch.nn as nn - -class ReLUConvBN(nn.Module): - - def __init__(self, in_channels, out_channels, kernel_size, stride, padding, dilation, affine, track_running_stats=True, use_bn=True, name='ReLUConvBN'): - super(ReLUConvBN, self).__init__() - self.name = name - if use_bn: - self.op = nn.Sequential( - nn.ReLU(inplace=False), - nn.Conv2d(in_channels, out_channels, kernel_size, stride=stride, padding=padding, dilation=dilation, bias=not affine), - nn.BatchNorm2d(out_channels, affine=affine, track_running_stats=track_running_stats) - ) - else: - self.op = nn.Sequential( - nn.ReLU(inplace=False), - nn.Conv2d(in_channels, out_channels, kernel_size, stride=stride, padding=padding, dilation=dilation, bias=not affine) - ) - - def forward(self, x): - return self.op(x) - -class Identity(nn.Module): - def __init__(self, name='Identity'): - self.name = name - super(Identity, self).__init__() - - def forward(self, x): - return x - -class Zero(nn.Module): - - def __init__(self, stride, name='Zero'): - self.name = name - super(Zero, self).__init__() - self.stride = stride - - def forward(self, x): - if self.stride == 1: - return x.mul(0.) - return x[:,:,::self.stride,::self.stride].mul(0.) - -class POOLING(nn.Module): - def __init__(self, kernel_size, stride, padding, name='POOLING'): - super(POOLING, self).__init__() - self.name = name - self.avgpool = nn.AvgPool2d(kernel_size=kernel_size, stride=1, padding=1, count_include_pad=False) - - def forward(self, x): - return self.avgpool(x) - - -class reduction(nn.Module): - def __init__(self, in_channels, out_channels): - super(reduction, self).__init__() - self.residual = nn.Sequential( - nn.AvgPool2d(kernel_size=2, stride=2, padding=0), - nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=1, stride=1, padding=0, bias=False)) - - self.conv_a = ReLUConvBN(in_channels=in_channels, out_channels=out_channels, kernel_size=3, stride=2, padding=1, dilation=1, affine=True, track_running_stats=True) - self.conv_b = ReLUConvBN(in_channels=out_channels, out_channels=out_channels, kernel_size=3, stride=1, padding=1, dilation=1, affine=True, track_running_stats=True) - - def forward(self, x): - basicblock = self.conv_a(x) - basicblock = self.conv_b(basicblock) - residual = self.residual(x) - return residual + basicblock - -class stem(nn.Module): - def __init__(self, out_channels, use_bn=True): - super(stem, self).__init__() - if use_bn: - self.net = nn.Sequential( - nn.Conv2d(in_channels=3, out_channels=out_channels, kernel_size=3, padding=1, bias=False), - nn.BatchNorm2d(out_channels)) - else: - self.net = nn.Sequential( - nn.Conv2d(in_channels=3, out_channels=out_channels, kernel_size=3, padding=1, bias=False) - ) - - def forward(self, x): - return self.net(x) - -class top(nn.Module): - def __init__(self, in_dims, num_classes, use_bn=True): - super(top, self).__init__() - if use_bn: - self.lastact = nn.Sequential(nn.BatchNorm2d(in_dims), nn.ReLU(inplace=True)) - else: - self.lastact = nn.ReLU(inplace=True) - self.global_pooling = nn.AdaptiveAvgPool2d(1) - self.classifier = nn.Linear(in_dims, num_classes) - - def forward(self, x): - x = self.lastact(x) - x = self.global_pooling(x) - x = x.view(x.size(0), -1) - logits = self.classifier(x) - return logits - - -class SearchCell(nn.Module): - - def __init__(self, in_channels, out_channels, stride, affine, track_running_stats, use_bn=True, num_nodes=4, keep_mask=None): - super(SearchCell, self).__init__() - self.num_nodes = num_nodes - self.options = nn.ModuleList() - for curr_node in range(self.num_nodes-1): - for prev_node in range(curr_node+1): - for _op_name in OPS.keys(): - op = OPS[_op_name](in_channels, out_channels, stride, affine, track_running_stats, use_bn) - self.options.append(op) - - if keep_mask is not None: - self.keep_mask = keep_mask - else: - self.keep_mask = [True]*len(self.options) - - def forward(self, x): - outs = [x] - - idx = 0 - for curr_node in range(self.num_nodes-1): - edges_in = [] - for prev_node in range(curr_node+1): # n-1 prev nodes - for op_idx in range(len(OPS.keys())): - if self.keep_mask[idx]: - edges_in.append(self.options[idx](outs[prev_node])) - idx += 1 - node_output = sum(edges_in) - outs.append(node_output) - - return outs[-1] - - - -OPS = { - 'none' : lambda in_channels, out_channels, stride, affine, track_running_stats, use_bn: Zero(stride, name='none'), - 'avg_pool_3x3' : lambda in_channels, out_channels, stride, affine, track_running_stats, use_bn: POOLING(3, 1, 1, name='avg_pool_3x3'), - 'nor_conv_3x3' : lambda in_channels, out_channels, stride, affine, track_running_stats, use_bn: ReLUConvBN(in_channels, out_channels, 3, 1, 1, 1, affine, track_running_stats, use_bn, name='nor_conv_3x3'), - 'nor_conv_1x1' : lambda in_channels, out_channels, stride, affine, track_running_stats, use_bn: ReLUConvBN(in_channels, out_channels, 1, 1, 0, 1, affine, track_running_stats, use_bn, name='nor_conv_1x1'), - 'skip_connect' : lambda in_channels, out_channels, stride, affine, track_running_stats, use_bn: Identity(name='skip_connect'), -} - - diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/tools/__init__.py b/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/tools/__init__.py deleted file mode 100644 index 3df60b02f7..0000000000 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/tools/__init__.py +++ /dev/null @@ -1,17 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/pg_extension/sql/filter_phase.sql b/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/pg_extension/sql/filter_phase.sql deleted file mode 100644 index 0fbfc05aef..0000000000 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/pg_extension/sql/filter_phase.sql +++ /dev/null @@ -1,54 +0,0 @@ -/************************************************************ -* -* Licensed to the Apache Software Foundation (ASF) under one -* or more contributor license agreements. See the NOTICE file -* distributed with this work for additional information -* regarding copyright ownership. The ASF licenses this file -* to you under the Apache License, Version 2.0 (the -* "License"); you may not use this file except in compliance -* with the License. You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, -* software distributed under the License is distributed on an -* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -* KIND, either express or implied. See the License for the -* specific language governing permissions and limitations -* under the License. -* -*************************************************************/ - -CREATE OR REPLACE -PROCEDURE model_selection_sp( - dataset TEXT, --dataset name - selected_columns TEXT[], --used columns - N INTEGER, --number of models to evaluate - batch_size INTEGER, --batch size, for profiling, filtering - config_file TEXT --config file path -) -LANGUAGE plpgsql -AS $$ -DECLARE - -- global inputs/outputs - result_status TEXT; - column_list TEXT; -BEGIN - -- combine the columns into a string - column_list := array_to_string(selected_columns, ', '); - - -- 4. Run filtering phase to get top K models. - EXECUTE format(' - WITH batch_rows AS ( - SELECT %s - FROM %I - ORDER BY RANDOM() - LIMIT %s OFFSET 0 - ) - SELECT filtering_phase( - json_agg(row_to_json(t))::text, %s, %s, %L - ) - FROM batch_rows AS t', column_list, dataset, batch_size, N, 1, config_file) INTO result_status; - RAISE NOTICE '4. run filtering phase, k models = %', result_status; - -END; $$; diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/pg_extension/sql/pg_extension.sql b/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/pg_extension/sql/pg_extension.sql deleted file mode 100644 index 052f3cb5e1..0000000000 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/pg_extension/sql/pg_extension.sql +++ /dev/null @@ -1,141 +0,0 @@ -/************************************************************ -* -* Licensed to the Apache Software Foundation (ASF) under one -* or more contributor license agreements. See the NOTICE file -* distributed with this work for additional information -* regarding copyright ownership. The ASF licenses this file -* to you under the Apache License, Version 2.0 (the -* "License"); you may not use this file except in compliance -* with the License. You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, -* software distributed under the License is distributed on an -* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -* KIND, either express or implied. See the License for the -* specific language governing permissions and limitations -* under the License. -* -*************************************************************/ - --- src/lib.rs:66 --- pg_extension::filtering_phase -CREATE FUNCTION "filtering_phase"( - "mini_batch" TEXT, /* alloc::string::String */ - "n" INT, /* i32 */ - "k" INT, /* i32 */ - "config_file" TEXT /* alloc::string::String */ -) RETURNS TEXT /* alloc::string::String */ - IMMUTABLE STRICT PARALLEL SAFE -LANGUAGE c /* Rust */ -AS 'MODULE_PATHNAME', 'filtering_phase_wrapper'; - --- src/lib.rs:16 --- pg_extension::profiling_filtering_phase -CREATE FUNCTION "profiling_filtering_phase"( - "mini_batch" TEXT, /* alloc::string::String */ - "config_file" TEXT /* alloc::string::String */ -) RETURNS TEXT /* alloc::string::String */ - IMMUTABLE STRICT PARALLEL SAFE -LANGUAGE c /* Rust */ -AS 'MODULE_PATHNAME', 'profiling_filtering_phase_wrapper'; - --- src/lib.rs:80 --- pg_extension::refinement_phase -CREATE FUNCTION "refinement_phase"( - "config_file" TEXT /* alloc::string::String */ -) RETURNS TEXT /* alloc::string::String */ - IMMUTABLE STRICT PARALLEL SAFE -LANGUAGE c /* Rust */ -AS 'MODULE_PATHNAME', 'refinement_phase_wrapper'; - --- src/lib.rs:31 --- pg_extension::profiling_refinement_phase -CREATE FUNCTION "profiling_refinement_phase"( - "mini_batch" TEXT, /* alloc::string::String */ - "config_file" TEXT /* alloc::string::String */ -) RETURNS TEXT /* alloc::string::String */ - IMMUTABLE STRICT PARALLEL SAFE -LANGUAGE c /* Rust */ -AS 'MODULE_PATHNAME', 'profiling_refinement_phase_wrapper'; - --- src/lib.rs:46 --- pg_extension::coordinator -CREATE FUNCTION "coordinator"( - "time_score" TEXT, /* alloc::string::String */ - "time_train" TEXT, /* alloc::string::String */ - "time_budget" TEXT, /* alloc::string::String */ - "only_phase1" bool, /* bool */ - "config_file" TEXT /* alloc::string::String */ -) RETURNS TEXT /* alloc::string::String */ - IMMUTABLE STRICT PARALLEL SAFE -LANGUAGE c /* Rust */ -AS 'MODULE_PATHNAME', 'coordinator_wrapper'; - --- src/lib.rs:94 --- pg_extension::model_selection -CREATE FUNCTION "model_selection"( - "mini_batch" TEXT, /* alloc::string::String */ - "time_budget" TEXT, /* alloc::string::String */ - "config_file" TEXT /* alloc::string::String */ -) RETURNS TEXT /* alloc::string::String */ - IMMUTABLE STRICT PARALLEL SAFE -LANGUAGE c /* Rust */ -AS 'MODULE_PATHNAME', 'model_selection_wrapper'; - --- src/lib.rs:110 --- pg_extension::model_selection_workloads -CREATE FUNCTION "model_selection_workloads"( - "mini_batch" TEXT, /* alloc::string::String */ - "n" INT, /* i32 */ - "k" INT, /* i32 */ - "config_file" TEXT /* alloc::string::String */ -) RETURNS TEXT /* alloc::string::String */ - IMMUTABLE STRICT PARALLEL SAFE -LANGUAGE c /* Rust */ -AS 'MODULE_PATHNAME', 'model_selection_workloads_wrapper'; - --- src/lib.rs:125 --- pg_extension::model_selection_trails -CREATE FUNCTION "model_selection_trails"( - "mini_batch" TEXT, /* alloc::string::String */ - "time_budget" TEXT, /* alloc::string::String */ - "config_file" TEXT /* alloc::string::String */ -) RETURNS TEXT /* alloc::string::String */ - IMMUTABLE STRICT PARALLEL SAFE -LANGUAGE c /* Rust */ -AS 'MODULE_PATHNAME', 'model_selection_trails_wrapper'; - --- src/lib.rs:138 --- pg_extension::model_selection_trails_workloads -CREATE FUNCTION "model_selection_trails_workloads"( - "mini_batch" TEXT, /* alloc::string::String */ - "n" INT, /* i32 */ - "k" INT, /* i32 */ - "config_file" TEXT /* alloc::string::String */ -) RETURNS TEXT /* alloc::string::String */ - IMMUTABLE STRICT PARALLEL SAFE -LANGUAGE c /* Rust */ -AS 'MODULE_PATHNAME', 'model_selection_trails_workloads_wrapper'; - --- src/lib.rs:152 --- pg_extension::benchmark_filtering_phase_latency -CREATE FUNCTION "benchmark_filtering_phase_latency"( - "explore_models" INT, /* i32 */ - "config_file" TEXT /* alloc::string::String */ -) RETURNS TEXT /* alloc::string::String */ - IMMUTABLE STRICT PARALLEL SAFE -LANGUAGE c /* Rust */ -AS 'MODULE_PATHNAME', 'benchmark_filtering_phase_latency_wrapper'; - --- src/lib.rs:163 --- pg_extension::benchmark_filtering_latency_in_db -CREATE FUNCTION "benchmark_filtering_latency_in_db"( - "explore_models" INT, /* i32 */ - "dataset" TEXT, /* alloc::string::String */ - "config_file" TEXT /* alloc::string::String */ -) RETURNS TEXT /* alloc::string::String */ - IMMUTABLE STRICT PARALLEL SAFE -LANGUAGE c /* Rust */ -AS 'MODULE_PATHNAME', 'benchmark_filtering_latency_in_db_wrapper'; \ No newline at end of file diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/pg_extension/src/bindings/model.rs b/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/pg_extension/src/bindings/model.rs deleted file mode 100644 index 61268fea48..0000000000 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/pg_extension/src/bindings/model.rs +++ /dev/null @@ -1,39 +0,0 @@ -/************************************************************ -* -* Licensed to the Apache Software Foundation (ASF) under one -* or more contributor license agreements. See the NOTICE file -* distributed with this work for additional information -* regarding copyright ownership. The ASF licenses this file -* to you under the Apache License, Version 2.0 (the -* "License"); you may not use this file except in compliance -* with the License. You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, -* software distributed under the License is distributed on an -* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -* KIND, either express or implied. See the License for the -* specific language governing permissions and limitations -* under the License. -* -*************************************************************/ - -use serde::{Serialize, Deserialize}; - - -#[derive(Debug, Serialize, Deserialize)] -pub(crate) struct Frappe { - pub(crate) id: i32, - pub(crate) label: i32, - pub(crate) col1: String, - pub(crate) col2: String, - pub(crate) col3: String, - pub(crate) col4: String, - pub(crate) col5: String, - pub(crate) col6: String, - pub(crate) col7: String, - pub(crate) col8: String, - pub(crate) col9: String, - pub(crate) col10: String, -} diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/pg_extension/src/bindings/ms.rs b/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/pg_extension/src/bindings/ms.rs deleted file mode 100644 index ca946aa091..0000000000 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/pg_extension/src/bindings/ms.rs +++ /dev/null @@ -1,228 +0,0 @@ -/************************************************************ -* -* Licensed to the Apache Software Foundation (ASF) under one -* or more contributor license agreements. See the NOTICE file -* distributed with this work for additional information -* regarding copyright ownership. The ASF licenses this file -* to you under the Apache License, Version 2.0 (the -* "License"); you may not use this file except in compliance -* with the License. You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, -* software distributed under the License is distributed on an -* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -* KIND, either express or implied. See the License for the -* specific language governing permissions and limitations -* under the License. -* -*************************************************************/ - -use serde_json::json; -use std::collections::HashMap; -use pgrx::prelude::*; -use crate::bindings::ml_register::PY_MODULE; -use crate::bindings::ml_register::run_python_function; -use std::time::{Instant, Duration}; - - -pub fn profiling_filtering_phase( - task: &String -) -> serde_json::Value { - run_python_function(&PY_MODULE, task, "profiling_filtering_phase") -} - - -pub fn profiling_refinement_phase( - task: &String -) -> serde_json::Value { - run_python_function(&PY_MODULE, task, "profiling_refinement_phase") -} - - -pub fn coordinator( - task: &String -) -> serde_json::Value { - run_python_function(&PY_MODULE, task, "coordinator") -} - - -pub fn filtering_phase( - task: &String -) -> serde_json::Value { - run_python_function(&PY_MODULE, task, "filtering_phase_dataLoader") -} - - -pub fn refinement_phase() -> serde_json::Value { - let task = "refinement_phase".to_string(); - run_python_function(&PY_MODULE, &task, "refinement_phase") -} - - -// this two are filtering + refinement in UDF runtime -pub fn model_selection( - task: &String -) -> serde_json::Value { - run_python_function(&PY_MODULE, task, "model_selection") -} - - -pub fn model_selection_workloads( - task: &String -) -> serde_json::Value { - run_python_function(&PY_MODULE, task, "model_selection_workloads") -} - - -// this two are filtering + refinement in GPU server -pub fn model_selection_trails( - task: &String -) -> serde_json::Value { - run_python_function(&PY_MODULE, task, "model_selection_trails") -} - - -pub fn model_selection_trails_workloads( - task: &String -) -> serde_json::Value { - run_python_function(&PY_MODULE, task, "model_selection_trails_workloads") -} - -// micro benchmarks - -pub fn benchmark_filtering_phase_latency( - task: &String -) -> serde_json::Value { - run_python_function(&PY_MODULE, task, "benchmark_filtering_phase_latency") -} - -pub fn benchmark_filtering_latency_in_db( - explore_models: i32, dataset: &String, config_file: &String) -> serde_json::Value { - - let overall_start_time = Instant::now(); - - let database_name = "pg_extension"; - let mut last_id = 0; - let mut eva_results = serde_json::Value::Null; // Initializing the eva_results - - for i in 1..explore_models { - - // Step 1: Initialize State in Python - let mut task_map = HashMap::new(); - task_map.insert("config_file", config_file.clone()); - task_map.insert("dataset", dataset.clone()); - task_map.insert("eva_results", eva_results.to_string()); - let task_json = json!(task_map).to_string(); - - // here it cache a state - let sample_result = run_python_function( - &PY_MODULE, - &task_json, - "in_db_filtering_state_init"); - - // 2. query data via SPI - let start_time = Instant::now(); - let results: Result>, String> = Spi::connect(|client| { - let query = format!("SELECT * FROM {}_train WHERE id > {} ORDER BY id ASC LIMIT 32", dataset, last_id); - let mut cursor = client.open_cursor(&query, None); - let table = match cursor.fetch(32) { - Ok(table) => table, - Err(e) => return Err(e.to_string()), // Convert the error to a string and return - }; - - let mut mini_batch = Vec::new(); - - for row in table.into_iter() { - let mut each_row = Vec::new(); - // add primary key - let col0 = match row.get::(1) { - Ok(Some(val)) => { - // Update last_id with the retrieved value - if val > 100000{ - last_id = 0; - }else{ - last_id = val - } - val.to_string() - } - Ok(None) => "".to_string(), // Handle the case when there's no valid value - Err(e) => e.to_string(), - }; - each_row.push(col0); - // add label - let col1 = match row.get::(2) { - Ok(val) => val.map(|i| i.to_string()).unwrap_or_default(), - Err(e) => e.to_string(), - }; - each_row.push(col1); - // add fields - let texts: Vec = (3..row.columns()+1) - .filter_map(|i| { - match row.get::<&str>(i) { - Ok(Some(s)) => Some(s.to_string()), - Ok(None) => None, - Err(e) => Some(e.to_string()), // Convert error to string - } - }).collect(); - each_row.extend(texts); - mini_batch.push(each_row) - } - // return - Ok(mini_batch) - }); - // serialize the mini-batch data - let tup_table = match results { - Ok(data) => { - serde_json::json!({ - "status": "success", - "data": data - }) - } - Err(e) => { - serde_json::json!({ - "status": "error", - "message": format!("Error while connecting: {}", e) - }) - } - }; - - let end_time = Instant::now(); - let elapsed_time = end_time.duration_since(start_time); - let elapsed_seconds = elapsed_time.as_secs_f64(); - - // Step 3: model evaluate in Python - let mut eva_task_map = HashMap::new(); - eva_task_map.insert("config_file", config_file.clone()); - eva_task_map.insert("sample_result", sample_result.to_string()); - let mini_batch_json = tup_table.to_string(); - eva_task_map.insert("mini_batch", mini_batch_json); - eva_task_map.insert("spi_seconds", elapsed_seconds.to_string()); - eva_task_map.insert("model_index", i.to_string()); - - let eva_task_json = json!(eva_task_map).to_string(); // Corrected this line - - eva_results = run_python_function( - &PY_MODULE, - &eva_task_json, - "in_db_filtering_evaluate"); - } - - let mut record_task_map = HashMap::new(); - record_task_map.insert("config_file", config_file.clone()); - record_task_map.insert("dataset", dataset.clone()); - let record_task_json = json!(record_task_map).to_string(); - run_python_function( - &PY_MODULE, - &record_task_json, - "records_results"); - - let overall_end_time = Instant::now(); - let overall_elapsed_time = overall_end_time.duration_since(overall_start_time); - let overall_elapsed_seconds = overall_elapsed_time.as_secs_f64(); - - // Step 4: Return to PostgresSQL - return serde_json::json!(overall_elapsed_seconds.to_string()); -} - diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/requirement.txt b/examples/model_selection/TRAILS-Database-Native-Model-Selection/requirement.txt deleted file mode 100644 index ab233d87b5..0000000000 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/requirement.txt +++ /dev/null @@ -1,32 +0,0 @@ -ConfigSpace==0.7.1 -contourpy==1.1.0 -cycler==0.11.0 -fonttools==4.41.0 -importlib-resources==6.0.0 -joblib==1.3.1 -kiwisolver==1.4.4 -matplotlib==3.7.2 -more-itertools==9.1.0 -numpy==1.24.4 -orjson==3.9.2 -packaging==23.1 -palettable==3.3.3 -pandas==2.0.3 -Pillow==10.0.0 -pyparsing==3.0.9 -python-dateutil==2.8.2 -pytz==2023.3 -scikit-learn==1.3.0 -scipy==1.10.1 -seaborn==0.12.2 -six==1.16.0 -sklearn==0.0 -threadpoolctl==3.1.0 -torch==1.8.1 -torchaudio==0.8.1 -torchvision==0.9.1 -tqdm==4.47.0 -typing_extensions==4.7.1 -tzdata==2023.3 -zipp==3.16.2 -requests==2.31.0 diff --git a/examples/model_selection/Trails/.gitignore b/examples/model_selection/Trails/.gitignore new file mode 100644 index 0000000000..7eace33453 --- /dev/null +++ b/examples/model_selection/Trails/.gitignore @@ -0,0 +1,20 @@ + + +.idea/* +.DS_Store +*/__pycache__/* +**/__pycache__/ + +*.pdf +logs_* +*.log + +z_dev/* +internal/pg_extension/target +result_base/* +log_*/ +exp_result/*.json +exp_result/*.pdf + + +/internal/ml/third_party/* diff --git a/examples/model_selection/Trails/.gitmodules b/examples/model_selection/Trails/.gitmodules new file mode 100644 index 0000000000..0f6e011590 --- /dev/null +++ b/examples/model_selection/Trails/.gitmodules @@ -0,0 +1,3 @@ +[submodule "internal/ml/model_slicing/algorithm"] + path = internal/ml/model_slicing/algorithm + url = https://github.com/Zrealshadow/SAMS.git diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/README.md b/examples/model_selection/Trails/README.md similarity index 55% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/README.md rename to examples/model_selection/Trails/README.md index b756ca0cac..92c66dad29 100644 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/README.md +++ b/examples/model_selection/Trails/README.md @@ -8,7 +8,7 @@ with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 - + Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -17,65 +17,73 @@ under the License. --> -# Database-Native Model Selection - -​ -- based on SINGA - - - -![image-20231020174425377](documents/image-20231020174425377.png) +# TRAILS: A Database Native Model Selection System # Build & Run examples ## Singa + PostgreSQL ```bash +# Remove existing one if there is +docker rm -f singa_trails # Create project folder. mkdir project && cd project # Download the Dockerile. -wget https://raw.githubusercontent.com/apache/singa/dev-postgresql/examples/model_selection/TRAILS-Database-Native-Model-Selection/Dockerfile - +wget -O Dockerfile ?? # Build Dockerile and run the docker. -docker build -t trails . -docker run -d --name trails --network="host" trails -# Monitor the logs until the setup step is done. -docker logs -f trails +docker build -t singa_trails . +docker run -d --name singa_trails singa_trails +# Wait for 5 mins, monitor the logs until it shows "Done!", then exit the monitor +docker logs -f singa_trails -docker exec -it trails bash # Connect to the pg server and use pg_extension database. +docker exec -it singa_trails bash psql -h localhost -p 28814 -U postgres \c pg_extension +# Test coordinator +SELECT coordinator('0.08244', '168.830156', '800', false, '/project/Trails/internal/ml/model_selection/config.ini'); # Run an example, wait one min, it will run filtering + refinemnt + training the selected model. CALL model_selection_end2end('frappe_train', ARRAY['col1', 'col2', 'col3', 'col4','col5','col6','col7','col8','col9','col10', 'label'], '10', '/project/Trails/internal/ml/model_selection/config.ini'); +# In other terminal, monitor the running process +docker exec -it trails_polardb bash +tail -f /home/postgres/.pgrx/data-14/trails_log_folder/ ``` + ## Singa + PolarDB ```bash +# Remove existing one if there is +docker rm -f trails_singa_polardb # Create project folder. -mkdir project && cd project +mkdir project_polardb_singa && cd project_polardb_singa # Download the Dockerile. -wget https://raw.githubusercontent.com/apache/singa/dev-postgresql/examples/model_selection/TRAILS-Database-Native-Model-Selection/singa.polarDB.Dockerfile - +wget -O Dockerfile ??? # Build Dockerile and run the docker. -docker build -t trails_polardb . -docker run -d --name trails_polardb --network="host" trails_polardb +docker build -t trails_singa_polardb . +docker run -d --name trails_singa_polardb trails_singa_polardb # Monitor the logs until the setup step is done. -docker logs -f trails_polardb +docker logs -f trails_singa_polardb +# Run a setup script +docker exec trails_singa_polardb /bin/bash -c "/home/postgres/Trails/init_polardb.sh" -docker exec -it trails_polardb bash -# Connect to the pg server and use pg_extension database. +# Connect to the primary pg server and use pg_extension database. +docker exec -it trails_singa_polardb bash psql -h localhost -p 5432 -U postgres \c pg_extension +# Test coordinator +SELECT coordinator('0.08244', '168.830156', '800', false, '/home/postgres/Trails/internal/ml/model_selection/config.ini'); # Run an example, wait one min, it will run filtering + refinemnt + training the selected model. -CALL model_selection_end2end('frappe_train', ARRAY['col1', 'col2', 'col3', 'col4','col5','col6','col7','col8','col9','col10', 'label'], '10', '/project/Trails/internal/ml/model_selection/config.ini'); +CALL model_selection_end2end('frappe_train', ARRAY['col1', 'col2', 'col3', 'col4','col5','col6','col7','col8','col9','col10', 'label'], '10', '/home/postgres/Trails/internal/ml/model_selection/config.ini'); +# In other terminal, monitor the running process +docker exec -it trails_singa_polardb bash +tail -f /var/polardb/primary_datadir/trails_log_folder/ ``` - diff --git a/examples/model_selection/Trails/dataset/frappe/decoded_train_feat_id.pt b/examples/model_selection/Trails/dataset/frappe/decoded_train_feat_id.pt new file mode 100644 index 0000000000..cf0c55922c Binary files /dev/null and b/examples/model_selection/Trails/dataset/frappe/decoded_train_feat_id.pt differ diff --git a/examples/model_selection/Trails/dataset/frappe/decoded_train_feat_value.pt b/examples/model_selection/Trails/dataset/frappe/decoded_train_feat_value.pt new file mode 100644 index 0000000000..d07449ebf6 Binary files /dev/null and b/examples/model_selection/Trails/dataset/frappe/decoded_train_feat_value.pt differ diff --git a/examples/model_selection/Trails/dataset/frappe/decoded_train_y.pt b/examples/model_selection/Trails/dataset/frappe/decoded_train_y.pt new file mode 100644 index 0000000000..869597e9a8 Binary files /dev/null and b/examples/model_selection/Trails/dataset/frappe/decoded_train_y.pt differ diff --git a/examples/model_selection/Trails/dataset/frappe/decoded_valid_feat_id.pt b/examples/model_selection/Trails/dataset/frappe/decoded_valid_feat_id.pt new file mode 100644 index 0000000000..86c14c9347 Binary files /dev/null and b/examples/model_selection/Trails/dataset/frappe/decoded_valid_feat_id.pt differ diff --git a/examples/model_selection/Trails/dataset/frappe/decoded_valid_feat_value.pt b/examples/model_selection/Trails/dataset/frappe/decoded_valid_feat_value.pt new file mode 100644 index 0000000000..8858b5a6f7 Binary files /dev/null and b/examples/model_selection/Trails/dataset/frappe/decoded_valid_feat_value.pt differ diff --git a/examples/model_selection/Trails/dataset/frappe/decoded_valid_y.pt b/examples/model_selection/Trails/dataset/frappe/decoded_valid_y.pt new file mode 100644 index 0000000000..80ece76d0d Binary files /dev/null and b/examples/model_selection/Trails/dataset/frappe/decoded_valid_y.pt differ diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/dataset/frappe/test.libsvm b/examples/model_selection/Trails/dataset/frappe/test.libsvm similarity index 100% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/dataset/frappe/test.libsvm rename to examples/model_selection/Trails/dataset/frappe/test.libsvm diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/dataset/frappe/train.libsvm b/examples/model_selection/Trails/dataset/frappe/train.libsvm similarity index 100% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/dataset/frappe/train.libsvm rename to examples/model_selection/Trails/dataset/frappe/train.libsvm diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/dataset/frappe/valid.libsvm b/examples/model_selection/Trails/dataset/frappe/valid.libsvm similarity index 100% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/dataset/frappe/valid.libsvm rename to examples/model_selection/Trails/dataset/frappe/valid.libsvm diff --git a/examples/model_selection/Trails/documents/dev_guide.md b/examples/model_selection/Trails/documents/dev_guide.md new file mode 100644 index 0000000000..d5ba0ee540 --- /dev/null +++ b/examples/model_selection/Trails/documents/dev_guide.md @@ -0,0 +1,12 @@ +# Test Singa for model selection + +Run those three functions to ensure the singa can run. + +```bash +python3 ./internal/ml/model_selection/exps/4.seq_score_online.py --embedding_cache_filtering=True --models_explore=10 --tfmem=synflow --log_name=score_based --search_space=mlp_sp --num_layers=4 --hidden_choice_len=20 --base_dir=./dataset --num_labels=2 --device=cpu --batch_size=32 --dataset=frappe --nfeat=5500 --nfield=10 --nemb=10 --workers=0 --result_dir=./exp_result/ --log_folder=log_foler + +python3 ./internal/ml/model_selection/exps/0.train_one_model.py --log_name=train_log --search_space=mlp_sp --base_dir=./dataset --num_labels=2 --device=cpu --batch_size=10 --lr=0.01 --epoch=5 --iter_per_epoch=2000 --dataset=frappe --nfeat=5500 --nfield=10 --nemb=10 --workers=0 --result_dir=./exp_result/ --log_folder=log_foler + +python3 internal/ml/model_selection/pg_interface.py +``` + diff --git a/examples/model_selection/Trails/init.sh b/examples/model_selection/Trails/init.sh new file mode 100644 index 0000000000..1bc3e4b39b --- /dev/null +++ b/examples/model_selection/Trails/init.sh @@ -0,0 +1,42 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + +# Those cmds will triggered after docker run . + +# Compile code, and run postgresql +cd /project/Trails/internal/pg_extension || exit +/bin/bash -c "source $HOME/.cargo/env && echo '\q' | cargo pgrx run --release" + +# Wait for PostgreSQL to become available +until psql -h localhost -p 28814 -U postgres -d pg_extension -c '\q'; do + >&2 echo "Postgres is unavailable - sleeping" + sleep 1 +done + +# Run setup commands +psql -h localhost -p 28814 -U postgres -d pg_extension -c "CREATE EXTENSION pg_extension;" +psql -h localhost -p 28814 -U postgres -d pg_extension -f /project/Trails/internal/pg_extension/sql/model_selection_cpu.sql +# Load example dataset into database +bash /project/Trails/internal/ml/model_selection/scripts/database/load_data_to_db.sh /project/Trails/dataset/frappe frappe 28814 + +echo "Done!" + +# Continue with the rest of your container's CMD +tail -f /dev/null diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/scripts/nas-bench-tabular/score_all_modesl_uci.sh b/examples/model_selection/Trails/init_polardb.sh similarity index 54% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/scripts/nas-bench-tabular/score_all_modesl_uci.sh rename to examples/model_selection/Trails/init_polardb.sh index 99dfe0e4d5..70a23b320a 100644 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/scripts/nas-bench-tabular/score_all_modesl_uci.sh +++ b/examples/model_selection/Trails/init_polardb.sh @@ -1,3 +1,4 @@ +#!/bin/bash # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file @@ -16,29 +17,19 @@ # limitations under the License. # -export PYTHONPATH=$PYTHONPATH:./internal/ml/model_selection -conda activate trails -nohup python ./internal/ml/model_selection/exps/nas_bench_tabular/4.seq_score_online.py \ - --embedding_cache_filtering=True \ - --models_explore=159999 \ - --tfmem=express_flow \ - --log_name=score_based \ - --search_space=mlp_sp \ - --num_layers=4 \ - --hidden_choice_len=20 \ - --base_dir=/hdd1/xingnaili/exp_data/ \ - --num_labels=2 \ - --device=cpu \ - --batch_size=32 \ - --dataset=uci_diabetes \ - --nfeat=369 \ - --nfield=43 \ - --nemb=10 \ - --workers=0 \ - --result_dir=./internal/ml/model_selection/exp_result/ \ - --log_folder=log_score_time_uci > outputUciScoreALl.log& - +# Wait for PostgreSQL to become available +until psql -h localhost -p 5432 -U postgres -c '\q'; do + >&2 echo "Postgres is unavailable - sleeping" + sleep 1 +done +# Run setup commands +psql -h localhost -p 5432 -U postgres -c "CREATE DATABASE pg_extension;" +psql -h localhost -p 5432 -U postgres -d pg_extension -c "CREATE EXTENSION pg_extension;" +psql -h localhost -p 5432 -U postgres -d pg_extension -f /home/postgres/Trails/internal/pg_extension/sql/model_selection_cpu.sql +# Load example dataset into database +bash /home/postgres/Trails/internal/ml/model_selection/scripts/database/load_data_to_db.sh //home/postgres/Trails/dataset/frappe frappe 5432 +echo "Done!" diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/cache-service/cache_service.py b/examples/model_selection/Trails/internal/cache-service/cache_service.py similarity index 99% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/cache-service/cache_service.py rename to examples/model_selection/Trails/internal/cache-service/cache_service.py index 87479a704b..035e519e5c 100644 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/cache-service/cache_service.py +++ b/examples/model_selection/Trails/internal/cache-service/cache_service.py @@ -17,6 +17,7 @@ # limitations under the License. # + import time import threading import queue diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/cache-service/trigger_cache_svc.py b/examples/model_selection/Trails/internal/cache-service/trigger_cache_svc.py similarity index 99% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/cache-service/trigger_cache_svc.py rename to examples/model_selection/Trails/internal/cache-service/trigger_cache_svc.py index 2631abeabb..87a3843ad9 100644 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/cache-service/trigger_cache_svc.py +++ b/examples/model_selection/Trails/internal/cache-service/trigger_cache_svc.py @@ -16,6 +16,7 @@ # limitations under the License. # + import requests url = 'http://localhost:8093/' diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/config.ini b/examples/model_selection/Trails/internal/ml/model_selection/config.ini similarity index 50% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/config.ini rename to examples/model_selection/Trails/internal/ml/model_selection/config.ini index e7235b1d46..34f0ff2e4e 100644 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/config.ini +++ b/examples/model_selection/Trails/internal/ml/model_selection/config.ini @@ -1,29 +1,9 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - [DEFAULT] log_name = in_db_ms budget = 100 device = cpu -log_folder = ./internal/debug_singa_logger -;log_folder = /project/TRAILS/log_score_time_frappe -result_dir = ./internal/ml/model_selection/exp_result_singa/ -;result_dir = /project/TRAILS/internal/ml/model_selection/exp_result_sever_cache_sql_indb/ +log_folder = ./trails_log_folder +result_dir = ./trails_log_folder num_points = 12 max_load = -1 @@ -51,7 +31,7 @@ hidden_choice_len = 20 [MLP_TRAINER] epoch = 20 -batch_size = 32 +batch_size = 8 lr = 0.002 patience = 1 iter_per_epoch = 200 @@ -62,8 +42,7 @@ report_freq = 30 workers = 0 [DATASET] -;base_dir = ../exp_data/ -base_dir = /hdd1/xingnaili/exp_data/ +base_dir = ./dataset dataset = frappe num_labels = 2 @@ -98,7 +77,6 @@ db_port = 28814 [SYS_PERFORMANCE] models_explore = -1 -# tfmem = express_flow tfmem = synflow embedding_cache_filtering = True concurrency = 1 diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/eva_service.py b/examples/model_selection/Trails/internal/ml/model_selection/eva_service.py similarity index 97% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/eva_service.py rename to examples/model_selection/Trails/internal/ml/model_selection/eva_service.py index f3f5b4575d..6d089a7614 100644 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/eva_service.py +++ b/examples/model_selection/Trails/internal/ml/model_selection/eva_service.py @@ -50,7 +50,7 @@ def refinement_phase(u: int, k_models: List, dataset_name: str, config_file: str try: rms = RunModelSelection(args.search_space, args, is_simulate=args.is_simulate) - best_arch, best_arch_performance, _ = rms.refinement_phase( + best_arch, best_arch_performance, _, _ = rms.refinement_phase( U=u, k_models=k_models, train_loader=train_dataloader, @@ -86,7 +86,6 @@ async def start_refinement_phase(request): if __name__ == "__main__": - result = refinement_phase( u=1, k_models=["8-8-8-8", "16-16-16-16"], diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/exps/nas_bench_tabular/0.train_one_model.py b/examples/model_selection/Trails/internal/ml/model_selection/exps/0.train_one_model.py similarity index 94% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/exps/nas_bench_tabular/0.train_one_model.py rename to examples/model_selection/Trails/internal/ml/model_selection/exps/0.train_one_model.py index 9a19f007db..67ffcb5e73 100644 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/exps/nas_bench_tabular/0.train_one_model.py +++ b/examples/model_selection/Trails/internal/ml/model_selection/exps/0.train_one_model.py @@ -53,23 +53,20 @@ # 1. data loader train_loader, val_loader, test_loader = libsvm_dataloader( args=args, - data_dir=os.path.join(args.base_dir, "data", "structure_data", args.dataset), + data_dir=os.path.join(args.base_dir, args.dataset), nfield=args.nfield, batch_size=args.batch_size) - # arch_id = "256-256-256-256" arch_id = "128-128-128-128" print(f"begin to train the {arch_id}") model = search_space_ins.new_architecture(arch_id) - # model.init_embedding(requires_grad=True) if args.device == 'cpu': dev = singa_device.get_default_device() else: # GPU dev = singa_device.create_cuda_gpu_on(args.local_rank) # need to change to CPU device for CPU-only machines dev.SetRandSeed(0) np.random.seed(0) - # model.to(args.device) valid_auc, total_run_time, train_log = ModelTrainer.fully_train_arch( model=model, diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/exps/nas_bench_tabular/4.seq_score_online.py b/examples/model_selection/Trails/internal/ml/model_selection/exps/4.seq_score_online.py similarity index 82% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/exps/nas_bench_tabular/4.seq_score_online.py rename to examples/model_selection/Trails/internal/ml/model_selection/exps/4.seq_score_online.py index 5d6c16ec00..063d843511 100644 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/exps/nas_bench_tabular/4.seq_score_online.py +++ b/examples/model_selection/Trails/internal/ml/model_selection/exps/4.seq_score_online.py @@ -19,36 +19,23 @@ import calendar import json import os -import random import time from exps.shared_args import parse_arguments -from datetime import datetime -import gc - -# import tracemalloc -# tracemalloc.start() -# -# -# def print_memory_usg(): -# snapshot = tracemalloc.take_snapshot() -# top_stats = snapshot.statistics('lineno') -# for stat in top_stats[:10]: # top 10 memory-consuming lines -# print(stat) def generate_data_loader(): - if args.dataset in [Config.c10, Config.c100, Config.imgNet]: + if args.dataset in [Config.c10, Config.c100, Config.imgNet, Config.imgNetFull]: train_loader, val_loader, class_num = dataset.get_dataloader( train_batch_size=args.batch_size, test_batch_size=args.batch_size, dataset=args.dataset, num_workers=1, - datadir=os.path.join(args.base_dir, "data")) + datadir=os.path.join(args.base_dir)) test_loader = val_loader else: train_loader, val_loader, test_loader = libsvm_dataloader( args=args, - data_dir=os.path.join(args.base_dir, "data", "structure_data", args.dataset), + data_dir=os.path.join(args.base_dir, args.dataset), nfield=args.nfield, batch_size=args.batch_size) class_num = args.num_labels @@ -58,6 +45,7 @@ def generate_data_loader(): if __name__ == "__main__": args = parse_arguments() + from src.common.constant import Config # set the log name gmt = time.gmtime() @@ -66,7 +54,6 @@ def generate_data_loader(): os.environ.setdefault("log_file_name", args.log_name + "_" + str(ts) + ".log") os.environ.setdefault("base_dir", args.base_dir) - from src.common.constant import Config from src.common.structure import ModelAcquireData from src.controller.sampler_all.seq_sampler import SequenceSampler from src.eva_engine.phase1.evaluator import P1Evaluator @@ -117,6 +104,16 @@ def generate_data_loader(): explored_n += 1 result[arch_id] = model_score # print(f" {datetime.now()} finish arch = {arch_id}, model_score = {model_score}") + + if explored_n < 10: + print("3. [trails] Phase 1: filter phase explored " + str(explored_n) + + "Total explored " + str(len(result)) + + " model, model_id = " + str(arch_id) + + " model_scores = " + json.dumps(model_score)) + logger.info("3. [trails] Phase 1: filter phase explored " + str(explored_n) + + "Total explored " + str(len(result)) + + " model, model_id = " + str(arch_id) + + " model_scores = " + json.dumps(model_score)) if explored_n % 1000 == 0: # print_memory_usg() # _evaluator.force_gc() @@ -124,6 +121,10 @@ def generate_data_loader(): + "Total explored " + str(len(result)) + " model, model_id = " + str(arch_id) + " model_scores = " + json.dumps(model_score)) + logger.info("3. [trails] Phase 1: filter phase explored " + str(explored_n) + + "Total explored " + str(len(result)) + + " model, model_id = " + str(arch_id) + + " model_scores = " + json.dumps(model_score)) if explored_n % 1000 == 0: # print_memory_usg() # _evaluator.force_gc() diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/exps/README.md b/examples/model_selection/Trails/internal/ml/model_selection/exps/README.md similarity index 93% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/exps/README.md rename to examples/model_selection/Trails/internal/ml/model_selection/exps/README.md index 2adc07d972..84e9597ddb 100644 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/exps/README.md +++ b/examples/model_selection/Trails/internal/ml/model_selection/exps/README.md @@ -17,6 +17,7 @@ under the License. --> + # Folder description ## baseline @@ -33,8 +34,4 @@ We benchmark the system from both macro and analysis component in micro ## nas_bench_tabular -We build a nas-bench-tabular dataset here - -## system - -We run the experiment here \ No newline at end of file +We build a nas-bench-tabular dataset here \ No newline at end of file diff --git a/examples/model_selection/Trails/internal/ml/model_selection/exps/shared_args.py b/examples/model_selection/Trails/internal/ml/model_selection/exps/shared_args.py new file mode 100644 index 0000000000..c8a4200320 --- /dev/null +++ b/examples/model_selection/Trails/internal/ml/model_selection/exps/shared_args.py @@ -0,0 +1,223 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import argparse +import os +import random +import numpy as np +import torch + + +def seed_everything(seed=2201): + # 2022 -> 2021 -> 2031 + ''' [reference] https://gist.github.com/KirillVladimirov/005ec7f762293d2321385580d3dbe335 ''' + random.seed(seed) + os.environ['PYTHONHASHSEED'] = str(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed(seed) + torch.backends.cudnn.deterministic = True + + +def str2bool(v): + if isinstance(v, bool): + return v + if v.lower() in ('yes', 'true', 't', 'y', '1'): + return True + elif v.lower() in ('no', 'false', 'f', 'n', '0'): + return False + else: + raise argparse.ArgumentTypeError('Boolean value expected.') + + +def sampler_args(parser): + # define search space, + parser.add_argument('--search_space', type=str, default="mlp_sp", + help='[nasbench101, nasbench201, mlp_sp]') + # EA sampler's parameters, + parser.add_argument('--population_size', type=int, default=10, help="The learning rate for REINFORCE.") + parser.add_argument('--sample_size', type=int, default=3, help="The momentum value for EMA.") + parser.add_argument('--simple_score_sum', default='True', type=str2bool, + help="Sum multiple TFMEM score or use Global Rank") + + +def space201_101_share_args(parser): + parser.add_argument('--api_loc', type=str, default="NAS-Bench-201-v1_1-096897.pth", + help='which search space file to use, [' + 'nasbench101: nasbench_only108.pkl' + 'nasbench201: NAS-Bench-201-v1_1-096897.pth' + ' ... ]') + + parser.add_argument('--init_channels', default=16, type=int, help='output channels of stem convolution') + parser.add_argument('--bn', type=int, default=1, help="If use batch norm in network 1 = true, 0 = false") + + +def nb101_args(parser): + parser.add_argument('--num_stacks', default=3, type=int, help='#stacks of modules') + parser.add_argument('--num_modules_per_stack', default=3, type=int, help='# modules per stack') + + +def nb201_args(parser): + parser.add_argument('--init_w_type', type=str, default='none', + help='weight initialization (before pruning) type [none, xavier, kaiming, zero]') + parser.add_argument('--init_b_type', type=str, default='none', + help='bias initialization (before pruning) type [none, xavier, kaiming, zero]') + parser.add_argument('--arch_size', type=int, default=1, + help='How many node the architecture has at least') + + +def mlp_args(parser): + parser.add_argument('--num_layers', default=4, type=int, help='# hidden layers') + parser.add_argument('--hidden_choice_len', default=20, type=int, help= + 'number of hidden layer choices, 10 for criteo, 20 for others') + + +def mlp_trainner_args(parser): + parser.add_argument('--epoch', type=int, default=20, + help='number of maximum epochs, ' + 'frappe: 20, uci_diabetes: 40, criteo: 10' + 'nb101: 108, nb201: 200') + + parser.add_argument('--batch_size', type=int, default=32, help='batch size') + parser.add_argument('--lr', type=float, default=0.001, help="learning reate") + parser.add_argument('--patience', type=int, default=1, help='number of epochs for stopping training') + # parser.add_argument('--eval_freq', type=int, default=10000, help='max number of batches to train per epoch') + + parser.add_argument('--iter_per_epoch', type=int, default=200, + help="None, " + "200 for frappe, uci_diabetes, " + "2000 for criteo") + + # MLP model config + parser.add_argument('--nfeat', type=int, default=5500, + help='the number of features, ' + 'frappe: 5500, ' + 'uci_diabetes: 369,' + 'criteo: 2100000') + parser.add_argument('--nfield', type=int, default=10, + help='the number of fields, ' + 'frappe: 10, ' + 'uci_diabetes: 43,' + 'criteo: 39') + parser.add_argument('--nemb', type=int, default=10, + help='embedding size 10') + + # MLP train config + parser.add_argument('--report_freq', type=int, default=30, help='report frequency') + parser.add_argument('--workers', default=1, type=int, help='data loading workers') + + +def data_set_config(parser): + parser.add_argument('--base_dir', type=str, default="./dataset/", + help='path of data and result parent folder') + # define search space, + parser.add_argument('--dataset', type=str, default='frappe', + help='cifar10, cifar100, ImageNet16-120 ' + 'frappe, criteo, uci_diabetes') + + parser.add_argument('--num_labels', type=int, default=2, + help='[10, 100, 120],' + '[2, 2, 2]') + + +def seq_train_all_params(parser): + parser.add_argument('--worker_id', type=int, default=0, help='start from 0') + parser.add_argument('--total_workers', type=int, default=120, + help='total number of workers, each train some models') + parser.add_argument('--total_models_per_worker', type=int, default=-1, help='How many models to evaluate') + parser.add_argument('--pre_partitioned_file', + default="./internal/ml/model_selection/exps/sampled_data/sampled_models_all.json", + type=str, help='all models with id') + + +def dis_train_all_models(parser): + parser.add_argument('--worker_each_gpu', default=6, type=int, help='num worker each gpu') + parser.add_argument('--gpu_num', default=8, type=int, help='num GPus') + + +# tune interval and schedule NK rate such that it can produce a good result +def tune_interval_NK_rate(parser): + parser.add_argument('--kn_rate', default=-1, type=int, help="default N/K = 100") + + +def db4nas(parser): + parser.add_argument('--db_name', default="pg_extension", type=str) + parser.add_argument('--db_user', default="postgres", type=str) + parser.add_argument('--db_host', default="localhost", type=str) + parser.add_argument('--db_port', default=28814, type=int) + + +def anytime_exp_set(parser): + parser.add_argument('--only_phase1', default='False', type=str2bool) + parser.add_argument('--is_simulate', default='True', type=str2bool, + help='Use pre-computed result or run online') + + +def system_performance_exp(parser): + parser.add_argument('--models_explore', default=10, type=int, help='# models to explore in the filtering phase') + parser.add_argument('--tfmem', default="jacflow", type=str, help='the matrix t use, all_matrix') + parser.add_argument('--embedding_cache_filtering', default='True', type=str2bool, + help='Cache embedding for MLP in filtering phase?') + parser.add_argument('--concurrency', default=1, type=int, help='number of worker in filtering phase') + + +def parse_arguments(): + parser = argparse.ArgumentParser(description='system') + + # job config + parser.add_argument('--log_name', type=str, default="main_T_100s") + parser.add_argument('--budget', type=int, default=100, help="in second") + + # define base dir, where it stores apis, datasets, logs, etc, + parser.add_argument('--device', type=str, default="cpu") + parser.add_argument('--local_rank', type=int, default=1, help="local rank") + + parser.add_argument('--log_folder', default="log_debug", type=str) + + parser.add_argument('--result_dir', default="./internal/ml/model_selection/exp_result/", type=str, + help='path to store exp outputs') + parser.add_argument('--num_points', default=12, type=int, help='num GPus') + + sampler_args(parser) + + nb101_args(parser) + nb201_args(parser) + space201_101_share_args(parser) + + mlp_args(parser) + data_set_config(parser) + mlp_trainner_args(parser) + seq_train_all_params(parser) + dis_train_all_models(parser) + + tune_interval_NK_rate(parser) + + db4nas(parser) + anytime_exp_set(parser) + + system_performance_exp(parser) + + # tmp + parser.add_argument('--max_load', type=int, default=-1, help="Max Loading time") + + # refinement server + parser.add_argument('--url', type=str, default=-1, help="Max Loading time") + + seed_everything() + + return parser.parse_args() diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/main.py b/examples/model_selection/Trails/internal/ml/model_selection/main.py similarity index 92% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/main.py rename to examples/model_selection/Trails/internal/ml/model_selection/main.py index 38357f3049..33cb88f3ca 100644 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/main.py +++ b/examples/model_selection/Trails/internal/ml/model_selection/main.py @@ -16,7 +16,6 @@ # limitations under the License. # -# this is the main function of model selection. import calendar import os @@ -33,12 +32,12 @@ def generate_data_loader(): test_batch_size=args.batch_size, dataset=args.dataset, num_workers=1, - datadir=os.path.join(args.base_dir, "data")) + datadir=os.path.join(args.base_dir)) test_loader = val_loader else: train_loader, val_loader, test_loader = libsvm_dataloader( args=args, - data_dir=os.path.join(args.base_dir, "data", "structure_data", args.dataset), + data_dir=os.path.join(args.base_dir, args.dataset), nfield=args.nfield, batch_size=args.batch_size) class_num = args.num_labels diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/pg_interface.py b/examples/model_selection/Trails/internal/ml/model_selection/pg_interface.py similarity index 82% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/pg_interface.py rename to examples/model_selection/Trails/internal/ml/model_selection/pg_interface.py index 98317c08a4..3636c51db3 100644 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/pg_interface.py +++ b/examples/model_selection/Trails/internal/ml/model_selection/pg_interface.py @@ -22,6 +22,8 @@ import requests import json from typing import List, Dict +import torch +from torch.utils.data import Dataset, DataLoader import traceback import orjson from argparse import Namespace @@ -30,6 +32,7 @@ def exception_catcher(func): def wrapper(encoded_str: str): + global_res = "NA, " try: # each functon accepts a json string params = json.loads(encoded_str) @@ -45,15 +48,15 @@ def wrapper(encoded_str: str): os.environ.setdefault("log_file_name", args.log_name + "_" + str(ts) + ".log") # Call the original function with the parsed parameters - return func(params, args) + global_res = func(params, args) + return global_res except Exception as e: return orjson.dumps( - {"Errored": traceback.format_exc()}).decode('utf-8') + {"res": global_res, "Errored": traceback.format_exc()}).decode('utf-8') return wrapper -from torch.utils.data import Dataset -import torch + class LibsvmDataset(Dataset): """ Dataset loader for Libsvm data format """ @@ -97,7 +100,6 @@ def __getitem__(self, idx): def generate_dataloader(mini_batch_data, args): from src.logger import logger - from torch.utils.data import DataLoader logger.info(f"Begin to preprocessing dataset") begin_time = time.time() dataloader = DataLoader(LibsvmDataset(mini_batch_data), @@ -112,6 +114,9 @@ def model_selection(params: dict, args: Namespace): from src.logger import logger logger.info(f"begin run model_selection on UDF runtime with CPU only") + begin = time.time() + # logger.info(params["mini_batch"]) + mini_batch_data = json.loads(params["mini_batch"]) budget = float(params["budget"]) @@ -121,6 +126,10 @@ def model_selection(params: dict, args: Namespace): data_loader = [dataloader, dataloader, dataloader] + logger.info(f"[end2end model_selection] Done with dataloader generation, time usage = " + str(time.time() - begin)) + + begin = time.time() + rms = RunModelSelection(args.search_space, args, is_simulate=args.is_simulate) best_arch, best_arch_performance, time_usage, _, p1_trace_highest_score, p1_trace_highest_scored_models_id = \ rms.select_model_online_clean( @@ -129,9 +138,17 @@ def model_selection(params: dict, args: Namespace): only_phase1=False, run_workers=1) + logger.info(f"[end2end model_selection] Done with model selection, time usage = " + str(time.time() - begin)) + + # here is some response notation + if best_arch_performance == 0: + best_arch_performance_str = "Not Fully Train Yet" + else: + best_arch_performance_str = str(best_arch_performance) + return orjson.dumps( {"best_arch": best_arch, - "best_arch_performance": best_arch_performance, + "best_arch_performance": best_arch_performance_str, "time_usage": time_usage}).decode('utf-8') @@ -179,8 +196,8 @@ def profiling_refinement_phase(params: dict, args: Namespace): @exception_catcher def coordinator(params: dict, args: Namespace): from src.logger import logger + logger.info(f"begin run coordinator") - # print (f"begin run coordinator") budget = float(params["budget"]) score_time_per_model = float(params["score_time_per_model"]) @@ -201,6 +218,8 @@ def coordinator(params: dict, args: Namespace): train_time_per_epoch=train_time_per_epoch, only_phase1=only_phase1) + logger.info(f"coordinator done with K, U, N with {K, U, N}") + return orjson.dumps( {"k": K, "u": U, "n": N}).decode('utf-8') @@ -229,12 +248,8 @@ def filtering_phase(params: dict, args: Namespace): def filtering_phase_dataLoader(params: dict, args: Namespace): from src.logger import logger logger.info(f"begin run filtering_phase CPU only") - # print (f"begin run filtering_phase CPU only") mini_batch_m = params["mini_batch"] - # print ("mini_batch_m: ", mini_batch_m) - - n = int(params["n"]) k = int(params["k"]) @@ -275,7 +290,7 @@ def model_selection_workloads(params: dict, args: Namespace): dataloader = generate_dataloader(mini_batch_data=mini_batch_data, args=args) rms = RunModelSelection(args.search_space, args, is_simulate=args.is_simulate) k_models, _, _, _ = rms.filtering_phase(N=n, K=k, train_loader=dataloader) - best_arch, best_arch_performance, _ = rms.refinement_phase( + best_arch, best_arch_performance, _, _ = rms.refinement_phase( U=1, k_models=k_models, train_loader=dataloader, @@ -394,7 +409,6 @@ def benchmark_filtering_phase_latency(params: dict, args: Namespace): from src.search_space.init_search_space import init_search_space from src.tools.io_tools import write_json, read_json from src.tools.res_measure import print_cpu_gpu_usage - import torch logger.info(f"begin run filtering_phase CPU only") args.models_explore = int(params["explore_models"]) @@ -457,7 +471,6 @@ def benchmark_filtering_phase_latency(params: dict, args: Namespace): # the first two are used for warming up _evaluator.time_usage["io_latency"] = \ sum(_evaluator.time_usage["track_io_model_load"][2:]) + \ - sum(_evaluator.time_usage["track_io_model_release_each_50"]) + \ sum(_evaluator.time_usage["track_io_model_init"][2:]) + \ sum(_evaluator.time_usage["track_io_res_load"][2:]) + \ sum(_evaluator.time_usage["track_io_data_retrievel"][2:]) + \ @@ -538,20 +551,28 @@ def in_db_filtering_evaluate(params: dict, args: Namespace): logger.info("search_space_ins, _evaluator, sampler is None") return orjson.dumps({"error": "erroed, plz call init first"}).decode('utf-8') + begin_read = time.time() + mini_batch = get_data_from_shared_memory_int(int(params["rows"])) + read_done = time.time() + # logger.info(mini_batch) + # logger.info(mini_batch.size()) + # logger.info(list(mini_batch[0])) + + logger.info(f"Data Retrievel time {params['spi_seconds']}, " + f"read shared memory time = {read_done - begin_read}") + sampled_result = json.loads(params["sample_result"]) arch_id, model_encoding = str(sampled_result["arch_id"]), str(sampled_result["model_encoding"]) - mini_batch = json.loads(params["mini_batch"]) - if mini_batch["status"] == "error": - return orjson.dumps({"error": mini_batch["message"]}).decode('utf-8') logger.info(f"Begin evaluate {params['model_index']}, " - f"with size of batch = {len(mini_batch['data'])}, " - f"size of columns = {len(mini_batch['data'][0])}") + f"with size of batch = {len(mini_batch)}, " + f"size of columns = {len(mini_batch[0])}") model_acquire_data = ModelAcquireData(model_id=arch_id, model_encoding=model_encoding, is_last=False, - spi_seconds=float(params["spi_seconds"]), - spi_mini_batch=mini_batch["data"], + spi_seconds=float(params["spi_seconds"]) + read_done - begin_read, + spi_mini_batch=mini_batch, + batch_size=int(params["rows"]) ) model_score = _evaluator._p1_evaluate_online(model_acquire_data) @@ -577,7 +598,6 @@ def records_results(params: dict, args: Namespace): time_output_file = f"{args.result_dir}/time_score_{args.search_space}_{params['dataset']}_batch_size_{args.batch_size}_{args.device}_{args.tfmem}.json" _evaluator.time_usage["io_latency"] = \ sum(_evaluator.time_usage["track_io_model_load"][2:]) + \ - sum(_evaluator.time_usage["track_io_model_release_each_50"]) + \ sum(_evaluator.time_usage["track_io_model_init"][2:]) + \ sum(_evaluator.time_usage["track_io_res_load"][2:]) + \ sum(_evaluator.time_usage["track_io_data_retrievel"][2:]) + \ @@ -604,6 +624,22 @@ def records_results(params: dict, args: Namespace): return orjson.dumps({"Done": 1}).decode('utf-8') +@exception_catcher +def measure_call_overheads(params: dict, args: Namespace): + return orjson.dumps({"Done": 1}).decode('utf-8') + + +import numpy as np +from multiprocessing import shared_memory + + +def get_data_from_shared_memory_int(n_rows): + shm = shared_memory.SharedMemory(name="my_shared_memory") + data = np.frombuffer(shm.buf, dtype=np.float32) + data = data.reshape(n_rows, -1) + return data + + if __name__ == "__main__": params = {} params["budget"] = 10 @@ -620,16 +656,3 @@ def records_results(params: dict, args: Namespace): params["k"] = 1 params["config_file"] = './internal/ml/model_selection/config.ini' print(filtering_phase_dataLoader(json.dumps(params))) - - # params = {} - # params[ - # "mini_batch"] = '[{"col1":"123:123","col2":"123:123","col3":"123:123","label":"1"}, {"col1":"123:123","col2":"123:123","col3":"123:123","label":"1"}, {"col1":"123:123","col2":"123:123","col3":"123:123","label":"1"}, {"col1":"123:123","col2":"123:123","col3":"123:123","label":"1"}, {"col1":"123:123","col2":"123:123","col3":"123:123","label":"0"}, {"col1":"123:123","col2":"123:123","col3":"123:123","label":"0"}, {"col1":"123:123","col2":"123:123","col3":"123:123","label":"0"}, {"col1":"123:123","col2":"123:123","col3":"123:123","label":"0"}, {"col1":"123:123","col2":"123:123","col3":"123:123","label":"0"}, {"col1":"123:123","col2":"123:123","col3":"123:123","label":"0"}, {"col1":"123:123","col2":"123:123","col3":"123:123","label":"0"}, {"col1":"123:123","col2":"123:123","col3":"123:123","label":"1"}, {"col1":"123:123","col2":"123:123","col3":"123:123","label":"1"}, {"col1":"123:123","col2":"123:123","col3":"123:123","label":"1"}, {"col1":"123:123","col2":"123:123","col3":"123:123","label":"0"}, {"col1":"123:123","col2":"123:123","col3":"123:123","label":"0"}, {"col1":"123:123","col2":"123:123","col3":"123:123","label":"0"}, {"col1":"123:123","col2":"123:123","col3":"123:123","label":"0"}, {"col1":"123:123","col2":"123:123","col3":"123:123","label":"1"}, {"col1":"123:123","col2":"123:123","col3":"123:123","label":"0"}, {"col1":"123:123","col2":"123:123","col3":"123:123","label":"0"}, {"col1":"123:123","col2":"123:123","col3":"123:123","label":"0"}, {"col1":"123:123","col2":"123:123","col3":"123:123","label":"0"}, {"col1":"123:123","col2":"123:123","col3":"123:123","label":"0"}, {"col1":"123:123","col2":"123:123","col3":"123:123","label":"1"}, {"col1":"123:123","col2":"123:123","col3":"123:123","label":"1"}, {"col1":"123:123","col2":"123:123","col3":"123:123","label":"0"}, {"col1":"123:123","col2":"123:123","col3":"123:123","label":"0"}, {"col1":"123:123","col2":"123:123","col3":"123:123","label":"1"}, {"col1":"123:123","col2":"123:123","col3":"123:123","label":"1"}, {"col1":"123:123","col2":"123:123","col3":"123:123","label":"0"}, {"col1":"123:123","col2":"123:123","col3":"123:123","label":"0"}]' - # params["config_file"] = './internal/ml/model_selection/config.ini' - # print(profiling_refinement_phase(json.dumps(params))) - # - # params = {} - # params["budget"] = 10 - # params[ - # "mini_batch"] = '[{"col1":"123:123","col2":"123:123","col3":"123:123","label":"1"}, {"col1":"123:123","col2":"123:123","col3":"123:123","label":"1"}, {"col1":"123:123","col2":"123:123","col3":"123:123","label":"1"}, {"col1":"123:123","col2":"123:123","col3":"123:123","label":"1"}, {"col1":"123:123","col2":"123:123","col3":"123:123","label":"0"}, {"col1":"123:123","col2":"123:123","col3":"123:123","label":"0"}, {"col1":"123:123","col2":"123:123","col3":"123:123","label":"0"}, {"col1":"123:123","col2":"123:123","col3":"123:123","label":"0"}, {"col1":"123:123","col2":"123:123","col3":"123:123","label":"0"}, {"col1":"123:123","col2":"123:123","col3":"123:123","label":"0"}, {"col1":"123:123","col2":"123:123","col3":"123:123","label":"0"}, {"col1":"123:123","col2":"123:123","col3":"123:123","label":"1"}, {"col1":"123:123","col2":"123:123","col3":"123:123","label":"1"}, {"col1":"123:123","col2":"123:123","col3":"123:123","label":"1"}, {"col1":"123:123","col2":"123:123","col3":"123:123","label":"0"}, {"col1":"123:123","col2":"123:123","col3":"123:123","label":"0"}, {"col1":"123:123","col2":"123:123","col3":"123:123","label":"0"}, {"col1":"123:123","col2":"123:123","col3":"123:123","label":"0"}, {"col1":"123:123","col2":"123:123","col3":"123:123","label":"1"}, {"col1":"123:123","col2":"123:123","col3":"123:123","label":"0"}, {"col1":"123:123","col2":"123:123","col3":"123:123","label":"0"}, {"col1":"123:123","col2":"123:123","col3":"123:123","label":"0"}, {"col1":"123:123","col2":"123:123","col3":"123:123","label":"0"}, {"col1":"123:123","col2":"123:123","col3":"123:123","label":"0"}, {"col1":"123:123","col2":"123:123","col3":"123:123","label":"1"}, {"col1":"123:123","col2":"123:123","col3":"123:123","label":"1"}, {"col1":"123:123","col2":"123:123","col3":"123:123","label":"0"}, {"col1":"123:123","col2":"123:123","col3":"123:123","label":"0"}, {"col1":"123:123","col2":"123:123","col3":"123:123","label":"1"}, {"col1":"123:123","col2":"123:123","col3":"123:123","label":"1"}, {"col1":"123:123","col2":"123:123","col3":"123:123","label":"0"}, {"col1":"123:123","col2":"123:123","col3":"123:123","label":"0"}]' - # params["config_file"] = './internal/ml/model_selection/config.ini' - # print(model_selection(json.dumps(params))) diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/requirement.txt b/examples/model_selection/Trails/internal/ml/model_selection/requirement.txt similarity index 83% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/requirement.txt rename to examples/model_selection/Trails/internal/ml/model_selection/requirement.txt index 591daefa59..855bab3c91 100644 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/requirement.txt +++ b/examples/model_selection/Trails/internal/ml/model_selection/requirement.txt @@ -5,12 +5,16 @@ charset-normalizer==3.2.0 ConfigSpace==0.7.1 contourpy==1.1.0 cycler==0.11.0 +einops==0.7.0 fonttools==4.41.0 +fvcore==0.1.5.post20221221 gpustat==1.1 +h5py==3.10.0 html5tagger==1.3.0 httptools==0.6.0 idna==3.4 importlib-resources==6.0.0 +iopath==0.1.10 joblib==1.3.1 kiwisolver==1.4.4 matplotlib==3.7.2 @@ -18,16 +22,20 @@ more-itertools==9.1.0 multidict==6.0.4 numpy==1.24.4 nvidia-ml-py==12.535.77 +objgraph==3.6.0 orjson==3.9.2 packaging==23.1 palettable==3.3.3 pandas==2.0.3 Pillow==10.0.0 +portalocker==2.8.2 psutil==5.9.5 psycopg2-binary==2.9.6 +Pympler==1.0.1 pyparsing==3.0.9 python-dateutil==2.8.2 pytz==2023.3 +PyYAML==6.0.1 requests==2.31.0 sanic==23.6.0 sanic-routing==23.6.0 @@ -36,6 +44,8 @@ scipy==1.10.1 seaborn==0.12.2 six==1.16.0 sklearn==0.0 +tabulate==0.9.0 +termcolor==2.3.0 thop @ git+https://github.com/Lyken17/pytorch-OpCounter.git@43c064afb71383501e41eaef9e8c8407265cf77f threadpoolctl==3.1.0 torch==1.8.1 @@ -51,4 +61,5 @@ urllib3==2.0.4 uvloop==0.17.0 wcwidth==0.2.6 websockets==11.0.3 -zipp==3.16.2 +yacs==0.1.8 +zipp==3.16.2 \ No newline at end of file diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/scripts/database/load_data_to_db.sh b/examples/model_selection/Trails/internal/ml/model_selection/scripts/database/load_data_to_db.sh similarity index 67% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/scripts/database/load_data_to_db.sh rename to examples/model_selection/Trails/internal/ml/model_selection/scripts/database/load_data_to_db.sh index dc7e0172f5..860e23fcfd 100644 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/scripts/database/load_data_to_db.sh +++ b/examples/model_selection/Trails/internal/ml/model_selection/scripts/database/load_data_to_db.sh @@ -1,36 +1,19 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - #!/bin/bash # Check for proper number of command line args -if [[ $# -ne 2 ]]; then - echo "Usage: $0 " +if [[ $# -ne 3 ]]; then + echo "Usage: $0 " exit 1 fi # Configurations DATA_PATH="$1" DB_NAME="$2" +PORT="$3" # Connection details HOST="localhost" -PORT="28814" +#PORT="28814" USERNAME="postgres" DBNAME="pg_extension" diff --git a/examples/model_selection/Trails/internal/ml/model_selection/scripts/database/load_data_to_db_int.sh b/examples/model_selection/Trails/internal/ml/model_selection/scripts/database/load_data_to_db_int.sh new file mode 100644 index 0000000000..04f902c40c --- /dev/null +++ b/examples/model_selection/Trails/internal/ml/model_selection/scripts/database/load_data_to_db_int.sh @@ -0,0 +1,66 @@ +#!/bin/bash + +# Check for proper number of command line args +if [[ $# -ne 2 ]]; then + echo "Usage: $0 " + exit 1 +fi + +# Configurations +DATA_PATH="$1" +DB_NAME="${2}_int" + +# Connection details +HOST="localhost" +PORT="28814" +USERNAME="postgres" +DBNAME="pg_extension" + +# Create the database +echo "Creating database..." +createdb -h $HOST -p $PORT -U $USERNAME $DBNAME + +# Define datasets to process +datasets=("train" "valid" "test") + +# Loop over each dataset +for dataset in "${datasets[@]}"; do + rm "${DATA_PATH}/${dataset}.csv" + + # 1. Identify the number of columns + num_columns=$(awk 'NF > max { max = NF } END { print max }' "${DATA_PATH}/${dataset}.libsvm") + + # 2. Create the table dynamically + create_table_cmd="CREATE TABLE ${DB_NAME}_${dataset} (id SERIAL PRIMARY KEY, label INTEGER" + + for (( i=2; i<=$num_columns; i++ )); do + create_table_cmd+=", col$(($i-1)) INTEGER" # Change to INTEGER type + done + create_table_cmd+=");" + + echo "Creating ${dataset} table..." + echo $create_table_cmd | psql -h $HOST -p $PORT -U $USERNAME -d $DBNAME + + # 3. Transform the libsvm format to CSV + echo "Transforming ${dataset} to CSV format..." + + awk '{ + printf $1; # print label + for (i = 2; i <= NF; i++) { + split($i, a, ":"); + printf " %s", a[1]; # print the first part of the split + } + printf "\n"; # end of line + }' "${DATA_PATH}/${dataset}.libsvm" > "${DATA_PATH}/${dataset}.csv" + + # 4. Import into PostgreSQL + columns="label" + for (( i=2; i<=$num_columns; i++ )); do + columns+=", col$(($i-1))" + done + + echo "Loading ${dataset} into PostgreSQL..." + psql -h $HOST -p $PORT -U $USERNAME -d $DBNAME -c "\COPY ${DB_NAME}_${dataset}($columns) FROM '${DATA_PATH}/${dataset}.csv' DELIMITER ' '" +done + +echo "Data load complete." diff --git a/examples/model_selection/Trails/internal/ml/model_selection/scripts/database/load_data_to_db_raw_source.sh b/examples/model_selection/Trails/internal/ml/model_selection/scripts/database/load_data_to_db_raw_source.sh new file mode 100644 index 0000000000..67225b44e6 --- /dev/null +++ b/examples/model_selection/Trails/internal/ml/model_selection/scripts/database/load_data_to_db_raw_source.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +# Check for proper number of command line args +if [[ $# -ne 2 ]]; then + echo "Usage: $0 " + exit 1 +fi + +# Configurations +DATA_PATH="$1" +DB_NAME="$2" + +# Connection details +HOST="localhost" +PORT="5432" +USERNAME="postgres" +DBNAME="model_slicing" + +# Create the database +echo "Creating database..." +createdb -h $HOST -p $PORT -U $USERNAME $DBNAME + + + +echo "Data load complete." diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/shared_config.py b/examples/model_selection/Trails/internal/ml/model_selection/shared_config.py similarity index 99% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/shared_config.py rename to examples/model_selection/Trails/internal/ml/model_selection/shared_config.py index f40ac15d67..10f9163688 100644 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/shared_config.py +++ b/examples/model_selection/Trails/internal/ml/model_selection/shared_config.py @@ -16,9 +16,7 @@ # limitations under the License. # -import calendar -import os -import time + import argparse import configparser diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/phase2/__init__.py b/examples/model_selection/Trails/internal/ml/model_selection/src/__init__.py similarity index 99% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/phase2/__init__.py rename to examples/model_selection/Trails/internal/ml/model_selection/src/__init__.py index 3df60b02f7..8c328a3bbc 100644 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/phase2/__init__.py +++ b/examples/model_selection/Trails/internal/ml/model_selection/src/__init__.py @@ -15,3 +15,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # + + + + diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/__init__.py b/examples/model_selection/Trails/internal/ml/model_selection/src/common/__init__.py similarity index 100% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/__init__.py rename to examples/model_selection/Trails/internal/ml/model_selection/src/common/__init__.py diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/common/constant.py b/examples/model_selection/Trails/internal/ml/model_selection/src/common/constant.py similarity index 92% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/common/constant.py rename to examples/model_selection/Trails/internal/ml/model_selection/src/common/constant.py index 36227ec38d..ccc2e352c6 100644 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/common/constant.py +++ b/examples/model_selection/Trails/internal/ml/model_selection/src/common/constant.py @@ -16,6 +16,7 @@ # limitations under the License. # + class CommonVars: # SAMPLER @@ -44,6 +45,9 @@ class CommonVars: PRUNE_SYNFLOW = "synflow" WEIGHT_NORM = "weight_norm" + KNAS = "knas" + + JACFLOW = "jacflow" ALL_EVALUATOR = "all_matrix" @@ -72,9 +76,15 @@ class Config: c10 = "cifar10" c100 = "cifar100" imgNet = "ImageNet16-120" + imgNetFull = "ImageNet1k" # struct dataset Frappe = "frappe" Criteo = "criteo" UCIDataset = "uci_diabetes" + SUCCHALF = "SUCCHALF" + SUCCREJCT = "SUCCREJCT" + UNIFORM = "UNIFORM" + + diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/common/structure.py b/examples/model_selection/Trails/internal/ml/model_selection/src/common/structure.py similarity index 82% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/common/structure.py rename to examples/model_selection/Trails/internal/ml/model_selection/src/common/structure.py index 521f45f1e6..b7ab685393 100644 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/common/structure.py +++ b/examples/model_selection/Trails/internal/ml/model_selection/src/common/structure.py @@ -51,7 +51,7 @@ class ModelAcquireData: """ def __init__(self, model_id: str, model_encoding: str, is_last: bool = False, - spi_seconds=None, spi_mini_batch=None): + spi_seconds=None, spi_mini_batch=None, batch_size=32): self.is_last = is_last self.model_id = model_id self.model_encoding = model_encoding @@ -59,25 +59,28 @@ def __init__(self, model_id: str, model_encoding: str, is_last: bool = False, # this is when using spi self.spi_seconds = spi_seconds self.spi_mini_batch = spi_mini_batch + self.batch_size = batch_size - def serialize_model(self) -> str: + def serialize_model(self) -> dict: data = {"is_last": self.is_last, "model_id": self.model_id, "model_encoding": self.model_encoding, "spi_seconds": self.spi_seconds, + "preprocess_seconds": self.spi_seconds, + "batch_size": self.batch_size, "spi_mini_batch": self.spi_mini_batch} - return json.dumps(data) + return data @classmethod - def deserialize(cls, data_str: str): - data = json.loads(data_str) + def deserialize(cls, data: dict): res = cls( - data["model_id"], - data["model_encoding"], - data["is_last"], - data["spi_mini_batch"], - data["spi_seconds"]) + model_id=data["model_id"], + model_encoding=data["model_encoding"], + is_last=data["is_last"], + spi_mini_batch=data["spi_mini_batch"], + batch_size=data["batch_size"], + spi_seconds=data["spi_seconds"]) return res diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/controller/__init__.py b/examples/model_selection/Trails/internal/ml/model_selection/src/controller/__init__.py similarity index 87% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/controller/__init__.py rename to examples/model_selection/Trails/internal/ml/model_selection/src/controller/__init__.py index 222757523c..69595db4bb 100644 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/controller/__init__.py +++ b/examples/model_selection/Trails/internal/ml/model_selection/src/controller/__init__.py @@ -18,14 +18,11 @@ from src.common.constant import CommonVars from src.controller.sampler_ea.regularized_ea import RegularizedEASampler -from src.controller.sampler_all.seq_sampler import SequenceSampler from src.controller.sampler_rl.reinforcement_learning import RLSampler -from src.controller.sampler_rand.random_sample import RandomSampler from src.controller.sampler_all.seq_sampler import SequenceSampler sampler_register = { CommonVars.TEST_SAMPLER: SequenceSampler, - # CommonVars.RANDOM_SAMPLER: RandomSampler, CommonVars.RANDOM_SAMPLER: SequenceSampler, CommonVars.RL_SAMPLER: RLSampler, CommonVars.EA_SAMPLER: RegularizedEASampler, diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/controller/controler.py b/examples/model_selection/Trails/internal/ml/model_selection/src/controller/controler.py similarity index 94% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/controller/controler.py rename to examples/model_selection/Trails/internal/ml/model_selection/src/controller/controler.py index 2770a72ee8..0b49e33178 100644 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/controller/controler.py +++ b/examples/model_selection/Trails/internal/ml/model_selection/src/controller/controler.py @@ -16,10 +16,11 @@ # limitations under the License. # + import time from src.controller.core.sample import Sampler -from src.third_pkg.models import CellStructure +from src.search_space.core.model_params import ModelMicroCfg class ModelScore: @@ -84,14 +85,15 @@ def __init__(self, search_strategy: Sampler): # use when simple_score_sum=True, record the model's sum score self.history = {} - def sample_next_arch(self) -> (str, CellStructure): + def sample_next_arch(self) -> (str, ModelMicroCfg): """ Return a generator :return: """ return self.search_strategy.sample_next_arch(self.ranked_models) - def fit_sampler(self, arch_id: str, alg_score: dict, simple_score_sum: bool = False) -> float: + def fit_sampler(self, arch_id: str, alg_score: dict, simple_score_sum: bool = False, + is_sync: bool = True, arch_micro=None) -> float: """ :param arch_id: :param alg_score: {alg_name1: score1, alg_name2: score2} @@ -103,7 +105,10 @@ def fit_sampler(self, arch_id: str, alg_score: dict, simple_score_sum: bool = Fa score = self._use_pure_score_as_final_res(arch_id, alg_score) else: score = self._use_vote_rank_as_final_res(arch_id, alg_score) - self.search_strategy.fit_sampler(score) + if is_sync: + self.search_strategy.fit_sampler(score) + else: + self.search_strategy.async_fit_sampler(arch_id, arch_micro, score) return score def _use_vote_rank_as_final_res(self, model_id: str, alg_score: dict): diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/controller/sampler_EA/__init__.py b/examples/model_selection/Trails/internal/ml/model_selection/src/controller/core/__init__.py similarity index 100% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/controller/sampler_EA/__init__.py rename to examples/model_selection/Trails/internal/ml/model_selection/src/controller/core/__init__.py diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/controller/core/metrics.py b/examples/model_selection/Trails/internal/ml/model_selection/src/controller/core/metrics.py similarity index 99% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/controller/core/metrics.py rename to examples/model_selection/Trails/internal/ml/model_selection/src/controller/core/metrics.py index 77eeea32dc..840ca16504 100644 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/controller/core/metrics.py +++ b/examples/model_selection/Trails/internal/ml/model_selection/src/controller/core/metrics.py @@ -16,6 +16,7 @@ # limitations under the License. # + from enum import Enum, auto diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/controller/core/sample.py b/examples/model_selection/Trails/internal/ml/model_selection/src/controller/core/sample.py similarity index 99% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/controller/core/sample.py rename to examples/model_selection/Trails/internal/ml/model_selection/src/controller/core/sample.py index b48066925b..244c1abdf5 100644 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/controller/core/sample.py +++ b/examples/model_selection/Trails/internal/ml/model_selection/src/controller/core/sample.py @@ -1,3 +1,4 @@ + # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file @@ -16,6 +17,7 @@ # limitations under the License. # + from abc import abstractmethod from src.search_space.core.model_params import ModelMicroCfg diff --git a/examples/model_selection/Trails/internal/ml/model_selection/src/controller/sampler_all/__init__.py b/examples/model_selection/Trails/internal/ml/model_selection/src/controller/sampler_all/__init__.py new file mode 100644 index 0000000000..8c328a3bbc --- /dev/null +++ b/examples/model_selection/Trails/internal/ml/model_selection/src/controller/sampler_all/__init__.py @@ -0,0 +1,21 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + + + diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/controller/sampler_all/seq_sampler.py b/examples/model_selection/Trails/internal/ml/model_selection/src/controller/sampler_all/seq_sampler.py similarity index 95% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/controller/sampler_all/seq_sampler.py rename to examples/model_selection/Trails/internal/ml/model_selection/src/controller/sampler_all/seq_sampler.py index 4eaf04ff3f..8fd5e8f445 100644 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/controller/sampler_all/seq_sampler.py +++ b/examples/model_selection/Trails/internal/ml/model_selection/src/controller/sampler_all/seq_sampler.py @@ -16,6 +16,7 @@ # limitations under the License. # + import random from src.controller.core.sample import Sampler @@ -44,7 +45,8 @@ def sample_next_arch(self, sorted_model: list = None) -> (str, ModelMicroCfg): print("the end") return None, None else: - raise e + print("Error", str(e)) + return None, None def fit_sampler(self, score: float): pass diff --git a/examples/model_selection/Trails/internal/ml/model_selection/src/controller/sampler_bohb/bohb_or.py b/examples/model_selection/Trails/internal/ml/model_selection/src/controller/sampler_bohb/bohb_or.py new file mode 100644 index 0000000000..ff12c969ee --- /dev/null +++ b/examples/model_selection/Trails/internal/ml/model_selection/src/controller/sampler_bohb/bohb_or.py @@ -0,0 +1,294 @@ + +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import os, sys, time, random, argparse, collections +from src.tools.env_tools import prepare_seed +from src.logger import logger +from models import CellStructure, get_search_spaces + +# BOHB: Robust and Efficient Hyperparameter Optimization at Scale, ICML 2018 +import ConfigSpace +from hpbandster.optimizers.bohb import BOHB +import hpbandster.core.nameserver as hpns +from hpbandster.core.worker import Worker + +from nats_bench import create + + + +def time_string(): + ISOTIMEFORMAT = "%Y-%m-%d %X" + string = "[{:}]".format(time.strftime(ISOTIMEFORMAT, time.gmtime(time.time()))) + return string + + +def get_topology_config_space(search_space, max_nodes=4): + cs = ConfigSpace.ConfigurationSpace() + # edge2index = {} + for i in range(1, max_nodes): + for j in range(i): + node_str = "{:}<-{:}".format(i, j) + cs.add_hyperparameter( + ConfigSpace.CategoricalHyperparameter(node_str, search_space) + ) + return cs + + +def get_size_config_space(search_space): + cs = ConfigSpace.ConfigurationSpace() + for ilayer in range(search_space["numbers"]): + node_str = "layer-{:}".format(ilayer) + cs.add_hyperparameter( + ConfigSpace.CategoricalHyperparameter(node_str, search_space["candidates"]) + ) + return cs + + +def config2topology_func(max_nodes=4): + def config2structure(config): + genotypes = [] + for i in range(1, max_nodes): + xlist = [] + for j in range(i): + node_str = "{:}<-{:}".format(i, j) + op_name = config[node_str] + xlist.append((op_name, j)) + genotypes.append(tuple(xlist)) + return CellStructure(genotypes) + + return config2structure + + +def config2size_func(search_space): + def config2structure(config): + channels = [] + for ilayer in range(search_space["numbers"]): + node_str = "layer-{:}".format(ilayer) + channels.append(str(config[node_str])) + return ":".join(channels) + + return config2structure + + +class MyWorker(Worker): + def __init__(self, *args, convert_func=None, dataset=None, api=None, **kwargs): + super().__init__(*args, **kwargs) + self.convert_func = convert_func + self._dataset = dataset + self._api = api + self.total_times = [] + self.trajectory = [] + + def compute(self, config, budget, **kwargs): + arch = self.convert_func(config) + accuracy, latency, time_cost, total_time = self._api.simulate_train_eval( + arch, self._dataset, iepoch=int(budget) - 1, hp="12" + ) + self.trajectory.append((accuracy, arch)) + self.total_times.append(total_time) + return {"loss": 100 - accuracy, "info": self._api.query_index_by_arch(arch)} + + +def main(xargs, api): + prepare_seed(xargs.rand_seed) + + logger.info("{:} use api : {:}".format(time_string(), api)) + api.reset_time() + search_space = get_search_spaces(xargs.search_space, "nats-bench") + if xargs.search_space == "tss": + cs = get_topology_config_space(search_space) + config2structure = config2topology_func() + else: + cs = get_size_config_space(search_space) + config2structure = config2size_func(search_space) + + hb_run_id = "0" + + NS = hpns.NameServer(run_id=hb_run_id, host="localhost", port=0) + ns_host, ns_port = NS.start() + num_workers = 1 + + workers = [] + for i in range(num_workers): + w = MyWorker( + nameserver=ns_host, + nameserver_port=ns_port, + convert_func=config2structure, + dataset=xargs.dataset, + api=api, + run_id=hb_run_id, + id=i, + ) + w.run(background=True) + workers.append(w) + + start_time = time.time() + bohb = BOHB( + configspace=cs, + run_id=hb_run_id, + eta=3, + min_budget=1, + max_budget=12, + nameserver=ns_host, + nameserver_port=ns_port, + num_samples=xargs.num_samples, + random_fraction=xargs.random_fraction, + bandwidth_factor=xargs.bandwidth_factor, + ping_interval=10, + min_bandwidth=xargs.min_bandwidth, + ) + + results = run(xargs.n_iters, min_n_workers=num_workers) + + bohb.shutdown(shutdown_workers=True) + NS.shutdown() + + # print('There are {:} runs.'.format(len(results.get_all_runs()))) + # workers[0].total_times + # workers[0].trajectory + current_best_index = [] + for idx in range(len(workers[0].trajectory)): + trajectory = workers[0].trajectory[: idx + 1] + arch = max(trajectory, key=lambda x: x[0])[1] + current_best_index.append(api.query_index_by_arch(arch)) + + best_arch = max(workers[0].trajectory, key=lambda x: x[0])[1] + logger.log( + "Best found configuration: {:} within {:.3f} s".format( + best_arch, workers[0].total_times[-1] + ) + ) + info = api.query_info_str_by_arch( + best_arch, "200" if xargs.search_space == "tss" else "90" + ) + logger.log("{:}".format(info)) + logger.log("-" * 100) + logger.close() + + return logger.log_dir, current_best_index, workers[0].total_times + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + "BOHB: Robust and Efficient Hyperparameter Optimization at Scale" + ) + parser.add_argument( + "--dataset", + default="cifar10", + type=str, + choices=["cifar10", "cifar100", "ImageNet16-120"], + help="Choose between Cifar10/100 and ImageNet-16.", + ) + # general arg + parser.add_argument( + "--search_space", + default="tss", + type=str, + choices=["tss", "sss"], + help="Choose the search space.", + ) + parser.add_argument( + "--time_budget", + type=int, + default=20000, + help="The total time cost budge for searching (in seconds).", + ) + parser.add_argument( + "--loops_if_rand", type=int, default=500, help="The total runs for evaluation." + ) + # BOHB + parser.add_argument( + "--strategy", + default="sampling", + type=str, + nargs="?", + help="optimization strategy for the acquisition function", + ) + parser.add_argument( + "--min_bandwidth", + default=0.3, + type=float, + nargs="?", + help="minimum bandwidth for KDE", + ) + parser.add_argument( + "--num_samples", + default=64, + type=int, + nargs="?", + help="number of samples for the acquisition function", + ) + parser.add_argument( + "--random_fraction", + default=0.33, + type=float, + nargs="?", + help="fraction of random configurations", + ) + parser.add_argument( + "--bandwidth_factor", + default=3, + type=int, + nargs="?", + help="factor multiplied to the bandwidth", + ) + parser.add_argument( + "--n_iters", + default=300, + type=int, + nargs="?", + help="number of iterations for optimization method", + ) + # log + parser.add_argument( + "--save_dir", + type=str, + default="./output/search", + help="Folder to save checkpoints and log.", + ) + parser.add_argument("--rand_seed", type=int, default=-1, help="manual seed") + args = parser.parse_args() + + api = create(None, args.search_space, fast_mode=False, verbose=False) + + args.save_dir = os.path.join( + "{:}-{:}".format(args.save_dir, args.search_space), + "{:}-T{:}".format(args.dataset, args.time_budget), + "BOHB", + ) + print("save-dir : {:}".format(args.save_dir)) + + if args.rand_seed < 0: + save_dir, all_info = None, collections.OrderedDict() + for i in range(args.loops_if_rand): + print("{:} : {:03d}/{:03d}".format(time_string(), i, args.loops_if_rand)) + args.rand_seed = random.randint(1, 100000) + save_dir, all_archs, all_total_times = main(args, api) + all_info[i] = {"all_archs": all_archs, "all_total_times": all_total_times} + save_path = save_dir / "results.pth" + print("save into {:}".format(save_path)) + + import pickle + with open(save_path, 'wb') as f: + pickle.dump(all_info, f) + + else: + main(args, api) + + diff --git a/examples/model_selection/Trails/internal/ml/model_selection/src/controller/sampler_ea/__init__.py b/examples/model_selection/Trails/internal/ml/model_selection/src/controller/sampler_ea/__init__.py new file mode 100644 index 0000000000..8c328a3bbc --- /dev/null +++ b/examples/model_selection/Trails/internal/ml/model_selection/src/controller/sampler_ea/__init__.py @@ -0,0 +1,21 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + + + diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/controller/sampler_ea/regularized_ea.py b/examples/model_selection/Trails/internal/ml/model_selection/src/controller/sampler_ea/regularized_ea.py similarity index 59% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/controller/sampler_ea/regularized_ea.py rename to examples/model_selection/Trails/internal/ml/model_selection/src/controller/sampler_ea/regularized_ea.py index 62126bef61..2e3685fa68 100644 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/controller/sampler_ea/regularized_ea.py +++ b/examples/model_selection/Trails/internal/ml/model_selection/src/controller/sampler_ea/regularized_ea.py @@ -16,6 +16,7 @@ # limitations under the License. # + import collections from src.search_space.core.model_params import ModelMicroCfg from src.controller.core.sample import Sampler @@ -54,8 +55,8 @@ def __init__(self, space: SpaceWrapper, population_size: int, sample_size: int): # use the visited to reduce the collapse self.visited = {} - self.max_mutate_time = 2 - self.max_mutate_sampler_time = 2 + self.max_mutate_time = 4 + self.max_mutate_sampler_time = 4 def sample_next_arch(self, sorted_model_ids: list) -> (str, ModelMicroCfg): """ @@ -146,3 +147,101 @@ def fit_sampler(self, score: float): # Remove the oldest model. self.population.popleft() self.population_model_ids.popleft() + + +class AsyncRegularizedEASampler(Sampler): + + def __init__(self, space: SpaceWrapper, population_size: int, sample_size: int): + super().__init__(space) + + self.population_size = population_size + # list of object, + self.population = collections.deque() + # list of str, for duplicate checking + self.population_model_ids = collections.deque() + + self.space = space + self.sample_size = sample_size + self.current_sampled = 0 + + # use the visited to reduce the collapse + self.visited = {} + self.max_mutate_time = 2 + self.max_mutate_sampler_time = 3 + + def sample_next_arch(self, sorted_model_ids: list) -> (str, ModelMicroCfg): + # Case 1: If population hasn't reached desired size, add random architectures + if len(self.population) < self.population_size: + while True: + arch_id, arch_micro = self.space.random_architecture_id() + # Ensure that EA population has no repeated value + if str(arch_id) not in self.population_model_ids: + break + return arch_id, arch_micro + + # Case 2: If population has reached desired size, evolve population + else: + cur_mutate_sampler_time = 0 + is_found_new = False + + # Keep attempting mutations for a maximum of 'max_mutate_sampler_time' times + while cur_mutate_sampler_time < self.max_mutate_sampler_time: + cur_mutate_time = 0 + + # Randomly select a sample of models from the population + sample = [] + sample_ids = [] + while len(sample) < self.sample_size: + candidate = random.choice(list(self.population)) + candidate_id = self.population_model_ids[self.population.index(candidate)] + sample.append(candidate) + sample_ids.append(candidate_id) + + # Select the best parent from the sample (based on the order in sorted_model_ids) + parent_id = max(sample_ids, key=lambda _id: sorted_model_ids.index(str(_id))) + parent = sample[sample_ids.index(parent_id)] + + # Try to mutate the parent up to 'max_mutate_time' times + while cur_mutate_time < self.max_mutate_time: + arch_id, arch_micro = self.space.mutate_architecture(parent.arch) + + # If the mutated architecture hasn't been visited or we've visited all possible architectures, stop + if arch_id not in self.visited or len(self.space) == len(self.visited): + self.visited[arch_id] = True + is_found_new = True + break + cur_mutate_time += 1 + + # If we've found a new architecture, stop sampling + if is_found_new: + break + + cur_mutate_sampler_time += 1 + + # If we've hit the maximum number of mutation attempts, do nothing + if cur_mutate_time * cur_mutate_sampler_time == self.max_mutate_time * self.max_mutate_sampler_time: + pass + + # Update current architecture details + return arch_id, arch_micro + + def async_fit_sampler(self, current_arch_id, current_arch_micro, score: float): + # if it's in Initialize stage, add to the population with random models. + if len(self.population) < self.population_size: + model = Model() + model.arch = current_arch_micro + model.score = score + self.population.append(model) + self.population_model_ids.append(current_arch_id) + + # if it's in mutation stage + else: + child = Model() + child.arch = current_arch_micro + child.score = score + + self.population.append(child) + self.population_model_ids.append(current_arch_id) + # Remove the oldest model. + self.population.popleft() + self.population_model_ids.popleft() diff --git a/examples/model_selection/Trails/internal/ml/model_selection/src/controller/sampler_rand/__init__.py b/examples/model_selection/Trails/internal/ml/model_selection/src/controller/sampler_rand/__init__.py new file mode 100644 index 0000000000..8c328a3bbc --- /dev/null +++ b/examples/model_selection/Trails/internal/ml/model_selection/src/controller/sampler_rand/__init__.py @@ -0,0 +1,21 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + + + diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/controller/sampler_rand/random_sample.py b/examples/model_selection/Trails/internal/ml/model_selection/src/controller/sampler_rand/random_sample.py similarity index 99% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/controller/sampler_rand/random_sample.py rename to examples/model_selection/Trails/internal/ml/model_selection/src/controller/sampler_rand/random_sample.py index 8c31254462..ff010b9872 100644 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/controller/sampler_rand/random_sample.py +++ b/examples/model_selection/Trails/internal/ml/model_selection/src/controller/sampler_rand/random_sample.py @@ -16,6 +16,7 @@ # limitations under the License. # + from src.controller.core.sample import Sampler from src.search_space.core.space import SpaceWrapper from src.search_space.core.model_params import ModelMicroCfg diff --git a/examples/model_selection/Trails/internal/ml/model_selection/src/controller/sampler_rl/__init__.py b/examples/model_selection/Trails/internal/ml/model_selection/src/controller/sampler_rl/__init__.py new file mode 100644 index 0000000000..8c328a3bbc --- /dev/null +++ b/examples/model_selection/Trails/internal/ml/model_selection/src/controller/sampler_rl/__init__.py @@ -0,0 +1,21 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + + + diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/controller/sampler_rl/reinforcement_learning.py b/examples/model_selection/Trails/internal/ml/model_selection/src/controller/sampler_rl/reinforcement_learning.py similarity index 98% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/controller/sampler_rl/reinforcement_learning.py rename to examples/model_selection/Trails/internal/ml/model_selection/src/controller/sampler_rl/reinforcement_learning.py index a65eed36e8..aabe4997b7 100644 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/controller/sampler_rl/reinforcement_learning.py +++ b/examples/model_selection/Trails/internal/ml/model_selection/src/controller/sampler_rl/reinforcement_learning.py @@ -16,10 +16,10 @@ # limitations under the License. # + from src.controller.core.sample import Sampler from src.search_space.core.space import SpaceWrapper from src.search_space.core.model_params import ModelMicroCfg -from src.third_pkg.models import CellStructure class ExponentialMovingAverage(object): diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/search_space/utils/__init__.py b/examples/model_selection/Trails/internal/ml/model_selection/src/dataset_utils/__init__.py similarity index 99% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/search_space/utils/__init__.py rename to examples/model_selection/Trails/internal/ml/model_selection/src/dataset_utils/__init__.py index 01d7057208..42c2ac6db1 100644 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/search_space/utils/__init__.py +++ b/examples/model_selection/Trails/internal/ml/model_selection/src/dataset_utils/__init__.py @@ -16,3 +16,4 @@ # limitations under the License. # + diff --git a/examples/model_selection/Trails/internal/ml/model_selection/src/dataset_utils/dataset.py b/examples/model_selection/Trails/internal/ml/model_selection/src/dataset_utils/dataset.py new file mode 100644 index 0000000000..cf7ac352bd --- /dev/null +++ b/examples/model_selection/Trails/internal/ml/model_selection/src/dataset_utils/dataset.py @@ -0,0 +1,217 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import torch +from torch import tensor +from torch.utils.data import DataLoader, Subset +from torchvision.datasets import ImageFolder +from torchvision.datasets import MNIST, CIFAR10, CIFAR100, SVHN +from torchvision.transforms import Compose +from torchvision import transforms +from .imagenet16 import * + + +def get_dataloader(train_batch_size: int, test_batch_size: int, dataset: str, + num_workers: int, datadir: str, resize=None) -> (DataLoader, DataLoader, int): + """ + Load CIFAR or imagenet datasets + :param train_batch_size: + :param test_batch_size: + :param dataset: ImageNet16, cifar, svhn, ImageNet1k, mnist + :param num_workers: + :param datadir: + :param resize: + :return: + """ + + class_num = 0 + mean = [] + std = [] + pad = 0 + + if 'ImageNet16' in dataset: + mean = [x / 255 for x in [122.68, 116.66, 104.01]] + std = [x / 255 for x in [63.22, 61.26, 65.09]] + size, pad = 16, 2 + elif 'cifar' in dataset: + mean = (0.4914, 0.4822, 0.4465) + std = (0.2023, 0.1994, 0.2010) + size, pad = 32, 4 + elif 'svhn' in dataset: + mean = (0.5, 0.5, 0.5) + std = (0.5, 0.5, 0.5) + size, pad = 32, 0 + elif dataset == 'ImageNet1k': + from .h5py_dataset import H5Dataset + size, pad = 224, 2 + mean = (0.485, 0.456, 0.406) + std = (0.229, 0.224, 0.225) + # resize = 256 + elif dataset == 'ImageNet224-120': + from .h5py_dataset import H5Dataset + size, pad = 224, 2 + mean = (0.485, 0.456, 0.406) + std = (0.229, 0.224, 0.225) + # resize = 256 + + if resize is None: + resize = size + + train_transform = transforms.Compose([ + transforms.RandomCrop(size, padding=pad), + transforms.Resize(resize), + transforms.RandomHorizontalFlip(), + transforms.ToTensor(), + transforms.Normalize(mean, std), + ]) + + test_transform = transforms.Compose([ + transforms.Resize((resize, resize)), + transforms.ToTensor(), + transforms.Normalize(mean, std), + ]) + + if dataset == 'cifar10': + class_num = 10 + train_dataset = CIFAR10(datadir, True, train_transform, download=True) + test_dataset = CIFAR10(datadir, False, test_transform, download=True) + elif dataset == 'cifar100': + class_num = 100 + train_dataset = CIFAR100(datadir, True, train_transform, download=True) + test_dataset = CIFAR100(datadir, False, test_transform, download=True) + elif dataset == 'svhn': + class_num = 10 + train_dataset = SVHN(datadir, split='train', transform=train_transform, download=True) + test_dataset = SVHN(datadir, split='test', transform=test_transform, download=True) + elif dataset == 'ImageNet16-120': + class_num = 120 + train_dataset = ImageNet16(os.path.join(datadir, 'ImageNet16'), True, train_transform, 120) + test_dataset = ImageNet16(os.path.join(datadir, 'ImageNet16'), False, test_transform, 120) + elif dataset == 'ImageNet1k': + class_num = 1000 + # train_dataset = ImageFolder(root=os.path.join(datadir, 'imagenet/val'), transform=train_transform) + test_dataset = ImageFolder(root=os.path.join(datadir, 'imagenet/val'), transform=test_transform) + train_dataset = test_dataset + elif dataset == 'ImageNet224-120': + class_num = 120 + test_dataset = ImageFolder(root=os.path.join(datadir, 'imagenet/val'), transform=test_transform) + + # get 0-120 classes + class_indices = list(range(120)) # 0-120 inclusive + subset_indices = [i for i, (_, label) in enumerate(test_dataset.samples) if label in class_indices] + filtered_test_dataset = Subset(test_dataset, subset_indices) + # get 0-120 classes + train_dataset = filtered_test_dataset + test_dataset = filtered_test_dataset + elif dataset == 'mnist': + data_transform = Compose([transforms.ToTensor()]) + # Normalise? transforms.Normalize((0.1307,), (0.3081,)) + train_dataset = MNIST("_dataset", True, data_transform, download=True) + test_dataset = MNIST("_dataset", False, data_transform, download=True) + else: + raise ValueError('There are no more cifars or imagenets.') + + train_loader = DataLoader( + train_dataset, + train_batch_size, + shuffle=True, + num_workers=4, + pin_memory=False + ) + test_loader = DataLoader( + test_dataset, + test_batch_size, + shuffle=False, + num_workers=4, + pin_memory=False + ) + + print("dataset load done") + + return train_loader, test_loader, class_num + + +def get_mini_batch(dataloader: DataLoader, sample_alg: str, batch_size: int, num_classes: int) -> (tensor, tensor): + """ + Get a mini-batch of data, + :param dataloader: DataLoader + :param sample_alg: random or grasp + :param batch_size: batch_size + :param num_classes: num_classes + :return: two tensor + """ + + if sample_alg == 'random': + inputs, targets = _get_some_data(dataloader, batch_size=batch_size) + elif sample_alg == 'grasp': + inputs, targets = _get_some_data_grasp(dataloader, num_classes, samples_per_class=batch_size // num_classes) + else: + raise NotImplementedError(f'dataload {sample_alg} is not supported') + + return inputs, targets + + +def _get_some_data(train_dataloader: DataLoader, batch_size: int) -> (torch.tensor, torch.tensor): + """ + Randomly sample some data, some class may not be sampled + :param train_dataloader: torch dataLoader + :param batch_size: batch_size of the data. + :return: + """ + traindata = [] + + dataloader_iter = iter(train_dataloader) + traindata.append(next(dataloader_iter)) + + inputs = torch.cat([a for a, _ in traindata]) + targets = torch.cat([b for _, b in traindata]) + inputs = inputs + targets = targets + return inputs, targets + + +def _get_some_data_grasp(train_dataloader: DataLoader, num_classes: int, + samples_per_class: int) -> (torch.tensor, torch.tensor): + """ + Sample some data while guarantee example class has equal number of samples. + :param train_dataloader: torch dataLoader + :param num_classes: number of class + :param samples_per_class: how many samples for eacl class. + :return: + """ + + datas = [[] for _ in range(num_classes)] + labels = [[] for _ in range(num_classes)] + mark = dict() + dataloader_iter = iter(train_dataloader) + while True: + inputs, targets = next(dataloader_iter) + for idx in range(inputs.shape[0]): + x, y = inputs[idx:idx + 1], targets[idx:idx + 1] + category = y.item() + if len(datas[category]) == samples_per_class: + mark[category] = True + continue + datas[category].append(x) + labels[category].append(y) + if len(mark) == num_classes: + break + + x = torch.cat([torch.cat(_, 0) for _ in datas]) + y = torch.cat([torch.cat(_) for _ in labels]).view(-1) + return x, y diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/dataset_utils/download_critero_and_avazu.py b/examples/model_selection/Trails/internal/ml/model_selection/src/dataset_utils/download_critero_and_avazu.py similarity index 100% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/dataset_utils/download_critero_and_avazu.py rename to examples/model_selection/Trails/internal/ml/model_selection/src/dataset_utils/download_critero_and_avazu.py diff --git a/examples/model_selection/Trails/internal/ml/model_selection/src/dataset_utils/h5py_dataset.py b/examples/model_selection/Trails/internal/ml/model_selection/src/dataset_utils/h5py_dataset.py new file mode 100644 index 0000000000..31feeb06fc --- /dev/null +++ b/examples/model_selection/Trails/internal/ml/model_selection/src/dataset_utils/h5py_dataset.py @@ -0,0 +1,59 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + +import h5py +import numpy as np +from PIL import Image + +import torch +from torch.utils.data import Dataset, DataLoader + + +class H5Dataset(Dataset): + def __init__(self, h5_path, transform=None): + self.h5_path = h5_path + self.h5_file = None + self.length = len(h5py.File(h5_path, 'r')) + self.transform = transform + + def __getitem__(self, index): + + # loading in getitem allows us to use multiple processes for data loading + # because hdf5 files aren't pickelable so can't transfer them across processes + # https://discuss.pytorch.org/t/hdf5-a-data-format-for-pytorch/40379 + # https://discuss.pytorch.org/t/dataloader-when-num-worker-0-there-is-bug/25643/16 + # TODO possible look at __getstate__ and __setstate__ as a more elegant solution + if self.h5_file is None: + self.h5_file = h5py.File(self.h5_path, 'r') + + record = self.h5_file[str(index)] + + if self.transform: + x = Image.fromarray(record['data'][()]) + x = self.transform(x) + else: + x = torch.from_numpy(record['data'][()]) + + y = record['target'][()] + y = torch.from_numpy(np.asarray(y)) + + return (x, y) + + def __len__(self): + return self.length diff --git a/examples/model_selection/Trails/internal/ml/model_selection/src/dataset_utils/imagenet16.py b/examples/model_selection/Trails/internal/ml/model_selection/src/dataset_utils/imagenet16.py new file mode 100644 index 0000000000..c49428c07b --- /dev/null +++ b/examples/model_selection/Trails/internal/ml/model_selection/src/dataset_utils/imagenet16.py @@ -0,0 +1,137 @@ +################################################## +# Copyright (c) Xuanyi Dong [GitHub D-X-Y], 2019 # +################################################## + +import os, sys, hashlib +import numpy as np +from PIL import Image +import torch.utils.data as data + +if sys.version_info[0] == 2: + import cPickle as pickle +else: + import pickle + + +def calculate_md5(fpath, chunk_size=1024 * 1024): + md5 = hashlib.md5() + with open(fpath, 'rb') as f: + for chunk in iter(lambda: f.read(chunk_size), b''): + md5.update(chunk) + return md5.hexdigest() + + +def check_md5(fpath, md5, **kwargs): + return md5 == calculate_md5(fpath, **kwargs) + + +def check_integrity(fpath, md5=None): + if not os.path.isfile(fpath): return False + if md5 is None: + return True + else: + return check_md5(fpath, md5) + + +class ImageNet16(data.Dataset): + # http://image-net.org/download-images + # A Downsampled Variant of ImageNet as an Alternative to the CIFAR datasets + # https://arxiv.org/pdf/1707.08819.pdf + + train_list = [ + ['train_data_batch_1', '27846dcaa50de8e21a7d1a35f30f0e91'], + ['train_data_batch_2', 'c7254a054e0e795c69120a5727050e3f'], + ['train_data_batch_3', '4333d3df2e5ffb114b05d2ffc19b1e87'], + ['train_data_batch_4', '1620cdf193304f4a92677b695d70d10f'], + ['train_data_batch_5', '348b3c2fdbb3940c4e9e834affd3b18d'], + ['train_data_batch_6', '6e765307c242a1b3d7d5ef9139b48945'], + ['train_data_batch_7', '564926d8cbf8fc4818ba23d2faac7564'], + ['train_data_batch_8', 'f4755871f718ccb653440b9dd0ebac66'], + ['train_data_batch_9', 'bb6dd660c38c58552125b1a92f86b5d4'], + ['train_data_batch_10', '8f03f34ac4b42271a294f91bf480f29b'], + ] + valid_list = [ + ['val_data', '3410e3017fdaefba8d5073aaa65e4bd6'], + ] + + def __init__(self, root, train, transform, use_num_of_class_only=None): + self.root = root + self.transform = transform + self.train = train # training set or valid set + if not self._check_integrity(): raise RuntimeError('Dataset not found or corrupted.') + + if self.train: + downloaded_list = self.train_list + else: + downloaded_list = self.valid_list + self.data = [] + self.targets = [] + + # now load the picked numpy arrays + for i, (file_name, checksum) in enumerate(downloaded_list): + file_path = os.path.join(self.root, file_name) + # print ('Load {:}/{:02d}-th : {:}'.format(i, len(downloaded_list), file_path)) + with open(file_path, 'rb') as f: + if sys.version_info[0] == 2: + entry = pickle.load(f) + else: + entry = pickle.load(f, encoding='latin1') + self.data.append(entry['data']) + self.targets.extend(entry['labels']) + self.data = np.vstack(self.data).reshape(-1, 3, 16, 16) + self.data = self.data.transpose((0, 2, 3, 1)) # convert to HWC + if use_num_of_class_only is not None: + assert isinstance(use_num_of_class_only, + int) and use_num_of_class_only > 0 and use_num_of_class_only < 1000, 'invalid use_num_of_class_only : {:}'.format( + use_num_of_class_only) + new_data, new_targets = [], [] + for I, L in zip(self.data, self.targets): + if 1 <= L <= use_num_of_class_only: + new_data.append(I) + new_targets.append(L) + self.data = new_data + self.targets = new_targets + # self.mean.append(entry['mean']) + # self.mean = np.vstack(self.mean).reshape(-1, 3, 16, 16) + # self.mean = np.mean(np.mean(np.mean(self.mean, axis=0), axis=1), axis=1) + # print ('Mean : {:}'.format(self.mean)) + # temp = self.data - np.reshape(self.mean, (1, 1, 1, 3)) + # std_data = np.std(temp, axis=0) + # std_data = np.mean(np.mean(std_data, axis=0), axis=0) + # print ('Std : {:}'.format(std_data)) + + def __getitem__(self, index): + img, target = self.data[index], self.targets[index] - 1 + + img = Image.fromarray(img) + + if self.transform is not None: + img = self.transform(img) + + return img, target + + def __len__(self): + return len(self.data) + + def _check_integrity(self): + root = self.root + for fentry in (self.train_list + self.valid_list): + filename, md5 = fentry[0], fentry[1] + fpath = os.path.join(root, filename) + if not check_integrity(fpath, md5): + return False + return True + + +if __name__ == '__main__': + train = ImageNet16('/data02/dongxuanyi/.torch/cifar.python/ImageNet16', True, None) + valid = ImageNet16('/data02/dongxuanyi/.torch/cifar.python/ImageNet16', False, None) + + print(len(train)) + print(len(valid)) + image, label = train[111] + trainX = ImageNet16('/data02/dongxuanyi/.torch/cifar.python/ImageNet16', True, None, 200) + validX = ImageNet16('/data02/dongxuanyi/.torch/cifar.python/ImageNet16', False, None, 200) + print(len(trainX)) + print(len(validX)) + # import pdb; pdb.set_trace() diff --git a/examples/model_selection/Trails/internal/ml/model_selection/src/dataset_utils/save_load_torch.py b/examples/model_selection/Trails/internal/ml/model_selection/src/dataset_utils/save_load_torch.py new file mode 100644 index 0000000000..4da19a8673 --- /dev/null +++ b/examples/model_selection/Trails/internal/ml/model_selection/src/dataset_utils/save_load_torch.py @@ -0,0 +1,106 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + +import argparse + +from tqdm import tqdm +import torch +import os +import glob + + +def decode_libsvm(line): + columns = line.split(' ') + map_func = lambda pair: (int(pair[0]), float(pair[1])) + id, value = zip(*map(lambda col: map_func(col.split(':')), columns[1:])) + sample = {'id': torch.LongTensor(id), + 'value': torch.FloatTensor(value), + 'y': float(columns[0])} + return sample + + +def _save_data(data_dir, fname, nfields, namespace): + with open(fname) as f: + sample_lines = sum(1 for line in f) + + feat_id = torch.LongTensor(sample_lines, nfields) + feat_value = torch.FloatTensor(sample_lines, nfields) + y = torch.FloatTensor(sample_lines) + + nsamples = 0 + with tqdm(total=sample_lines) as pbar: + with open(fname) as fp: + line = fp.readline() + while line: + try: + sample = decode_libsvm(line) + feat_id[nsamples] = sample['id'] + feat_value[nsamples] = sample['value'] + y[nsamples] = sample['y'] + nsamples += 1 + except Exception: + print(f'incorrect data format line "{line}" !') + line = fp.readline() + pbar.update(1) + print(f'# {nsamples} data samples loaded...') + + # save the tensors to disk + torch.save(feat_id, f'{data_dir}/{namespace}_feat_id.pt') + torch.save(feat_value, f'{data_dir}/{namespace}_feat_value.pt') + torch.save(y, f'{data_dir}/{namespace}_y.pt') + + +def parse_arguments(): + parser = argparse.ArgumentParser(description='FastAutoNAS') + + parser.add_argument('--nfield', type=int, default=10, + help='the number of fields, frappe: 10, uci_diabetes: 43, criteo: 39') + + parser.add_argument('--dataset', type=str, default='frappe', + help='cifar10, cifar100, ImageNet16-120, frappe, criteo, uci_diabetes') + + return parser.parse_args() + + +def load_data(data_dir, namespace): + feat_id = torch.load(f'{data_dir}/{namespace}_feat_id.pt') + feat_value = torch.load(f'{data_dir}/{namespace}_feat_value.pt') + y = torch.load(f'{data_dir}/{namespace}_y.pt') + + print(f'# {int(y.shape[0])} data samples loaded...') + + return feat_id, feat_value, y, int(y.shape[0]) + + +if __name__ == "__main__": + args = parse_arguments() + + _data_dir = os.path.join("./dataset", args.dataset) + + train_name_space = "decoded_train" + valid_name_space = "decoded_valid" + # save + train_file = glob.glob("%s/tr*libsvm" % _data_dir)[0] + val_file = glob.glob("%s/va*libsvm" % _data_dir)[0] + _save_data(_data_dir, train_file, args.nfield, train_name_space) + _save_data(_data_dir, val_file, args.nfield, valid_name_space) + + # read + # load_data(data_dir, train_name_space) + # load_data(data_dir, valid_name_space) diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/dataset_utils/sequence_dataloader.py b/examples/model_selection/Trails/internal/ml/model_selection/src/dataset_utils/sequence_dataloader.py similarity index 99% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/dataset_utils/sequence_dataloader.py rename to examples/model_selection/Trails/internal/ml/model_selection/src/dataset_utils/sequence_dataloader.py index 9a6587e9e0..9503d87b46 100644 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/dataset_utils/sequence_dataloader.py +++ b/examples/model_selection/Trails/internal/ml/model_selection/src/dataset_utils/sequence_dataloader.py @@ -16,10 +16,12 @@ # limitations under the License. # + import queue import threading import requests import time +import torch from src.logger import logger @@ -61,7 +63,6 @@ def fetch_data(self): # end_signal in trianing, then keep training continue else: - import torch # convert to tensor again id_tensor = torch.LongTensor(batch['id']) value_tensor = torch.FloatTensor(batch['value']) @@ -94,3 +95,5 @@ def stop(self): self.stop_event.set() self.thread.join() + + diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/dataset_utils/stream_dataloader.py b/examples/model_selection/Trails/internal/ml/model_selection/src/dataset_utils/stream_dataloader.py similarity index 99% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/dataset_utils/stream_dataloader.py rename to examples/model_selection/Trails/internal/ml/model_selection/src/dataset_utils/stream_dataloader.py index f39499f335..9c5211081e 100644 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/dataset_utils/stream_dataloader.py +++ b/examples/model_selection/Trails/internal/ml/model_selection/src/dataset_utils/stream_dataloader.py @@ -16,10 +16,12 @@ # limitations under the License. # + import queue import threading import requests import time +import torch from src.logger import logger @@ -62,7 +64,6 @@ def fetch_data(self): continue else: # convert to tensor again - import torch id_tensor = torch.LongTensor(batch['id']) value_tensor = torch.FloatTensor(batch['value']) y_tensor = torch.FloatTensor(batch['y']) @@ -94,3 +95,5 @@ def stop(self): self.stop_event.set() self.thread.join() + + diff --git a/examples/model_selection/Trails/internal/ml/model_selection/src/dataset_utils/structure_data_loader.py b/examples/model_selection/Trails/internal/ml/model_selection/src/dataset_utils/structure_data_loader.py new file mode 100644 index 0000000000..d83dfc77c5 --- /dev/null +++ b/examples/model_selection/Trails/internal/ml/model_selection/src/dataset_utils/structure_data_loader.py @@ -0,0 +1,276 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + +from tqdm import tqdm +import torch +from torch.utils.data import Dataset, DataLoader, TensorDataset +import os +import glob +import numpy as np +import sklearn.model_selection +from scipy.io.arff import loadarff + + +def load_data(data_dir, namespace): + print(f'# loading data from ' + f'{data_dir}/{namespace}_feat_id.pt, ' + f'{data_dir}/{namespace}_feat_value.pt' + f'{data_dir}/{namespace}_y.pt ......') + + feat_id = torch.load(f'{data_dir}/{namespace}_feat_id.pt') + feat_value = torch.load(f'{data_dir}/{namespace}_feat_value.pt') + y = torch.load(f'{data_dir}/{namespace}_y.pt') + + print(f'# {int(y.shape[0])} data samples loaded...') + + return feat_id, feat_value, y, int(y.shape[0]) + + +class LibsvmDatasetReadOnce(Dataset): + """ Dataset loader for Libsvm data format """ + + def __init__(self, fname): + parent_directory = os.path.dirname(fname) + if "train" in fname: + namespace = "decoded_train" + elif "valid" in fname: + namespace = "decoded_valid" + else: + raise + self.feat_id, self.feat_value, self.y, self.nsamples = load_data(parent_directory, namespace) + + print(f'# {self.nsamples} data samples loaded...') + + def __len__(self): + return self.nsamples + + def __getitem__(self, idx): + return {'id': self.feat_id[idx], + 'value': self.feat_value[idx], + 'y': self.y[idx]} + + +class LibsvmDataset(Dataset): + """ Dataset loader for Libsvm data format """ + + def __init__(self, fname, nfields, max_load=-1): + + def decode_libsvm(line): + columns = line.split(' ') + map_func = lambda pair: (int(pair[0]), float(pair[1])) + id, value = zip(*map(lambda col: map_func(col.split(':')), columns[1:])) + sample = {'id': torch.LongTensor(id), + 'value': torch.FloatTensor(value), + 'y': float(columns[0])} + return sample + + with open(fname) as f: + sample_lines = sum(1 for line in f) + + self.feat_id = torch.LongTensor(sample_lines, nfields) + self.feat_value = torch.FloatTensor(sample_lines, nfields) + self.y = torch.FloatTensor(sample_lines) + + self.nsamples = 0 + with tqdm(total=sample_lines) as pbar: + with open(fname) as fp: + line = fp.readline() + while line: + if max_load > 0 and self.nsamples > max_load: + break + try: + sample = decode_libsvm(line) + self.feat_id[self.nsamples] = sample['id'] + self.feat_value[self.nsamples] = sample['value'] + self.y[self.nsamples] = sample['y'] + self.nsamples += 1 + except Exception: + print(f'incorrect data format line "{line}" !') + line = fp.readline() + pbar.update(1) + print(f'# {self.nsamples} data samples loaded...') + + def __len__(self): + return self.nsamples + + def __getitem__(self, idx): + return {'id': self.feat_id[idx], + 'value': self.feat_value[idx], + 'y': self.y[idx]} + + +def libsvm_dataloader(args, data_dir, nfield, batch_size): + print("Loading data from ", data_dir) + workers = args.workers + train_file_name = f"{data_dir}/train.libsvm" + valid_file_name = f"{data_dir}/valid.libsvm" + test_file_name = f"{data_dir}/test.libsvm" + print(f"using train={train_file_name}, valid={valid_file_name}") + # read the converted file + if args.device == "cpu": + train_loader = DataLoader(LibsvmDatasetReadOnce(train_file_name), + batch_size=batch_size, + shuffle=True) + val_loader = DataLoader(LibsvmDatasetReadOnce(valid_file_name), + batch_size=batch_size * 8, + shuffle=False) + + else: + train_loader = DataLoader(LibsvmDatasetReadOnce(train_file_name), + batch_size=batch_size, + shuffle=True, + num_workers=workers, + pin_memory=False) + + val_loader = DataLoader(LibsvmDatasetReadOnce(valid_file_name), + batch_size=batch_size * 8, + shuffle=False, + num_workers=workers, + pin_memory=False) + + return train_loader, val_loader, val_loader + + +def libsvm_dataloader_ori(args): + data_dir = args.base_dir + args.dataset + print(data_dir) + train_file = glob.glob("%s/tr*libsvm" % data_dir)[0] + val_file = glob.glob("%s/va*libsvm" % data_dir)[0] + test_file = glob.glob("%s/te*libsvm" % data_dir)[0] + + train_loader = DataLoader(LibsvmDataset(train_file, args.nfield, args.max_load), + batch_size=args.batch_size, shuffle=True, + num_workers=args.workers, pin_memory=True) + val_loader = DataLoader(LibsvmDataset(val_file, args.nfield, args.max_load), + batch_size=args.batch_size, shuffle=False, + num_workers=args.workers, pin_memory=True) + # test_loader = DataLoader(LibsvmDataset(test_file, args.nfield), + # batch_size=args.batch_size, shuffle=False, + # num_workers=args.workers, pin_memory=True) + + return train_loader, val_loader, val_loader + + +class UCILibsvmDataset(Dataset): + """ Dataset loader for loading UCI dataset of Libsvm format """ + + def __init__(self, X, y): + assert X.shape[0] == y.shape[0] + self.nsamples, self.nfeat = X.shape + + self.feat_id = torch.LongTensor(self.nsamples, self.nfeat) + self.feat_value = torch.FloatTensor(self.nsamples, self.nfeat) + self.y = torch.FloatTensor(self.nsamples) + + with tqdm(total=self.nsamples) as pbar: + id = torch.LongTensor(range(self.nfeat)) + for idx in range(self.nsamples): + self.feat_id[idx] = id + self.feat_value[idx] = torch.FloatTensor(X[idx]) + self.y[idx] = y[idx] + + pbar.update(1) + print(f'Data loader: {self.nsamples} data samples') + + def __len__(self): + return self.nsamples + + def __getitem__(self, idx): + return {'id': self.feat_id[idx], + 'value': self.feat_value[idx], + 'y': self.y[idx]} + + +def uci_loader(data_dir, batch_size, valid_perc=0., libsvm=False, workers=4): + ''' + :param data_dir: Path to load the uci dataset + :param batch_size: Batch size + :param valid_perc: valid percentage split from train (default 0, whole train set) + :param libsvm: Libsvm loader of format {'id', 'value', 'y'} + :param workers: the number of subprocesses to load data + :return: train/valid/test loader, train_loader.nclass + ''' + + def uci_validation_set(X, y, split_perc=0.2): + return sklearn.model_selection.train_test_split( + X, y, test_size=split_perc, random_state=0) + + def make_loader(X, y, transformer=None, batch_size=64): + if transformer is None: + transformer = sklearn.preprocessing.StandardScaler() + transformer.fit(X) + X = transformer.transform(X) + if libsvm: + return DataLoader(UCILibsvmDataset(X, y), + batch_size=batch_size, + shuffle=transformer is None, + num_workers=workers, pin_memory=True + ), transformer + else: + return DataLoader( + dataset=TensorDataset(*[torch.from_numpy(e) for e in [X, y]]), + batch_size=batch_size, + shuffle=transformer is None, + num_workers=workers, pin_memory=True + ), transformer + + def uci_folder_to_name(f): + return f.split('/')[-1] + + def line_to_idx(l): + return np.array([int(e) for e in l.split()], dtype=np.int32) + + def load_uci_dataset(folder, train=True): + full_file = f'{folder}/{uci_folder_to_name(folder)}.arff' + if os.path.exists(full_file): + data = loadarff(full_file) + train_idx, test_idx = [line_to_idx(l) for l in open(f'{folder}/conxuntos.dat').readlines()] + assert len(set(train_idx) & set(test_idx)) == 0 + all_idx = list(train_idx) + list(test_idx) + assert len(all_idx) == np.max(all_idx) + 1 + assert np.min(all_idx) == 0 + if train: + data = (data[0][train_idx], data[1]) + else: + data = (data[0][test_idx], data[1]) + else: + typename = 'train' if train else 'test' + filename = f'{folder}/{uci_folder_to_name(folder)}_{typename}.arff' + data = loadarff(filename) + assert data[1].types() == ['numeric'] * (len(data[1].types()) - 1) + ['nominal'] + X = np.array(data[0][data[1].names()[:-1]].tolist()) + y = np.array([int(e) for e in data[0][data[1].names()[-1]]]) + nclass = len(data[1]['clase'][1]) + return X.astype(np.float32), y, nclass + + Xtrain, ytrain, nclass = load_uci_dataset(data_dir) + if valid_perc > 0: + Xtrain, Xvalid, ytrain, yvalid = uci_validation_set(Xtrain, ytrain, split_perc=valid_perc) + train_loader, _ = make_loader(Xtrain, ytrain, batch_size=batch_size) + valid_loader, _ = make_loader(Xvalid, yvalid, batch_size=batch_size) + else: + train_loader, _ = make_loader(Xtrain, ytrain, batch_size=batch_size) + valid_loader = train_loader + + print(f'{uci_folder_to_name(data_dir)}: {len(ytrain)} training samples loaded.') + Xtest, ytest, _ = load_uci_dataset(data_dir, False) + test_loader, _ = make_loader(Xtest, ytest, batch_size=batch_size) + print(f'{uci_folder_to_name(data_dir)}: {len(ytest)} testing samples loaded.') + train_loader.nclass = nclass + return train_loader, valid_loader, test_loader diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/__init__.py b/examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/__init__.py similarity index 98% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/__init__.py rename to examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/__init__.py index e5ddb1e193..59ffabc364 100644 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/__init__.py +++ b/examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/__init__.py @@ -16,13 +16,13 @@ # limitations under the License. # + from src.common.constant import * from src.eva_engine.phase1.algo.prune_synflow import SynFlowEvaluator + # evaluator mapper to register many existing evaluation algorithms evaluator_register = { - - # prune based CommonVars.PRUNE_SYNFLOW: SynFlowEvaluator(), - } + diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/coordinator.py b/examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/coordinator.py similarity index 99% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/coordinator.py rename to examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/coordinator.py index 8142a5cd6a..cd9bf95e1d 100644 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/coordinator.py +++ b/examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/coordinator.py @@ -16,6 +16,7 @@ # limitations under the License. # + from src.common.constant import Config from src.eva_engine.phase2.run_sh import BudgetAwareControllerSH from src.logger import logger diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/controller/sampler_all/__init__.py b/examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase1/__init__.py similarity index 100% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/controller/sampler_all/__init__.py rename to examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase1/__init__.py diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/controller/sampler_RL/__init__.py b/examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase1/algo/__init__.py similarity index 100% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/controller/sampler_RL/__init__.py rename to examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase1/algo/__init__.py diff --git a/examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase1/algo/alg_base.py b/examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase1/algo/alg_base.py new file mode 100644 index 0000000000..947ce71c26 --- /dev/null +++ b/examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase1/algo/alg_base.py @@ -0,0 +1,150 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + +import math +import time +from abc import abstractmethod +import torch +from torch import nn + + +class Evaluator: + def __init__(self): + pass + + @abstractmethod + def evaluate(self, arch: nn.Module, + device: str, + batch_data: object, batch_labels: torch.Tensor, + space_name: str + ) -> float: + """ + Score each architecture with predefined architecture and data + :param arch: architecture to be scored + :param device: cpu or gpu + :param batch_data: a mini batch of data, [ batch_size, channel, W, H ] or dict for structure data + :param batch_labels: a mini batch of labels + :param space_name: string + :return: score + """ + raise NotImplementedError + + def evaluate_wrapper(self, arch, device: str, space_name: str, + batch_data: torch.tensor, + batch_labels: torch.tensor) -> (float, float): + """ + :param arch: architecture to be scored + :param device: cpu or GPU + :param space_name: search space name + :param batch_data: a mini batch of data, [ batch_size, channel, W, H ] + :param batch_labels: a mini batch of labels + :return: score, timeUsage + """ + + arch.train() + # arch.zero_grad() + + # measure scoring time + if "cuda" in device: + torch.cuda.synchronize() + # use this will not need cuda.sync + # starter, ender = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True) + # starter.record() + starter, ender = time.time(), time.time() + else: + starter, ender = time.time(), time.time() + + # score + score = self.evaluate(arch, device, batch_data, batch_labels, space_name) + + if "cuda" in device: + # ender.record() + # implicitly waits for the event to be marked as complete before calculating the time difference + # curr_time = starter.elapsed_time(ender) + torch.cuda.synchronize() + ender = time.time() + curr_time = ender - starter + else: + ender = time.time() + curr_time = ender - starter + + if math.isnan(score): + if score > 0: + score = 1e8 + else: + score = -1e8 + if math.isinf(score): + if score > 0: + score = 1e8 + else: + score = -1e8 + + return score, curr_time + + def evaluate_wrapper_origin(self, arch, device: str, space_name: str, + batch_data: torch.tensor, + batch_labels: torch.tensor) -> (float, float): + """ + :param arch: architecture to be scored + :param device: cpu or GPU + :param space_name: search space name + :param batch_data: a mini batch of data, [ batch_size, channel, W, H ] + :param batch_labels: a mini batch of labels + :return: score, timeUsage + """ + + arch.train() + arch.zero_grad() + + # measure scoring time + if "cuda" in device: + torch.cuda.synchronize() + # use this will not need cuda.sync + # starter, ender = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True) + # starter.record() + starter, ender = time.time(), time.time() + else: + starter, ender = time.time(), time.time() + + # score + score = self.evaluate(arch, device, batch_data, batch_labels, space_name) + + if "cuda" in device: + # ender.record() + # implicitly waits for the event to be marked as complete before calculating the time difference + # curr_time = starter.elapsed_time(ender) + torch.cuda.synchronize() + ender = time.time() + curr_time = ender - starter + else: + ender = time.time() + curr_time = ender - starter + + if math.isnan(score): + if score > 0: + score = 1e8 + else: + score = -1e8 + if math.isinf(score): + if score > 0: + score = 1e8 + else: + score = -1e8 + + return score, curr_time diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/phase1/algo/prune_synflow.py b/examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase1/algo/prune_synflow.py similarity index 96% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/phase1/algo/prune_synflow.py rename to examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase1/algo/prune_synflow.py index 1c671febda..e1088f83bc 100644 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/phase1/algo/prune_synflow.py +++ b/examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase1/algo/prune_synflow.py @@ -38,6 +38,7 @@ # singa_dtype = {"float16": tensor.float16, "float32": tensor.float32} singa_dtype = {"float32": tensor.float32} + ### MSOptimizer class MSOptimizer(Optimizer): def __call__(self, loss): @@ -64,6 +65,7 @@ def call_with_returns(self, loss): # print ("call_with_returns after apply loss.data: \n", loss.data) return pn_p_g_list + # MSSGD -- actually no change of code class MSSGD(MSOptimizer): """Implements stochastic gradient descent (optionally with momentum). @@ -236,14 +238,15 @@ def set_states(self, states): self.moments = states['moments'] self.mom_value = self.momentum(self.step_counter) + # Data augmentation def augmentation(x, batch_size): xpad = np.pad(x, [[0, 0], [0, 0], [4, 4], [4, 4]], 'symmetric') for data_num in range(0, batch_size): offset = np.random.randint(8, size=2) x[data_num, :, :, :] = xpad[data_num, :, - offset[0]:offset[0] + x.shape[2], - offset[1]:offset[1] + x.shape[2]] + offset[0]:offset[0] + x.shape[2], + offset[1]:offset[1] + x.shape[2]] if_flip = np.random.randint(2) if (if_flip): x[data_num, :, :, :] = x[data_num, :, :, ::-1] @@ -295,10 +298,13 @@ def resize_dataset(x, image_size): for d in range(0, dim): X[n, d, :, :] = np.array(Image.fromarray(x[n, d, :, :]).resize( (image_size, image_size), Image.BILINEAR), - dtype=np.float32) + dtype=np.float32) return X + import torch + + class SynFlowEvaluator(Evaluator): def __init__(self): @@ -319,7 +325,7 @@ def evaluate(self, arch, device, batch_data: object, batch_labels: torch.Tensor, 1. this is data-Agnostic 2. only compute on a single example """ - + ### singa configs mssgd = MSSGD(lr=0.005, momentum=0.9, weight_decay=1e-5, dtype=singa_dtype['float32']) device_id = 0 @@ -327,8 +333,8 @@ def evaluate(self, arch, device, batch_data: object, batch_labels: torch.Tensor, model = arch graph = True verbosity = 0 - dist_option='plain' - spars=None + dist_option = 'plain' + spars = None precision = 'float32' global_rank = 0 world_size = 1 @@ -353,7 +359,7 @@ def evaluate(self, arch, device, batch_data: object, batch_labels: torch.Tensor, model.train() ### process batch_data - x = batch_data.cpu().numpy() # Size([1, 100]) and all ones + x = batch_data.cpu().numpy() # Size([1, 100]) and all ones x = x.astype(np_dtype[precision]) y = np.ones(x.shape[0], dtype=np.int32) if model.dimension == 2: # input data dimension @@ -364,16 +370,15 @@ def evaluate(self, arch, device, batch_data: object, batch_labels: torch.Tensor, model.compile([tx], is_train=True, use_graph=graph, sequential=sequential) dev.SetVerbosity(verbosity) - # 1. Convert params to their abs. - synflow_flag = True ### just change the model to the absolute value + synflow_flag = True ### just change the model to the absolute value tx.copy_from_numpy(x) # dtype=np.float32 ty.copy_from_numpy(y) # print ("before model forward ...") pn_p_g_list, out, loss = model(tx, ty, dist_option, spars, synflow_flag) # print ("---------------------------------------") # print ("before absolute prune_synflow !!!nemb input vector!!! tensor.to_numpy(loss)[0]: ", tensor.to_numpy(loss)[0]) - # print ("before absolute prune_synflow !!!nemb input vector!!! tensor.to_numpy(loss): ", tensor.to_numpy(loss)) + # print ("before absolute prune_synflow !!!nemb input vector!!! tensor.to_numpy(loss): ", tensor.to_numpy(loss)) # train_correct += accuracy(tensor.to_numpy(out), y) # train_loss += tensor.to_numpy(loss)[0] # all params turned to positive @@ -409,7 +414,7 @@ def evaluate(self, arch, device, batch_data: object, batch_labels: torch.Tensor, score = 0.0 for pn_p_g_item in pn_p_g_list: # print ("calculate weight param * grad parameter name: \n", pn_p_g_item[0]) - if len(pn_p_g_item[1].shape) == 2: # param_value.data is "weight" + if len(pn_p_g_item[1].shape) == 2: # param_value.data is "weight" # print ("pn_p_g_item[1].shape: \n", pn_p_g_item[1].shape) # print ("tensor.to_numpy(pn_p_g_item[1][0]): ", tensor.to_numpy(pn_p_g_item[1][0])) # print ("calculate synflow parameter name: \n", pn_p_g_item[0]) @@ -420,6 +425,6 @@ def evaluate(self, arch, device, batch_data: object, batch_labels: torch.Tensor, score += np.sum(np.absolute(tensor.to_numpy(pn_p_g_item[1]) * tensor.to_numpy(pn_p_g_item[2]))) # print ("layer_hidden_list: \n", layer_hidden_list) # print ("prune_synflow !!!one-hot input vector!!! absolute step tensor.to_numpy(loss)[0]: ", tensor.to_numpy(loss)[0]) - print ("score: \n", score) + print("score: \n", score) return score diff --git a/examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/README.md b/examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/README.md new file mode 100644 index 0000000000..90e0d8e17e --- /dev/null +++ b/examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/README.md @@ -0,0 +1,2 @@ +(1) copy cnn_ms/pkg_model_code/model.py to ~/miniconda3/lib/python3.6/site-packages/singa/model.py +(2) enter cnn_ms/ and run "python train_ms_model.py ms_model_mlp mnist" diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/README.md b/examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/README.md similarity index 100% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/README.md rename to examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/README.md diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/autograd/cifar10_multiprocess.py b/examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/autograd/cifar10_multiprocess.py similarity index 100% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/autograd/cifar10_multiprocess.py rename to examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/autograd/cifar10_multiprocess.py diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/autograd/mnist_cnn.py b/examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/autograd/mnist_cnn.py similarity index 100% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/autograd/mnist_cnn.py rename to examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/autograd/mnist_cnn.py diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/autograd/mnist_dist.py b/examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/autograd/mnist_dist.py similarity index 100% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/autograd/mnist_dist.py rename to examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/autograd/mnist_dist.py diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/autograd/mnist_multiprocess.py b/examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/autograd/mnist_multiprocess.py similarity index 100% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/autograd/mnist_multiprocess.py rename to examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/autograd/mnist_multiprocess.py diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/autograd/resnet_cifar10.py b/examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/autograd/resnet_cifar10.py similarity index 97% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/autograd/resnet_cifar10.py rename to examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/autograd/resnet_cifar10.py index 7541736994..d71e0f29b6 100644 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/autograd/resnet_cifar10.py +++ b/examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/autograd/resnet_cifar10.py @@ -1,292 +1,292 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - -try: - import pickle -except ImportError: - import cPickle as pickle - -from singa import singa_wrap as singa -from singa import autograd -from singa import tensor -from singa import device -from singa import opt -from PIL import Image -import numpy as np -import os -import sys -import time - - -def load_dataset(filepath): - with open(filepath, 'rb') as fd: - try: - cifar10 = pickle.load(fd, encoding='latin1') - except TypeError: - cifar10 = pickle.load(fd) - image = cifar10['data'].astype(dtype=np.uint8) - image = image.reshape((-1, 3, 32, 32)) - label = np.asarray(cifar10['labels'], dtype=np.uint8) - label = label.reshape(label.size, 1) - return image, label - - -def load_train_data(dir_path='cifar-10-batches-py', num_batches=5): - labels = [] - batchsize = 10000 - images = np.empty((num_batches * batchsize, 3, 32, 32), dtype=np.uint8) - for did in range(1, num_batches + 1): - fname_train_data = dir_path + "/data_batch_{}".format(did) - image, label = load_dataset(check_dataset_exist(fname_train_data)) - images[(did - 1) * batchsize:did * batchsize] = image - labels.extend(label) - images = np.array(images, dtype=np.float32) - labels = np.array(labels, dtype=np.int32) - return images, labels - - -def load_test_data(dir_path='cifar-10-batches-py'): - images, labels = load_dataset(check_dataset_exist(dir_path + "/test_batch")) - return np.array(images, dtype=np.float32), np.array(labels, dtype=np.int32) - - -def check_dataset_exist(dirpath): - if not os.path.exists(dirpath): - print( - 'Please download the cifar10 dataset using download_data.py (e.g. python ~/singa/examples/cifar10/download_data.py py)' - ) - sys.exit(0) - return dirpath - - -def normalize_for_resnet(train_x, test_x): - mean = [0.4914, 0.4822, 0.4465] - std = [0.2023, 0.1994, 0.2010] - train_x /= 255 - test_x /= 255 - for ch in range(0, 2): - train_x[:, ch, :, :] -= mean[ch] - train_x[:, ch, :, :] /= std[ch] - test_x[:, ch, :, :] -= mean[ch] - test_x[:, ch, :, :] /= std[ch] - return train_x, test_x - - -def resize_dataset(x, IMG_SIZE): - num_data = x.shape[0] - dim = x.shape[1] - X = np.zeros(shape=(num_data, dim, IMG_SIZE, IMG_SIZE), dtype=np.float32) - for n in range(0, num_data): - for d in range(0, dim): - X[n, d, :, :] = np.array(Image.fromarray(x[n, d, :, :]).resize( - (IMG_SIZE, IMG_SIZE), Image.BILINEAR), - dtype=np.float32) - return X - - -def augmentation(x, batch_size): - xpad = np.pad(x, [[0, 0], [0, 0], [4, 4], [4, 4]], 'symmetric') - for data_num in range(0, batch_size): - offset = np.random.randint(8, size=2) - x[data_num, :, :, :] = xpad[data_num, :, offset[0]:offset[0] + 32, - offset[1]:offset[1] + 32] - if_flip = np.random.randint(2) - if (if_flip): - x[data_num, :, :, :] = x[data_num, :, :, ::-1] - return x - - -def accuracy(pred, target): - y = np.argmax(pred, axis=1) - t = np.argmax(target, axis=1) - a = y == t - return np.array(a, "int").sum() - - -def to_categorical(y, num_classes): - y = np.array(y, dtype="int") - n = y.shape[0] - categorical = np.zeros((n, num_classes)) - for i in range(0, n): - categorical[i, y[i]] = 1 - categorical = categorical.astype(np.float32) - return categorical - - -# Function to all reduce NUMPY accuracy and loss from multiple devices -def reduce_variable(variable, dist_opt, reducer): - reducer.copy_from_numpy(variable) - dist_opt.all_reduce(reducer.data) - dist_opt.wait() - output = tensor.to_numpy(reducer) - return output - - -# Function to sychronize SINGA TENSOR initial model parameters -def synchronize(tensor, dist_opt): - dist_opt.all_reduce(tensor.data) - dist_opt.wait() - tensor /= dist_opt.world_size - - -# Data partition -def data_partition(dataset_x, dataset_y, global_rank, world_size): - data_per_rank = dataset_x.shape[0] // world_size - idx_start = global_rank * data_per_rank - idx_end = (global_rank + 1) * data_per_rank - return dataset_x[idx_start:idx_end], dataset_y[idx_start:idx_end] - - -def train_cifar10(DIST=False, - local_rank=None, - world_size=None, - nccl_id=None, - partial_update=False): - - # Define the hypermeters for the train_cifar10 - sgd = opt.SGD(lr=0.005, momentum=0.9, weight_decay=1e-5) - max_epoch = 5 - batch_size = 32 - - train_x, train_y = load_train_data() - test_x, test_y = load_test_data() - train_x, test_x = normalize_for_resnet(train_x, test_x) - IMG_SIZE = 224 - num_classes = 10 - - if DIST: - # For distributed GPU training - sgd = opt.DistOpt(sgd, - nccl_id=nccl_id, - local_rank=local_rank, - world_size=world_size) - dev = device.create_cuda_gpu_on(sgd.local_rank) - - # Dataset partition for distributed training - train_x, train_y = data_partition(train_x, train_y, sgd.global_rank, - sgd.world_size) - test_x, test_y = data_partition(test_x, test_y, sgd.global_rank, - sgd.world_size) - world_size = sgd.world_size - else: - # For single GPU - dev = device.create_cuda_gpu() - world_size = 1 - - from resnet import resnet50 - model = resnet50(num_classes=num_classes) - - tx = tensor.Tensor((batch_size, 3, IMG_SIZE, IMG_SIZE), dev, tensor.float32) - ty = tensor.Tensor((batch_size,), dev, tensor.int32) - num_train_batch = train_x.shape[0] // batch_size - num_test_batch = test_x.shape[0] // batch_size - idx = np.arange(train_x.shape[0], dtype=np.int32) - - if DIST: - # Sychronize the initial parameters - autograd.training = True - x = np.random.randn(batch_size, 3, IMG_SIZE, - IMG_SIZE).astype(np.float32) - y = np.zeros(shape=(batch_size,), dtype=np.int32) - tx.copy_from_numpy(x) - ty.copy_from_numpy(y) - out = model(tx) - loss = autograd.softmax_cross_entropy(out, ty) - param = [] - for p, _ in autograd.backward(loss): - synchronize(p, sgd) - param.append(p) - - for epoch in range(max_epoch): - start_time = time.time() - np.random.shuffle(idx) - - if ((DIST == False) or (sgd.global_rank == 0)): - print('Starting Epoch %d:' % (epoch)) - - # Training phase - autograd.training = True - train_correct = np.zeros(shape=[1], dtype=np.float32) - test_correct = np.zeros(shape=[1], dtype=np.float32) - train_loss = np.zeros(shape=[1], dtype=np.float32) - - for b in range(num_train_batch): - x = train_x[idx[b * batch_size:(b + 1) * batch_size]] - x = augmentation(x, batch_size) - x = resize_dataset(x, IMG_SIZE) - y = train_y[idx[b * batch_size:(b + 1) * batch_size]] - tx.copy_from_numpy(x) - ty.copy_from_numpy(y) - out = model(tx) - loss = autograd.softmax_cross_entropy(out, ty) - train_correct += accuracy(tensor.to_numpy(out), - to_categorical(y, num_classes)).astype( - np.float32) - train_loss += tensor.to_numpy(loss)[0] - if not partial_update: - sgd.backward_and_update(loss) - else: - sgd.backward_and_partial_update(loss) - - if DIST: - # Reduce the evaluation accuracy and loss from multiple devices - reducer = tensor.Tensor((1,), dev, tensor.float32) - train_correct = reduce_variable(train_correct, sgd, reducer) - train_loss = reduce_variable(train_loss, sgd, reducer) - - # Output the training loss and accuracy - if ((DIST == False) or (sgd.global_rank == 0)): - print('Training loss = %f, training accuracy = %f' % - (train_loss, train_correct / - (num_train_batch * batch_size * world_size)), - flush=True) - - if partial_update: - # Sychronize parameters before evaluation phase - for p in param: - synchronize(p, sgd) - - # Evaulation phase - autograd.training = False - for b in range(num_test_batch): - x = test_x[b * batch_size:(b + 1) * batch_size] - x = resize_dataset(x, IMG_SIZE) - y = test_y[b * batch_size:(b + 1) * batch_size] - tx.copy_from_numpy(x) - ty.copy_from_numpy(y) - out_test = model(tx) - test_correct += accuracy(tensor.to_numpy(out_test), - to_categorical(y, num_classes)) - - if DIST: - # Reduce the evaulation accuracy from multiple devices - test_correct = reduce_variable(test_correct, sgd, reducer) - - # Output the evaluation accuracy - if ((DIST == False) or (sgd.global_rank == 0)): - print('Evaluation accuracy = %f, Elapsed Time = %fs' % - (test_correct / (num_test_batch * batch_size * world_size), - time.time() - start_time), - flush=True) - - -if __name__ == '__main__': - - DIST = False - train_cifar10(DIST=DIST) +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +try: + import pickle +except ImportError: + import cPickle as pickle + +from singa import singa_wrap as singa +from singa import autograd +from singa import tensor +from singa import device +from singa import opt +from PIL import Image +import numpy as np +import os +import sys +import time + + +def load_dataset(filepath): + with open(filepath, 'rb') as fd: + try: + cifar10 = pickle.load(fd, encoding='latin1') + except TypeError: + cifar10 = pickle.load(fd) + image = cifar10['data'].astype(dtype=np.uint8) + image = image.reshape((-1, 3, 32, 32)) + label = np.asarray(cifar10['labels'], dtype=np.uint8) + label = label.reshape(label.size, 1) + return image, label + + +def load_train_data(dir_path='cifar-10-batches-py', num_batches=5): + labels = [] + batchsize = 10000 + images = np.empty((num_batches * batchsize, 3, 32, 32), dtype=np.uint8) + for did in range(1, num_batches + 1): + fname_train_data = dir_path + "/data_batch_{}".format(did) + image, label = load_dataset(check_dataset_exist(fname_train_data)) + images[(did - 1) * batchsize:did * batchsize] = image + labels.extend(label) + images = np.array(images, dtype=np.float32) + labels = np.array(labels, dtype=np.int32) + return images, labels + + +def load_test_data(dir_path='cifar-10-batches-py'): + images, labels = load_dataset(check_dataset_exist(dir_path + "/test_batch")) + return np.array(images, dtype=np.float32), np.array(labels, dtype=np.int32) + + +def check_dataset_exist(dirpath): + if not os.path.exists(dirpath): + print( + 'Please download the cifar10 dataset using download_data.py (e.g. python ~/singa/examples/cifar10/download_data.py py)' + ) + sys.exit(0) + return dirpath + + +def normalize_for_resnet(train_x, test_x): + mean = [0.4914, 0.4822, 0.4465] + std = [0.2023, 0.1994, 0.2010] + train_x /= 255 + test_x /= 255 + for ch in range(0, 2): + train_x[:, ch, :, :] -= mean[ch] + train_x[:, ch, :, :] /= std[ch] + test_x[:, ch, :, :] -= mean[ch] + test_x[:, ch, :, :] /= std[ch] + return train_x, test_x + + +def resize_dataset(x, IMG_SIZE): + num_data = x.shape[0] + dim = x.shape[1] + X = np.zeros(shape=(num_data, dim, IMG_SIZE, IMG_SIZE), dtype=np.float32) + for n in range(0, num_data): + for d in range(0, dim): + X[n, d, :, :] = np.array(Image.fromarray(x[n, d, :, :]).resize( + (IMG_SIZE, IMG_SIZE), Image.BILINEAR), + dtype=np.float32) + return X + + +def augmentation(x, batch_size): + xpad = np.pad(x, [[0, 0], [0, 0], [4, 4], [4, 4]], 'symmetric') + for data_num in range(0, batch_size): + offset = np.random.randint(8, size=2) + x[data_num, :, :, :] = xpad[data_num, :, offset[0]:offset[0] + 32, + offset[1]:offset[1] + 32] + if_flip = np.random.randint(2) + if (if_flip): + x[data_num, :, :, :] = x[data_num, :, :, ::-1] + return x + + +def accuracy(pred, target): + y = np.argmax(pred, axis=1) + t = np.argmax(target, axis=1) + a = y == t + return np.array(a, "int").sum() + + +def to_categorical(y, num_classes): + y = np.array(y, dtype="int") + n = y.shape[0] + categorical = np.zeros((n, num_classes)) + for i in range(0, n): + categorical[i, y[i]] = 1 + categorical = categorical.astype(np.float32) + return categorical + + +# Function to all reduce NUMPY accuracy and loss from multiple devices +def reduce_variable(variable, dist_opt, reducer): + reducer.copy_from_numpy(variable) + dist_opt.all_reduce(reducer.data) + dist_opt.wait() + output = tensor.to_numpy(reducer) + return output + + +# Function to sychronize SINGA TENSOR initial model parameters +def synchronize(tensor, dist_opt): + dist_opt.all_reduce(tensor.data) + dist_opt.wait() + tensor /= dist_opt.world_size + + +# Data partition +def data_partition(dataset_x, dataset_y, global_rank, world_size): + data_per_rank = dataset_x.shape[0] // world_size + idx_start = global_rank * data_per_rank + idx_end = (global_rank + 1) * data_per_rank + return dataset_x[idx_start:idx_end], dataset_y[idx_start:idx_end] + + +def train_cifar10(DIST=False, + local_rank=None, + world_size=None, + nccl_id=None, + partial_update=False): + + # Define the hypermeters for the train_cifar10 + sgd = opt.SGD(lr=0.005, momentum=0.9, weight_decay=1e-5) + max_epoch = 5 + batch_size = 32 + + train_x, train_y = load_train_data() + test_x, test_y = load_test_data() + train_x, test_x = normalize_for_resnet(train_x, test_x) + IMG_SIZE = 224 + num_classes = 10 + + if DIST: + # For distributed GPU training + sgd = opt.DistOpt(sgd, + nccl_id=nccl_id, + local_rank=local_rank, + world_size=world_size) + dev = device.create_cuda_gpu_on(sgd.local_rank) + + # Dataset partition for distributed training + train_x, train_y = data_partition(train_x, train_y, sgd.global_rank, + sgd.world_size) + test_x, test_y = data_partition(test_x, test_y, sgd.global_rank, + sgd.world_size) + world_size = sgd.world_size + else: + # For single GPU + dev = device.create_cuda_gpu() + world_size = 1 + + from resnet import resnet50 + model = resnet50(num_classes=num_classes) + + tx = tensor.Tensor((batch_size, 3, IMG_SIZE, IMG_SIZE), dev, tensor.float32) + ty = tensor.Tensor((batch_size,), dev, tensor.int32) + num_train_batch = train_x.shape[0] // batch_size + num_test_batch = test_x.shape[0] // batch_size + idx = np.arange(train_x.shape[0], dtype=np.int32) + + if DIST: + # Sychronize the initial parameters + autograd.training = True + x = np.random.randn(batch_size, 3, IMG_SIZE, + IMG_SIZE).astype(np.float32) + y = np.zeros(shape=(batch_size,), dtype=np.int32) + tx.copy_from_numpy(x) + ty.copy_from_numpy(y) + out = model(tx) + loss = autograd.softmax_cross_entropy(out, ty) + param = [] + for p, _ in autograd.backward(loss): + synchronize(p, sgd) + param.append(p) + + for epoch in range(max_epoch): + start_time = time.time() + np.random.shuffle(idx) + + if ((DIST == False) or (sgd.global_rank == 0)): + print('Starting Epoch %d:' % (epoch)) + + # Training phase + autograd.training = True + train_correct = np.zeros(shape=[1], dtype=np.float32) + test_correct = np.zeros(shape=[1], dtype=np.float32) + train_loss = np.zeros(shape=[1], dtype=np.float32) + + for b in range(num_train_batch): + x = train_x[idx[b * batch_size:(b + 1) * batch_size]] + x = augmentation(x, batch_size) + x = resize_dataset(x, IMG_SIZE) + y = train_y[idx[b * batch_size:(b + 1) * batch_size]] + tx.copy_from_numpy(x) + ty.copy_from_numpy(y) + out = model(tx) + loss = autograd.softmax_cross_entropy(out, ty) + train_correct += accuracy(tensor.to_numpy(out), + to_categorical(y, num_classes)).astype( + np.float32) + train_loss += tensor.to_numpy(loss)[0] + if not partial_update: + sgd.backward_and_update(loss) + else: + sgd.backward_and_partial_update(loss) + + if DIST: + # Reduce the evaluation accuracy and loss from multiple devices + reducer = tensor.Tensor((1,), dev, tensor.float32) + train_correct = reduce_variable(train_correct, sgd, reducer) + train_loss = reduce_variable(train_loss, sgd, reducer) + + # Output the training loss and accuracy + if ((DIST == False) or (sgd.global_rank == 0)): + print('Training loss = %f, training accuracy = %f' % + (train_loss, train_correct / + (num_train_batch * batch_size * world_size)), + flush=True) + + if partial_update: + # Sychronize parameters before evaluation phase + for p in param: + synchronize(p, sgd) + + # Evaulation phase + autograd.training = False + for b in range(num_test_batch): + x = test_x[b * batch_size:(b + 1) * batch_size] + x = resize_dataset(x, IMG_SIZE) + y = test_y[b * batch_size:(b + 1) * batch_size] + tx.copy_from_numpy(x) + ty.copy_from_numpy(y) + out_test = model(tx) + test_correct += accuracy(tensor.to_numpy(out_test), + to_categorical(y, num_classes)) + + if DIST: + # Reduce the evaulation accuracy from multiple devices + test_correct = reduce_variable(test_correct, sgd, reducer) + + # Output the evaluation accuracy + if ((DIST == False) or (sgd.global_rank == 0)): + print('Evaluation accuracy = %f, Elapsed Time = %fs' % + (test_correct / (num_test_batch * batch_size * world_size), + time.time() - start_time), + flush=True) + + +if __name__ == '__main__': + + DIST = False + train_cifar10(DIST=DIST) diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/autograd/resnet_dist.py b/examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/autograd/resnet_dist.py similarity index 100% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/autograd/resnet_dist.py rename to examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/autograd/resnet_dist.py diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/autograd/sparsification_mnist.py b/examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/autograd/sparsification_mnist.py similarity index 100% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/autograd/sparsification_mnist.py rename to examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/autograd/sparsification_mnist.py diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/autograd/xceptionnet.py b/examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/autograd/xceptionnet.py similarity index 100% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/autograd/xceptionnet.py rename to examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/autograd/xceptionnet.py diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/benchmark.py b/examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/benchmark.py similarity index 100% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/benchmark.py rename to examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/benchmark.py diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/data/cifar10.py b/examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/data/cifar10.py similarity index 97% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/data/cifar10.py rename to examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/data/cifar10.py index 5caaf30f44..74230d0de7 100644 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/data/cifar10.py +++ b/examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/data/cifar10.py @@ -1,89 +1,89 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - -try: - import pickle -except ImportError: - import cPickle as pickle - -import numpy as np -import os -import sys - - -def load_dataset(filepath): - with open(filepath, 'rb') as fd: - try: - cifar10 = pickle.load(fd, encoding='latin1') - except TypeError: - cifar10 = pickle.load(fd) - image = cifar10['data'].astype(dtype=np.uint8) - image = image.reshape((-1, 3, 32, 32)) - label = np.asarray(cifar10['labels'], dtype=np.uint8) - label = label.reshape(label.size, 1) - return image, label - - -def load_train_data(dir_path='/tmp/cifar-10-batches-py', num_batches=5): # need to save to specific local directories - labels = [] - batchsize = 10000 - images = np.empty((num_batches * batchsize, 3, 32, 32), dtype=np.uint8) - for did in range(1, num_batches + 1): - fname_train_data = dir_path + "/data_batch_{}".format(did) - image, label = load_dataset(check_dataset_exist(fname_train_data)) - images[(did - 1) * batchsize:did * batchsize] = image - labels.extend(label) - images = np.array(images, dtype=np.float32) - labels = np.array(labels, dtype=np.int32) - return images, labels - - -def load_test_data(dir_path='/tmp/cifar-10-batches-py'): # need to save to specific local directories - images, labels = load_dataset(check_dataset_exist(dir_path + "/test_batch")) - return np.array(images, dtype=np.float32), np.array(labels, dtype=np.int32) - - -def check_dataset_exist(dirpath): - if not os.path.exists(dirpath): - print( - 'Please download the cifar10 dataset using python data/download_cifar10.py' - ) - sys.exit(0) - return dirpath - - -def normalize(train_x, val_x): - mean = [0.4914, 0.4822, 0.4465] - std = [0.2023, 0.1994, 0.2010] - train_x /= 255 - val_x /= 255 - for ch in range(0, 2): - train_x[:, ch, :, :] -= mean[ch] - train_x[:, ch, :, :] /= std[ch] - val_x[:, ch, :, :] -= mean[ch] - val_x[:, ch, :, :] /= std[ch] - return train_x, val_x - -def load(): - train_x, train_y = load_train_data() - val_x, val_y = load_test_data() - train_x, val_x = normalize(train_x, val_x) - train_y = train_y.flatten() - val_y = val_y.flatten() - return train_x, train_y, val_x, val_y +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +try: + import pickle +except ImportError: + import cPickle as pickle + +import numpy as np +import os +import sys + + +def load_dataset(filepath): + with open(filepath, 'rb') as fd: + try: + cifar10 = pickle.load(fd, encoding='latin1') + except TypeError: + cifar10 = pickle.load(fd) + image = cifar10['data'].astype(dtype=np.uint8) + image = image.reshape((-1, 3, 32, 32)) + label = np.asarray(cifar10['labels'], dtype=np.uint8) + label = label.reshape(label.size, 1) + return image, label + + +def load_train_data(dir_path='/tmp/cifar-10-batches-py', num_batches=5): # need to save to specific local directories + labels = [] + batchsize = 10000 + images = np.empty((num_batches * batchsize, 3, 32, 32), dtype=np.uint8) + for did in range(1, num_batches + 1): + fname_train_data = dir_path + "/data_batch_{}".format(did) + image, label = load_dataset(check_dataset_exist(fname_train_data)) + images[(did - 1) * batchsize:did * batchsize] = image + labels.extend(label) + images = np.array(images, dtype=np.float32) + labels = np.array(labels, dtype=np.int32) + return images, labels + + +def load_test_data(dir_path='/tmp/cifar-10-batches-py'): # need to save to specific local directories + images, labels = load_dataset(check_dataset_exist(dir_path + "/test_batch")) + return np.array(images, dtype=np.float32), np.array(labels, dtype=np.int32) + + +def check_dataset_exist(dirpath): + if not os.path.exists(dirpath): + print( + 'Please download the cifar10 dataset using python data/download_cifar10.py' + ) + sys.exit(0) + return dirpath + + +def normalize(train_x, val_x): + mean = [0.4914, 0.4822, 0.4465] + std = [0.2023, 0.1994, 0.2010] + train_x /= 255 + val_x /= 255 + for ch in range(0, 2): + train_x[:, ch, :, :] -= mean[ch] + train_x[:, ch, :, :] /= std[ch] + val_x[:, ch, :, :] -= mean[ch] + val_x[:, ch, :, :] /= std[ch] + return train_x, val_x + +def load(): + train_x, train_y = load_train_data() + val_x, val_y = load_test_data() + train_x, val_x = normalize(train_x, val_x) + train_y = train_y.flatten() + val_y = val_y.flatten() + return train_x, train_y, val_x, val_y diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/data/cifar100.py b/examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/data/cifar100.py similarity index 96% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/data/cifar100.py rename to examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/data/cifar100.py index 88b943f074..b9f121b0a7 100644 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/data/cifar100.py +++ b/examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/data/cifar100.py @@ -1,81 +1,81 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - -try: - import pickle -except ImportError: - import cPickle as pickle - -import numpy as np -import os -import sys - - -def load_dataset(filepath): - with open(filepath, 'rb') as fd: - try: - cifar100 = pickle.load(fd, encoding='latin1') - except TypeError: - cifar100 = pickle.load(fd) - image = cifar100['data'].astype(dtype=np.uint8) - image = image.reshape((-1, 3, 32, 32)) - label = np.asarray(cifar100['fine_labels'], dtype=np.uint8) - label = label.reshape(label.size, 1) - return image, label - - -def load_train_data(dir_path='/tmp/cifar-100-python'): - images, labels = load_dataset(check_dataset_exist(dir_path + "/train")) - return np.array(images, dtype=np.float32), np.array(labels, dtype=np.int32) - - -def load_test_data(dir_path='/tmp/cifar-100-python'): - images, labels = load_dataset(check_dataset_exist(dir_path + "/test")) - return np.array(images, dtype=np.float32), np.array(labels, dtype=np.int32) - - -def check_dataset_exist(dirpath): - if not os.path.exists(dirpath): - print( - 'Please download the cifar100 dataset using python data/download_cifar100.py' - ) - sys.exit(0) - return dirpath - - -def normalize(train_x, val_x): - mean = [0.4914, 0.4822, 0.4465] - std = [0.2023, 0.1994, 0.2010] - train_x /= 255 - val_x /= 255 - for ch in range(0, 2): - train_x[:, ch, :, :] -= mean[ch] - train_x[:, ch, :, :] /= std[ch] - val_x[:, ch, :, :] -= mean[ch] - val_x[:, ch, :, :] /= std[ch] - return train_x, val_x - - -def load(): - train_x, train_y = load_train_data() - val_x, val_y = load_test_data() - train_x, val_x = normalize(train_x, val_x) - train_y = train_y.flatten() - val_y = val_y.flatten() - return train_x, train_y, val_x, val_y +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +try: + import pickle +except ImportError: + import cPickle as pickle + +import numpy as np +import os +import sys + + +def load_dataset(filepath): + with open(filepath, 'rb') as fd: + try: + cifar100 = pickle.load(fd, encoding='latin1') + except TypeError: + cifar100 = pickle.load(fd) + image = cifar100['data'].astype(dtype=np.uint8) + image = image.reshape((-1, 3, 32, 32)) + label = np.asarray(cifar100['fine_labels'], dtype=np.uint8) + label = label.reshape(label.size, 1) + return image, label + + +def load_train_data(dir_path='/tmp/cifar-100-python'): + images, labels = load_dataset(check_dataset_exist(dir_path + "/train")) + return np.array(images, dtype=np.float32), np.array(labels, dtype=np.int32) + + +def load_test_data(dir_path='/tmp/cifar-100-python'): + images, labels = load_dataset(check_dataset_exist(dir_path + "/test")) + return np.array(images, dtype=np.float32), np.array(labels, dtype=np.int32) + + +def check_dataset_exist(dirpath): + if not os.path.exists(dirpath): + print( + 'Please download the cifar100 dataset using python data/download_cifar100.py' + ) + sys.exit(0) + return dirpath + + +def normalize(train_x, val_x): + mean = [0.4914, 0.4822, 0.4465] + std = [0.2023, 0.1994, 0.2010] + train_x /= 255 + val_x /= 255 + for ch in range(0, 2): + train_x[:, ch, :, :] -= mean[ch] + train_x[:, ch, :, :] /= std[ch] + val_x[:, ch, :, :] -= mean[ch] + val_x[:, ch, :, :] /= std[ch] + return train_x, val_x + + +def load(): + train_x, train_y = load_train_data() + val_x, val_y = load_test_data() + train_x, val_x = normalize(train_x, val_x) + train_y = train_y.flatten() + val_y = val_y.flatten() + return train_x, train_y, val_x, val_y diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/data/download_cifar10.py b/examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/data/download_cifar10.py similarity index 100% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/data/download_cifar10.py rename to examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/data/download_cifar10.py diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/data/download_cifar100.py b/examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/data/download_cifar100.py similarity index 100% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/data/download_cifar100.py rename to examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/data/download_cifar100.py diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/data/download_mnist.py b/examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/data/download_mnist.py similarity index 100% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/data/download_mnist.py rename to examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/data/download_mnist.py diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/data/mnist.py b/examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/data/mnist.py similarity index 100% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/data/mnist.py rename to examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/data/mnist.py diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/model/alexnet.py b/examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/model/alexnet.py similarity index 100% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/model/alexnet.py rename to examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/model/alexnet.py diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/model/cnn.py b/examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/model/cnn.py similarity index 100% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/model/cnn.py rename to examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/model/cnn.py diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/model/resnet.py b/examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/model/resnet.py similarity index 100% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/model/resnet.py rename to examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/model/resnet.py diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/model/xceptionnet.py b/examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/model/xceptionnet.py similarity index 100% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/model/xceptionnet.py rename to examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/model/xceptionnet.py diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/pkg_model_code/model.py b/examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/pkg_model_code/model.py similarity index 100% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/pkg_model_code/model.py rename to examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/pkg_model_code/model.py diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/run.sh b/examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/run.sh similarity index 100% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/run.sh rename to examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/run.sh diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/train_cnn.py b/examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/train_cnn.py similarity index 100% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/train_cnn.py rename to examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/train_cnn.py diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/train_mpi.py b/examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/train_mpi.py similarity index 100% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/train_mpi.py rename to examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/train_mpi.py diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/train_ms_model.py b/examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/train_ms_model.py similarity index 100% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/train_ms_model.py rename to examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/train_ms_model.py diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/train_multiprocess.py b/examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/train_multiprocess.py similarity index 100% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/train_multiprocess.py rename to examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/cnn_ms/train_multiprocess.py diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/ms_model_mlp/model.py b/examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/ms_model_mlp/model.py similarity index 100% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/ms_model_mlp/model.py rename to examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/ms_model_mlp/model.py diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/ms_model_mlp/native.py b/examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/ms_model_mlp/native.py similarity index 100% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/ms_model_mlp/native.py rename to examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/ms_model_mlp/native.py diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/msmlp/model.py b/examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/msmlp/model.py similarity index 100% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/msmlp/model.py rename to examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/msmlp/model.py diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/msmlp/native.py b/examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/msmlp/native.py similarity index 100% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/msmlp/native.py rename to examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase1/algo/singa_ms/msmlp/native.py diff --git a/examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase1/concurrent_evaluator.py b/examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase1/concurrent_evaluator.py new file mode 100644 index 0000000000..ea53d4e667 --- /dev/null +++ b/examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase1/concurrent_evaluator.py @@ -0,0 +1,213 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + +try: + from thop import profile +except: + pass +from src.common.constant import Config, CommonVars +from src.common.structure import ModelAcquireData +from src.eva_engine import evaluator_register +from src.query_api.interface import SimulateScore +from src.dataset_utils import dataset +from torch.utils.data import DataLoader +import torch +import time +from torch import nn +from src.search_space.core.space import SpaceWrapper +from multiprocessing import Manager +import gc + + +class ConcurrentP1Evaluator: + + def __init__(self, device: str, num_label: int, dataset_name: str, + search_space_ins: SpaceWrapper, + train_loader: DataLoader, is_simulate: bool, metrics: str = CommonVars.ExpressFlow, + enable_cache: bool = False): + """ + :param device: + :param num_label: + :param dataset_name: + :param search_space_ins: + :param search_space_ins: + :param train_loader: + :param is_simulate: + :param metrics: which TFMEM to use? + :param enable_cache: if cache embedding for scoring? only used on structued data + """ + self.metrics = metrics + self.is_simulate = is_simulate + + self.dataset_name = dataset_name + + self.search_space_ins = search_space_ins + + self.device = device + self.num_labels = num_label + + self.score_getter = None + + # get one mini batch + if not self.is_simulate: + if self.dataset_name in [Config.c10, Config.c100, Config.imgNet]: + # for img data + self.mini_batch, self.mini_batch_targets = dataset.get_mini_batch( + dataloader=train_loader, + sample_alg="random", + batch_size=32, + num_classes=self.num_labels) + self.mini_batch.to(self.device) + self.mini_batch_targets.to(self.device) + elif self.dataset_name in [Config.Criteo, Config.Frappe, Config.UCIDataset]: + # this is structure data + batch = iter(train_loader).__next__() + target = batch['y'].type(torch.LongTensor).to(self.device) + batch['id'] = batch['id'].to(self.device) + batch['value'] = batch['value'].to(self.device) + self.mini_batch = batch + self.mini_batch_targets = target.to(self.device) + else: + raise NotImplementedError + + print("GC the large train data loader") + del train_loader + # Force garbage collection + gc.collect() + + self.time_usage = { + "latency": 0.0, + "io_latency": 0.0, + "compute_latency": 0.0, + "track_compute": [], # compute time + "track_io_model_init": [], # init model weight + "track_io_model_load": [], # load into GPU/CPU + "track_io_data": [], # context switch + } + + # this is to do the expeirment + self.enable_cache = enable_cache + if self.enable_cache: + # todo: warmup for concurrent usage. this is only test for MLP with embedding. + new_model = self.search_space_ins.new_arch_scratch_with_default_setting("8-8-8-8", bn=False) + new_model.init_embedding() + # shared embedding + manager = Manager() + self.model_cache = manager.dict() + self.model_cache["model"] = new_model.embedding + self.get_cache_data = self._get_cache_data_enabled + self.set_cache_data = self._set_cache_data_enabled + else: + # this is the baseline, independently run + self.get_cache_data = self._get_cache_data_disabled + self.set_cache_data = self._set_cache_data_disabled + + def _get_cache_data_enabled(self): + return self.model_cache["model"] + + def _set_cache_data_enabled(self, data): + self.model_cache["model"] = data + + def _get_cache_data_disabled(self): + return None + + def _set_cache_data_disabled(self, data): + pass + + def if_cuda_avaiable(self): + if "cuda" in self.device: + return True + else: + return False + + def p1_evaluate(self, data_str: str) -> dict: + """ + :param data_str: encoded ModelAcquireData + :return: + """ + + model_acquire = ModelAcquireData.deserialize(data_str) + return self._p1_evaluate_online(model_acquire) + + def _p1_evaluate_online(self, model_acquire: ModelAcquireData) -> dict: + + model_encoding = model_acquire.model_encoding + + # score using only one metrics + if self.metrics == CommonVars.PRUNE_SYNFLOW or self.metrics == CommonVars.ExpressFlow: + bn = False + else: + bn = True + + # measure model load time + begin = time.time() + new_model = self.search_space_ins.new_arch_scratch_with_default_setting(model_encoding, bn=bn) + + # mlp have embedding layer, which can be cached, optimization! + if self.search_space_ins.name == Config.MLPSP: + if self.enable_cache: + new_model.init_embedding(self.get_cache_data()) + if self.get_cache_data() is None: + self.set_cache_data(new_model.embedding.to(self.device)) + else: + new_model.init_embedding() + + self.time_usage["track_io_model_init"].append(time.time() - begin) + + begin = time.time() + new_model = new_model.to(self.device) + + self.time_usage["track_io_model_load"].append(time.time() - begin) + + # measure data load time + begin = time.time() + mini_batch = self.data_pre_processing(self.metrics, new_model) + self.time_usage["track_io_data"].append(time.time() - begin) + + _score, curr_time = evaluator_register[self.metrics].evaluate_wrapper( + arch=new_model, + device=self.device, + space_name=self.search_space_ins.name, + batch_data=mini_batch, + batch_labels=self.mini_batch_targets) + + self.time_usage["track_compute"].append(curr_time) + + del new_model + model_score = {self.metrics: _score} + return model_score + + def data_pre_processing(self, metrics: str, new_model: nn.Module): + """ + To measure the io/compute time more acccuretely, we pick the data pre_processing here. + """ + + # for those two metrics, we use all one embedding for efficiency (as in their paper) + if metrics in [CommonVars.ExpressFlow, CommonVars.PRUNE_SYNFLOW]: + if isinstance(self.mini_batch, torch.Tensor): + feature_dim = list(self.mini_batch[0, :].shape) + # add one dimension to feature dim, [1] + [3, 32, 32] = [1, 3, 32, 32] + mini_batch = torch.ones([1] + feature_dim).float().to(self.device) + else: + # this is for the tabular data, + mini_batch = new_model.generate_all_ones_embedding().float().to(self.device) + else: + mini_batch = self.mini_batch + + return mini_batch diff --git a/examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase1/evaluator.py b/examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase1/evaluator.py new file mode 100644 index 0000000000..3255d69bcc --- /dev/null +++ b/examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase1/evaluator.py @@ -0,0 +1,526 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + +# this is for checking the flops and params +try: + from thop import profile +except: + pass +from src.common.constant import Config, CommonVars +from src.common.structure import ModelAcquireData +from src.eva_engine import evaluator_register +from src.query_api.interface import SimulateScore +from src.dataset_utils import dataset +from torch.utils.data import DataLoader +import torch +import time +from torch import nn +from src.search_space.core.space import SpaceWrapper +import psycopg2 +from typing import Any, List, Dict, Tuple +from src.logger import logger + + +class P1Evaluator: + + def __init__(self, device: str, num_label: int, dataset_name: str, + search_space_ins: SpaceWrapper, + train_loader: DataLoader, is_simulate: bool, metrics: str = CommonVars.ExpressFlow, + enable_cache: bool = False, db_config: Dict = None, + data_retrievel: str = "sql"): + """ + :param device: + :param num_label: + :param dataset_name: + :param search_space_ins: + :param search_space_ins: + :param train_loader: + :param is_simulate: + :param metrics: which TFMEM to use? + :param enable_cache: if cache embedding for scoring? only used on structued data + :param db_config: how to connect to databaes + :param data_retrievel: sql or spi + """ + self.metrics = metrics + self.is_simulate = is_simulate + # used only is_simulate = True + self.score_getter = None + + # dataset settings + self.dataset_name = dataset_name + self.train_loader = train_loader + self.num_labels = num_label + + self.search_space_ins = search_space_ins + + self.device = device + + # this is to do the expeirment + self.enable_cache = enable_cache + self.model_cache = None + + # performance records + self.time_usage = { + "model_id": [], + + "latency": 0.0, + "io_latency": 0.0, + "compute_latency": 0.0, + + "track_compute": [], # compute time + "track_io_model_init": [], # init model weight + "track_io_model_load": [], # load model into GPU/CPU + "track_io_res_load": [], # load result into GPU/CPU + "track_io_data_retrievel": [], # release data + "track_io_data_preprocess": [], # pre-processing + } + + self.db_config = db_config + self.last_id = -1 + self.data_retrievel = data_retrievel + + # at the benchmarking, we only use one batch for fast evaluate + self.cached_mini_batch = None + self.cached_mini_batch_target = None + + self.conn = None + + def if_cuda_avaiable(self): + if "cuda" in self.device: + return True + else: + return False + + def p1_evaluate(self, data_str: dict) -> dict: + """ + :param data_str: encoded ModelAcquireData + :return: + """ + + model_acquire = ModelAcquireData.deserialize(data_str) + + if self.is_simulate: + if self.metrics == "jacflow": + return self._p1_evaluate_simu_jacflow(model_acquire) + else: + return self._p1_evaluate_simu(model_acquire) + else: + return self._p1_evaluate_online(model_acquire) + + def measure_model_flops(self, data_str: dict, batch_size: int, channel_size: int): + # todo: check the package + mini_batch, mini_batch_targets, _ = self.retrievel_data(None) + model_acquire = ModelAcquireData.deserialize(data_str) + model_encoding = model_acquire.model_encoding + new_model = self.search_space_ins.new_arch_scratch_with_default_setting(model_encoding, bn=True) + if self.search_space_ins.name == Config.MLPSP: + new_model.init_embedding(requires_grad=True) + new_model = new_model.to(self.device) + flops, params = profile(new_model, inputs=(mini_batch,)) + print('FLOPs = ' + str(flops / 1000 ** 3) + 'G') + print('Params = ' + str(params / 1000 ** 2) + 'M') + + # # 1. Score NasWot + # new_model = self.search_space_ins.new_arch_scratch_with_default_setting(model_encoding, bn=True) + # new_model = new_model.to(self.device) + # naswot_score, _ = evaluator_register[CommonVars.NAS_WOT].evaluate_wrapper( + # arch=new_model, + # device=self.device, + # space_name = self.search_space_ins.name, + # batch_data=self.mini_batch, + # batch_labels=self.mini_batch_targets) + # + # # 2. Score SynFlow + # new_model = self.search_space_ins.new_arch_scratch_with_default_setting(model_encoding, bn=False) + # new_model = new_model.to(self.device) + # synflow_score, _ = evaluator_register[CommonVars.PRUNE_SYNFLOW].evaluate_wrapper( + # arch=new_model, + # device=self.device, + # space_name = self.search_space_ins.name, + # batch_data=self.mini_batch, + # batch_labels=self.mini_batch_targets) + # + # # 3. combine the result and return + # model_score = {CommonVars.NAS_WOT: naswot_score, + # CommonVars.PRUNE_SYNFLOW: synflow_score} + + def _p1_evaluate_online(self, model_acquire: ModelAcquireData) -> dict: + + model_encoding = model_acquire.model_encoding + + # 1. Get a batch of data + mini_batch, mini_batch_targets, data_load_time_usage, data_pre_process_time = self.retrievel_data(model_acquire) + # logger.info( + # f"mini_batch sizes - id: {mini_batch['id'].size()}, value: {mini_batch['value'].size()}, + # targets: {mini_batch_targets.size()}") + # print( + # f"mini_batch sizes - id: {mini_batch['id'].size()}, value: {mini_batch['value'].size()}, + # targets: {mini_batch_targets.size()}") + self.time_usage["track_io_data_retrievel"].append(data_load_time_usage) + + # 2. Score all tfmem + if self.metrics == CommonVars.ALL_EVALUATOR: + model_score = {} + for alg, score_evaluator in evaluator_register.items(): + if alg == CommonVars.PRUNE_SYNFLOW or alg == CommonVars.ExpressFlow: + bn = False + else: + bn = True + new_model = self.search_space_ins.new_arch_scratch_with_default_setting(model_encoding, bn=bn) + if self.search_space_ins.name == Config.MLPSP: + new_model.init_embedding() + new_model = new_model.to(self.device) + + mini_batch = self.data_pre_processing(mini_batch, self.metrics, new_model) + + _score, _ = score_evaluator.evaluate_wrapper( + arch=new_model, + device=self.device, + space_name=self.search_space_ins.name, + batch_data=mini_batch, + batch_labels=mini_batch_targets) + + _score = _score.item() + model_score[alg] = abs(_score) + + # clear the cache + if "cuda" in self.device: + torch.cuda.empty_cache() + + elif self.metrics == CommonVars.JACFLOW: + begin = time.time() + new_model = self.search_space_ins.new_arch_scratch_with_default_setting(model_encoding, bn=False) + if self.search_space_ins.name == Config.MLPSP: + if self.enable_cache: + new_model.init_embedding(self.model_cache) + if self.model_cache is None: + self.model_cache = new_model.embedding.to(self.device) + else: + # init embedding every time created a new model + new_model.init_embedding() + time_usage = time.time() - begin + self.time_usage["track_io_model_init"].append(time_usage) + print("Model Init", self.enable_cache, time_usage) + + if self.if_cuda_avaiable(): + begin = time.time() + new_model = new_model.to(self.device) + torch.cuda.synchronize() + self.time_usage["track_io_model_load"].append(time.time() - begin) + else: + self.time_usage["track_io_model_load"].append(0) + + # measure data load time + begin = time.time() + all_one_mini_batch = self.data_pre_processing(mini_batch, CommonVars.PRUNE_SYNFLOW, new_model) + self.time_usage["track_io_data_preprocess"].append(data_pre_process_time + time.time() - begin) + if self.search_space_ins.name == Config.MLPSP: + print("compute with done", all_one_mini_batch.size(), mini_batch["id"].size(), mini_batch["value"].size()) + logger.info( + f"mini_batch sizes - {all_one_mini_batch.size()} " + f"id: {mini_batch['id'].size()}, value: {mini_batch['value'].size()}," + f"targets: {mini_batch_targets.size()}") + + _score_1, compute_time1 = evaluator_register[CommonVars.PRUNE_SYNFLOW].evaluate_wrapper( + arch=new_model, + device=self.device, + space_name=self.search_space_ins.name, + batch_data=all_one_mini_batch, + batch_labels=mini_batch_targets) + + _score_2, compute_time2 = evaluator_register[CommonVars.NAS_WOT].evaluate_wrapper( + arch=new_model, + device=self.device, + space_name=self.search_space_ins.name, + batch_data=mini_batch, + batch_labels=mini_batch_targets) + print(compute_time1, compute_time2) + logger.info(f"{compute_time1}, {compute_time2}") + + self.time_usage["track_compute"].append(compute_time1 + compute_time2) + self.time_usage["model_id"].append(model_encoding) + + if self.if_cuda_avaiable(): + begin = time.time() + _score = _score_1.item() + _score_2 + torch.cuda.synchronize() + self.time_usage["track_io_res_load"].append(time.time() - begin) + else: + _score = _score_1.item() + _score_2 + self.time_usage["track_io_res_load"].append(0) + + model_score = {self.metrics: float(abs(_score))} + del new_model + # 2. score using only one metrics + else: + if self.metrics == CommonVars.PRUNE_SYNFLOW or self.metrics == CommonVars.ExpressFlow: + bn = False + else: + bn = True + # measure model load time + begin = time.time() + new_model = self.search_space_ins.new_arch_scratch_with_default_setting(model_encoding, bn=bn) + + # # mlp have embedding layer, which can be cached, optimization! + # if self.search_space_ins.name == Config.MLPSP: + # if self.enable_cache: + # new_model.init_embedding(self.model_cache) + # if self.model_cache is None: + # self.model_cache = new_model.embedding.to(self.device) + # else: + # # init embedding every time created a new model + # new_model.init_embedding() + + self.time_usage["track_io_model_init"].append(time.time() - begin) + + if self.if_cuda_avaiable(): + begin = time.time() + new_model = new_model.to(self.device) + torch.cuda.synchronize() + self.time_usage["track_io_model_load"].append(time.time() - begin) + else: + self.time_usage["track_io_model_load"].append(0) + + # measure data load time + begin = time.time() + mini_batch = self.data_pre_processing(mini_batch, self.metrics, new_model) + self.time_usage["track_io_data_preprocess"].append(data_pre_process_time + time.time() - begin) + + _score, compute_time = evaluator_register[self.metrics].evaluate_wrapper( + arch=new_model, + device=self.device, + space_name=self.search_space_ins.name, + batch_data=mini_batch, + batch_labels=mini_batch_targets) + + self.time_usage["track_compute"].append(compute_time) + + if self.if_cuda_avaiable(): + begin = time.time() + _score = _score.item() + torch.cuda.synchronize() + self.time_usage["track_io_res_load"].append(time.time() - begin) + + else: + _score = _score.item() + self.time_usage["track_io_res_load"].append(0) + + model_score = {self.metrics: abs(_score)} + del new_model + return model_score + + def _p1_evaluate_simu_jacflow(self, model_acquire: ModelAcquireData) -> dict: + """ + This involves get rank, and get jacflow + """ + if self.score_getter is None: + self.score_getter = SimulateScore(space_name=self.search_space_ins.name, + dataset_name=self.dataset_name) + + model_score = self.score_getter.query_tfmem_rank_score(arch_id=model_acquire.model_id) + + return model_score + + def _p1_evaluate_simu(self, model_acquire: ModelAcquireData) -> dict: + """ + This involves simulate get alls core, + """ + if self.score_getter is None: + self.score_getter = SimulateScore(space_name=self.search_space_ins.name, + dataset_name=self.dataset_name) + + score = self.score_getter.query_all_tfmem_score(arch_id=model_acquire.model_id) + model_score = {self.metrics: abs(float(score[self.metrics]))} + return model_score + + def retrievel_data(self, model_acquire): + if not self.is_simulate: + if self.dataset_name in [Config.c10, Config.c100, Config.imgNet, Config.imgNetFull]: + if self.train_loader is None: + raise f"self.train_loader is None for {self.dataset_name}" + # for img data + begin = time.time() + mini_batch, mini_batch_targets = dataset.get_mini_batch( + dataloader=self.train_loader, + sample_alg="random", + batch_size=model_acquire.batch_size, + num_classes=self.num_labels) + mini_batch.to(self.device) + mini_batch_targets.to(self.device) + # wait for moving data to GPU + if self.if_cuda_avaiable(): + torch.cuda.synchronize() + time_usage = time.time() - begin + # todo: here is inaccurate + return mini_batch, mini_batch_targets, time_usage, 0 + elif self.dataset_name in [Config.Criteo, Config.Frappe, Config.UCIDataset]: + if self.train_loader is None: + if self.data_retrievel == "sql": + batch, time_usage = self._retrievel_from_db_sql(model_acquire.batch_size) + data_tensor, y_tensor, process_time = self.sql_batch_data_pre_processing(batch) + return data_tensor, y_tensor, time_usage, process_time + elif self.data_retrievel == "spi": + batch, time_usage = self._retrievel_from_db_spi(model_acquire) + # pre-processing + begin = time.time() + id_tensor = torch.LongTensor(batch[:, 1::2]).to(self.device) + value_tensor = torch.FloatTensor(batch[:, 2::2]).to(self.device) + y_tensor = torch.FloatTensor(batch[:, 0:1]).to(self.device) + data_tensor = {'id': id_tensor, 'value': value_tensor, 'y': y_tensor} + logger.info(id_tensor.size()) + return data_tensor, y_tensor, time_usage + time.time() - begin, 0 + else: + if self.cached_mini_batch is None and self.cached_mini_batch_target is None: + # this is structure data + begin = time.time() + batch = iter(self.train_loader).__next__() + target = batch['y'].type(torch.LongTensor).to(self.device) + batch['id'] = batch['id'].to(self.device) + batch['value'] = batch['value'].to(self.device) + + # wait for moving data to GPU + if self.if_cuda_avaiable(): + torch.cuda.synchronize() + time_usage = time.time() - begin + self.cached_mini_batch = batch + self.cached_mini_batch_target = target + return batch, target, time_usage, 0 + else: + return self.cached_mini_batch, self.cached_mini_batch_target, 0, 0 + else: + # here is to test the expressflow + # todo: debut, this is to debut, mannuly tune it + y_tensor = torch.rand(1) + dimensions = 2000 + data_tensor = {'id': torch.rand([1, dimensions]), 'value': torch.rand([1, dimensions]), 'y': y_tensor} + return data_tensor, y_tensor, 0, 0 + + def connect_to_db(self): + try: + self.conn = psycopg2.connect( + dbname=self.db_config["db_name"], + user=self.db_config["db_user"], + host=self.db_config["db_host"], + port=self.db_config["db_port"] + ) + except Exception as e: + print(f"Error connecting to the database: {e}") + + def _retrievel_from_db_sql(self, batch_size): + + begin_time = time.time() + if self.conn is None or self.conn.closed: + # If the connection is not established or was closed, reconnect. + self.connect_to_db() + + # fetch and preprocess data from PostgreSQL + cur = self.conn.cursor() + + cur.execute(f"SELECT * FROM {self.dataset_name}_train WHERE id > {self.last_id} LIMIT {batch_size};") + rows = cur.fetchall() + + if self.last_id <= 80000: + # Update last_id with max id of fetched rows + self.last_id = max(row[0] for row in rows) # assuming 'id' is at index 0 + else: + # If no more new rows, reset last_id to start over scan and return 'end_position' + self.last_id = 0 + + # block until a free slot is available + time_usage = time.time() - begin_time + return rows, time_usage + + def _retrievel_from_db_spi(self, model_acquire): + batch = model_acquire.spi_mini_batch + data_retrieval_time_usage = model_acquire.spi_seconds + return batch, data_retrieval_time_usage + + def data_pre_processing(self, mini_batch, metrics: str, new_model: nn.Module): + + # for those two metrics, we use all one embedding for efficiency (as in their paper) + if metrics in [CommonVars.ExpressFlow, CommonVars.PRUNE_SYNFLOW]: + if isinstance(mini_batch, torch.Tensor): + feature_dim = list(mini_batch[0, :].shape) + # add one dimension to feature dim, [1] + [3, 32, 32] = [1, 3, 32, 32] + mini_batch = torch.ones([1] + feature_dim).float().to(self.device) + else: + # this is for the tabular data, + mini_batch = new_model.generate_all_ones_embedding().float().to(self.device) + # print(mini_batch.size()) + else: + # for others, skip preprocessing + pass + + # wait for moving data to GPU + if self.if_cuda_avaiable(): + torch.cuda.synchronize() + return mini_batch + + def sql_batch_data_pre_processing(self, queryed_rows: List[Tuple]): + """ + mini_batch_data: [('0', '0', '123:123', '123:123', '123:123',) + """ + + # def decode_libsvm(columns): + # # Decode without additional mapping or zipping, directly processing the splits. + # ids = [] + # values = [] + # for col in columns[2:]: + # id, value = col.split(':') + # ids.append(int(id)) + # values.append(float(value)) + # return {'id': ids, 'value': values, 'y': int(columns[1])} + + def decode_libsvm(columns): + map_func = lambda pair: (int(pair[0]), float(pair[1])) + # 0 is id, 1 is label + id, value = zip(*map(lambda col: map_func(col.split(':')), columns[2:])) + sample = {'id': list(id), + 'value': list(value), + 'y': int(columns[1])} + return sample + + def pre_processing(mini_batch_data: List[Tuple]): + """ + mini_batch_data: [('0', '0', '123:123', '123:123', '123:123',) + """ + sample_lines = len(mini_batch_data) + feat_id = [] + feat_value = [] + y = [] + + for i in range(sample_lines): + row_value = mini_batch_data[i] + sample = decode_libsvm(list(row_value)) + feat_id.append(sample['id']) + feat_value.append(sample['value']) + y.append(sample['y']) + return {'id': feat_id, 'value': feat_value, 'y': y} + + begin = time.time() + batch = pre_processing(queryed_rows) + id_tensor = torch.LongTensor(batch['id']).to(self.device) + value_tensor = torch.FloatTensor(batch['value']).to(self.device) + y_tensor = torch.FloatTensor(batch['y']).to(self.device) + data_tensor = {'id': id_tensor, 'value': value_tensor, 'y': y_tensor} + # wait for moving data to GPU + if self.if_cuda_avaiable(): + torch.cuda.synchronize() + duration = time.time() - begin + return data_tensor, y_tensor, duration diff --git a/examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase1/run_phase1.py b/examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase1/run_phase1.py new file mode 100644 index 0000000000..3f81101e4d --- /dev/null +++ b/examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase1/run_phase1.py @@ -0,0 +1,169 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + +import json + +from src.common.structure import ModelAcquireData, ModelEvaData +from src.controller.controler import SampleController +from src.controller.sampler_all.seq_sampler import SequenceSampler + +from src.eva_engine.phase1.evaluator import P1Evaluator +from src.logger import logger +from src.query_api.img_explore_ea import fetch_from_db +from torch.utils.data import DataLoader +from src.controller.sampler_ea.regularized_ea import RegularizedEASampler +from src.search_space.core.space import SpaceWrapper +from src.common.constant import Config + + +# this is for image only +def p1_evaluate_query(space_name, dataset, run_id, N, K) -> (list, float): + """ + :param space_name: + :param dataset: + :param run_id: + :param N: + :param K: + :return: return list of models and time usage. + """ + arch_id, candidates, current_time = fetch_from_db(space_name, dataset, run_id, N) + return candidates[-K:], current_time + + +class RunPhase1: + + def __init__(self, args, K: int, N: int, search_space_ins: SpaceWrapper, + train_loader: DataLoader = None, is_simulate: bool = False): + """ + Each model selection job will init one class here. + :param args: space, population_size, sample_size + :param K: K models return in 1st phase + :param N: N models eval in total + :param search_space_ins: + """ + + # return K models + self.K = K + # explore N models + self.N = N + + self.args = args + + self.search_space_ins = search_space_ins + + # seq: init the search strategy and controller, + if self.search_space_ins.name == Config.MLPSP and self.N >= min(len(self.search_space_ins), 100000): + print("Explore all models") + strategy = SequenceSampler(self.search_space_ins) + elif self.search_space_ins.name != Config.MLPSP and self.N >= min(len(self.search_space_ins), 8000): + print("Explore all models") + strategy = SequenceSampler(self.search_space_ins) + else: + strategy = RegularizedEASampler(self.search_space_ins, + population_size=self.args.population_size, + sample_size=self.args.sample_size) + self.sampler = SampleController(strategy) + + # generate db config + db_config = { + "db_name": self.args.db_name, + "db_user": self.args.db_user, + "db_host": self.args.db_host, + "db_port": self.args.db_port, + } + + # seq: init the phase 1 evaluator, + self._evaluator = P1Evaluator(device=self.args.device, + num_label=self.args.num_labels, + dataset_name=self.args.dataset, + search_space_ins=self.search_space_ins, + train_loader=train_loader, + is_simulate=is_simulate, + metrics=self.args.tfmem, + db_config=db_config) + + def run_phase1(self) -> (list, list, list, list): + """ + Controller explore n models, and return the top K models. + :return: + """ + + # those two are used to track performance trace + # current best model id + trace_highest_scored_models_id = [] + # current highest score + trace_highest_score = [] + explored_n = 1 + model_eva = ModelEvaData() + + while explored_n <= self.N: + # generate new model + arch_id, arch_micro = self.sampler.sample_next_arch() + # this is for sequence sampler. + if arch_id is None: + break + model_encoding = self.search_space_ins.serialize_model_encoding(arch_micro) + + explored_n += 1 + + # run the model selection + model_acquire_data = ModelAcquireData(model_id=str(arch_id), + model_encoding=model_encoding, + is_last=False) + data_str = model_acquire_data.serialize_model() + + # update the shared model eval res + try: + model_eva.model_id = str(arch_id) + model_eva.model_score = self._evaluator.p1_evaluate(data_str) + except KeyError as e: + # when it is simulate, it could be keyerror, since some arch is not scored yet + continue + + if explored_n % 100 == 0: + logger.info("3. [trails] Phase 1: filter phase explored " + str(explored_n) + + " model, model_id = " + model_eva.model_id + + " model_scores = " + json.dumps(model_eva.model_score)) + + print("3. [trails] Phase 1: filter phase explored " + str(explored_n) + + " model, model_id = " + model_eva.model_id + + " model_scores = " + json.dumps(model_eva.model_score)) + + ranked_score = self.sampler.fit_sampler(model_eva.model_id, + model_eva.model_score, + simple_score_sum=self.args.simple_score_sum) + + # this is to measure the value of metrix, sum of two value. + if len(trace_highest_score) == 0: + trace_highest_score.append(ranked_score) + trace_highest_scored_models_id.append(str(arch_id)) + else: + if ranked_score > trace_highest_score[-1]: + trace_highest_score.append(ranked_score) + trace_highest_scored_models_id.append(str(arch_id)) + else: + trace_highest_score.append(trace_highest_score[-1]) + trace_highest_scored_models_id.append(trace_highest_scored_models_id[-1]) + + logger.info("3. [trails] Phase 1: filter phase explored " + str(explored_n) + + " model, model_id = " + model_eva.model_id + + " model_scores = " + json.dumps(model_eva.model_score)) + # return the top K models + return self.sampler.get_current_top_k_models(self.K), self.sampler.get_current_top_k_models(-1), \ + trace_highest_score, trace_highest_scored_models_id diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/third_pkg/__init__.py b/examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase2/__init__.py similarity index 99% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/third_pkg/__init__.py rename to examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase2/__init__.py index 01d7057208..ea94ffe9d7 100644 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/third_pkg/__init__.py +++ b/examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase2/__init__.py @@ -16,3 +16,5 @@ # limitations under the License. # + + diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/phase2/algo/__init__.py b/examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase2/algo/__init__.py similarity index 99% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/phase2/algo/__init__.py rename to examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase2/algo/__init__.py index 3df60b02f7..52a3e50080 100644 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/phase2/algo/__init__.py +++ b/examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase2/algo/__init__.py @@ -14,4 +14,4 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -# +# \ No newline at end of file diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/phase2/algo/trainer.py b/examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase2/algo/trainer.py similarity index 61% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/phase2/algo/trainer.py rename to examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase2/algo/trainer.py index 259ebea6d4..4ea5038051 100644 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/phase2/algo/trainer.py +++ b/examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase2/algo/trainer.py @@ -16,8 +16,10 @@ # limitations under the License. # -import time - +import torch +import torch.nn as nn +from torch import optim +from torch.utils.data import DataLoader from src.tools import utils from singa import singa_wrap as singa @@ -32,12 +34,14 @@ import time import argparse from PIL import Image +import json np_dtype = {"float16": np.float16, "float32": np.float32} # singa_dtype = {"float16": tensor.float16, "float32": tensor.float32} singa_dtype = {"float32": tensor.float32} + ### MSOptimizer class MSOptimizer(Optimizer): def __call__(self, loss): @@ -64,6 +68,8 @@ def call_with_returns(self, loss): # print ("call_with_returns after apply loss.data: \n", loss.data) return pn_p_g_list + +# MSSGD -- actually no change of code class MSSGD(MSOptimizer): """Implements stochastic gradient descent (optionally with momentum). @@ -235,14 +241,15 @@ def set_states(self, states): self.moments = states['moments'] self.mom_value = self.momentum(self.step_counter) + # Data augmentation def augmentation(x, batch_size): xpad = np.pad(x, [[0, 0], [0, 0], [4, 4], [4, 4]], 'symmetric') for data_num in range(0, batch_size): offset = np.random.randint(8, size=2) x[data_num, :, :, :] = xpad[data_num, :, - offset[0]:offset[0] + x.shape[2], - offset[1]:offset[1] + x.shape[2]] + offset[0]:offset[0] + x.shape[2], + offset[1]:offset[1] + x.shape[2]] if_flip = np.random.randint(2) if (if_flip): x[data_num, :, :, :] = x[data_num, :, :, ::-1] @@ -296,10 +303,10 @@ def resize_dataset(x, image_size): for d in range(0, dim): X[n, d, :, :] = np.array(Image.fromarray(x[n, d, :, :]).resize( (image_size, image_size), Image.BILINEAR), - dtype=np.float32) + dtype=np.float32) return X -from torch.utils.data import DataLoader + class ModelTrainer: @classmethod @@ -328,9 +335,9 @@ def fully_train_arch(cls, if logger is None: from src.logger import logger logger = logger - + logger.info(f'begin to train, batch size = {args.batch_size}') start_time, best_valid_auc = time.time(), 0. - + num_labels = args.num_labels lr = args.lr iter_per_epoch = args.iter_per_epoch @@ -341,20 +348,33 @@ def fully_train_arch(cls, args.epoch_num = epoch_num # for multiple classification + # opt_metric = nn.CrossEntropyLoss(reduction='mean').to(device) + # this is only sutiable when output is dimension 1, + # opt_metric = nn.BCEWithLogitsLoss(reduction='mean').to(device) # optimizer + # optimizer = optim.Adam(model.parameters(), lr=lr) + # scheduler = optim.lr_scheduler.CosineAnnealingLR( + # optimizer, + # T_max=epoch_num, # Maximum number of iterations. + # eta_min=1e-4) # Minimum learning rate. precision = 'float32' mssgd = MSSGD(lr=args.lr, momentum=0.9, weight_decay=1e-4, dtype=singa_dtype[precision]) device_id = 0 max_epoch = epoch_num + # model = arch graph = True verbosity = 0 - dist_option='plain' - spars=None + dist_option = 'plain' + spars = None global_rank = 0 world_size = 1 + # gradient clipping, set the gradient value to be -1 - 1 + # for p in model.parameters(): + # p.register_hook(lambda grad: torch.clamp(grad, -1., 1.)) # training params + # device = args.device if args.device == 'cpu': dev = singa_device.get_default_device() else: # GPU @@ -377,11 +397,49 @@ def fully_train_arch(cls, tx = tensor.Tensor((args.batch_size, args.nfeat), dev, singa_dtype[precision]) ty = tensor.Tensor((args.batch_size,), dev, tensor.int32) ### singa data - + model.set_optimizer(mssgd) model.compile([tx], is_train=True, use_graph=graph, sequential=sequential) dev.SetVerbosity(verbosity) + # synflow_flag = False ### just change the model to the absolute value + # for epoch in range(epoch_num): + # logger.info(f'Epoch [{epoch:3d}/{epoch_num:3d}]') + # train and eval + # print("begin to train...") + # logger.info(f"Begin to train.....") + # train_auc, train_loss = ModelTrainer.run(logger, + # epoch, iter_per_epoch, model, train_loader, opt_metric, args, + # optimizer=optimizer, namespace='train') + # scheduler.step() + # logger.info(f"Begin to evaluate on valid.....") + # print("begin to evaluate...") + # valid_auc, valid_loss = ModelTrainer.run(logger, + # epoch, iter_per_epoch, model, val_loader, + # opt_metric, args, namespace='val') + + # if use_test_acc: + # logger.info(f"Begin to evaluate on test.....") + # test_auc, test_loss = ModelTrainer.run(logger, + # epoch, iter_per_epoch, model, test_loader, + # opt_metric, args, namespace='test') + # else: + # test_auc = -1 + + # info_dic[epoch] = { + # "train_auc": train_auc, + # "valid_auc": valid_auc, + # "train_loss": train_loss, + # "valid_loss": valid_loss, + # "train_val_total_time": time.time() - start_time} + + # record best auc and save checkpoint + # if valid_auc >= best_valid_auc: + # best_valid_auc, best_test_auc = valid_auc, test_auc + # logger.info(f'best valid auc: valid {valid_auc:.4f}, test {test_auc:.4f}') + # else: + # logger.info(f'valid {valid_auc:.4f}, test {test_auc:.4f}') + # Training and evaluation loop for epoch in range(max_epoch): start_time = time.time() @@ -390,6 +448,7 @@ def fully_train_arch(cls, if global_rank == 0: print('Starting Epoch %d:' % (epoch)) + logger.info('Starting Epoch %d:' % (epoch)) # Training phase train_correct = np.zeros(shape=[1], dtype=np.float32) @@ -401,9 +460,10 @@ def fully_train_arch(cls, # print () batch_idx = 0 # for b in range(num_train_batch): - for batch_idx, batch in enumerate(train_loader): + for batch_idx, batch in enumerate(train_loader, start=1): if batch_idx % 50 == 0: - print ("trainer.py train batch_idx: \n", batch_idx) + print("trainer.py train batch_idx: \n", batch_idx) + logger.info("trainer.py train batch_idx: \n", batch_idx) # Generate the batch data in this iteration # x = train_x[idx[b * batch_size:(b + 1) * batch_size]] # if model.dimension == 4: @@ -440,7 +500,7 @@ def fully_train_arch(cls, # print ("x.astype(np.float32): \n", x.astype(np.float32)) # print ("y: \n", y) tx = tensor.Tensor(x.shape, dev, singa_dtype[precision]) - ty = tensor.Tensor((y.shape[0],), dev, tensor.int32) + ty = tensor.Tensor((y.shape[0],), dev, tensor.int32) tx.copy_from_numpy(x) # dtype=np.float32 # print ("tx: \n", tx) ty.copy_from_numpy(y) @@ -468,10 +528,16 @@ def fully_train_arch(cls, if global_rank == 0: print('Training loss = %f, training accuracy = %f' % - (train_loss, train_correct / - (batch_idx * args.batch_size * world_size)), - flush=True) - print ("train total batch_idx: ", batch_idx) + (train_loss, train_correct / + (batch_idx * args.batch_size * world_size)), + flush=True) + print("train total batch_idx: ", batch_idx) + + logger.info('Training loss = %f, training accuracy = %f' % + (train_loss, train_correct / + (batch_idx * args.batch_size * world_size))) + + logger.info("train total batch_idx: ", batch_idx) train_metric = train_correct / (batch_idx * args.batch_size * world_size) # Evaluation phase @@ -479,7 +545,7 @@ def fully_train_arch(cls, batch_idx = 0 # for b in range(num_val_batch): # print ("evaluation begins") - for batch_idx, batch in enumerate(test_loader): + for batch_idx, batch in enumerate(test_loader, start=1): # print ("trainer.py test batch_idx: \n", batch_idx) # x = val_x[b * batch_size:(b + 1) * batch_size] # if model.dimension == 4: @@ -532,13 +598,16 @@ def fully_train_arch(cls, # Output the evaluation accuracy if global_rank == 0: print('Evaluation accuracy = %f, Elapsed Time = %fs' % - (test_correct / (batch_idx * args.batch_size * 8 * world_size), - time.time() - start_time), - flush=True) + (test_correct / (batch_idx * args.batch_size * 8 * world_size), + time.time() - start_time), + flush=True) + + logger.info('Evaluation accuracy = %f, Elapsed Time = %fs' % + (test_correct / (batch_idx * args.batch_size * 8 * world_size), + time.time() - start_time)) # print ("test all batch_idx: ", batch_idx) test_metric = test_correct / (batch_idx * args.batch_size * 8 * world_size) - info_dic[epoch] = { "train_metric": str(train_metric[0]), "test_metric": str(test_metric[0]), @@ -549,5 +618,237 @@ def fully_train_arch(cls, dev.PrintTimeProfiling() # return valid_auc, time.time() - start_time, info_dic - print ("info_dic: ", info_dic) + print("info_dic: ", info_dic) + logger.info("info_dic: ", info_dic) + + logger.info(json.dumps(info_dic)) + + test_metric = train_metric return test_metric, time.time() - start_time, info_dic + + @classmethod + def fully_train_arch_origin(cls, + model: nn.Module, + use_test_acc: bool, + epoch_num, + train_loader: DataLoader, + val_loader: DataLoader, + test_loader: DataLoader, + args, + logger=None + ) -> (float, float, dict): + """ + Args: + model: + use_test_acc: + epoch_num: how many epoch, set by scheduler + train_loader: + val_loader: + test_loader: + args: + Returns: + """ + + if logger is None: + from src.logger import logger + logger = logger + + start_time, best_valid_auc = time.time(), 0. + + # training params + device = args.device + num_labels = args.num_labels + lr = args.lr + iter_per_epoch = args.iter_per_epoch + # report_freq = args.report_freq + # given_patience = args.patience + + # assign new values + args.epoch_num = epoch_num + + # for multiple classification + opt_metric = nn.CrossEntropyLoss(reduction='mean').to(device) + # this is only sutiable when output is dimension 1, + # opt_metric = nn.BCEWithLogitsLoss(reduction='mean').to(device) + + # optimizer + optimizer = optim.Adam(model.parameters(), lr=lr) + scheduler = optim.lr_scheduler.CosineAnnealingLR( + optimizer, + T_max=epoch_num, # Maximum number of iterations. + eta_min=1e-4) # Minimum learning rate. + + # gradient clipping, set the gradient value to be -1 - 1 + for p in model.parameters(): + p.register_hook(lambda grad: torch.clamp(grad, -1., 1.)) + + info_dic = {} + valid_auc = -1 + valid_loss = 0 + for epoch in range(epoch_num): + logger.info(f'Epoch [{epoch:3d}/{epoch_num:3d}]') + # train and eval + # print("begin to train...") + logger.info(f"Begin to train.....") + train_auc, train_loss = ModelTrainer.run(logger, + epoch, iter_per_epoch, model, train_loader, opt_metric, args, + optimizer=optimizer, namespace='train') + scheduler.step() + logger.info(f"Begin to evaluate on valid.....") + # print("begin to evaluate...") + valid_auc, valid_loss = ModelTrainer.run(logger, + epoch, iter_per_epoch, model, val_loader, + opt_metric, args, namespace='val') + + if use_test_acc: + logger.info(f"Begin to evaluate on test.....") + test_auc, test_loss = ModelTrainer.run(logger, + epoch, iter_per_epoch, model, test_loader, + opt_metric, args, namespace='test') + else: + test_auc = -1 + + info_dic[epoch] = { + "train_auc": train_auc, + "valid_auc": valid_auc, + "train_loss": train_loss, + "valid_loss": valid_loss, + "train_val_total_time": time.time() - start_time} + + # record best auc and save checkpoint + if valid_auc >= best_valid_auc: + best_valid_auc, best_test_auc = valid_auc, test_auc + logger.info(f'best valid auc: valid {valid_auc:.4f}, test {test_auc:.4f}') + else: + logger.info(f'valid {valid_auc:.4f}, test {test_auc:.4f}') + + return valid_auc, time.time() - start_time, info_dic + + @classmethod + def fully_evaluate_arch(cls, + model: nn.Module, + use_test_acc: bool, + epoch_num, + val_loader: DataLoader, + test_loader: DataLoader, + args, + logger=None, + ) -> (float, float, dict): + """ + Args: + model: + use_test_acc: + epoch_num: how many epoch, set by scheduler + val_loader: + test_loader: + args: + Returns: + """ + + if logger is None: + from src.logger import logger + logger = logger + + start_time, best_valid_auc = time.time(), 0. + + device = args.device + iter_per_epoch = args.iter_per_epoch + args.epoch_num = epoch_num + opt_metric = nn.CrossEntropyLoss(reduction='mean').to(device) + + info_dic = {} + valid_auc = -1 + valid_loss = 0 + for epoch in range(epoch_num): + logger.info(f'Epoch [{epoch:3d}/{epoch_num:3d}]') + # print("begin to evaluate...") + valid_auc, valid_loss = ModelTrainer.run(logger, + epoch, iter_per_epoch, model, val_loader, + opt_metric, args, namespace='val') + + if use_test_acc: + test_auc, test_loss = ModelTrainer.run(logger, + epoch, iter_per_epoch, model, test_loader, + opt_metric, args, namespace='test') + else: + test_auc = -1 + + # record best auc and save checkpoint + if valid_auc >= best_valid_auc: + best_valid_auc, best_test_auc = valid_auc, test_auc + logger.info(f'best valid auc: valid {valid_auc:.4f}, test {test_auc:.4f}') + else: + logger.info(f'valid {valid_auc:.4f}, test {test_auc:.4f}') + + return valid_auc, time.time() - start_time, info_dic + + # train one epoch of train/val/test + @classmethod + def run(cls, logger, epoch, iter_per_epoch, model, data_loader, opt_metric, args, optimizer=None, + namespace='train'): + if optimizer: + model.train() + else: + model.eval() + + time_avg, timestamp = utils.AvgrageMeter(), time.time() + loss_avg, auc_avg = utils.AvgrageMeter(), utils.AvgrageMeter() + + batch_idx = 0 + for batch_idx, batch in enumerate(data_loader): + # if suer set this, then only train fix number of iteras + # stop training current epoch for evaluation + if namespace == 'train' and iter_per_epoch is not None and batch_idx >= iter_per_epoch: + logger.info(f"Traing Iteration {batch_idx} > iter_per_epoch = {iter_per_epoch}, breakout") + break + + target = batch['y'].type(torch.LongTensor).to(args.device) + batch['id'] = batch['id'].to(args.device) + batch['value'] = batch['value'].to(args.device) + + if namespace == 'train': + y = model(batch) + loss = opt_metric(y, target) + + optimizer.zero_grad() + loss.backward() + optimizer.step() + else: + with torch.no_grad(): + y = model(batch) + loss = opt_metric(y, target) + + # for multiple classification + auc = utils.roc_auc_compute_fn(torch.nn.functional.softmax(y, dim=1)[:, 1], target) + # for binary classification + # auc = utils.roc_auc_compute_fn(y, target) + loss_avg.update(loss.item(), target.size(0)) + auc_avg.update(auc, target.size(0)) + + time_avg.update(time.time() - timestamp) + timestamp = time.time() + if batch_idx % args.report_freq == 0: + logger.info(f'Epoch [{epoch:3d}/{args.epoch_num}][{batch_idx:3d}/{len(data_loader)}]\t' + f'{time_avg.val:.3f} ({time_avg.avg:.3f}) AUC {auc_avg.val:4f} ({auc_avg.avg:4f}) ' + f'Loss {loss_avg.val:8.4f} ({loss_avg.avg:8.4f})') + + # print(f'Epoch [{epoch:3d}/{args.epoch_num}][{batch_idx:3d}/{len(data_loader)}]\t' + # f'{time_avg.val:.3f} ({time_avg.avg:.3f}) AUC {auc_avg.val:4f} ({auc_avg.avg:4f}) ' + # f'Loss {loss_avg.val:8.4f} ({loss_avg.avg:8.4f})') + + # record the last epoch information + logger.info(f'Epoch [{epoch:3d}/{args.epoch_num}][{batch_idx:3d}/{len(data_loader)}]\t' + f'{time_avg.val:.3f} ({time_avg.avg:.3f}) AUC {auc_avg.val:4f} ({auc_avg.avg:4f}) ' + f'Loss {loss_avg.val:8.4f} ({loss_avg.avg:8.4f})') + + # print(f'Epoch [{epoch:3d}/{args.epoch_num}][{batch_idx:3d}/{len(data_loader)}]\t' + # f'{time_avg.val:.3f} ({time_avg.avg:.3f}) AUC {auc_avg.val:4f} ({auc_avg.avg:4f}) ' + # f'Loss {loss_avg.val:8.4f} ({loss_avg.avg:8.4f})') + + logger.info(f'{namespace}\tTime {utils.timeSince(s=time_avg.sum):>12s} ' + f'AUC {auc_avg.avg:8.4f} Loss {loss_avg.avg:8.4f}') + + # print(f'{namespace}\tTime {utils.timeSince(s=time_avg.sum):>12s} ' + # f'AUC {auc_avg.avg:8.4f} Loss {loss_avg.avg:8.4f}') + + return auc_avg.avg, loss_avg.avg diff --git a/examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase2/evaluator.py b/examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase2/evaluator.py new file mode 100644 index 0000000000..743c42e5c4 --- /dev/null +++ b/examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase2/evaluator.py @@ -0,0 +1,97 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from src.common.constant import Config +from src.eva_engine.phase2.algo.trainer import ModelTrainer +from src.logger import logger +from src.query_api.interface import SimulateTrain +from src.search_space.core.space import SpaceWrapper +from torch.utils.data import DataLoader + + +class P2Evaluator: + + def __init__(self, + search_space_ins: SpaceWrapper, + dataset: str, + is_simulate: bool = True, + train_loader: DataLoader = None, + val_loader: DataLoader = None, + args=None): + """ + :param search_space_ins: + :param dataset: + :param is_simulate: train or not, default query from API. + """ + self.search_space_ins = search_space_ins + + # dataset name + self.dataset = dataset + self.is_simulate = is_simulate + self.acc_getter = None + + # for training only + self.train_loader = train_loader + self.val_loader = val_loader + self.args = args + + def p2_evaluate(self, cand: str, epoch_per_model: int) -> (float, float): + """ + :param cand: candidate id + :param epoch_per_model: epoch for each model + :return: + """ + # if it's simulate or it's image dataset + if self.is_simulate or self.search_space_ins.name in [Config.NB101, Config.NB201]: + return self._evaluate_query(cand, epoch_per_model) + else: + return self._evaluate_train(cand, epoch_per_model) + + def _evaluate_query(self, cand: str, epoch_per_model: int) -> (float, float): + """ + :param cand: the candidate to evaluate + :param epoch_per_model: how many resource it can use, epoch number + :return: + """ + if self.acc_getter is None: + self.acc_getter = SimulateTrain(space_name=self.search_space_ins.name) + + acc, time_usage = self.acc_getter.get_ground_truth(arch_id=cand, epoch_num=epoch_per_model, dataset=self.dataset) + + return acc, time_usage + + def _evaluate_train(self, cand: str, epoch_per_model: int) -> (float, float): + """ + :param cand: the candidate to evaluate + :param epoch_per_model: how many resource it can use, epoch number + :return: + """ + model = self.search_space_ins.new_architecture(cand) + valid_auc, total_run_time, train_log = ModelTrainer.fully_train_arch( + model=model, + use_test_acc=False, + epoch_num=epoch_per_model, + train_loader=self.train_loader, + val_loader=self.val_loader, + test_loader=self.val_loader, + args=self.args) + + logger.info(f' ----- model id: {cand}, Val_AUC : {valid_auc} Total running time: ' + f'{total_run_time}-----') + + return valid_auc, total_run_time diff --git a/examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase2/run_sh.py b/examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase2/run_sh.py new file mode 100644 index 0000000000..b2188e6b8c --- /dev/null +++ b/examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase2/run_sh.py @@ -0,0 +1,182 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + +from copy import copy + +from src.common.constant import Config +from src.eva_engine.phase2.evaluator import P2Evaluator + +# successive halving +from src.logger import logger +from src.search_space.core.space import SpaceWrapper +from torch.utils.data import DataLoader + + +class BudgetAwareControllerSH: + + @staticmethod + def pre_calculate_epoch_required(K: int, U: int, eta: int=3, max_unit_per_model: int=200): + if K == 1: + return 0 + + cur_cand_num = K + cur_epoch = min(U, max_unit_per_model) # Limit the current epoch to max_unit_per_model + total_epochs = 0 + + while cur_cand_num > 1 and cur_epoch < max_unit_per_model: + total_epochs += cur_cand_num * cur_epoch + # Prune models + cur_cand_num = int(cur_cand_num * (1 / eta)) + # Increase the training epoch for the remaining models + cur_epoch = min(cur_epoch * eta, max_unit_per_model) + + # If the models are fully trained and there is more than one candidate, add these final evaluations to the total + if cur_cand_num > 1 and cur_epoch >= max_unit_per_model: + total_epochs += cur_cand_num * max_unit_per_model + + return total_epochs + + def __init__(self, + search_space_ins: SpaceWrapper, dataset_name: str, + eta, time_per_epoch, + train_loader: DataLoader = None, + val_loader: DataLoader = None, + args=None, + is_simulate: bool = True): + """ + :param search_space_ins: + :param dataset_name: + :param time_per_epoch: + :param is_simulate: + :param eta: 1/mu to keep in each iteration + """ + self.is_simulate = is_simulate + self._evaluator = P2Evaluator(search_space_ins, dataset_name, + is_simulate=is_simulate, + train_loader=train_loader, val_loader=val_loader, + args=args) + self.eta = eta + self.max_unit_per_model = args.epoch + self.time_per_epoch = time_per_epoch + self.name = "SUCCHALF" + + def schedule_budget_per_model_based_on_T(self, space_name, fixed_time_budget, K_): + # for benchmarking only phase 2 + + # try different K and U combinations + # only consider 15625 arches in this paper + # min_budget_required: when K = 1, N = min_budget_required * 1 + if space_name == Config.NB101: + U_options = [4, 12, 16, 108] + else: + U_options = list(range(1, 200)) + + history = [] + + for U in U_options: + real_time_used = \ + BudgetAwareControllerSH.pre_calculate_epoch_required( + self.eta, self.max_unit_per_model, K_, U) * self.time_per_epoch + + if real_time_used > fixed_time_budget: + break + else: + history.append(U) + if len(history) == 0: + print(f"{fixed_time_budget} is too small for current config") + raise f"{fixed_time_budget} is too small for current config" + return history[-1] + + def pre_calculate_time_required(self, K, U): + all_epoch = BudgetAwareControllerSH.pre_calculate_epoch_required(self.eta, self.max_unit_per_model, K, U) + return all_epoch, all_epoch * self.time_per_epoch + + def run_phase2(self, U: int, candidates_m: list) -> (str, float, float): + total_time = 0 + if len(candidates_m) == 0: + raise "No model to explore during the second phase!" + candidates_m_ori = copy(candidates_m) + if len(candidates_m) == 1: + best_perform, _ = self._evaluator.p2_evaluate(candidates_m[0], self.max_unit_per_model) + return candidates_m[0], best_perform, 0, 0 + + eta = self.eta + max_unit_per_model = self.max_unit_per_model + + cur_cand_num = len(candidates_m) + cur_epoch = min(U, max_unit_per_model) # Limit the current epoch to max_unit_per_model + total_epochs = 0 + + while cur_cand_num > 1 and cur_epoch < max_unit_per_model: + logger.info(f"4. [trails] Running phase2: train {len(candidates_m)} models each with {cur_epoch} epochs") + scores = [] + # Evaluate all models + for cand in candidates_m: + score, time_usage = self._evaluator.p2_evaluate(cand, cur_epoch) + scores.append((score, cand)) + total_epochs += cur_epoch + total_time += time_usage + + # Sort models based on score + scores.sort(reverse=True, key=lambda x: x[0]) + + # Prune models, at lease keep one model + cur_cand_num = max(int(cur_cand_num * (1 / eta)), 1) + candidates_m = [x[1] for x in scores[:cur_cand_num]] + + # Increase the training epoch for the remaining models + cur_epoch = min(cur_epoch * eta, max_unit_per_model) + + # If the models can be fully trained and there is more than one candidate, select the top one + if cur_cand_num > 1 and cur_epoch >= max_unit_per_model: + logger.info( + f"4. [trails] Running phase2: train {len(candidates_m)} models each with {max_unit_per_model} epochs") + scores = [] + for cand in candidates_m: + score, time_usage = self._evaluator.p2_evaluate(cand, max_unit_per_model) + scores.append((score, cand)) + total_epochs += cur_epoch + total_time += time_usage + scores.sort(reverse=True, key=lambda x: x[0]) + candidates_m = [scores[0][1]] + + # only return the performance when simulating, skip the training, just return model + if self.is_simulate: + logger.info( + f"5. [trails] Phase2 Done, Select {candidates_m[0]}, " + f"simulate={self.is_simulate}. Acqure the ground truth") + best_perform, _ = self._evaluator.p2_evaluate(candidates_m[0], self.max_unit_per_model) + else: + logger.info( + f"5. [trails] Phase2 Done, Select {candidates_m[0]}, " + f"simulate={self.is_simulate}, Skip training") + best_perform = 0 + # Return the best model and the total epochs used + return candidates_m[0], best_perform, total_epochs, total_time + + +if __name__ == "__main__": + 'frappe: 20, uci_diabetes: 40, criteo: 10' + 'nb101: 108, nb201: 200' + k_options = [1, 2, 4, 8, 16] + u_options = [1, 2, 4, 8, 16] + print(f"k={10}, u={8}, total_epoch = {BudgetAwareControllerSH.pre_calculate_epoch_required(3, 20, 10, 8)}") + for k in k_options: + for u in u_options: + print(f"k={k}, u={u}, total_epoch = {BudgetAwareControllerSH.pre_calculate_epoch_required(3, 20, k, u)}") diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/phase2/run_sr.py b/examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase2/run_sr.py similarity index 74% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/phase2/run_sr.py rename to examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase2/run_sr.py index e5610c89cc..dd8ce96b31 100644 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/phase2/run_sr.py +++ b/examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase2/run_sr.py @@ -16,45 +16,17 @@ # limitations under the License. # + from copy import copy from src.common.constant import Config +from src.eva_engine.phase2.evaluator import P2Evaluator +from src.search_space.core.space import SpaceWrapper class BudgetAwareControllerSR: - def __init__(self, evaluator, time_per_epoch, max_unit=200): - """ - :param evaluator: - :param max_unit: for 201, it's 200, for 101 it's 108 - """ - self._evaluator = evaluator - self.max_unit_per_model = max_unit - self.time_per_epoch = time_per_epoch - self.name = "SUCCREJCT" - def schedule_budget_per_model_based_on_T(self, space_name, fixed_time_budget, K_): - # for benchmarking only phase 2 - - # try different K and U combinations - # only consider 15625 arches in this paper - # min_budget_required: when K = 1, N = min_budget_required * 1 - if space_name == Config.NB101: - U_options = [4, 12, 16, 108] - else: - U_options = list(range(1, 200)) - - history = [] - - for U in U_options: - expected_time_used = self.pre_calculate_epoch_required(K_, U) * self.time_per_epoch - if expected_time_used > fixed_time_budget: - break - else: - history.append(U) - if len(history) == 0: - raise f"{fixed_time_budget} is too small for current config" - return history[-1] - - def pre_calculate_epoch_required(self, K, U): + @staticmethod + def pre_calculate_epoch_required(K, U, eta: int = 3, max_unit_per_model: int = 200): """ :param K: candidates lists :param U: min resource each candidate needs @@ -73,16 +45,21 @@ def pre_calculate_epoch_required(self, K, U): if previous_epoch is None: previous_epoch = epoch_per_model elif previous_epoch == epoch_per_model: - # which means the epoch don't increase, no need to re-evaluate each component + # current epoch == last epoch, no need to re-evaluate each component K = cur_cand_num - 1 continue - if epoch_per_model >= self.max_unit_per_model: - epoch_per_model = self.max_unit_per_model + previous_epoch = epoch_per_model + + if epoch_per_model >= max_unit_per_model: + epoch_per_model = max_unit_per_model + + # print(f"[successive_reject]: {cur_cand_num} model left, " + # f"and evaluate each model with {epoch_per_model} epoch, total epoch = {max_unit_per_model}") # evaluate each arch min_budget_required += epoch_per_model * cur_cand_num # sort from min to max - if epoch_per_model == self.max_unit_per_model: + if epoch_per_model == max_unit_per_model: # each model is fully evaluated, just return top 1 K = 1 else: @@ -90,12 +67,53 @@ def pre_calculate_epoch_required(self, K, U): K = cur_cand_num - 1 return min_budget_required + def __init__(self, + search_space_ins: SpaceWrapper, dataset_name: str, + eta, args, time_per_epoch): + + self.is_simulate = True + self._evaluator = P2Evaluator(search_space_ins, + dataset_name, + is_simulate=True, + train_loader=None, + val_loader=None, + args=None) + + self.eta = eta + self.max_unit_per_model = args.epoch + self.time_per_epoch = time_per_epoch + self.name = "SUCCREJCT" + + def schedule_budget_per_model_based_on_T(self, space_name, fixed_time_budget, K_): + # for benchmarking only phase 2 + + # try different K and U combinations + # only consider 15625 arches in this paper + # min_budget_required: when K = 1, N = min_budget_required * 1 + if space_name == Config.NB101: + U_options = [4, 12, 16, 108] + else: + U_options = list(range(1, 200)) + + history = [] + + for U in U_options: + expected_time_used = self.pre_calculate_epoch_required(K_, U) * self.time_per_epoch + if expected_time_used > fixed_time_budget: + break + else: + history.append(U) + if len(history) == 0: + raise f"{fixed_time_budget} is too small for current config" + return history[-1] + def run_phase2(self, U: int, candidates_m: list): """ :param candidates_m: candidates lists :param U: min resource each candidate needs :return: """ + total_time = 0 # print(f" *********** begin BudgetAwareControllerSR with U={U}, K={len(candidates_m)} ***********") candidates = copy(candidates_m) total_epoch_each_rounds = len(candidates) * U @@ -118,14 +136,17 @@ def run_phase2(self, U: int, candidates_m: list): candidates = [ele[0] for ele in scored_cand[-num_keep:]] continue + previous_epoch = epoch_per_model + if epoch_per_model >= self.max_unit_per_model: epoch_per_model = self.max_unit_per_model # print(f"[successive_reject]: {cur_cand_num} model left, " - # f"and evaluate each model with {epoch_per_model} epoch") + # f"and evaluate each model with {epoch_per_model} epoch, total epoch = {self.max_unit_per_model}") # evaluate each arch for cand in candidates: - score = self._evaluator.p2_evaluate(cand, epoch_per_model) + score, time_usage = self._evaluator.p2_evaluate(cand, epoch_per_model) + total_time += time_usage total_score.append((cand, score)) min_budget_required += epoch_per_model # sort from min to max @@ -139,6 +160,5 @@ def run_phase2(self, U: int, candidates_m: list): num_keep = cur_cand_num - 1 candidates = [ele[0] for ele in scored_cand[-num_keep:]] - return candidates[0], None, min_budget_required - - + best_perform, _ = self._evaluator.p2_evaluate(candidates[0], self.max_unit_per_model) + return candidates[0], best_perform, min_budget_required, total_time diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/phase2/run_uniform.py b/examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase2/run_uniform.py similarity index 69% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/phase2/run_uniform.py rename to examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase2/run_uniform.py index 02c9b8e703..8b32fab3df 100644 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/phase2/run_uniform.py +++ b/examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase2/run_uniform.py @@ -16,21 +16,29 @@ # limitations under the License. # + from copy import copy -from random import randint +from src.search_space.core.space import SpaceWrapper from src.common.constant import Config +from src.eva_engine.phase2.evaluator import P2Evaluator # UniformAllocation class UniformAllocation: - def __init__(self, evaluator, time_per_epoch, max_unit=200): - """ - :param evaluator: - :param max_unit: for 201, it's 200, for 101 it's 108 - """ - self._evaluator = evaluator - self.max_unit_per_model = max_unit + def __init__(self, + search_space_ins: SpaceWrapper, dataset_name: str, + eta, time_per_epoch, args=None): + + self.is_simulate = True + self._evaluator = P2Evaluator(search_space_ins, + dataset_name, + is_simulate=True, + train_loader=None, + val_loader=None, + args=None) + self.eta = eta + self.max_unit_per_model = args.epoch self.time_per_epoch = time_per_epoch self.name = "UNIFORM" @@ -55,14 +63,14 @@ def schedule_budget_per_model_based_on_T(self, space_name, fixed_time_budget, K_ history.append(U) return history[-1] - def pre_calculate_epoch_required(self, K, U): + def pre_calculate_epoch_required(self, K, U, eta: int=3, max_unit_per_model: int=200): """ :param B: total budget for phase 2 :param U: mini unit computation for each modle :param candidates_m: :return: """ - return K*U + return K * U def run_phase2(self, U: int, candidates_m: list): """ @@ -72,23 +80,26 @@ def run_phase2(self, U: int, candidates_m: list): """ # print(f" *********** begin uniformly_allocate with U={U}, K={len(candidates_m)} ***********") - candidates = copy(candidates_m) min_budget_required = 0 + # todo: this is to run the full training, when compute full traiing + # U = self.max_unit_per_model + if U >= self.max_unit_per_model: U = self.max_unit_per_model # print(f"[uniformly_allocate]: uniformly allocate {U} epoch to each model") + total_time = 0 total_score = [] for cand in candidates: - score = self._evaluator.p2_evaluate(cand, U) + score, time_usage = self._evaluator.p2_evaluate(cand, U) + total_time += time_usage total_score.append((cand, score)) min_budget_required += U # sort from min to max scored_cand = sorted(total_score, key=lambda x: x[1]) candidate = scored_cand[-1][0] - return candidate, None, min_budget_required - - + best_perform, _ = self._evaluator.p2_evaluate(candidate, self.max_unit_per_model) + return candidate, best_perform, min_budget_required, total_time diff --git a/examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/run_ms.py b/examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/run_ms.py new file mode 100644 index 0000000000..5b0a640ba1 --- /dev/null +++ b/examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/run_ms.py @@ -0,0 +1,353 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + +import time + +from typing import Set, List + +from src.eva_engine import coordinator +from src.eva_engine.phase1.run_phase1 import RunPhase1, p1_evaluate_query +from torch.utils.data import DataLoader +from src.eva_engine.phase2.run_sh import BudgetAwareControllerSH +from src.eva_engine.phase2.run_sr import BudgetAwareControllerSR +from src.eva_engine.phase2.run_uniform import UniformAllocation +from src.logger import logger +from src.search_space.init_search_space import init_search_space +from src.query_api.interface import profile_NK_trade_off +from src.common.constant import Config + + +class RunModelSelection: + + def __init__(self, search_space_name: str, args, is_simulate: bool = False): + self.args = args + + self.eta = 3 + self.is_simulate = is_simulate + # basic + self.search_space_name = search_space_name + self.dataset = self.args.dataset + + # p2 evaluator + self.sh = None + + # instance of the search space. + self.search_space_ins = init_search_space(self.args) + + def select_model_simulate(self, budget: float, run_id: int = 0, only_phase1: bool = False, run_workers: int = 1): + """ + This is for image data only + """ + + # 0. profiling dataset and search space, get t1 and t2 + + score_time_per_model, train_time_per_epoch, N_K_ratio = self.search_space_ins.profiling(self.dataset) + self.sh = BudgetAwareControllerSH( + search_space_ins=self.search_space_ins, + dataset_name=self.dataset, + eta=self.eta, + time_per_epoch=train_time_per_epoch, + args=self.args, + is_simulate=self.is_simulate) + + # 1. run coordinator to schedule + K, U, N, B1_planed_time, B2_planed_time, B2_all_epoch = coordinator.schedule(self.dataset, self.sh, budget, + score_time_per_model, + train_time_per_epoch, + run_workers, + self.search_space_ins, + N_K_ratio, + only_phase1) + + print(f"Budget = {budget}, N={N}, K={K}") + + # 2. run phase 1 to score N models + k_models, B1_actual_time_use = p1_evaluate_query(self.search_space_name, self.dataset, run_id, N, K) + + # 3. run phase-2 to determine the final model + best_arch, best_arch_performance, B2_actual_epoch_use, _ = self.sh.run_phase2(U, k_models) + # print("best model returned from Phase2 = ", k_models) + + return best_arch, B1_actual_time_use + B2_actual_epoch_use * train_time_per_epoch, \ + B1_planed_time + B2_planed_time, B2_all_epoch + + def select_model_online_clean(self, budget: float, data_loader: List[DataLoader], + only_phase1: bool = False, run_workers: int = 1): + """ + Select model online for structured data. + :param budget: time budget + :param data_loader: time budget + :param only_phase1: + :param run_workers: + :return: + """ + begin_time = time.time() + logger.info("1. profiling....") + score_time_per_model = self.profile_filtering(data_loader) + train_time_per_epoch = self.profile_refinement(data_loader) + logger.info("2. coordination....") + K, U, N = self.coordination(budget, score_time_per_model, train_time_per_epoch, only_phase1) + logger.info("3. filtering phase....") + k_models, all_models, p1_trace_highest_score, p1_trace_highest_scored_models_id = self.filtering_phase( + N, K, train_loader=data_loader[0]) + logger.info("4. refinement phase....") + best_arch, best_arch_performance, _, _ = self.refinement_phase( + U, k_models, train_loader=data_loader[0], valid_loader=data_loader[1]) + + end_time = time.time() + real_time_usage = end_time - begin_time + + return best_arch, best_arch_performance, real_time_usage, all_models, \ + p1_trace_highest_score, p1_trace_highest_scored_models_id + + def select_model_online(self, budget: float, data_loader: List[DataLoader], + only_phase1: bool = False, run_workers: int = 1): + """ + Select model online for structured data. + :param budget: time budget + :param data_loader: time budget + :param only_phase1: + :param run_workers: + :return: + """ + + train_loader, valid_loader, test_loader = data_loader + + logger.info(f"0. [trails] Begin model selection, is_simulate={self.is_simulate} ... ") + begin_time = time.time() + + logger.info("1. [trails] Begin profiling.") + # 0. profiling dataset and search space, get t1 and t2 + score_time_per_model, train_time_per_epoch, N_K_ratio = self.search_space_ins.profiling( + self.dataset, + train_loader, + valid_loader, + self.args, + is_simulate=self.is_simulate) + + self.sh = BudgetAwareControllerSH( + search_space_ins=self.search_space_ins, + dataset_name=self.dataset, + eta=self.eta, + time_per_epoch=train_time_per_epoch, + is_simulate=self.is_simulate, + train_loader=train_loader, + val_loader=valid_loader, + args=self.args) + + # 1. run coordinator to schedule + logger.info("2. [trails] Begin scheduling...") + K, U, N, B1_planed_time, B2_planed_time, B2_all_epoch = coordinator.schedule(self.dataset, self.sh, budget, + score_time_per_model, + train_time_per_epoch, + run_workers, + self.search_space_ins, + N_K_ratio, + only_phase1) + + print(f"Budget = {budget}, N={N}, K={K}") + + # 2. run phase 1 to score N models + logger.info("3. [trails] Begin to run phase1: filter phase") + # lazy loading the search space if needed. + + # run phase-1 to get the K models. + p1_runner = RunPhase1( + args=self.args, + K=K, N=N, + search_space_ins=self.search_space_ins, + train_loader=train_loader, + is_simulate=self.is_simulate) + + k_models, all_models, p1_trace_highest_score, p1_trace_highest_scored_models_id \ + = p1_runner.run_phase1() + + logger.info("4. [trails] Begin to run phase2: refinement phase") + + # 3. run phase-2 to determine the final model + best_arch, best_arch_performance, B2_actual_epoch_use, _ = self.sh.run_phase2(U, k_models) + # print("best model returned from Phase2 = ", k_models) + end_time = time.time() + real_time_usage = end_time - begin_time + planned_time_usage = B1_planed_time + B2_planed_time + logger.info("5. [trails] Real time Usage = " + str(real_time_usage) + + ", Final selected model = " + str(best_arch) + + ", planned time usage = " + str(planned_time_usage) + ) + # best arch returned, + # time usage, epoch trained, + # p1 ea trace + return best_arch, best_arch_performance, \ + real_time_usage, planned_time_usage, B2_all_epoch, \ + all_models, p1_trace_highest_score, p1_trace_highest_scored_models_id + + def schedule_only(self, budget: float, data_loader: List[DataLoader], + only_phase1: bool = False, run_workers: int = 1): + """ + Select model online + :param budget: time budget + :param data_loader: time budget + :param only_phase1: + :param run_workers: + :return: + """ + + train_loader, valid_loader, test_loader = data_loader + + logger.info("0. [trails] Begin model selection ... ") + + logger.info("1. [trails] Begin profiling.") + # 0. profiling dataset and search space, get t1 and t2 + score_time_per_model, train_time_per_epoch, N_K_ratio = self.search_space_ins.profiling( + self.dataset, + train_loader, + valid_loader, + self.args, + is_simulate=self.is_simulate) + + self.sh = BudgetAwareControllerSH( + search_space_ins=self.search_space_ins, + dataset_name=self.dataset, + eta=self.eta, + time_per_epoch=train_time_per_epoch, + is_simulate=self.is_simulate, + train_loader=train_loader, + val_loader=valid_loader, + args=self.args) + + # 1. run coordinator to schedule + logger.info("2. [trails] Begin scheduling...") + K, U, N, B1_planed_time, B2_planed_time, B2_all_epoch = coordinator.schedule(self.dataset, self.sh, budget, + score_time_per_model, + train_time_per_epoch, + run_workers, + self.search_space_ins, + N_K_ratio, + only_phase1) + + return K, U, N, B1_planed_time, B2_planed_time, B2_all_epoch + + ############################################# + # to support in-database model selection + ############################################# + + def profile_filtering(self, data_loader: List[DataLoader] = [None, None, None]): + logger.info("0. [trails] Begin profile_filtering...") + begin_time = time.time() + train_loader, valid_loader, test_loader = data_loader + score_time_per_model = self.search_space_ins.profiling_score_time( + self.dataset, + train_loader, + valid_loader, + self.args, + is_simulate=self.is_simulate) + logger.info(f"0. [trails] profile_filtering Done, time_usage = {time.time() - begin_time}") + return score_time_per_model + + def profile_refinement(self, data_loader: List[DataLoader] = [None, None, None]): + logger.info("0. [trails] Begin profile_refinement...") + begin_time = time.time() + train_loader, valid_loader, test_loader = data_loader + train_time_per_epoch = self.search_space_ins.profiling_train_time( + self.dataset, + train_loader, + valid_loader, + self.args, + is_simulate=self.is_simulate) + logger.info(f"0. [trails] profile_refinement Done, time_usage = {time.time() - begin_time}") + return train_time_per_epoch + + def coordination(self, budget: float, score_time_per_model: float, train_time_per_epoch: float, only_phase1: bool): + logger.info("1. [trails] Begin coordination...") + begin_time = time.time() + sh = BudgetAwareControllerSH( + search_space_ins=self.search_space_ins, + dataset_name=self.dataset, + eta=self.eta, + time_per_epoch=train_time_per_epoch, + is_simulate=self.is_simulate, + train_loader=None, + val_loader=None, + args=self.args) + n_k_ratio = profile_NK_trade_off(self.dataset) + K, U, N, B1_planed_time, B2_planed_time, B2_all_epoch = coordinator.schedule( + self.dataset, sh, budget, + score_time_per_model, + train_time_per_epoch, + 1, + self.search_space_ins, + n_k_ratio, + only_phase1) + + logger.info(f"1. [trails] Coordination Done, time_usage = {time.time() - begin_time}") + return K, U, N + + def filtering_phase(self, N, K, train_loader=None): + logger.info("2. [trails] Begin filtering_phase...") + begin_time = time.time() + p1_runner = RunPhase1( + args=self.args, + K=K, N=N, + search_space_ins=self.search_space_ins, + train_loader=train_loader, + is_simulate=self.is_simulate) + + k_models, all_models, p1_trace_highest_score, p1_trace_highest_scored_models_id \ + = p1_runner.run_phase1() + logger.info(f"2. [trails] filtering_phase Done, time_usage = {time.time() - begin_time}") + print(f"2. [trails] filtering_phase Done, time_usage = {time.time() - begin_time}") + return k_models, all_models, p1_trace_highest_score, p1_trace_highest_scored_models_id + + def refinement_phase(self, U, k_models, alg_name: str = Config.SUCCHALF, train_loader=None, valid_loader=None, + train_time_per_epoch=None): + logger.info("3. [trails] Begin refinement...") + begin_time = time.time() + + if alg_name == Config.SUCCHALF: + self.sh = BudgetAwareControllerSH( + search_space_ins=self.search_space_ins, + dataset_name=self.dataset, + eta=self.eta, + time_per_epoch=train_time_per_epoch, + is_simulate=self.is_simulate, + train_loader=train_loader, + val_loader=valid_loader, + args=self.args) + elif alg_name == Config.SUCCREJCT: + self.sh = BudgetAwareControllerSR( + search_space_ins=self.search_space_ins, + dataset_name=self.dataset, + eta=self.eta, + time_per_epoch=train_time_per_epoch, + args=self.args) + elif alg_name == Config.UNIFORM: + self.sh = UniformAllocation( + search_space_ins=self.search_space_ins, + dataset_name=self.dataset, + eta=self.eta, + time_per_epoch=train_time_per_epoch, + args=self.args) + else: + raise NotImplementedError + + best_arch, best_arch_performance, B2_actual_epoch_use, total_time_usage = self.sh.run_phase2(U, k_models) + logger.info( + f"3. [trails] refinement phase Done, time_usage = {time.time() - begin_time}, " + f"epoches_used = {B2_actual_epoch_use}") + return best_arch, best_arch_performance, B2_actual_epoch_use, total_time_usage diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/logger/__init__.py b/examples/model_selection/Trails/internal/ml/model_selection/src/logger/__init__.py similarity index 100% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/logger/__init__.py rename to examples/model_selection/Trails/internal/ml/model_selection/src/logger/__init__.py diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/query_api/README.md b/examples/model_selection/Trails/internal/ml/model_selection/src/query_api/README.md similarity index 100% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/query_api/README.md rename to examples/model_selection/Trails/internal/ml/model_selection/src/query_api/README.md diff --git a/examples/model_selection/Trails/internal/ml/model_selection/src/query_api/__init__.py b/examples/model_selection/Trails/internal/ml/model_selection/src/query_api/__init__.py new file mode 100644 index 0000000000..8c328a3bbc --- /dev/null +++ b/examples/model_selection/Trails/internal/ml/model_selection/src/query_api/__init__.py @@ -0,0 +1,21 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + + + diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/query_api/img_explore_ea.py b/examples/model_selection/Trails/internal/ml/model_selection/src/query_api/img_explore_ea.py similarity index 96% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/query_api/img_explore_ea.py rename to examples/model_selection/Trails/internal/ml/model_selection/src/query_api/img_explore_ea.py index b032997749..ae3f691044 100644 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/query_api/img_explore_ea.py +++ b/examples/model_selection/Trails/internal/ml/model_selection/src/query_api/img_explore_ea.py @@ -25,7 +25,7 @@ base_folder_dir = os.environ.get("base_dir") if base_folder_dir is None: base_folder_dir = os.getcwd() -base_dir = os.path.join(base_folder_dir, "img_data", "ground_truth") +base_dir = os.path.join(base_folder_dir, "img_data") print("local api running at {}".format(base_dir)) # sum score is better @@ -93,4 +93,4 @@ def fetch_from_db(space_name, dataset, run_id_m, N_m): if __name__ == '__main__': - print(fetch_from_db(Config.NB201, Config.c10, 3, 10)) + print(fetch_from_db(Config.NB201, Config.c100, 3, 10)) diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/query_api/img_train_baseline.py b/examples/model_selection/Trails/internal/ml/model_selection/src/query_api/img_train_baseline.py similarity index 99% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/query_api/img_train_baseline.py rename to examples/model_selection/Trails/internal/ml/model_selection/src/query_api/img_train_baseline.py index 7b81505165..b614fb2999 100644 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/query_api/img_train_baseline.py +++ b/examples/model_selection/Trails/internal/ml/model_selection/src/query_api/img_train_baseline.py @@ -16,6 +16,7 @@ # limitations under the License. # + import os import numpy as np from src.common.constant import Config diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/query_api/interface.py b/examples/model_selection/Trails/internal/ml/model_selection/src/query_api/interface.py similarity index 96% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/query_api/interface.py rename to examples/model_selection/Trails/internal/ml/model_selection/src/query_api/interface.py index d2d335ceef..265ecdc5a7 100644 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/query_api/interface.py +++ b/examples/model_selection/Trails/internal/ml/model_selection/src/query_api/interface.py @@ -16,7 +16,6 @@ # limitations under the License. # -# query ground truth from src.common.constant import Config, CommonVars from src.query_api.query_api_img import Gt201, Gt101 from src.query_api.query_api_mlp import GTMLP @@ -30,13 +29,16 @@ def profile_NK_trade_off(dataset): We try various N/K combinations, and find this is better. """ if dataset == Config.c10: - return 85 + return 100 elif dataset == Config.c100: - return 85 + return 100 elif dataset == Config.imgNet: - return 130 + return 100 else: - return 30 + # this is the expressflow + # return 30 + # this is the jacflow + return 100 class SimulateTrain: diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/query_api/query_api_img.py b/examples/model_selection/Trails/internal/ml/model_selection/src/query_api/query_api_img.py similarity index 98% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/query_api/query_api_img.py rename to examples/model_selection/Trails/internal/ml/model_selection/src/query_api/query_api_img.py index dd3f4ca9c8..b5f77e38ac 100644 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/query_api/query_api_img.py +++ b/examples/model_selection/Trails/internal/ml/model_selection/src/query_api/query_api_img.py @@ -68,6 +68,7 @@ def guess_train_one_epoch_time(search_space_m, dataset): return Gt101().guess_train_one_epoch_time() if search_space_m == Config.NB201: return Gt201().guess_train_one_epoch_time(dataset) + raise NotImplementedError class ImgScoreQueryApi: @@ -221,7 +222,9 @@ def guess_train_one_epoch_time(self, dataset): # if time_usage > res: # res = time_usage # return res - return 40 + arch_id = random.randint(1, 15625) + time_usage = self.data201[str(arch_id)]["200"][dataset]["0"]["time_usage"] + return time_usage def get_all_trained_model_ids(self): # 201 all data has the same model set. diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/query_api/query_api_mlp.py b/examples/model_selection/Trails/internal/ml/model_selection/src/query_api/query_api_mlp.py similarity index 88% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/query_api/query_api_mlp.py rename to examples/model_selection/Trails/internal/ml/model_selection/src/query_api/query_api_mlp.py index affeacaa44..1527567924 100644 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/query_api/query_api_mlp.py +++ b/examples/model_selection/Trails/internal/ml/model_selection/src/query_api/query_api_mlp.py @@ -39,9 +39,9 @@ # 0.8028456677612497 # todo: here is for debug expressFlow only -exp_mlp_score_frappe = os.path.join(base_dir, "score_scale_traj_width/score_mlp_sp_frappe_batch_size_32_cpu.json") -exp_mlp_score_uci = os.path.join(base_dir, "score_scale_traj_width/score_mlp_sp_uci_diabetes_batch_size_32_cpu.json") -exp_mlp_score_criteo = os.path.join(base_dir, "score_scale_traj_width/score_mlp_sp_criteo_batch_size_32_cpu.json") +exp_mlp_score_frappe = os.path.join(base_dir, "micro_sensitivity/3_batch_size/4/score_mlp_sp_frappe_batch_size_32_cpu.json") +exp_mlp_score_uci = os.path.join(base_dir, "micro_sensitivity/3_batch_size/4/score_mlp_sp_uci_diabetes_batch_size_32_cpu.json") +exp_mlp_score_criteo = os.path.join(base_dir, "micro_sensitivity/3_batch_size/4/score_mlp_sp_criteo_batch_size_32_cpu.json") # todo here we use weigth sharing. mlp_score_frappe_weight_share = os.path.join(base_dir, "tab_data/weight_share_nas_frappe.json") @@ -139,18 +139,22 @@ def get_train_one_epoch_time(self, device: str): def get_valid_auc(self, arch_id: str, epoch_num: int): # todo: due to the too many job contention on server, the time usage may not valid. - time_usage = (int(epoch_num) + 1) * self.get_train_one_epoch_time(self.device) + # train on gpu, + time_usage = (int(epoch_num) + 1) * self.get_train_one_epoch_time("gpu") if self.dataset == Config.Frappe: - if epoch_num is None or epoch_num >= 20: epoch_num = 19 + if epoch_num is None or epoch_num >= 13: epoch_num = 13 t_acc = self.mlp_train[self.dataset][arch_id][str(epoch_num)]["valid_auc"] + time_usage = self.mlp_train[self.dataset][arch_id][str(epoch_num)]["train_val_total_time"] return t_acc, time_usage elif self.dataset == Config.Criteo: if epoch_num is None or epoch_num >= 10: epoch_num = 9 t_acc = self.mlp_train[self.dataset][arch_id][str(epoch_num)]["valid_auc"] + time_usage = self.mlp_train[self.dataset][arch_id][str(epoch_num)]["train_val_total_time"] return t_acc, time_usage elif self.dataset == Config.UCIDataset: if epoch_num is None or epoch_num >= 40: epoch_num = 39 - t_acc = self.mlp_train[self.dataset][arch_id][str(epoch_num)]["valid_auc"] + t_acc = self.mlp_train[self.dataset][arch_id][str(0)]["valid_auc"] + time_usage = self.mlp_train[self.dataset][arch_id][str(epoch_num)]["train_val_total_time"] return t_acc, time_usage else: raise NotImplementedError diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/query_api/singleton.py b/examples/model_selection/Trails/internal/ml/model_selection/src/query_api/singleton.py similarity index 99% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/query_api/singleton.py rename to examples/model_selection/Trails/internal/ml/model_selection/src/query_api/singleton.py index 24814b1190..d0478d2f30 100644 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/query_api/singleton.py +++ b/examples/model_selection/Trails/internal/ml/model_selection/src/query_api/singleton.py @@ -16,6 +16,7 @@ # limitations under the License. # + import threading diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/controller/sampler_rand/__init__.py b/examples/model_selection/Trails/internal/ml/model_selection/src/search_space/__init__.py similarity index 100% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/controller/sampler_rand/__init__.py rename to examples/model_selection/Trails/internal/ml/model_selection/src/search_space/__init__.py diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/dataset_utils/__init__.py b/examples/model_selection/Trails/internal/ml/model_selection/src/search_space/core/__init__.py similarity index 100% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/dataset_utils/__init__.py rename to examples/model_selection/Trails/internal/ml/model_selection/src/search_space/core/__init__.py diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/search_space/core/model_params.py b/examples/model_selection/Trails/internal/ml/model_selection/src/search_space/core/model_params.py similarity index 99% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/search_space/core/model_params.py rename to examples/model_selection/Trails/internal/ml/model_selection/src/search_space/core/model_params.py index 811bf71c72..0851964e51 100644 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/search_space/core/model_params.py +++ b/examples/model_selection/Trails/internal/ml/model_selection/src/search_space/core/model_params.py @@ -15,7 +15,6 @@ # See the License for the specific language governing permissions and # limitations under the License. # - class ModelMacroCfg: """ Macro search space config diff --git a/examples/model_selection/Trails/internal/ml/model_selection/src/search_space/core/rl_policy.py b/examples/model_selection/Trails/internal/ml/model_selection/src/search_space/core/rl_policy.py new file mode 100644 index 0000000000..eb1d169bdd --- /dev/null +++ b/examples/model_selection/Trails/internal/ml/model_selection/src/search_space/core/rl_policy.py @@ -0,0 +1,23 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import torch.nn as nn + + +class RLPolicyBase(nn.Module): + pass diff --git a/examples/model_selection/Trails/internal/ml/model_selection/src/search_space/core/space.py b/examples/model_selection/Trails/internal/ml/model_selection/src/search_space/core/space.py new file mode 100644 index 0000000000..607c65ac5f --- /dev/null +++ b/examples/model_selection/Trails/internal/ml/model_selection/src/search_space/core/space.py @@ -0,0 +1,193 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + +from abc import abstractmethod +from typing import Generator + +from torch.utils.data import DataLoader +from src.search_space.core.model_params import ModelMacroCfg, ModelMicroCfg + + +class SpaceWrapper: + + def __init__(self, cfg: ModelMacroCfg, name: str): + self.model_cfg = cfg + self.name = name + + @abstractmethod + def sample_all_models(self) -> Generator[str, None, None]: + """ + Sample all models, return a list of arch ids + """ + raise NotImplementedError + + """serialize and deserialize""" + + @classmethod + def serialize_model_encoding(cls, arch_micro: ModelMicroCfg) -> str: + raise NotImplementedError + + @classmethod + def deserialize_model_encoding(cls, model_encoding) -> ModelMicroCfg: + raise NotImplementedError + + @classmethod + def new_arch_scratch(cls, arch_macro: ModelMacroCfg, arch_micro: ModelMicroCfg, bn: bool = True): + """ + Args: + arch_macro: macro setting for one architecture + arch_micro: micro setting for one architecture + bn: true or false + Returns: + """ + raise NotImplementedError + + def new_arch_scratch_with_default_setting(self, model_encoding: str, bn: bool): + """ + Use the current search space's macro setting. + Args: + model_encoding: str of the model encoding + bn: true or false + Returns: + """ + raise NotImplementedError + + @abstractmethod + def load(self): + """ + Load the related API + Returns: + """ + raise NotImplementedError + + @abstractmethod + def profiling(self, dataset: str, + train_loader: DataLoader = None, val_loader: DataLoader = None, + args=None, is_simulate: bool = False) -> (float, float, int): + """ + Profile the training and scoring time. + Args: + dataset: + train_loader: + val_loader + args: + is_simulate: + Returns: + """ + raise NotImplementedError + + @abstractmethod + def micro_to_id(self, arch_struct: ModelMicroCfg) -> str: + raise NotImplementedError + + """init new architecture""" + + @abstractmethod + def new_architecture(self, arch_id: str): + """ + Generate an architecture with arch id + :return: + """ + raise NotImplementedError + + def new_architecture_with_micro_cfg(self, arch_micro: ModelMicroCfg): + """ + Generate an architecture with arch_micro + :return: + """ + raise NotImplementedError + + @abstractmethod + def __len__(self): + """ + How many architectures the space has + :return: + """ + raise NotImplementedError + + @abstractmethod + def get_arch_size(self, architecture): + """ + Get how many edges in each cell of the architecture. + :return: + """ + raise NotImplementedError + + def update_bn_flag(self, bn: bool): + """ + Update architecture's bn, + :param bn: + :return: + """ + self.model_cfg.bn = bn + + """Below is for integrating space with various sampler""" + + def random_architecture_id(self) -> (str, ModelMicroCfg): + """ + Random generate architecture id, cell structure, supporting RN, RL, R + :param max_nodes: how many nodes in this cell + :return: + """ + raise NotImplementedError + + def mutate_architecture(self, parent_arch: ModelMicroCfg) -> (str, ModelMicroCfg): + """ + Mutate architecture, this is to support EA sampler + :rtype: object + :return: + """ + raise NotImplementedError + + def get_reinforcement_learning_policy(self, lr_rate): + """ + This is fpr reinforcement learning policy sampler + :return: + """ + raise NotImplementedError + + """In-RDBMS Helper Functions""" + + def profiling_score_time(self, dataset: str, train_loader: DataLoader = None, val_loader: DataLoader = None, + args=None, is_simulate: bool = False) -> float: + """ + Profile the scoring time. + Args: + dataset: + train_loader: + val_loader + args: + is_simulate: + Returns: + """ + raise NotImplementedError + + def profiling_train_time(self, dataset: str, train_loader: DataLoader = None, val_loader: DataLoader = None, + args=None, is_simulate: bool = False) -> float: + """ + Profile the training time. + Args: + dataset: + train_loader: + val_loader + args: + is_simulate: + Returns: + """ + raise NotImplementedError diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/search_space/init_search_space.py b/examples/model_selection/Trails/internal/ml/model_selection/src/search_space/init_search_space.py similarity index 94% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/search_space/init_search_space.py rename to examples/model_selection/Trails/internal/ml/model_selection/src/search_space/init_search_space.py index 8d46ebdbd9..1ad6e4398a 100644 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/search_space/init_search_space.py +++ b/examples/model_selection/Trails/internal/ml/model_selection/src/search_space/init_search_space.py @@ -16,23 +16,25 @@ # limitations under the License. # + import os from src.common.constant import Config from src.search_space.core.space import SpaceWrapper from src.query_api.query_api_img import ImgScoreQueryApi + def init_search_space(args) -> SpaceWrapper: """ :param args: :param loapi: Local score API, records all scored arch, 101 use it to detect which arch is scored. :return: """ - # elif args.search_space == Config.MLPSP: + if args.search_space == Config.MLPSP: from .mlp_api.space import MlpSpace from .mlp_api.model_params import MlpMacroCfg from .mlp_api.space import DEFAULT_LAYER_CHOICES_20, DEFAULT_LAYER_CHOICES_10 - print ("src/search_space/init_search_space.py config.MLPSP") + print("[Singa] src/search_space/init_search_space.py config.MLPSP") if args.hidden_choice_len == 10: model_cfg = MlpMacroCfg( args.nfield, diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/phase1/__init__.py b/examples/model_selection/Trails/internal/ml/model_selection/src/search_space/mlp_api/__init__.py similarity index 100% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/phase1/__init__.py rename to examples/model_selection/Trails/internal/ml/model_selection/src/search_space/mlp_api/__init__.py diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/search_space/mlp_api/model_params.py b/examples/model_selection/Trails/internal/ml/model_selection/src/search_space/mlp_api/model_params.py similarity index 99% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/search_space/mlp_api/model_params.py rename to examples/model_selection/Trails/internal/ml/model_selection/src/search_space/mlp_api/model_params.py index 7edf35e1d4..1d7e676525 100644 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/search_space/mlp_api/model_params.py +++ b/examples/model_selection/Trails/internal/ml/model_selection/src/search_space/mlp_api/model_params.py @@ -16,6 +16,7 @@ # limitations under the License. # + from src.search_space.core.model_params import ModelMacroCfg diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/search_space/mlp_api/rl_policy.py b/examples/model_selection/Trails/internal/ml/model_selection/src/search_space/mlp_api/rl_policy.py similarity index 99% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/search_space/mlp_api/rl_policy.py rename to examples/model_selection/Trails/internal/ml/model_selection/src/search_space/mlp_api/rl_policy.py index e3372525da..aa8d24d978 100644 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/search_space/mlp_api/rl_policy.py +++ b/examples/model_selection/Trails/internal/ml/model_selection/src/search_space/mlp_api/rl_policy.py @@ -16,6 +16,7 @@ # limitations under the License. # + from src.search_space.core.rl_policy import RLPolicyBase diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/search_space/mlp_api/space.py b/examples/model_selection/Trails/internal/ml/model_selection/src/search_space/mlp_api/space.py similarity index 73% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/search_space/mlp_api/space.py rename to examples/model_selection/Trails/internal/ml/model_selection/src/search_space/mlp_api/space.py index 8336750ae2..33a9b6fb12 100644 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/search_space/mlp_api/space.py +++ b/examples/model_selection/Trails/internal/ml/model_selection/src/search_space/mlp_api/space.py @@ -16,6 +16,7 @@ # limitations under the License. # + import copy import itertools import random @@ -23,6 +24,7 @@ from copy import deepcopy from typing import Generator +import torch from src.common.constant import Config, CommonVars from src.eva_engine import evaluator_register from src.eva_engine.phase2.algo.trainer import ModelTrainer @@ -30,6 +32,8 @@ from src.search_space.core.model_params import ModelMicroCfg, ModelMacroCfg from src.search_space.core.space import SpaceWrapper from src.search_space.mlp_api.model_params import MlpMacroCfg +import torch.nn as nn +from torch.utils.data import DataLoader from src.query_api.interface import profile_NK_trade_off from src.query_api.query_api_mlp import GTMLP @@ -53,12 +57,12 @@ 48, 96, 112, 144, 176, 240, 384] - np_dtype = {"float16": np.float16, "float32": np.float32} # singa_dtype = {"float16": tensor.float16, "float32": tensor.float32} singa_dtype = {"float32": tensor.float32} + class MlpMicroCfg(ModelMicroCfg): @classmethod @@ -72,6 +76,83 @@ def __init__(self, hidden_layer_list: list): def __str__(self): return "-".join(str(x) for x in self.hidden_layer_list) + +class Embedding(nn.Module): + + def __init__(self, nfeat, nemb): + super().__init__() + self.embedding = nn.Embedding(nfeat, nemb) + nn.init.xavier_uniform_(self.embedding.weight) + + def forward(self, x: dict): + """ + :param x: {'id': LongTensor B*F, 'value': FloatTensor B*F} + :return: embeddings B*F*E + """ + emb = self.embedding(x['id']) # B*F*E + return emb * x['value'].unsqueeze(2) # B*F*E + + +class MLP(nn.Module): + + def __init__(self, ninput: int, hidden_layer_list: list, dropout_rate: float, noutput: int, use_bn: bool): + super().__init__() + """ + Args: + ninput: number of input feature dim + hidden_layer_list: [a,b,c..] each value is number of Neurons in corresponding hidden layer + dropout_rate: if use drop out + noutput: number of labels. + """ + + layers = list() + # 1. all hidden layers. + for index, layer_size in enumerate(hidden_layer_list): + layers.append(nn.Linear(ninput, layer_size)) + if use_bn: + layers.append(nn.BatchNorm1d(layer_size)) + layers.append(nn.ReLU()) + layers.append(nn.Dropout(p=dropout_rate)) + ninput = layer_size + # 2. last hidden layer + if len(hidden_layer_list) == 0: + last_hidden_layer_num = ninput + else: + last_hidden_layer_num = hidden_layer_list[-1] + layers.append(nn.Linear(last_hidden_layer_num, noutput)) + + # 3. generate the MLP + self.mlp = nn.Sequential(*layers) + + self._initialize_weights() + + def forward(self, x): + """ + each element represents the probability of the positive class. + :param x: FloatTensor B*ninput + :return: FloatTensor B*nouput + """ + return self.mlp(x) + + def _initialize_weights(self, method='xavier'): + for m in self.modules(): + if isinstance(m, nn.BatchNorm2d): + m.weight.data.fill_(1) + m.bias.data.zero_() + elif isinstance(m, nn.Linear): + if method == 'lecun': + nn.init.kaiming_normal_(m.weight, mode='fan_in', nonlinearity='linear') + elif method == 'xavier': + nn.init.xavier_uniform_(m.weight) + elif method == 'he': + nn.init.kaiming_uniform_(m.weight) + # m.weight.data.normal_(0, 0.01) + # m.bias.data.zero_() + + def reset_zero_grads(self): + self.zero_grad() + + #### self-defined loss begin ### from autograd.py @@ -107,11 +188,13 @@ def backward(self, dy=1.0): dx.data *= float(dy) return dx.data + def se_loss(x): # assert x.shape == t.shape, "input and target shape different: %s, %s" % ( # x.shape, t.shape) return SumError()(x)[0] + ### from layer.py class SumErrorLayer(Layer): """ @@ -124,6 +207,7 @@ def __init__(self): def forward(self, x): return se_loss(x) + #### self-defined loss end class SINGADNNModel(model.Model): @@ -131,7 +215,7 @@ class SINGADNNModel(model.Model): def __init__(self, nfield: int, nfeat: int, nemb: int, hidden_layer_list: list, dropout_rate: float, noutput: int, use_bn: bool = True): - # def __init__(self, data_size=10, perceptron_size=100, num_classes=10, layer_hidden_list=[10,10,10,10]): + # def __init__(self, data_size=10, perceptron_size=100, num_classes=10, layer_hidden_list=[10,10,10,10]): super(SINGADNNModel, self).__init__() # self.num_classes = num_classes self.dimension = 2 # data dimension = 2 @@ -165,7 +249,7 @@ def __init__(self, nfield: int, nfeat: int, nemb: int, self.hidden_layer_list = hidden_layer_list # Initialize subnet mask with ones self.subnet_mask = [np.ones(size) for size in hidden_layer_list] - + def forward(self, inputs): # print ("in space.py forward") # print ("in space.py inputs shape: ", inputs.shape) @@ -179,13 +263,12 @@ def forward(self, inputs): y = self.relu(y) y = self.linear5(y) return y - + def generate_all_ones_embedding(self): """ Only for the MLP Returns: """ - import torch # batch_data = torch.ones(1, self.mlp_ninput).double() # embedding batch_data = torch.ones(1, self.nfeat).double() # one-hot # print ("batch_data shape: ", batch_data.shape) @@ -258,7 +341,7 @@ def create_model(pretrained=False, **kwargs): Args: pretrained (bool): If True, returns a pre-trained model. - + Returns: The created CNN model. """ @@ -269,7 +352,117 @@ def create_model(pretrained=False, **kwargs): __all__ = ['SINGADNNModel', 'create_model'] -from torch.utils.data import DataLoader + +class DNNModel(torch.nn.Module): + """ + Model: Deep Neural Networks + """ + + def __init__(self, nfield: int, nfeat: int, nemb: int, + hidden_layer_list: list, dropout_rate: float, + noutput: int, use_bn: bool = True): + """ + Args: + nfield: the number of fields + nfeat: the number of features + nemb: embedding size + """ + super().__init__() + self.nfeat = nfeat + self.nemb = nemb + self.embedding = None + self.mlp_ninput = nfield * nemb + self.mlp = MLP(self.mlp_ninput, hidden_layer_list, dropout_rate, noutput, use_bn) + # self.sigmoid = nn.Sigmoid() + + # for weight-sharing + self.is_masked_subnet = False + self.hidden_layer_list = hidden_layer_list + # Initialize subnet mask with ones + self.subnet_mask = [torch.ones(size) for size in hidden_layer_list] + + def init_embedding(self, cached_embedding=None, requires_grad=False): + """ + This is slow, in filtering phase, we could enable caching here. + """ + if self.embedding is None: + if cached_embedding is None: + self.embedding = Embedding(self.nfeat, self.nemb) + else: + self.embedding = cached_embedding + + # in scoring process + # Disable gradients for all parameters in the embedding layer + if not requires_grad: + for param in self.embedding.parameters(): + param.requires_grad = False + + def generate_all_ones_embedding(self): + """ + Only for the MLP + Returns: + """ + batch_data = torch.ones(1, self.mlp_ninput).double() + return batch_data + + def forward_wo_embedding(self, x): + """ + Only used when embedding is generated outside, eg, all 1 embedding. + """ + y = self.mlp(x) # B*label + return y.squeeze(1) + + def forward(self, x): + """ + :param x: {'id': LongTensor B*F, 'value': FloatTensor B*F} + :return: y of size B, Regression and Classification (+sigmoid) + """ + if self.is_masked_subnet: + return self.forward_w_mask(x) + else: + x_emb = self.embedding(x) # B*F*E + y = self.mlp(x_emb.view(-1, self.mlp_ninput)) # B*label + # this is for binary classification + return y.squeeze(1) + + def sample_subnet(self, arch_id: str, device: str): + # arch_id e.g., '128-128-128-128' + sizes = list(map(int, arch_id.split('-'))) + self.is_masked_subnet = True + # randomly mask neurons in the layers. + + for idx, size in enumerate(sizes): + # Create a mask of ones and zeros with the required length + mask = torch.cat([ + torch.ones(size), + torch.zeros(self.hidden_layer_list[idx] - size)], + dim=0).to(device) + # Shuffle the mask to randomize which neurons are active + mask = mask[torch.randperm(mask.size(0))] + self.subnet_mask[idx] = mask + + def forward_w_mask(self, x): + x_emb = self.embedding(x) # B*F*E + x_emb = x_emb.view(-1, self.mlp_ninput) + + # Loop till the second last layer of the MLP + for idx, layer in enumerate(self.mlp.mlp[:-1]): # Exclude the last Linear layer + # 1. subnet_mask: idx // 4 is to map computation later => mlp later + # 2. unsqueeze(1): convert to 2 dimension, + # and then the mask is broadcasted across the row, correspond to one neuron, + # 3. matrix multiplication between input and the transposed weight + if isinstance(layer, nn.Linear): + weight = layer.weight * self.subnet_mask[idx // 4].unsqueeze(1) + x_emb = torch.nn.functional.linear(x_emb, weight, layer.bias) + else: + x_emb = layer(x_emb) # apply activation, dropout, batchnorm, etc. + + # Handle the output layer + output_layer = self.mlp.mlp[-1] + y = output_layer(x_emb) + return y.squeeze(1) + + class MlpSpace(SpaceWrapper): def __init__(self, modelCfg: MlpMacroCfg): super().__init__(modelCfg, Config.MLPSP) @@ -356,7 +549,6 @@ def profiling_score_time( else: # get a random batch. - import torch batch = iter(train_loader).__next__() target = batch['y'].type(torch.LongTensor) batch['id'] = batch['id'].to(device) @@ -373,33 +565,44 @@ def profiling_score_time( hidden_layer_list=[DEFAULT_LAYER_CHOICES_20[-1]] * self.model_cfg.num_layers, dropout_rate=0, noutput=self.model_cfg.num_labels) - super_net.init_embedding(requires_grad=False) - super_net.to(device) + # super_net.init_embedding(requires_grad=False) + # super_net.to(device) # measure score time, score_time_begin = time.time() - naswot_score, _ = evaluator_register[CommonVars.NAS_WOT].evaluate_wrapper( - arch=super_net, - device=device, - batch_data=batch, - batch_labels=target) + # naswot_score, _ = evaluator_register[CommonVars.NAS_WOT].evaluate_wrapper( + # arch=super_net, + # device=device, + # batch_data=batch, + # batch_labels=target) + # + # # re-init hte net + # del super_net + # # super_net = DNNModel( + # super_net = SINGADNNModel( + # nfield=args.nfield, + # nfeat=args.nfeat, + # nemb=args.nemb, + # hidden_layer_list=[DEFAULT_LAYER_CHOICES_20[-1]] * self.model_cfg.num_layers, + # dropout_rate=0, + # noutput=self.model_cfg.num_labels, + # use_bn=False) + # super_net.init_embedding(requires_grad=False) + # super_net.to(device) + + # preprocessing + if isinstance(batch, torch.Tensor): + feature_dim = list(batch[0, :].shape) + # add one dimension to feature dim, [1] + [3, 32, 32] = [1, 3, 32, 32] + mini_batch = torch.ones([1] + feature_dim).float().to(device) + else: + # this is for the tabular data, + mini_batch = super_net.generate_all_ones_embedding().float().to(device) - # re-init hte net - del super_net - # super_net = DNNModel( - super_net = SINGADNNModel( - nfield=args.nfield, - nfeat=args.nfeat, - nemb=args.nemb, - hidden_layer_list=[DEFAULT_LAYER_CHOICES_20[-1]] * self.model_cfg.num_layers, - dropout_rate=0, - noutput=self.model_cfg.num_labels, - use_bn=False) - super_net.init_embedding(requires_grad=False) - super_net.to(device) synflow_score, _ = evaluator_register[CommonVars.PRUNE_SYNFLOW].evaluate_wrapper( arch=super_net, device=device, - batch_data=batch, + space_name=self.name, + batch_data=mini_batch, batch_labels=target) score_time = time.time() - score_time_begin @@ -428,8 +631,8 @@ def profiling_train_time(self, dataset: str, hidden_layer_list=[DEFAULT_LAYER_CHOICES_20[-1]] * self.model_cfg.num_layers, dropout_rate=0, noutput=self.model_cfg.num_labels) - super_net.init_embedding(requires_grad=True) - super_net.to(device) + # super_net.init_embedding(requires_grad=True) + # super_net.to(device) # only train for ony iteratin to evaluat the time usage. targs = copy.deepcopy(args) valid_auc, train_time_epoch, train_log = ModelTrainer.fully_train_arch( @@ -459,7 +662,7 @@ def profiling(self, dataset: str, _train_time_per_epoch = gtmlp.get_score_one_model_time("cpu") score_time = _train_time_per_epoch else: - import torch + # get a random batch. batch = iter(train_loader).__next__() target = batch['y'].type(torch.LongTensor) @@ -477,14 +680,15 @@ def profiling(self, dataset: str, hidden_layer_list=[DEFAULT_LAYER_CHOICES_20[-1]] * self.model_cfg.num_layers, dropout_rate=0, noutput=self.model_cfg.num_labels) - super_net.init_embedding(requires_grad=False) - super_net.to(device) + # super_net.init_embedding(requires_grad=False) + # super_net.to(device) # measure score time, score_time_begin = time.time() naswot_score, _ = evaluator_register[CommonVars.NAS_WOT].evaluate_wrapper( arch=super_net, device=device, + space_name=self.name, batch_data=batch, batch_labels=target) @@ -499,12 +703,13 @@ def profiling(self, dataset: str, dropout_rate=0, noutput=self.model_cfg.num_labels, use_bn=False) - super_net.init_embedding(requires_grad=False) - super_net.to(device) + # super_net.init_embedding(requires_grad=False) + # super_net.to(device) synflow_score, _ = evaluator_register[CommonVars.PRUNE_SYNFLOW].evaluate_wrapper( arch=super_net, device=device, + space_name=self.name, batch_data=batch, batch_labels=target) @@ -527,8 +732,8 @@ def profiling(self, dataset: str, hidden_layer_list=[DEFAULT_LAYER_CHOICES_20[-1]] * self.model_cfg.num_layers, dropout_rate=0, noutput=self.model_cfg.num_labels) - super_net.init_embedding(requires_grad=True) - super_net.to(device) + # super_net.init_embedding(requires_grad=True) + # super_net.to(device) # only train for ony iteratin to evaluat the time usage. targs = copy.deepcopy(args) diff --git a/examples/model_selection/Trails/internal/ml/model_selection/src/search_space/utils/__init__.py b/examples/model_selection/Trails/internal/ml/model_selection/src/search_space/utils/__init__.py new file mode 100644 index 0000000000..fe5964787e --- /dev/null +++ b/examples/model_selection/Trails/internal/ml/model_selection/src/search_space/utils/__init__.py @@ -0,0 +1,24 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + + + + + + diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/search_space/utils/weight_initializers.py b/examples/model_selection/Trails/internal/ml/model_selection/src/search_space/utils/weight_initializers.py similarity index 99% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/search_space/utils/weight_initializers.py rename to examples/model_selection/Trails/internal/ml/model_selection/src/search_space/utils/weight_initializers.py index de1c544423..5f6634bb77 100644 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/search_space/utils/weight_initializers.py +++ b/examples/model_selection/Trails/internal/ml/model_selection/src/search_space/utils/weight_initializers.py @@ -13,6 +13,9 @@ # limitations under the License. # ============================================================================= +import torch.nn as nn + + def init_net(net, w_type, b_type): """ Init network with various algorithms @@ -43,7 +46,6 @@ def init_net(net, w_type, b_type): else: raise NotImplementedError(f'init_type={b_type} is not supported.') -import torch.nn as nn def _init_weights_vs(m): if type(m) == nn.Linear or type(m) == nn.Conv2d: @@ -76,3 +78,5 @@ def _init_bias_zero(m): if type(m) == nn.Linear or type(m) == nn.Conv2d: if m.bias is not None: m.bias.data.fill_(.0) + + diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/phase1/algo/__init__.py b/examples/model_selection/Trails/internal/ml/model_selection/src/tools/__init__.py similarity index 99% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/phase1/algo/__init__.py rename to examples/model_selection/Trails/internal/ml/model_selection/src/tools/__init__.py index 01d7057208..42c2ac6db1 100644 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/phase1/algo/__init__.py +++ b/examples/model_selection/Trails/internal/ml/model_selection/src/tools/__init__.py @@ -16,3 +16,4 @@ # limitations under the License. # + diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/tools/compute.py b/examples/model_selection/Trails/internal/ml/model_selection/src/tools/compute.py similarity index 97% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/tools/compute.py rename to examples/model_selection/Trails/internal/ml/model_selection/src/tools/compute.py index 0400ef5467..b60e848e7e 100644 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/tools/compute.py +++ b/examples/model_selection/Trails/internal/ml/model_selection/src/tools/compute.py @@ -16,6 +16,7 @@ # limitations under the License. # + # for binary insert from typing import List import numpy as np @@ -92,12 +93,12 @@ def generate_global_rank(ml_data_score_dic: dict, alg_name_list: List) -> dict: return model_new_rank_score -def log_scale_x_array(num_points, max_minute, base=10) -> list: +def log_scale_x_array(num_points, max_minute, base=10, min_val=1) -> list: """ return a list of mins in log scale distance. """ # Set the minimum and maximum values for the log scale - min_val = 1 # 1 second + min_val = min_val # 1 second max_val = max_minute * 60 # 1440 minutes converted to seconds # Generate the log scale values diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/tools/correlation.py b/examples/model_selection/Trails/internal/ml/model_selection/src/tools/correlation.py similarity index 100% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/tools/correlation.py rename to examples/model_selection/Trails/internal/ml/model_selection/src/tools/correlation.py diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/tools/io_tools.py b/examples/model_selection/Trails/internal/ml/model_selection/src/tools/io_tools.py similarity index 98% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/tools/io_tools.py rename to examples/model_selection/Trails/internal/ml/model_selection/src/tools/io_tools.py index e657b9e04b..a66575b7b1 100644 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/tools/io_tools.py +++ b/examples/model_selection/Trails/internal/ml/model_selection/src/tools/io_tools.py @@ -53,8 +53,7 @@ def write_pickle(file_name, data): if __name__ == "__main__": - a = {1:1} + a = {1: 1} write_json("./asdf.json", a) - b = {2:2323} + b = {2: 2323} write_json("./asdf.json", b) - diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/tools/res_measure.py b/examples/model_selection/Trails/internal/ml/model_selection/src/tools/res_measure.py similarity index 99% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/tools/res_measure.py rename to examples/model_selection/Trails/internal/ml/model_selection/src/tools/res_measure.py index 93270ae31e..bc261fa54b 100644 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/tools/res_measure.py +++ b/examples/model_selection/Trails/internal/ml/model_selection/src/tools/res_measure.py @@ -23,6 +23,16 @@ import time from src.tools.io_tools import write_json import sys +import torch + + +def get_variable_memory_size(obj): + # If it's a PyTorch tensor and on the GPU + if torch.is_tensor(obj) and obj.is_cuda: + return obj.element_size() * obj.nelement() + else: + return sys.getsizeof(obj) + def print_cpu_gpu_usage(interval=1, output_file="path_to_folder", stop_event=None): def print_usage(): @@ -72,13 +82,6 @@ def print_usage(): thread.start() return stop_event, thread -def get_variable_memory_size(obj): - # If it's a PyTorch tensor and on the GPU - import torch - if torch.is_tensor(obj) and obj.is_cuda: - return obj.element_size() * obj.nelement() - else: - return sys.getsizeof(obj) def print_memory_usage(): # Get current process diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/tools/utils.py b/examples/model_selection/Trails/internal/ml/model_selection/src/tools/utils.py similarity index 99% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/tools/utils.py rename to examples/model_selection/Trails/internal/ml/model_selection/src/tools/utils.py index 8e4232caa0..c2c45a705c 100644 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/tools/utils.py +++ b/examples/model_selection/Trails/internal/ml/model_selection/src/tools/utils.py @@ -25,9 +25,16 @@ import numpy import numpy as np +import torch import shutil import logging +import torchvision.transforms as transforms +from torch.autograd import Variable +import torch.nn.functional as F +import torchvision.datasets as dset +import torch.nn as nn + warnings.filterwarnings("error") @@ -59,7 +66,7 @@ def update(self, val, n=1): self.count += n self.avg = self.sum / self.count -import torch + def get_correct_num(y, target): pred_label = torch.argmax(y, dim=1) return (target == pred_label).sum().item() @@ -101,7 +108,7 @@ def __call__(self, img): img *= mask return img -import torchvision.transforms as transforms + def _data_transforms_cifar10(args): CIFAR_MEAN = [0.49139968, 0.48215827, 0.44653124] CIFAR_STD = [0.24703233, 0.24348505, 0.26158768] @@ -121,7 +128,7 @@ def _data_transforms_cifar10(args): ]) return train_transform, valid_transform -import torchvision.datasets as dset + def _get_cifar10(args): train_transform, valid_transform = _data_transforms_cifar10(args) train_data = dset.CIFAR10( @@ -342,7 +349,6 @@ def save_ckpt(ckpt, file_dir, file_name='model.ckpt', is_best=False): def drop_path(x, drop_prob, dims=(0,)): - from torch.autograd import Variable var_size = [1 for _ in range(x.dim())] for i in dims: var_size[i] = x.size(i) @@ -372,7 +378,6 @@ def __init__(self, path): self.data = None def update(self, alphas_normal, alphas_reduce, val_loss): - import torch.nn.functional as F a_normal = F.softmax(alphas_normal, dim=-1) # print("alpha normal size: ", a_normal.data.size()) a_reduce = F.softmax(alphas_reduce, dim=-1) @@ -406,7 +411,7 @@ def logger(log_dir, need_time=True, need_stdout=False): log.addHandler(fh) return log -import torch.nn as nn + class CrossEntropyLabelSmooth(nn.Module): def __init__(self, num_classes, epsilon): diff --git a/examples/model_selection/Trails/internal/ml/model_slicing/README.md b/examples/model_selection/Trails/internal/ml/model_slicing/README.md new file mode 100644 index 0000000000..503c94b9dc --- /dev/null +++ b/examples/model_selection/Trails/internal/ml/model_slicing/README.md @@ -0,0 +1,1092 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + +# Powering In-Database Dynamic Model Slicing for Structured Data Analytics + +The general model based on LEADS is at [algorithm](https://github.com/Zrealshadow/SAMS/tree/f0570730563e7e05e073d5b7eaedabebe6286f56). + +# Envs + +```bash +pip install orjson +pip install einops +pip install tqdm +pip install matplotlib + +unset PYTHONPATH +export PYTHONPATH=$PYTHONPATH:/project/Trails/internal/ml/ +export PYTHONPATH=$PYTHONPATH:/project/Trails/internal/ml/model_slicing/ +export PYTHONPATH=$PYTHONPATH:/project/Trails/internal/ml/model_slicing/algorithm/ +echo $PYTHONPATH + + +export PYTHONPATH=$PYTHONPATH:/home/xingnaili/Trails/internal/ml/ +export PYTHONPATH=$PYTHONPATH:/home/xingnaili/Trails/internal/ml/model_slicing/ +export PYTHONPATH=$PYTHONPATH:/home/xingnaili/Trails/internal/ml/model_slicing/algorithm/ + + +/project/Trails/internal/ml/ +/project/Trails/internal/ml/model_slicing/algorithm: +/project/Trails/internal/ml/model_slicing: + +``` + +# Save data + +4 datasets are used here. + +``` +adult bank cvd frappe payment(credit) credit(hcdr) census diabetes +``` + +Save the statistics + +```bash +# save the data cardinalities, run in docker + +# frappe +python3 ./internal/ml/model_slicing/algorithm/save_satistics.py --dataset frappe --data_dir /hdd1/sams/data/ --nfeat 5500 --nfield 10 --max_filter_col 10 --train_dir ./ + +# adult +python3 ./internal/ml/model_slicing/algorithm/save_satistics.py --dataset adult --data_dir /hdd1/sams/data/ --nfeat 140 --nfield 13 --max_filter_col 13 --train_dir ./ + +# cvd +python3 ./internal/ml/model_slicing/algorithm/save_satistics.py --dataset cvd --data_dir /hdd1/sams/data/ --nfeat 110 --nfield 11 --max_filter_col 11 --train_dir ./ + +# bank +python3 ./internal/ml/model_slicing/algorithm/save_satistics.py --dataset bank --data_dir /hdd1/sams/data/ --nfeat 80 --nfield 16 --max_filter_col 16 --train_dir ./ + + + +New Datasets +# census +python3 ./internal/ml/model_slicing/algorithm/save_satistics.py --dataset census --data_dir /hdd1/sams/data/ --nfeat 540 --nfield 41 --max_filter_col 41 --train_dir ./ + +# Payment (credit) +python3 ./internal/ml/model_slicing/algorithm/save_satistics.py --dataset credit --data_dir /hdd1/sams/data/ --nfeat 350 --nfield 23 --max_filter_col 23 --train_dir ./ + +# diabetes +python3 ./internal/ml/model_slicing/algorithm/save_satistics.py --dataset diabetes --data_dir /hdd1/sams/data/ --nfeat 850 --nfield 48 --max_filter_col 48 --train_dir ./ + +# credit (hcdr) +python3 ./internal/ml/model_slicing/algorithm/save_satistics.py --dataset hcdr --data_dir /hdd1/sams/data/ --nfeat 550 --nfield 69 --max_filter_col 69 --train_dir ./ + +``` + +# Run docker + +```bash +# in server +ssh panda17 + +# goes to /home/xingnaili/firmest_docker/Trails +git submodule update --recursive --remote + +# run container +docker run -d --name moe_inf \ + --network="host" \ + -v $(pwd)/Trails:/project/Trails \ + -v /hdd1/sams/tensor_log/:/project/tensor_log \ + -v /hdd1/sams/data/:/project/data_all \ + trails + +# Enter the docker container. +docker exec -it moe_inf bash +``` + + + +# 12 Run in database + +Config the database runtime + +```sql +cargo pgrx run --release +``` + +Load data into RDBMS + +```bash + +psql -h localhost -p 28814 -U postgres +\l +\c pg_extension +\dt +\d frappe_train + + +# frappe +bash /project/Trails/internal/ml/model_selection/scripts/database/load_data_to_db.sh /project/data_all/frappe frappe +# frappe, only feature ids +bash /project/Trails/internal/ml/model_selection/scripts/database/load_data_to_db_int.sh /project/data_all/frappe frappe + + +# adult +bash ./internal/ml/model_selection/scripts/database/load_data_to_db.sh /project/data_all/adult adult +# adult, only feature ids +bash ./internal/ml/model_selection/scripts/database/load_data_to_db_int.sh /project/data_all/adult adult +# check type is correct or not. +SELECT column_name, data_type, column_default, is_nullable +FROM information_schema.columns +WHERE table_name = 'adult_int_train'; + + +# cvd +bash /project/Trails/internal/ml/model_selection/scripts/database/load_data_to_db.sh /project/data_all/cvd cvd +# cvd, only feature ids +bash /project/Trails/internal/ml/model_selection/scripts/database/load_data_to_db_int.sh /project/data_all/cvd cvd + + +# bank +bash /project/Trails/internal/ml/model_selection/scripts/database/load_data_to_db.sh /project/data_all/bank bank +# bank, only feature ids +bash /project/Trails/internal/ml/model_selection/scripts/database/load_data_to_db_int.sh /project/data_all/bank bank + + +New Datasets + +# census +bash /project/Trails/internal/ml/model_selection/scripts/database/load_data_to_db_int.sh /project/data_all/census census + +# credit +bash /project/Trails/internal/ml/model_selection/scripts/database/load_data_to_db_int.sh /project/data_all/credit credit + +# hcdr +bash /project/Trails/internal/ml/model_selection/scripts/database/load_data_to_db_int.sh /project/data_all/hcdr hcdr + +# diabetes +bash /project/Trails/internal/ml/model_selection/scripts/database/load_data_to_db_int.sh /project/data_all/diabetes diabetes +``` + +Verify data is in the DB + +```sql +# check table status +\dt +\d frappe_train +SELECT * FROM frappe_train LIMIT 10; +``` + +Config + +```sql +# after run the pgrx, then edie the sql +# generate schema +cargo pgrx schema >> /home/postgres/.pgrx/14.9/pgrx-install/share/extension/pg_extension--0.1.0.sql + + +-- src/lib.rs:266 +-- pg_extension::model_init +CREATE FUNCTION "model_init"( + "condition" TEXT, /* alloc::string::String */ + "config_file" TEXT, /* alloc::string::String */ + "col_cardinalities_file" TEXT, /* alloc::string::String */ + "model_path" TEXT /* alloc::string::String */ +) RETURNS TEXT /* alloc::string::String */ +IMMUTABLE STRICT PARALLEL SAFE +LANGUAGE c /* Rust */ +AS 'MODULE_PATHNAME', 'model_init_wrapper'; + +-- src/lib.rs:242 +-- pg_extension::inference_shared_write_once_int +CREATE FUNCTION "inference_shared_write_once_int"( + "dataset" TEXT, /* alloc::string::String */ + "condition" TEXT, /* alloc::string::String */ + "config_file" TEXT, /* alloc::string::String */ + "col_cardinalities_file" TEXT, /* alloc::string::String */ + "model_path" TEXT, /* alloc::string::String */ + "sql" TEXT, /* alloc::string::String */ + "batch_size" INT /* i32 */ +) RETURNS TEXT /* alloc::string::String */ +IMMUTABLE STRICT PARALLEL SAFE +LANGUAGE c /* Rust */ +AS 'MODULE_PATHNAME', 'inference_shared_write_once_int_wrapper'; + +-- src/lib.rs:219 +-- pg_extension::inference_shared_write_once +CREATE FUNCTION "inference_shared_write_once"( + "dataset" TEXT, /* alloc::string::String */ + "condition" TEXT, /* alloc::string::String */ + "config_file" TEXT, /* alloc::string::String */ + "col_cardinalities_file" TEXT, /* alloc::string::String */ + "model_path" TEXT, /* alloc::string::String */ + "sql" TEXT, /* alloc::string::String */ + "batch_size" INT /* i32 */ +) RETURNS TEXT /* alloc::string::String */ +IMMUTABLE STRICT PARALLEL SAFE +LANGUAGE c /* Rust */ +AS 'MODULE_PATHNAME', 'inference_shared_write_once_wrapper'; + +-- src/lib.rs:196 +-- pg_extension::inference_shared +CREATE FUNCTION "inference_shared"( + "dataset" TEXT, /* alloc::string::String */ + "condition" TEXT, /* alloc::string::String */ + "config_file" TEXT, /* alloc::string::String */ + "col_cardinalities_file" TEXT, /* alloc::string::String */ + "model_path" TEXT, /* alloc::string::String */ + "sql" TEXT, /* alloc::string::String */ + "batch_size" INT /* i32 */ +) RETURNS TEXT /* alloc::string::String */ +IMMUTABLE STRICT PARALLEL SAFE +LANGUAGE c /* Rust */ +AS 'MODULE_PATHNAME', 'run_inference_shared_wrapper'; + +-- src/lib.rs:173 +-- pg_extension::inference +CREATE FUNCTION "inference"( + "dataset" TEXT, /* alloc::string::String */ + "condition" TEXT, /* alloc::string::String */ + "config_file" TEXT, /* alloc::string::String */ + "col_cardinalities_file" TEXT, /* alloc::string::String */ + "model_path" TEXT, /* alloc::string::String */ + "sql" TEXT, /* alloc::string::String */ + "batch_size" INT /* i32 */ +) RETURNS TEXT /* alloc::string::String */ +IMMUTABLE STRICT PARALLEL SAFE +LANGUAGE c /* Rust */ +AS 'MODULE_PATHNAME', 'run_inference_wrapper'; + + +# record the necessary func above and then copy it to following +rm /home/postgres/.pgrx/14.9/pgrx-install/share/extension/pg_extension--0.1.0.sql +vi /home/postgres/.pgrx/14.9/pgrx-install/share/extension/pg_extension--0.1.0.sql + +# then drop/create extension +DROP EXTENSION IF EXISTS pg_extension; +CREATE EXTENSION pg_extension; +``` + +Examples + +```sql + +# this is database name, columns used, time budget, batch size, and config file +SELECT count(*) FROM frappe_train WHERE col2='973:1' LIMIT 1000; +SELECT col2, count(*) FROM frappe_train group by col2 order by count(*) desc; + +# query with two conditions +SELECT inference( + 'frappe', + '{"1":266, "2":1244}', + '/project/Trails/internal/ml/model_selection/config.ini', + '/project/Trails/frappe_col_cardinalities', + '/project/tensor_log/frappe/dnn_K16_alpha4', + 'WHERE col1=''266:1'' and col2=''1244:1''', + 32 +); + +# query with 1 conditions +SELECT inference( + 'frappe', + '{"2":977}', + '/project/Trails/internal/ml/model_selection/config.ini', + '/project/Trails/frappe_col_cardinalities', + '/project/tensor_log/frappe/dnn_K16_alpha4', + 'WHERE col2=''977:1''', + 10000 +); + +# query with no conditions +SELECT inference( + 'frappe', + '{}', + '/project/Trails/internal/ml/model_selection/config.ini', + '/project/Trails/frappe_col_cardinalities', + '/project/tensor_log/frappe/dnn_K16_alpha4', + '', + 8000 +); + +# explaination +EXPLAIN (ANALYZE, BUFFERS) SELECT inference( + 'frappe', + '{"2":977}', + '/project/Trails/internal/ml/model_selection/config.ini', + '/project/Trails/frappe_col_cardinalities', + '/project/tensor_log/frappe/dnn_K16_alpha4', + 'WHERE col2=''977:1''', + 8000 +); + + +``` + +# Clear cache + +```sql +DISCARD ALL; +``` + +# Benchmark Latency over all datasets + +## Adult + +```sql +SELECT inference( + 'adult', + '{}', + '/project/Trails/internal/ml/model_selection/config.ini', + '/project/Trails/data/adult_col_cardinalities', + '/project/tensor_log/adult/Ednn_K16_alpha2-5', + '', + 10000 +); + + +# exps +SELECT model_init( + '{}', + '/project/Trails/internal/ml/model_selection/config.ini', + '/project/Trails/data/adult_col_cardinalities', + '/project/tensor_log/adult/Ednn_K16_alpha2-5', +); +SELECT inference( + 'adult', + '{}', + '/project/Trails/internal/ml/model_selection/config.ini', + '/project/Trails/data/adult_col_cardinalities', + '/project/tensor_log/adult/Ednn_K16_alpha2-5', + '', + 10000 +); + + +SELECT model_init( + '{}', + '/project/Trails/internal/ml/model_selection/config.ini', + '/project/Trails/data/adult_col_cardinalities', + '/project/tensor_log/adult/Ednn_K16_alpha2-5' +); +SELECT inference_shared_write_once( + 'adult', + '{}', + '/project/Trails/internal/ml/model_selection/config.ini', + '/project/Trails/data/adult_col_cardinalities', + '/project/tensor_log/adult/Ednn_K16_alpha2-5', + '', + 100000 +); + +# replicate data +INSERT INTO adult_train (label, col1, col2, col3, col4, col5, col6, col7, col8, col9, col10, col11, col12, col13) +SELECT label, col1, col2, col3, col4, col5, col6, col7, col8, col9, col10, col11, col12, col13 +FROM adult_train; + +INSERT INTO adult_int_train (label, col1, col2, col3, col4, col5, col6, col7, col8, col9, col10, col11, col12, col13) +SELECT label, col1, col2, col3, col4, col5, col6, col7, col8, col9, col10, col11, col12, col13 +FROM adult_int_train; +``` + +## Frappe + +```sql +SELECT inference( + 'frappe', + '{}', + '/project/Trails/internal/ml/model_selection/config.ini', + '/project/Trails/data/frappe_col_cardinalities', + '/project/tensor_log/frappe/dnn_K16_alpha4', + '', + 10000 +); + +SELECT inference( + 'frappe', + '{}', + '/project/Trails/internal/ml/model_selection/config.ini', + '/project/Trails/data/frappe_col_cardinalities', + '/project/tensor_log/frappe/dnn_K16_alpha4', + '', + 20000 +); + + +SELECT model_init( + '{}', + '/project/Trails/internal/ml/model_selection/config.ini', + '/project/Trails/data/frappe_col_cardinalities', + '/project/tensor_log/frappe/dnn_K16_alpha4' +); +SELECT inference( + 'frappe', + '{}', + '/project/Trails/internal/ml/model_selection/config.ini', + '/project/Trails/data/frappe_col_cardinalities', + '/project/tensor_log/frappe/dnn_K16_alpha4', + '', + 10000 +); + + + +SELECT model_init( + '{}', + '/project/Trails/internal/ml/model_selection/config.ini', + '/project/Trails/data/frappe_col_cardinalities', + '/project/tensor_log/frappe/dnn_K16_alpha4' +); +SELECT inference_shared_write_once( + 'frappe', + '{}', + '/project/Trails/internal/ml/model_selection/config.ini', + '/project/Trails/data/frappe_col_cardinalities', + '/project/tensor_log/frappe/dnn_K16_alpha4', + '', + 100000 +); + + +SELECT inference_shared( + 'frappe', + '{}', + '/project/Trails/internal/ml/model_selection/config.ini', + '/project/Trails/data/frappe_col_cardinalities', + '/project/tensor_log/frappe/dnn_K16_alpha4', + '', + 40000 +); + + + +SELECT inference( + 'frappe', + '{}', + '/project/Trails/internal/ml/model_selection/config.ini', + '/project/Trails/data/frappe_col_cardinalities', + '/project/tensor_log/frappe/dnn_K16_alpha4', + '', + 80000 +); + + +SELECT inference( + 'frappe', + '{}', + '/project/Trails/internal/ml/model_selection/config.ini', + '/project/Trails/data/frappe_col_cardinalities', + '/project/tensor_log/frappe/dnn_K16_alpha4', + '', + 160000 +); + +# replicate data +INSERT INTO frappe_train (label, col1, col2, col3, col4, col5, col6, col7, col8, col9, col10) +SELECT label, col1, col2, col3, col4, col5, col6, col7, col8, col9, col10 +FROM frappe_train; + + +INSERT INTO frappe_int_train (label, col1, col2, col3, col4, col5, col6, col7, col8, col9, col10) +SELECT label, col1, col2, col3, col4, col5, col6, col7, col8, col9, col10 +FROM frappe_int_train; +``` + +## CVD + +```sql +SELECT inference( + 'cvd', + '{}', + '/project/Trails/internal/ml/model_selection/config.ini', + '/project/Trails/data/cvd_col_cardinalities', + '/project/tensor_log/cvd/dnn_K16_alpha2-5', + '', + 10000 +); + +# exps +SELECT model_init( + '{}', + '/project/Trails/internal/ml/model_selection/config.ini', + '/project/Trails/data/cvd_col_cardinalities', + '/project/tensor_log/cvd/dnn_K16_alpha2-5', +); +SELECT inference( + 'cvd', + '{}', + '/project/Trails/internal/ml/model_selection/config.ini', + '/project/Trails/data/cvd_col_cardinalities', + '/project/tensor_log/cvd/dnn_K16_alpha2-5', + '', + 10000 +); + + +SELECT model_init( + '{}', + '/project/Trails/internal/ml/model_selection/config.ini', + '/project/Trails/data/cvd_col_cardinalities', + '/project/tensor_log/cvd/dnn_K16_alpha2-5' +); +SELECT inference_shared_write_once( + 'cvd', + '{}', + '/project/Trails/internal/ml/model_selection/config.ini', + '/project/Trails/data/cvd_col_cardinalities', + '/project/tensor_log/cvd/dnn_K16_alpha2-5', + '', + 100000 +); + + +# replicate data +INSERT INTO cvd_train (label, col1, col2, col3, col4, col5, col6, col7, col8, col9, col10, col11) +SELECT label, col1, col2, col3, col4, col5, col6, col7, col8, col9, col10, col11 +FROM cvd_train; + +INSERT INTO cvd_int_train (label, col1, col2, col3, col4, col5, col6, col7, col8, col9, col10, col11) +SELECT label, col1, col2, col3, col4, col5, col6, col7, col8, col9, col10, col11 +FROM cvd_int_train; + +``` + +## Bank + +```sql +SELECT inference( + 'bank', + '{}', + '/project/Trails/internal/ml/model_selection/config.ini', + '/project/Trails/data/bank_col_cardinalities', + '/project/tensor_log/bank/dnn_K16_alpha2-3_beta1e-3', + '', + 10000 +); + + +# exps +SELECT model_init( + '{}', + '/project/Trails/internal/ml/model_selection/config.ini', + '/project/Trails/data/bank_col_cardinalities', + '/project/tensor_log/bank/dnn_K16_alpha2-3_beta1e-3', +); +SELECT inference( + 'bank', + '{}', + '/project/Trails/internal/ml/model_selection/config.ini', + '/project/Trails/data/bank_col_cardinalities', + '/project/tensor_log/bank/dnn_K16_alpha2-3_beta1e-3', + '', + 10000 +); + + +SELECT model_init( + '{}', + '/project/Trails/internal/ml/model_selection/config.ini', + '/project/Trails/bank_col_cardinalities', + '/project/tensor_log/bank/dnn_K16_alpha2-3_beta1e-3' +); +SELECT inference_shared_write_once( + 'bank', + '{}', + '/project/Trails/internal/ml/model_selection/config.ini', + '/project/Trails/bank_col_cardinalities', + '/project/tensor_log/bank/dnn_K16_alpha2-3_beta1e-3', + '', + 100000 +); + + +# replicate data +INSERT INTO bank_train (label, col1, col2, col3, col4, col5, col6, col7, col8, col9, col10, col11, col12, col13, col14, col15, col16) +SELECT label, col1, col2, col3, col4, col5, col6, col7, col8, col9, col10, col11, col12, col13, col14, col15, col16 +FROM bank_train; + + +INSERT INTO bank_int_train (label, col1, col2, col3, col4, col5, col6, col7, col8, col9, col10, col11, col12, col13, col14, col15, col16) +SELECT label, col1, col2, col3, col4, col5, col6, col7, col8, col9, col10, col11, col12, col13, col14, col15, col16 +FROM bank_int_train; + +``` + +## Census + +```sql +# replicate data +INSERT INTO census_int_train (label, col1, col2, col3, col4, col5, col6, col7, col8, col9, col10, col11, col12, col13, col14, col15, col16,col17,col18,col19,col20,col21,col22,col23,col24,col25,col26,col27,col28,col29,col30,col31,col32,col33,col34,col35,col36,col37,col38,col39,col40,col41) +SELECT label, col1, col2, col3, col4, col5, col6, col7, col8, col9, col10, col11, col12, col13, col14, col15, col16,col17,col18,col19,col20,col21,col22,col23,col24,col25,col26,col27,col28,col29,col30,col31,col32,col33,col34,col35,col36,col37,col38,col39,col40,col41 +FROM census_int_train; + + +``` + +## Credit + +```sql +# replicate data +INSERT INTO credit_int_train (label, col1, col2, col3, col4, col5, col6, col7, col8, col9, col10, col11, col12, col13, col14, col15, col16,col17,col18,col19,col20,col21,col22,col23) +SELECT label, col1, col2, col3, col4, col5, col6, col7, col8, col9, col10, col11, col12, col13, col14, col15, col16,col17,col18,col19,col20,col21,col22,col23 +FROM credit_int_train; + + + +``` + +## Diabetes + +```sql +# replicate data +INSERT INTO diabetes_int_train (label, col1, col2, col3, col4, col5, col6, col7, col8, col9, col10, col11, col12, col13, col14, col15, col16,col17,col18,col19,col20,col21,col22,col23,col24,col25,col26,col27,col28,col29,col30,col31,col32,col33,col34,col35,col36,col37,col38,col39,col40,col41,col42,col43,col44,col45,col46,col47,col48) +SELECT label, col1, col2, col3, col4, col5, col6, col7, col8, col9, col10, col11, col12, col13, col14, col15, col16,col17,col18,col19,col20,col21,col22,col23,col24,col25,col26,col27,col28,col29,col30,col31,col32,col33,col34,col35,col36,col37,col38,col39,col40,col41,col42,col43,col44,col45,col46,col47,col48 +FROM diabetes_int_train; + + + +``` + +## Hcdr + +```sql +# replicate data +INSERT INTO hcdr_int_train (label,col1,col2,col3,col4,col5,col6,col7,col8,col9,col10,col11,col12,col13,col14,col15,col16,col17,col18,col19,col20,col21,col22,col23,col24,col25,col26,col27,col28,col29,col30,col31,col32,col33,col34,col35,col36,col37,col38,col39,col40,col41,col42,col43,col44,col45,col46,col47,col48,col49,col50,col51,col52,col53,col54,col55,col56,col57,col58,col59,col60,col61,col62,col63,col64,col65,col66,col67,col68,col69) +SELECT label,col1,col2,col3,col4,col5,col6,col7,col8,col9,col10,col11,col12,col13,col14,col15,col16,col17,col18,col19,col20,col21,col22,col23,col24,col25,col26,col27,col28,col29,col30,col31,col32,col33,col34,col35,col36,col37,col38,col39,col40,col41,col42,col43,col44,col45,col46,col47,col48,col49,col50,col51,col52,col53,col54,col55,col56,col57,col58,col59,col60,col61,col62,col63,col64,col65,col66,col67,col68,col69 +FROM hcdr_int_train; + + +``` + + + +# Baseline System & SAMS + +## Frappe + +```bash +# frappe +CUDA_VISIBLE_DEVICES=-1 python ./internal/ml/model_slicing/algorithm/baseline.py /hdd1/sams/tensor_log/frappe/dnn_K16_alpha4 --device cpu --dataset frappe --batch_size 10 --col_cardinalities_file data/frappe_col_cardinalities --target_batch 10 + + +CUDA_VISIBLE_DEVICES="0" python ./internal/ml/model_slicing/algorithm/baseline.py /hdd1/sams/tensor_log/frappe/dnn_K16_alpha4 --device cuda:0 --dataset frappe --batch_size 100000 --col_cardinalities_file data/frappe_col_cardinalities --target_batch 100000 + +CUDA_VISIBLE_DEVICES=-1 python ./internal/ml/model_slicing/algorithm/baseline_int.py /hdd1/sams/tensor_log/frappe/dnn_K16_alpha4 --device cpu --dataset frappe --batch_size 100000 --col_cardinalities_file data/frappe_col_cardinalities --target_batch 100000 + + +SELECT model_init( + '{}', + '/project/Trails/internal/ml/model_selection/config.ini', + '/project/Trails/frappe_col_cardinalities', + '/project/tensor_log/frappe/dnn_K16_alpha4' +); +SELECT inference_shared_write_once( + 'frappe', + '{}', + '/project/Trails/internal/ml/model_selection/config.ini', + '/project/Trails/frappe_col_cardinalities', + '/project/tensor_log/frappe/dnn_K16_alpha4', + '', + 100000 +); + +# read int data +SELECT model_init( + '{}', + '/project/Trails/internal/ml/model_selection/config.ini', + '/project/Trails/frappe_col_cardinalities', + '/project/tensor_log/frappe/dnn_K16_alpha4' +); +SELECT inference_shared_write_once_int( + 'frappe', + '{}', + '/project/Trails/internal/ml/model_selection/config.ini', + '/project/Trails/frappe_col_cardinalities', + '/project/tensor_log/frappe/dnn_K16_alpha4', + '', + 100000 +); +``` + +## Adult + +```bash + +# adult +CUDA_VISIBLE_DEVICES=-1 python ./internal/ml/model_slicing/algorithm/baseline.py /hdd1/sams/tensor_log/adult/Ednn_K16_alpha2-5 --device cpu --dataset adult --batch_size 100000 --col_cardinalities_file data/adult_col_cardinalities --target_batch 100000 + +CUDA_VISIBLE_DEVICES="0" python ./internal/ml/model_slicing/algorithm/baseline.py /hdd1/sams/tensor_log/adult/Ednn_K16_alpha2-5 --device cuda:0 --dataset adult --batch_size 100000 --col_cardinalities_file data/adult_col_cardinalities --target_batch 100000 + +CUDA_VISIBLE_DEVICES=-1 python ./internal/ml/model_slicing/algorithm/baseline_int.py /hdd1/sams/tensor_log/adult/Ednn_K16_alpha2-5 --device cpu --dataset adult --batch_size 100000 --col_cardinalities_file data/adult_col_cardinalities --target_batch 100000 + +SELECT model_init( + '{}', + '/project/Trails/internal/ml/model_selection/config.ini', + '/project/Trails/adult_col_cardinalities', + '/project/tensor_log/adult/Ednn_K16_alpha2-5' +); +SELECT inference_shared_write_once( + 'adult', + '{}', + '/project/Trails/internal/ml/model_selection/config.ini', + '/project/Trails/adult_col_cardinalities', + '/project/tensor_log/adult/Ednn_K16_alpha2-5', + '', + 100000 +); + +SELECT model_init( + '{}', + '/project/Trails/internal/ml/model_selection/config.ini', + '/project/Trails/adult_col_cardinalities', + '/project/tensor_log/adult/Ednn_K16_alpha2-5' +); +SELECT inference_shared_write_once_int( + 'adult', + '{}', + '/project/Trails/internal/ml/model_selection/config.ini', + '/project/Trails/adult_col_cardinalities', + '/project/tensor_log/adult/Ednn_K16_alpha2-5', + '', + 640000 +); +``` + +## CVD +```bash +# CVD +CUDA_VISIBLE_DEVICES=-1 python ./internal/ml/model_slicing/algorithm/baseline.py /hdd1/sams/tensor_log/cvd/dnn_K16_alpha2-5 --device cpu --dataset cvd --batch_size 100000 --col_cardinalities_file data/cvd_col_cardinalities --target_batch 100000 + +CUDA_VISIBLE_DEVICES="0" python ./internal/ml/model_slicing/algorithm/baseline.py /hdd1/sams/tensor_log/cvd/dnn_K16_alpha2-5 --device cuda:0 --dataset cvd --batch_size 100000 --col_cardinalities_file data/cvd_col_cardinalities --target_batch 100000 + +CUDA_VISIBLE_DEVICES=-1 python ./internal/ml/model_slicing/algorithm/baseline_int.py /hdd1/sams/tensor_log/cvd/dnn_K16_alpha2-5 --device cpu --dataset cvd --batch_size 100000 --col_cardinalities_file data/cvd_col_cardinalities --target_batch 100000 + + +SELECT model_init( + '{}', + '/project/Trails/internal/ml/model_selection/config.ini', + '/project/Trails/cvd_col_cardinalities', + '/project/tensor_log/cvd/dnn_K16_alpha2-5' +); +SELECT inference_shared_write_once( + 'cvd', + '{}', + '/project/Trails/internal/ml/model_selection/config.ini', + '/project/Trails/cvd_col_cardinalities', + '/project/tensor_log/cvd/dnn_K16_alpha2-5', + '', + 100000 +); + +SELECT model_init( + '{}', + '/project/Trails/internal/ml/model_selection/config.ini', + '/project/Trails/cvd_col_cardinalities', + '/project/tensor_log/cvd/dnn_K16_alpha2-5' +); +SELECT inference_shared_write_once_int( + 'cvd', + '{}', + '/project/Trails/internal/ml/model_selection/config.ini', + '/project/Trails/cvd_col_cardinalities', + '/project/tensor_log/cvd/dnn_K16_alpha2-5', + '', + 100000 +); +``` + +## Bank + +```bash +# Bank +CUDA_VISIBLE_DEVICES=-1 python ./internal/ml/model_slicing/algorithm/baseline.py /hdd1/sams/tensor_log/bank/dnn_K16_alpha2-3_beta1e-3 --device cpu --dataset bank --batch_size 100000 --col_cardinalities_file data/bank_col_cardinalities --target_batch 100000 + +CUDA_VISIBLE_DEVICES="0" python ./internal/ml/model_slicing/algorithm/baseline.py /hdd1/sams/tensor_log/bank/dnn_K16_alpha2-3_beta1e-3 --device cuda:0 --dataset bank --batch_size 100000 --col_cardinalities_file data/bank_col_cardinalities --target_batch 100000 + + +CUDA_VISIBLE_DEVICES=-1 python ./internal/ml/model_slicing/algorithm/baseline_int.py /hdd1/sams/tensor_log/bank/dnn_K16_alpha2-3_beta1e-3 --device cpu --dataset bank --batch_size 100000 --col_cardinalities_file data/bank_col_cardinalities --target_batch 100000 + +SELECT model_init( + '{}', + '/project/Trails/internal/ml/model_selection/config.ini', + '/project/Trails/bank_col_cardinalities', + '/project/tensor_log/bank/dnn_K16_alpha2-3_beta1e-3' +); +SELECT inference_shared_write_once( + 'bank', + '{}', + '/project/Trails/internal/ml/model_selection/config.ini', + '/project/Trails/bank_col_cardinalities', + '/project/tensor_log/bank/dnn_K16_alpha2-3_beta1e-3', + '', + 100000 +); + + +SELECT model_init( + '{}', + '/project/Trails/internal/ml/model_selection/config.ini', + '/project/Trails/bank_col_cardinalities', + '/project/tensor_log/bank/dnn_K16_alpha2-3_beta1e-3' +); +SELECT inference_shared_write_once_int( + 'bank', + '{}', + '/project/Trails/internal/ml/model_selection/config.ini', + '/project/Trails/bank_col_cardinalities', + '/project/tensor_log/bank/dnn_K16_alpha2-3_beta1e-3', + '', + 100000 +); + + +``` + +## Census + +```sql +# Bank +CUDA_VISIBLE_DEVICES=-1 python ./internal/ml/model_slicing/baseline_int.py /hdd1/sams/tensor_log/census/dnn_K16 --device cpu --dataset census --batch_size 100000 --col_cardinalities_file ./internal/ml/model_slicing/data/census_col_cardinalities --target_batch 100000 + +SELECT model_init( + '{}', + '/project/Trails/internal/ml/model_selection/config.ini', + '/project/Trails/internal/ml/model_slicing/data/census_col_cardinalities', + '/project/tensor_log/census/dnn_K16' +); +SELECT inference_shared_write_once_int( + 'census', + '{}', + '/project/Trails/internal/ml/model_selection/config.ini', + '/project/Trails/internal/ml/model_slicing/data/census_col_cardinalities', + '/project/tensor_log/census/dnn_K16', + '', + 100000 +); +``` + +## Credit + +```sql +# Bank +CUDA_VISIBLE_DEVICES=-1 python ./internal/ml/model_slicing/baseline_int.py /hdd1/sams/tensor_log/credit/dnn_K16_epoch50 --device cpu --dataset credit --batch_size 100000 --col_cardinalities_file ./internal/ml/model_slicing/data/credit_col_cardinalities --target_batch 100000 + +SELECT model_init( + '{}', + '/project/Trails/internal/ml/model_selection/config.ini', + '/project/Trails/internal/ml/model_slicing/data/credit_col_cardinalities', + '/project/tensor_log/credit/dnn_K16_epoch50' +); +SELECT inference_shared_write_once_int( + 'credit', + '{}', + '/project/Trails/internal/ml/model_selection/config.ini', + '/project/Trails/internal/ml/model_slicing/data/credit_col_cardinalities', + '/project/tensor_log/credit/dnn_K16_epoch50', + '', + 100000 +); +``` + +## Diabetes + +```sql +# Bank +CUDA_VISIBLE_DEVICES=-1 python ./internal/ml/model_slicing/baseline_int.py /hdd1/sams/tensor_log/diabetes/dnn_K16_epoch50 --device cpu --dataset diabetes --batch_size 100000 --col_cardinalities_file ./internal/ml/model_slicing/data/diabetes_col_cardinalities --target_batch 100000 + +SELECT model_init( + '{}', + '/project/Trails/internal/ml/model_selection/config.ini', + '/project/Trails/internal/ml/model_slicing/data/diabetes_col_cardinalities', + '/project/tensor_log/diabetes/dnn_K16_epoch50' +); +SELECT inference_shared_write_once_int( + 'diabetes', + '{}', + '/project/Trails/internal/ml/model_selection/config.ini', + '/project/Trails/internal/ml/model_slicing/data/diabetes_col_cardinalities', + '/project/tensor_log/diabetes/dnn_K16_epoch50', + '', + 100000 +); +``` + +## Hcdr + +```sql +# Bank +CUDA_VISIBLE_DEVICES=-1 python ./internal/ml/model_slicing/baseline_int.py /hdd1/sams/tensor_log/hcdr/dnn_K16 --device cpu --dataset hcdr --batch_size 100000 --col_cardinalities_file ./internal/ml/model_slicing/data/hcdr_col_cardinalities --target_batch 100000 + +SELECT model_init( + '{}', + '/project/Trails/internal/ml/model_selection/config.ini', + '/project/Trails/internal/ml/model_slicing/data/hcdr_col_cardinalities', + '/project/tensor_log/hcdr/dnn_K16' +); +SELECT inference_shared_write_once_int( + 'hcdr', + '{}', + '/project/Trails/internal/ml/model_selection/config.ini', + '/project/Trails/internal/ml/model_slicing/data/hcdr_col_cardinalities', + '/project/tensor_log/hcdr/dnn_K16', + '', + 100000 +); +``` + +# Data Scale + +```sql +# Bank +CUDA_VISIBLE_DEVICES=-1 python ./internal/ml/model_slicing/baseline_int.py /hdd1/sams/tensor_log/credit/dnn_K16_epoch50 --device cpu --dataset credit --batch_size 640000 --col_cardinalities_file ./internal/ml/model_slicing/data/credit_col_cardinalities --target_batch 640000 + +SELECT model_init( + '{}', + '/project/Trails/internal/ml/model_selection/config.ini', + '/project/Trails/internal/ml/model_slicing/data/credit_col_cardinalities', + '/project/tensor_log/credit/dnn_K16_epoch50' +); +SELECT inference_shared_write_once_int( + 'credit', + '{}', + '/project/Trails/internal/ml/model_selection/config.ini', + '/project/Trails/internal/ml/model_slicing/data/credit_col_cardinalities', + '/project/tensor_log/credit/dnn_K16_epoch50', + '', + 640000 +); +``` + +# Micro + +## Profiling + +```bash +CUDA_VISIBLE_DEVICES=-1 python ./internal/ml/model_slicing/algorithm/baseline.py /hdd1/sams/tensor_log/frappe/dnn_K16_alpha4 --device cpu --dataset frappe --batch_size 20000 --col_cardinalities_file frappe_col_cardinalities --target_batch 20000` +``` + +## Optimizations + +```bash + +# 1. with all opt +SELECT model_init( + '{}', + '/project/Trails/internal/ml/model_selection/config.ini', + '/project/Trails/frappe_col_cardinalities', + '/project/tensor_log/frappe/dnn_K16_alpha4' +); +SELECT inference_shared_write_once( + 'frappe', + '{}', + '/project/Trails/internal/ml/model_selection/config.ini', + '/project/Trails/frappe_col_cardinalities', + '/project/tensor_log/frappe/dnn_K16_alpha4', + '', + 100000 +); + +# 2. w/o model cache +SELECT inference_shared_write_once( + 'frappe', + '{}', + '/project/Trails/internal/ml/model_selection/config.ini', + '/project/Trails/frappe_col_cardinalities', + '/project/tensor_log/frappe/dnn_K16_alpha4', + '', + 100000 +); + +# 3. w/o shared memory +SELECT model_init( + '{}', + '/project/Trails/internal/ml/model_selection/config.ini', + '/project/Trails/frappe_col_cardinalities', + '/project/tensor_log/frappe/dnn_K16_alpha4' +); +SELECT inference( + 'frappe', + '{}', + '/project/Trails/internal/ml/model_selection/config.ini', + '/project/Trails/frappe_col_cardinalities', + '/project/tensor_log/frappe/dnn_K16_alpha4', + '', + 100000 +); + +# w/o SPI this can measure the time usage for not using spi +CUDA_VISIBLE_DEVICES=-1 python ./internal/ml/model_slicing/algorithm/baseline.py /hdd1/sams/tensor_log/frappe/dnn_K16_alpha4 --device cpu --dataset frappe --batch_size 100000 --col_cardinalities_file frappe_col_cardinalities --target_batch 100000 +``` + +Int dataset + +```bash + +# 1. with all opt +SELECT model_init( + '{}', + '/project/Trails/internal/ml/model_selection/config.ini', + '/project/Trails/internal/ml/model_slicing/data/credit_col_cardinalities', + '/project/tensor_log/credit/dnn_K16_epoch50' +); +SELECT inference_shared_write_once_int( + 'credit', + '{}', + '/project/Trails/internal/ml/model_selection/config.ini', + '/project/Trails/internal/ml/model_slicing/data/credit_col_cardinalities', + '/project/tensor_log/credit/dnn_K16_epoch50', + '', + 100000 +); + +# 2. w/o model cache +SELECT inference_shared_write_once_int( + 'credit', + '{}', + '/project/Trails/internal/ml/model_selection/config.ini', + '/project/Trails/internal/ml/model_slicing/data/credit_col_cardinalities', + '/project/tensor_log/credit/dnn_K16_epoch50', + '', + 100000 +); + +# 3. w/o shared memory +SELECT model_init( + '{}', + '/project/Trails/internal/ml/model_selection/config.ini', + '/project/Trails/internal/ml/model_slicing/data/credit_col_cardinalities', + '/project/tensor_log/credit/dnn_K16_epoch50' +); +SELECT inference( + 'credit', + '{}', + '/project/Trails/internal/ml/model_selection/config.ini', + '/project/Trails/internal/ml/model_slicing/data/credit_col_cardinalities', + '/project/tensor_log/credit/dnn_K16_epoch50', + '', + 100000 +); + +# w/o SPI this can measure the time usage for not using spi +CUDA_VISIBLE_DEVICES=-1 python ./internal/ml/model_slicing/algorithm/baseline.py /hdd1/sams/tensor_log/frappe/dnn_K16_alpha4 --device cpu --dataset frappe --batch_size 100000 --col_cardinalities_file frappe_col_cardinalities --target_batch 100000 + +CUDA_VISIBLE_DEVICES=-1 python ./internal/ml/model_slicing/baseline_int.py /hdd1/sams/tensor_log/credit/dnn_K16_epoch50 --device cpu --dataset credit --batch_size 100000 --col_cardinalities_file ./internal/ml/model_slicing/data/credit_col_cardinalities --target_batch 100000 +``` + + + + + + + + + + + + + diff --git a/examples/model_selection/Trails/internal/ml/model_slicing/baseline_int.py b/examples/model_selection/Trails/internal/ml/model_slicing/baseline_int.py new file mode 100644 index 0000000000..e10fd7a148 --- /dev/null +++ b/examples/model_selection/Trails/internal/ml/model_slicing/baseline_int.py @@ -0,0 +1,206 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + +import os +import torch +import argparse +from model_slicing.algorithm.src.model.sparsemax_verticalMoe import SliceModel, SparseMax_VerticalSAMS +import time +import psycopg2 +from model_slicing.algorithm.src.model.factory import initialize_model +from typing import Any, List, Dict, Tuple +import json + +USER = "postgres" +HOST = "localhost" +PORT = "28814" +DB_NAME = "pg_extension" +PASSWOD = "1234" + +time_dict = { + "load_model": 0, + "data_query_time": 0, + "py_conver_to_tensor": 0, + "tensor_to_gpu": 0, + "py_compute": 0 + +} + + +def read_json(file_name): + print(f"Loading {file_name}...") + is_exist = os.path.exists(file_name) + if is_exist: + with open(file_name, 'r') as readfile: + data = json.load(readfile) + return data + else: + print(f"{file_name} is not exist") + return {} + + +def fetch_and_preprocess(conn, batch_size, database): + cur = conn.cursor() + # Select rows greater than last_id + cur.execute(f"SELECT * FROM {database}_int_train LIMIT {batch_size}") + rows = cur.fetchall() + return rows + + +def pre_processing(mini_batch_data: List[Tuple]): + """ + mini_batch_data: [('0', '0', '123:123', '123:123', '123:123',) + """ + feat_id = torch.LongTensor(mini_batch_data) + print("feat_id = ", feat_id[:, 2:].size()) + return {'id': feat_id[:, 2:]} + + +def fetch_data(database, batch_size): + global time_dict + print("Data fetching ....") + begin_time = time.time() + with psycopg2.connect(database=DB_NAME, user=USER, host=HOST, port=PORT) as conn: + rows = fetch_and_preprocess(conn, batch_size, database) + time_dict["data_query_time"] += time.time() - begin_time + print(f"Data fetching done {rows[0]}, size = {len(rows)}, type = {type(rows)}, {type(rows[0])}") + + print("Data preprocessing ....") + begin_time = time.time() + batch = pre_processing(rows) + time_dict["py_conver_to_tensor"] += time.time() - begin_time + print("Data preprocessing done") + return batch + + +def load_model(tensorboard_path: str, device: str = "cuda"): + """ + Args: + tensorboard_path: the path of the directory of tensorboard + """ + arg_file_path = os.path.join(tensorboard_path, "args.txt") + model_config = reload_argparse(arg_file_path) + + net = initialize_model(model_config) + + model_pth_path = os.path.join(tensorboard_path, "best_model.pth") + saved_state_dict = torch.load(model_pth_path, map_location=device) + + net.load_state_dict(saved_state_dict) + print("successfully load model") + return net, model_config + + +def if_cuda_avaiable(device): + if "cuda" in device: + return True + else: + return False + + +def reload_argparse(file_path: str): + d = {} + + with open(file_path, 'r', encoding='utf-8') as f: + for line in f.readlines(): + key, value = line.strip('\n').split(',') + # print(f"{key}, {value}\n") + try: + re = eval(value) + except: + re = value + d[key] = re + + return argparse.Namespace(**d) + + +parser = argparse.ArgumentParser(description='predict FLOPS') +parser.add_argument('path', type=str, + help="directory to model file") +parser.add_argument('--flag', '-p', action='store_true', + help="wehther to print profile") +parser.add_argument('--print_net', '--b', action='store_true', + help="print the structure of network") + +parser.add_argument('--device', type=str, default="cuda") +parser.add_argument('--dataset', type=str, default="frappe") +parser.add_argument('--target_batch', type=int, default=10000) +parser.add_argument('--batch_size', type=int, default=10000) +parser.add_argument('--col_cardinalities_file', type=str, default="path to the stored file") + +if __name__ == '__main__': + args = parser.parse_args() + path = args.path + flag = args.flag + device = torch.device(args.device) + print(path) + load_time = time.time() + net, config = load_model(path, args.device) + net: SparseMax_VerticalSAMS = net + config.workload = 'random' + time_dict["load_model"] = time.time() - load_time + + print(config.workload) + + overall_query_latency = time.time() + if config.net == "sparsemax_vertical_sams": + alpha = net.sparsemax.alpha + print(alpha) + + print() + + col_cardinalities = read_json(args.col_cardinalities_file) + target_sql = torch.tensor([col[-1] for col in col_cardinalities]).reshape(1, -1) + + net.eval() + net = net.to(device) + with torch.no_grad(): + sql = target_sql.to(device) + if config.net == "sparsemax_vertical_sams": + subnet: SliceModel = net.tailor_by_sql(sql) + subnet.to(device) + else: + subnet = net + subnet.eval() + target_list, y_list = [], [] + ops = 0 + + # default batch to 1024 + num_ite = args.target_batch // args.batch_size + print(f"num_ite = {num_ite}") + for i in range(num_ite): + # fetch from db + data_batch = fetch_data(args.dataset, args.batch_size) + print("Copy to device") + # wait for moving data to GPU + begin = time.time() + x_id = data_batch['id'].to(device) + if if_cuda_avaiable(args.device): + torch.cuda.synchronize() + time_dict["tensor_to_gpu"] += time.time() - begin + + print(f"begin to compute on {args.device}, is_cuda = {if_cuda_avaiable(args.device)}") + # compute + begin = time.time() + y = subnet(x_id, None) + if if_cuda_avaiable(args.device): + torch.cuda.synchronize() + time_dict["py_compute"] += time.time() - begin + time_dict["overall_query_latency"] = time.time() - overall_query_latency + print(time_dict) diff --git a/examples/model_selection/Trails/internal/ml/model_slicing/data/adult_col_cardinalities b/examples/model_selection/Trails/internal/ml/model_slicing/data/adult_col_cardinalities new file mode 100755 index 0000000000..5aa7be266c --- /dev/null +++ b/examples/model_selection/Trails/internal/ml/model_slicing/data/adult_col_cardinalities @@ -0,0 +1 @@ +[[1, 2, 3, 4, 5, 6, 140], [7, 8, 9, 10, 11, 12, 13, 141], [14, 15, 16, 17, 18, 19, 142], [20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 143], [36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 144], [52, 53, 54, 55, 56, 57, 58, 145], [59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 146], [73, 74, 75, 76, 77, 78, 147], [79, 80, 81, 82, 83, 148], [84, 85, 149], [86, 87, 88, 89, 150], [90, 91, 92, 93, 94, 95, 96, 97, 98, 151], [99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 133, 134, 135, 136, 137, 138, 139, 152]] \ No newline at end of file diff --git a/examples/model_selection/Trails/internal/ml/model_slicing/data/bank_col_cardinalities b/examples/model_selection/Trails/internal/ml/model_slicing/data/bank_col_cardinalities new file mode 100644 index 0000000000..2d876c271a --- /dev/null +++ b/examples/model_selection/Trails/internal/ml/model_slicing/data/bank_col_cardinalities @@ -0,0 +1 @@ +[[1, 2, 3, 4, 5, 75], [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 76], [18, 19, 20, 77], [21, 22, 23, 24, 78], [25, 26, 79], [27, 28, 29, 30, 31, 80], [32, 33, 81], [34, 35, 82], [36, 37, 38, 83], [39, 40, 41, 42, 84], [43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 85], [55, 56, 57, 58, 86], [59, 60, 61, 62, 87], [63, 64, 65, 66, 67, 88], [68, 69, 70, 89], [71, 72, 73, 74, 90]] \ No newline at end of file diff --git a/examples/model_selection/Trails/internal/ml/model_slicing/data/census_col_cardinalities b/examples/model_selection/Trails/internal/ml/model_slicing/data/census_col_cardinalities new file mode 100755 index 0000000000..ecacf07efc --- /dev/null +++ b/examples/model_selection/Trails/internal/ml/model_slicing/data/census_col_cardinalities @@ -0,0 +1 @@ +[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 532], [46, 47, 48, 49, 50, 51, 52, 53, 54, 533], [55, 56, 57, 58, 59, 60, 534], [61, 62, 535], [63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 536], [80, 81, 82, 83, 84, 85, 537], [86, 87, 88, 538], [89, 90, 91, 92, 93, 94, 95, 539], [96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 540], [120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 541], [135, 136, 137, 138, 139, 542], [140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 543], [150, 151, 544], [152, 153, 154, 545], [155, 156, 157, 158, 159, 160, 546], [161, 162, 163, 164, 165, 166, 167, 168, 547], [169, 170, 171, 172, 173, 174, 175, 548], [176, 177, 178, 179, 180, 181, 182, 549], [183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 550], [200, 201, 202, 203, 204, 205, 551], [206, 207, 208, 209, 210, 211, 552], [212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 553], [263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 554], [301, 302, 303, 304, 305, 306, 307, 308, 555], [309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 556], [320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 557], [330, 331, 332, 333, 334, 335, 336, 337, 338, 558], [339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 559], [349, 350, 351, 560], [352, 353, 354, 355, 561], [356, 357, 358, 359, 360, 361, 362, 562], [363, 364, 365, 366, 367, 563], [368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 564], [411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 565], [454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 566], [497, 498, 499, 500, 501, 567], [502, 503, 504, 568], [505, 506, 507, 569], [508, 509, 510, 570], [511, 512, 513, 514, 515, 516, 517, 518, 519, 520, 521, 522, 523, 524, 525, 526, 527, 528, 529, 571], [530, 531, 572]] \ No newline at end of file diff --git a/examples/model_selection/Trails/internal/ml/model_slicing/data/credit_col_cardinalities b/examples/model_selection/Trails/internal/ml/model_slicing/data/credit_col_cardinalities new file mode 100755 index 0000000000..0e8246bf81 --- /dev/null +++ b/examples/model_selection/Trails/internal/ml/model_slicing/data/credit_col_cardinalities @@ -0,0 +1 @@ +[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 348], [81, 82, 349], [83, 84, 85, 86, 87, 88, 89, 350], [90, 91, 92, 93, 351], [94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 352], [150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 353], [161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 354], [172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 355], [183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 356], [194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 357], [204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 358], [214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 359], [226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 360], [237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 361], [247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 362], [260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 363], [273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 364], [283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 365], [294, 295, 296, 297, 298, 299, 300, 301, 302, 366], [303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 367], [314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 368], [325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 369], [337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 370]] \ No newline at end of file diff --git a/examples/model_selection/Trails/internal/ml/model_slicing/data/cvd_col_cardinalities b/examples/model_selection/Trails/internal/ml/model_slicing/data/cvd_col_cardinalities new file mode 100644 index 0000000000..1464dfe8f0 --- /dev/null +++ b/examples/model_selection/Trails/internal/ml/model_slicing/data/cvd_col_cardinalities @@ -0,0 +1 @@ +[[1, 2, 102], [3, 4, 5, 6, 7, 8, 9, 103], [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 104], [33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 105], [49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 106], [64, 65, 66, 107], [67, 68, 69, 108], [70, 71, 109], [72, 73, 110], [74, 75, 111], [76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 112]] \ No newline at end of file diff --git a/examples/model_selection/Trails/internal/ml/model_slicing/data/diabetes_col_cardinalities b/examples/model_selection/Trails/internal/ml/model_slicing/data/diabetes_col_cardinalities new file mode 100755 index 0000000000..1b4f12cea8 --- /dev/null +++ b/examples/model_selection/Trails/internal/ml/model_slicing/data/diabetes_col_cardinalities @@ -0,0 +1 @@ +[[0, 1, 833], [2, 3, 4, 5, 6, 7, 834], [8, 9, 10, 835], [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 836], [21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 837], [31, 32, 33, 34, 35, 36, 37, 38, 838], [39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 839], [65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 840], [82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 841], [96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 842], [114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 155, 156, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 179, 180, 181, 182, 183, 184, 185, 186, 843], [187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 844], [305, 306, 307, 308, 309, 310, 311, 845], [312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 846], [387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 847], [426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 439, 440, 441, 442, 443, 444, 446, 447, 449, 451, 452, 453, 454, 456, 457, 458, 848], [459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 473, 474, 475, 476, 477, 478, 479, 849], [480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511, 512, 513, 514, 515, 516, 517, 518, 519, 520, 521, 522, 523, 524, 525, 526, 527, 528, 529, 530, 531, 532, 533, 534, 535, 536, 537, 538, 539, 540, 541, 542, 543, 544, 545, 546, 547, 548, 549, 550, 551, 552, 553, 554, 555, 556, 557, 558, 559, 560, 561, 562, 563, 564, 565, 566, 567, 568, 569, 570, 850], [571, 572, 573, 574, 575, 576, 577, 578, 579, 580, 581, 582, 583, 584, 585, 586, 587, 588, 589, 590, 591, 592, 593, 594, 595, 596, 597, 598, 599, 600, 601, 602, 603, 604, 605, 606, 607, 608, 609, 610, 611, 612, 613, 614, 615, 616, 617, 618, 619, 620, 621, 622, 623, 624, 625, 626, 627, 628, 629, 630, 631, 632, 633, 634, 635, 636, 637, 638, 639, 640, 641, 642, 643, 644, 645, 646, 647, 648, 649, 650, 851], [651, 652, 653, 654, 655, 656, 657, 658, 659, 660, 661, 662, 663, 664, 665, 666, 667, 668, 669, 670, 671, 672, 673, 674, 675, 676, 677, 678, 679, 680, 681, 682, 683, 684, 685, 686, 687, 688, 689, 690, 691, 692, 693, 694, 695, 696, 697, 698, 699, 700, 701, 702, 703, 704, 705, 706, 707, 708, 709, 710, 711, 712, 713, 714, 715, 716, 717, 718, 719, 720, 721, 722, 723, 724, 725, 726, 727, 728, 729, 730, 731, 732, 733, 852], [734, 735, 736, 737, 738, 739, 740, 741, 742, 743, 744, 745, 746, 747, 748, 749, 853], [750, 751, 752, 753, 854], [754, 755, 756, 757, 855], [758, 759, 760, 761, 856], [762, 763, 764, 765, 857], [766, 767, 768, 769, 858], [770, 772, 773, 859], [774, 775, 776, 777, 860], [778, 861], [780, 781, 782, 783, 862], [784, 785, 786, 787, 863], [788, 789, 864], [790, 791, 792, 793, 865], [794, 795, 796, 797, 866], [798, 799, 800, 801, 867], [802, 803, 804, 805, 868], [806, 807, 869], [808, 810, 870], [811, 871], [812, 872], [813, 814, 815, 816, 873], [817, 818, 819, 820, 874], [821, 822, 875], [823, 824, 876], [825, 826, 877], [827, 828, 878], [829, 830, 879], [831, 832, 880]] \ No newline at end of file diff --git a/examples/model_selection/Trails/internal/ml/model_slicing/data/frappe_col_cardinalities b/examples/model_selection/Trails/internal/ml/model_slicing/data/frappe_col_cardinalities new file mode 100755 index 0000000000..219e664784 --- /dev/null +++ b/examples/model_selection/Trails/internal/ml/model_slicing/data/frappe_col_cardinalities @@ -0,0 +1 @@ +[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511, 512, 513, 514, 515, 516, 517, 518, 519, 520, 521, 522, 523, 524, 525, 526, 527, 528, 529, 530, 531, 532, 533, 534, 535, 536, 537, 538, 539, 540, 541, 543, 544, 545, 546, 547, 548, 549, 550, 551, 552, 553, 554, 555, 556, 557, 558, 559, 560, 561, 562, 563, 564, 565, 566, 567, 568, 569, 570, 571, 572, 573, 574, 575, 576, 577, 578, 579, 580, 581, 582, 583, 584, 585, 586, 587, 588, 589, 590, 591, 592, 593, 594, 595, 596, 597, 598, 599, 600, 601, 602, 603, 604, 605, 606, 607, 608, 609, 610, 611, 612, 613, 614, 615, 616, 617, 618, 619, 620, 621, 622, 623, 624, 625, 626, 627, 628, 629, 630, 631, 632, 633, 634, 635, 636, 637, 638, 639, 640, 641, 642, 643, 644, 645, 646, 647, 648, 649, 650, 651, 652, 653, 654, 655, 656, 657, 658, 659, 660, 661, 662, 663, 664, 665, 666, 667, 668, 669, 670, 671, 672, 673, 674, 675, 676, 677, 678, 679, 680, 681, 682, 683, 684, 685, 686, 687, 688, 689, 690, 691, 692, 693, 694, 695, 696, 697, 698, 699, 700, 701, 702, 703, 704, 705, 706, 707, 708, 709, 710, 711, 712, 713, 714, 715, 716, 717, 718, 719, 720, 721, 722, 723, 724, 725, 726, 727, 728, 729, 730, 731, 732, 733, 734, 735, 736, 737, 738, 739, 740, 741, 742, 743, 744, 745, 746, 747, 748, 749, 750, 751, 752, 753, 754, 755, 756, 757, 758, 759, 760, 761, 762, 763, 764, 765, 766, 767, 768, 769, 770, 771, 772, 773, 774, 775, 776, 777, 778, 779, 780, 781, 782, 783, 784, 785, 786, 787, 788, 789, 790, 791, 792, 793, 794, 795, 796, 797, 798, 799, 800, 801, 802, 803, 804, 805, 806, 807, 808, 809, 810, 811, 812, 813, 814, 815, 816, 817, 818, 819, 820, 821, 822, 823, 824, 825, 826, 827, 828, 829, 830, 831, 832, 833, 834, 835, 836, 837, 838, 839, 840, 841, 842, 843, 844, 845, 846, 847, 848, 849, 850, 851, 852, 853, 854, 855, 856, 857, 858, 859, 860, 861, 862, 863, 864, 865, 866, 867, 868, 869, 870, 871, 872, 873, 874, 875, 876, 877, 878, 879, 880, 881, 882, 883, 884, 885, 886, 887, 888, 889, 890, 891, 892, 893, 894, 895, 896, 897, 898, 899, 900, 901, 902, 903, 904, 905, 906, 907, 908, 909, 910, 911, 912, 913, 914, 915, 916, 917, 918, 919, 920, 921, 922, 923, 924, 925, 926, 927, 929, 930, 931, 932, 933, 934, 935, 936, 937, 938, 939, 940, 941, 942, 943, 944, 945, 946, 947, 948, 949, 950, 951, 952, 953, 954, 955, 956, 5382], [957, 958, 959, 960, 961, 962, 963, 964, 965, 966, 967, 968, 969, 970, 971, 972, 973, 974, 975, 976, 977, 978, 979, 980, 981, 982, 983, 984, 985, 986, 987, 988, 989, 990, 991, 992, 993, 994, 995, 996, 997, 998, 999, 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031, 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039, 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047, 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055, 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063, 1064, 1065, 1066, 1067, 1068, 1069, 1070, 1071, 1072, 1073, 1074, 1075, 1076, 1077, 1078, 1079, 1080, 1081, 1082, 1083, 1084, 1085, 1086, 1087, 1088, 1089, 1090, 1091, 1092, 1093, 1094, 1095, 1096, 1097, 1098, 1099, 1100, 1101, 1102, 1103, 1104, 1105, 1106, 1107, 1108, 1109, 1110, 1111, 1112, 1113, 1114, 1115, 1116, 1117, 1118, 1119, 1120, 1121, 1122, 1123, 1124, 1125, 1126, 1127, 1128, 1129, 1130, 1131, 1132, 1133, 1134, 1135, 1136, 1137, 1138, 1139, 1140, 1141, 1142, 1143, 1144, 1145, 1146, 1147, 1148, 1149, 1150, 1151, 1152, 1153, 1154, 1155, 1156, 1157, 1158, 1159, 1160, 1161, 1162, 1163, 1164, 1165, 1166, 1167, 1168, 1169, 1170, 1171, 1172, 1173, 1174, 1175, 1176, 1177, 1178, 1179, 1180, 1181, 1182, 1183, 1184, 1185, 1186, 1187, 1188, 1189, 1190, 1191, 1192, 1193, 1194, 1195, 1196, 1197, 1198, 1199, 1200, 1201, 1202, 1203, 1204, 1205, 1206, 1207, 1208, 1209, 1210, 1211, 1212, 1213, 1214, 1215, 1216, 1217, 1218, 1219, 1220, 1221, 1222, 1223, 1224, 1225, 1226, 1227, 1228, 1229, 1230, 1231, 1232, 1233, 1234, 1235, 1236, 1237, 1238, 1239, 1240, 1241, 1242, 1243, 1244, 1245, 1246, 1247, 1248, 1249, 1250, 1251, 1252, 1253, 1254, 1255, 1256, 1257, 1258, 1259, 1260, 1261, 1262, 1263, 1264, 1265, 1266, 1267, 1268, 1269, 1270, 1271, 1272, 1273, 1274, 1275, 1276, 1277, 1278, 1279, 1280, 1281, 1282, 1283, 1284, 1285, 1286, 1287, 1288, 1289, 1290, 1291, 1292, 1293, 1294, 1295, 1296, 1297, 1298, 1299, 1300, 1301, 1302, 1303, 1304, 1305, 1306, 1307, 1308, 1309, 1310, 1311, 1312, 1313, 1314, 1315, 1316, 1317, 1318, 1319, 1320, 1321, 1322, 1323, 1324, 1325, 1326, 1327, 1328, 1329, 1330, 1331, 1332, 1333, 1334, 1335, 1336, 1337, 1338, 1339, 1340, 1341, 1342, 1343, 1344, 1345, 1346, 1347, 1348, 1349, 1350, 1351, 1352, 1353, 1354, 1355, 1356, 1357, 1358, 1359, 1360, 1361, 1362, 1363, 1364, 1365, 1366, 1367, 1368, 1369, 1370, 1371, 1372, 1373, 1374, 1375, 1376, 1377, 1378, 1379, 1380, 1381, 1382, 1383, 1384, 1385, 1386, 1387, 1388, 1389, 1390, 1391, 1392, 1393, 1394, 1395, 1396, 1397, 1398, 1399, 1400, 1401, 1402, 1403, 1404, 1405, 1406, 1407, 1408, 1409, 1410, 1411, 1412, 1413, 1414, 1415, 1416, 1417, 1418, 1419, 1420, 1421, 1422, 1423, 1424, 1425, 1426, 1427, 1428, 1429, 1430, 1431, 1432, 1433, 1434, 1435, 1436, 1437, 1438, 1439, 1440, 1441, 1442, 1443, 1444, 1445, 1446, 1447, 1448, 1449, 1450, 1451, 1452, 1453, 1454, 1455, 1456, 1457, 1458, 1459, 1460, 1461, 1462, 1463, 1464, 1465, 1466, 1467, 1468, 1469, 1470, 1471, 1472, 1473, 1474, 1475, 1476, 1477, 1478, 1479, 1480, 1481, 1482, 1483, 1484, 1485, 1486, 1487, 1488, 1489, 1490, 1491, 1492, 1493, 1494, 1495, 1496, 1497, 1498, 1499, 1500, 1501, 1502, 1503, 1504, 1505, 1506, 1507, 1508, 1509, 1510, 1511, 1512, 1513, 1514, 1515, 1516, 1517, 1518, 1519, 1520, 1521, 1522, 1523, 1524, 1525, 1526, 1527, 1528, 1529, 1530, 1531, 1532, 1533, 1534, 1535, 1536, 1537, 1538, 1539, 1540, 1541, 1542, 1543, 1544, 1545, 1546, 1547, 1548, 1549, 1550, 1551, 1552, 1553, 1554, 1555, 1556, 1557, 1558, 1559, 1560, 1561, 1562, 1563, 1564, 1565, 1566, 1567, 1568, 1569, 1570, 1571, 1572, 1573, 1574, 1575, 1576, 1577, 1578, 1579, 1580, 1581, 1582, 1583, 1584, 1585, 1586, 1587, 1588, 1589, 1590, 1591, 1592, 1593, 1594, 1595, 1596, 1597, 1598, 1599, 1600, 1601, 1602, 1603, 1604, 1605, 1606, 1607, 1608, 1609, 1610, 1611, 1612, 1613, 1614, 1615, 1616, 1617, 1618, 1619, 1620, 1621, 1622, 1623, 1624, 1625, 1626, 1627, 1628, 1629, 1630, 1631, 1632, 1633, 1634, 1635, 1636, 1637, 1638, 1639, 1640, 1641, 1642, 1643, 1644, 1645, 1646, 1647, 1648, 1649, 1650, 1651, 1652, 1653, 1654, 1655, 1656, 1657, 1658, 1659, 1660, 1661, 1662, 1663, 1664, 1665, 1666, 1667, 1668, 1669, 1670, 1671, 1672, 1673, 1674, 1675, 1676, 1677, 1678, 1679, 1680, 1681, 1682, 1683, 1684, 1685, 1686, 1687, 1688, 1689, 1690, 1691, 1692, 1693, 1694, 1695, 1696, 1697, 1698, 1699, 1700, 1701, 1702, 1703, 1704, 1705, 1706, 1707, 1708, 1709, 1710, 1711, 1712, 1713, 1714, 1715, 1716, 1717, 1718, 1719, 1720, 1721, 1722, 1723, 1724, 1725, 1726, 1727, 1728, 1729, 1730, 1731, 1732, 1733, 1734, 1735, 1736, 1737, 1738, 1739, 1740, 1741, 1742, 1743, 1744, 1745, 1746, 1747, 1748, 1749, 1750, 1751, 1752, 1753, 1754, 1755, 1756, 1757, 1758, 1759, 1760, 1761, 1762, 1763, 1764, 1765, 1766, 1767, 1768, 1769, 1770, 1771, 1772, 1773, 1774, 1775, 1776, 1777, 1778, 1779, 1780, 1781, 1782, 1783, 1784, 1785, 1786, 1787, 1788, 1789, 1790, 1791, 1792, 1793, 1794, 1795, 1796, 1797, 1798, 1799, 1800, 1801, 1802, 1803, 1804, 1805, 1806, 1807, 1808, 1809, 1810, 1811, 1812, 1813, 1814, 1815, 1816, 1817, 1818, 1819, 1820, 1821, 1822, 1823, 1824, 1825, 1826, 1827, 1828, 1829, 1830, 1831, 1832, 1833, 1834, 1835, 1836, 1837, 1838, 1839, 1840, 1841, 1842, 1843, 1844, 1845, 1846, 1847, 1848, 1849, 1850, 1851, 1852, 1853, 1854, 1855, 1856, 1857, 1858, 1859, 1860, 1861, 1862, 1863, 1864, 1865, 1866, 1867, 1868, 1869, 1870, 1871, 1872, 1873, 1874, 1875, 1876, 1877, 1878, 1879, 1880, 1881, 1882, 1883, 1884, 1885, 1886, 1887, 1888, 1889, 1890, 1891, 1892, 1893, 1894, 1895, 1896, 1897, 1898, 1899, 1900, 1901, 1902, 1903, 1904, 1905, 1906, 1907, 1908, 1909, 1910, 1911, 1912, 1913, 1914, 1915, 1916, 1917, 1918, 1919, 1920, 1921, 1922, 1923, 1924, 1925, 1926, 1927, 1928, 1929, 1930, 1931, 1932, 1933, 1934, 1935, 1936, 1937, 1938, 1939, 1940, 1941, 1942, 1943, 1944, 1945, 1946, 1947, 1948, 1949, 1950, 1951, 1952, 1953, 1954, 1955, 1956, 1957, 1958, 1959, 1960, 1961, 1962, 1963, 1964, 1965, 1966, 1967, 1968, 1969, 1970, 1971, 1972, 1973, 1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024, 2025, 2026, 2027, 2028, 2029, 2030, 2031, 2032, 2033, 2034, 2035, 2036, 2037, 2038, 2039, 2040, 2041, 2042, 2043, 2044, 2045, 2046, 2047, 2048, 2049, 2050, 2051, 2052, 2053, 2054, 2055, 2056, 2057, 2058, 2059, 2060, 2061, 2062, 2063, 2064, 2065, 2066, 2067, 2068, 2069, 2070, 2071, 2072, 2073, 2074, 2075, 2076, 2077, 2078, 2079, 2080, 2081, 2082, 2083, 2084, 2085, 2086, 2087, 2088, 2089, 2090, 2091, 2092, 2093, 2094, 2095, 2096, 2097, 2098, 2099, 2100, 2101, 2102, 2103, 2104, 2105, 2106, 2107, 2108, 2109, 2110, 2111, 2112, 2113, 2114, 2115, 2116, 2117, 2118, 2119, 2120, 2121, 2122, 2123, 2124, 2125, 2126, 2127, 2128, 2129, 2130, 2131, 2132, 2133, 2134, 2135, 2136, 2137, 2138, 2139, 2140, 2141, 2142, 2143, 2144, 2145, 2146, 2147, 2148, 2149, 2150, 2151, 2152, 2153, 2154, 2155, 2156, 2157, 2158, 2159, 2160, 2161, 2162, 2163, 2164, 2165, 2166, 2167, 2168, 2169, 2170, 2171, 2172, 2173, 2174, 2175, 2176, 2177, 2178, 2179, 2180, 2181, 2182, 2183, 2184, 2185, 2186, 2187, 2188, 2189, 2190, 2191, 2192, 2193, 2194, 2195, 2196, 2197, 2198, 2199, 2200, 2201, 2202, 2203, 2204, 2205, 2206, 2207, 2208, 2209, 2210, 2211, 2212, 2213, 2214, 2215, 2216, 2217, 2218, 2219, 2220, 2221, 2222, 2223, 2224, 2225, 2226, 2227, 2228, 2229, 2230, 2231, 2232, 2233, 2234, 2235, 2236, 2237, 2238, 2239, 2240, 2241, 2242, 2243, 2244, 2245, 2246, 2247, 2248, 2249, 2250, 2251, 2252, 2253, 2254, 2255, 2256, 2257, 2258, 2259, 2260, 2261, 2262, 2263, 2264, 2265, 2266, 2267, 2268, 2269, 2270, 2271, 2272, 2273, 2274, 2275, 2276, 2277, 2278, 2279, 2280, 2281, 2282, 2283, 2284, 2285, 2286, 2287, 2288, 2289, 2290, 2291, 2292, 2293, 2294, 2295, 2296, 2297, 2298, 2299, 2300, 2301, 2302, 2303, 2304, 2305, 2306, 2307, 2308, 2309, 2310, 2311, 2312, 2313, 2314, 2315, 2316, 2317, 2318, 2319, 2320, 2321, 2322, 2323, 2324, 2325, 2326, 2327, 2328, 2329, 2330, 2331, 2332, 2333, 2334, 2335, 2336, 2337, 2338, 2339, 2340, 2341, 2342, 2343, 2344, 2345, 2346, 2347, 2348, 2349, 2350, 2351, 2352, 2353, 2354, 2355, 2356, 2357, 2358, 2359, 2360, 2361, 2362, 2363, 2364, 2365, 2366, 2367, 2368, 2369, 2370, 2371, 2372, 2373, 2374, 2375, 2376, 2377, 2378, 2379, 2380, 2381, 2382, 2383, 2384, 2385, 2386, 2387, 2388, 2389, 2390, 2391, 2392, 2393, 2394, 2395, 2396, 2397, 2398, 2399, 2400, 2401, 2402, 2403, 2404, 2405, 2406, 2407, 2408, 2409, 2410, 2411, 2412, 2413, 2414, 2415, 2416, 2417, 2418, 2419, 2420, 2421, 2422, 2423, 2424, 2425, 2426, 2427, 2428, 2429, 2430, 2431, 2432, 2433, 2434, 2435, 2436, 2437, 2438, 2439, 2440, 2441, 2442, 2443, 2444, 2445, 2446, 2447, 2448, 2449, 2450, 2451, 2452, 2453, 2454, 2455, 2456, 2457, 2458, 2459, 2460, 2461, 2462, 2463, 2464, 2465, 2466, 2467, 2468, 2469, 2470, 2471, 2472, 2473, 2474, 2475, 2476, 2477, 2478, 2479, 2480, 2481, 2482, 2483, 2484, 2485, 2486, 2487, 2488, 2489, 2490, 2491, 2492, 2493, 2494, 2495, 2496, 2497, 2498, 2499, 2500, 2501, 2502, 2503, 2504, 2505, 2506, 2507, 2508, 2509, 2510, 2511, 2512, 2513, 2514, 2515, 2516, 2517, 2518, 2519, 2520, 2521, 2522, 2523, 2524, 2525, 2526, 2527, 2528, 2529, 2530, 2531, 2532, 2533, 2534, 2535, 2536, 2537, 2538, 2539, 2540, 2541, 2542, 2543, 2544, 2545, 2546, 2547, 2548, 2549, 2550, 2551, 2552, 2553, 2554, 2555, 2556, 2557, 2558, 2559, 2560, 2561, 2562, 2563, 2564, 2565, 2566, 2567, 2568, 2569, 2570, 2571, 2572, 2573, 2574, 2575, 2576, 2577, 2578, 2579, 2580, 2581, 2582, 2583, 2584, 2585, 2586, 2587, 2588, 2589, 2590, 2591, 2592, 2593, 2594, 2595, 2596, 2597, 2598, 2599, 2600, 2601, 2602, 2603, 2604, 2605, 2606, 2607, 2608, 2609, 2610, 2611, 2612, 2613, 2614, 2615, 2616, 2617, 2618, 2619, 2620, 2621, 2622, 2623, 2624, 2625, 2626, 2627, 2628, 2629, 2630, 2631, 2632, 2633, 2634, 2635, 2636, 2637, 2638, 2639, 2640, 2641, 2642, 2643, 2644, 2645, 2646, 2647, 2648, 2649, 2650, 2651, 2652, 2653, 2654, 2655, 2656, 2657, 2658, 2659, 2660, 2661, 2662, 2663, 2664, 2665, 2666, 2667, 2668, 2669, 2670, 2671, 2672, 2673, 2674, 2675, 2676, 2677, 2678, 2679, 2680, 2681, 2682, 2683, 2684, 2685, 2686, 2687, 2688, 2689, 2690, 2691, 2692, 2693, 2694, 2695, 2696, 2697, 2698, 2699, 2700, 2701, 2702, 2703, 2704, 2705, 2706, 2707, 2708, 2709, 2710, 2711, 2712, 2713, 2714, 2715, 2716, 2717, 2718, 2719, 2720, 2721, 2722, 2723, 2724, 2725, 2726, 2727, 2728, 2729, 2730, 2731, 2732, 2733, 2734, 2735, 2736, 2737, 2738, 2739, 2740, 2741, 2742, 2743, 2744, 2745, 2746, 2747, 2748, 2749, 2750, 2751, 2752, 2753, 2754, 2755, 2756, 2757, 2758, 2759, 2760, 2761, 2762, 2763, 2764, 2765, 2766, 2767, 2768, 2769, 2770, 2771, 2772, 2773, 2774, 2775, 2776, 2777, 2778, 2779, 2780, 2781, 2782, 2783, 2784, 2785, 2786, 2787, 2788, 2789, 2790, 2791, 2792, 2793, 2794, 2795, 2796, 2797, 2798, 2799, 2800, 2801, 2802, 2803, 2804, 2805, 2806, 2807, 2808, 2809, 2810, 2811, 2812, 2813, 2814, 2815, 2816, 2817, 2818, 2819, 2820, 2821, 2822, 2823, 2824, 2825, 2826, 2827, 2828, 2829, 2830, 2831, 2832, 2833, 2834, 2835, 2836, 2837, 2838, 2839, 2840, 2841, 2842, 2843, 2844, 2845, 2846, 2847, 2848, 2849, 2850, 2851, 2852, 2853, 2854, 2855, 2856, 2857, 2858, 2859, 2860, 2861, 2862, 2863, 2864, 2865, 2866, 2867, 2868, 2869, 2870, 2871, 2872, 2873, 2874, 2875, 2876, 2877, 2878, 2879, 2880, 2881, 2882, 2883, 2884, 2885, 2886, 2887, 2888, 2889, 2890, 2891, 2892, 2893, 2894, 2895, 2896, 2897, 2898, 2899, 2900, 2901, 2902, 2903, 2904, 2905, 2906, 2907, 2908, 2909, 2910, 2911, 2912, 2913, 2914, 2915, 2916, 2917, 2918, 2919, 2920, 2921, 2922, 2923, 2924, 2925, 2926, 2927, 2928, 2929, 2930, 2931, 2932, 2933, 2934, 2935, 2936, 2937, 2938, 2939, 2940, 2941, 2942, 2943, 2944, 2945, 2946, 2947, 2948, 2949, 2950, 2951, 2952, 2953, 2954, 2955, 2956, 2957, 2958, 2959, 2960, 2961, 2962, 2963, 2964, 2965, 2966, 2967, 2968, 2969, 2970, 2971, 2972, 2973, 2974, 2975, 2976, 2977, 2978, 2979, 2980, 2981, 2982, 2983, 2984, 2985, 2986, 2987, 2988, 2989, 2990, 2991, 2992, 2993, 2994, 2995, 2996, 2997, 2998, 2999, 3000, 3001, 3002, 3003, 3004, 3005, 3006, 3007, 3008, 3009, 3010, 3011, 3012, 3013, 3014, 3015, 3016, 3017, 3018, 3019, 3020, 3021, 3022, 3023, 3024, 3025, 3026, 3027, 3028, 3029, 3030, 3031, 3032, 3033, 3034, 3035, 3036, 3037, 3038, 3039, 3040, 3041, 3042, 3043, 3044, 3045, 3046, 3047, 3048, 3049, 3050, 3051, 3052, 3053, 3054, 3055, 3056, 3057, 3058, 3059, 3060, 3061, 3062, 3063, 3064, 3065, 3066, 3067, 3068, 3069, 3070, 3071, 3072, 3073, 3074, 3075, 3076, 3077, 3078, 3079, 3080, 3081, 3082, 3083, 3084, 3085, 3086, 3087, 3088, 3089, 3090, 3091, 3092, 3093, 3094, 3095, 3096, 3097, 3098, 3099, 3100, 3101, 3102, 3103, 3104, 3105, 3106, 3107, 3108, 3109, 3110, 3111, 3112, 3113, 3114, 3115, 3116, 3117, 3118, 3119, 3120, 3121, 3122, 3123, 3124, 3125, 3126, 3127, 3128, 3129, 3130, 3131, 3132, 3133, 3134, 3135, 3136, 3137, 3138, 3139, 3140, 3141, 3142, 3143, 3144, 3145, 3146, 3147, 3148, 3149, 3150, 3151, 3152, 3153, 3154, 3155, 3156, 3157, 3158, 3159, 3160, 3161, 3162, 3163, 3164, 3165, 3166, 3167, 3168, 3169, 3170, 3171, 3172, 3173, 3174, 3175, 3176, 3177, 3178, 3179, 3180, 3181, 3182, 3183, 3184, 3185, 3186, 3187, 3188, 3189, 3190, 3191, 3192, 3193, 3194, 3195, 3196, 3197, 3198, 3199, 3200, 3201, 3202, 3203, 3204, 3205, 3206, 3207, 3208, 3209, 3210, 3211, 3212, 3213, 3214, 3215, 3216, 3217, 3218, 3219, 3220, 3221, 3222, 3223, 3224, 3225, 3226, 3227, 3228, 3229, 3230, 3231, 3232, 3233, 3234, 3235, 3236, 3237, 3238, 3239, 3240, 3241, 3242, 3243, 3244, 3245, 3246, 3247, 3248, 3249, 3250, 3251, 3252, 3253, 3254, 3255, 3256, 3257, 3258, 3259, 3260, 3261, 3262, 3263, 3264, 3265, 3266, 3267, 3268, 3269, 3270, 3271, 3272, 3273, 3274, 3275, 3276, 3277, 3278, 3279, 3280, 3281, 3282, 3283, 3284, 3285, 3286, 3287, 3288, 3289, 3290, 3291, 3292, 3293, 3294, 3295, 3296, 3297, 3298, 3299, 3300, 3301, 3302, 3303, 3304, 3305, 3306, 3307, 3308, 3309, 3310, 3311, 3312, 3313, 3314, 3315, 3316, 3317, 3318, 3319, 3320, 3321, 3322, 3323, 3324, 3325, 3326, 3327, 3328, 3329, 3330, 3331, 3332, 3333, 3334, 3335, 3336, 3337, 3338, 3339, 3340, 3341, 3342, 3343, 3344, 3345, 3346, 3347, 3348, 3349, 3350, 3351, 3352, 3353, 3354, 3355, 3356, 3357, 3358, 3359, 3360, 3361, 3362, 3363, 3364, 3365, 3366, 3367, 3368, 3369, 3370, 3371, 3372, 3373, 3374, 3375, 3376, 3377, 3378, 3379, 3380, 3381, 3382, 3383, 3384, 3385, 3386, 3387, 3388, 3389, 3390, 3391, 3392, 3393, 3394, 3395, 3396, 3397, 3398, 3399, 3400, 3401, 3402, 3403, 3404, 3405, 3406, 3407, 3408, 3409, 3410, 3411, 3412, 3413, 3414, 3415, 3416, 3417, 3418, 3419, 3420, 3421, 3422, 3423, 3424, 3425, 3426, 3427, 3428, 3429, 3430, 3431, 3432, 3433, 3434, 3435, 3436, 3437, 3438, 3439, 3440, 3441, 3442, 3443, 3444, 3445, 3446, 3447, 3448, 3449, 3450, 3451, 3452, 3453, 3454, 3455, 3456, 3457, 3458, 3459, 3460, 3461, 3462, 3463, 3464, 3465, 3466, 3467, 3468, 3469, 3470, 3471, 3472, 3473, 3474, 3475, 3476, 3477, 3478, 3479, 3480, 3481, 3482, 3483, 3484, 3485, 3486, 3487, 3488, 3489, 3490, 3491, 3492, 3493, 3494, 3495, 3496, 3497, 3498, 3499, 3500, 3501, 3502, 3503, 3504, 3505, 3506, 3507, 3508, 3509, 3510, 3511, 3512, 3513, 3514, 3515, 3516, 3517, 3518, 3519, 3520, 3521, 3522, 3523, 3524, 3525, 3526, 3527, 3528, 3529, 3530, 3531, 3532, 3533, 3534, 3535, 3536, 3537, 3538, 3539, 3540, 3541, 3542, 3543, 3544, 3545, 3546, 3547, 3548, 3549, 3550, 3551, 3552, 3553, 3554, 3555, 3556, 3557, 3558, 3559, 3560, 3561, 3562, 3563, 3564, 3565, 3566, 3567, 3568, 3569, 3570, 3571, 3572, 3573, 3574, 3575, 3576, 3577, 3578, 3579, 3580, 3581, 3582, 3583, 3584, 3585, 3586, 3587, 3588, 3589, 3590, 3591, 3592, 3593, 3594, 3595, 3596, 3597, 3598, 3599, 3600, 3601, 3602, 3603, 3604, 3605, 3606, 3607, 3608, 3609, 3610, 3611, 3612, 3613, 3614, 3615, 3616, 3617, 3618, 3619, 3620, 3621, 3622, 3623, 3624, 3625, 3626, 3627, 3628, 3629, 3630, 3631, 3632, 3633, 3634, 3635, 3636, 3637, 3638, 3639, 3640, 3641, 3642, 3643, 3644, 3645, 3646, 3647, 3648, 3649, 3650, 3651, 3652, 3653, 3654, 3655, 3656, 3657, 3658, 3659, 3660, 3661, 3662, 3663, 3664, 3665, 3666, 3667, 3668, 3669, 3670, 3671, 3672, 3673, 3674, 3675, 3676, 3677, 3678, 3679, 3680, 3681, 3682, 3683, 3684, 3685, 3686, 3687, 3688, 3689, 3690, 3691, 3692, 3693, 3694, 3695, 3696, 3697, 3698, 3699, 3700, 3701, 3702, 3703, 3704, 3705, 3706, 3707, 3708, 3709, 3710, 3711, 3712, 3713, 3714, 3715, 3716, 3717, 3718, 3719, 3720, 3721, 3722, 3723, 3724, 3725, 3726, 3727, 3728, 3729, 3730, 3731, 3732, 3733, 3734, 3735, 3736, 3737, 3738, 3739, 3740, 3741, 3742, 3743, 3744, 3745, 3746, 3747, 3748, 3749, 3750, 3751, 3752, 3753, 3754, 3755, 3756, 3757, 3758, 3759, 3760, 3761, 3762, 3763, 3764, 3765, 3766, 3767, 3768, 3769, 3770, 3771, 3772, 3773, 3774, 3775, 3776, 3777, 3778, 3779, 3780, 3781, 3782, 3783, 3784, 3785, 3786, 3787, 3788, 3789, 3790, 3791, 3792, 3793, 3794, 3795, 3796, 3797, 3798, 3799, 3800, 3801, 3802, 3803, 3804, 3805, 3806, 3807, 3808, 3809, 3810, 3811, 3812, 3813, 3814, 3815, 3816, 3817, 3818, 3819, 3820, 3821, 3822, 3823, 3824, 3825, 3826, 3827, 3828, 3829, 3830, 3831, 3832, 3833, 3834, 3835, 3836, 3837, 3838, 3839, 3840, 3841, 3842, 3843, 3844, 3845, 3846, 3847, 3848, 3849, 3850, 3851, 3852, 3853, 3854, 3855, 3856, 3857, 3858, 3859, 3860, 3861, 3862, 3863, 3864, 3865, 3866, 3867, 3868, 3869, 3870, 3871, 3872, 3873, 3874, 3875, 3876, 3877, 3878, 3879, 3880, 3881, 3882, 3883, 3884, 3885, 3886, 3887, 3888, 3889, 3890, 3891, 3892, 3893, 3894, 3895, 3896, 3897, 3898, 3899, 3900, 3901, 3902, 3903, 3904, 3905, 3906, 3907, 3908, 3909, 3910, 3911, 3912, 3913, 3914, 3915, 3916, 3917, 3918, 3919, 3920, 3921, 3922, 3923, 3924, 3925, 3926, 3927, 3928, 3929, 3930, 3931, 3932, 3933, 3934, 3935, 3936, 3937, 3938, 3939, 3940, 3941, 3942, 3943, 3944, 3945, 3946, 3947, 3948, 3949, 3950, 3951, 3952, 3953, 3954, 3955, 3956, 3957, 3958, 3959, 3960, 3961, 3962, 3963, 3964, 3965, 3966, 3967, 3968, 3969, 3970, 3971, 3972, 3973, 3974, 3975, 3976, 3977, 3978, 3979, 3980, 3981, 3982, 3983, 3984, 3985, 3986, 3987, 3988, 3989, 3990, 3991, 3992, 3993, 3994, 3995, 3996, 3997, 3998, 3999, 4000, 4001, 4002, 4003, 4004, 4005, 4006, 4007, 4008, 4009, 4010, 4011, 4012, 4013, 4014, 4015, 4016, 4017, 4018, 4019, 4020, 4021, 4022, 4023, 4024, 4025, 4026, 4027, 4028, 4029, 4030, 4031, 4032, 4033, 4034, 4035, 4036, 4037, 4038, 4039, 4040, 4041, 4042, 4043, 4044, 4045, 4046, 4047, 4048, 4049, 4050, 4051, 4052, 4053, 4054, 4055, 4056, 4057, 4058, 4059, 4060, 4061, 4062, 4063, 4064, 4065, 4066, 4067, 4068, 4069, 4070, 4071, 4072, 4073, 4074, 4075, 4076, 4077, 4078, 4079, 4080, 4081, 4082, 4083, 4084, 4085, 4086, 4087, 4088, 4089, 4090, 4091, 4092, 4093, 4094, 4095, 4096, 4097, 4098, 4099, 4100, 4101, 4102, 4103, 4104, 4105, 4106, 4107, 4108, 4109, 4110, 4111, 4112, 4113, 4114, 4115, 4116, 4117, 4118, 4119, 4120, 4121, 4122, 4123, 4124, 4125, 4126, 4127, 4128, 4129, 4130, 4131, 4132, 4133, 4134, 4135, 4136, 4137, 4138, 4139, 4140, 4141, 4142, 4143, 4144, 4145, 4146, 4147, 4148, 4149, 4150, 4151, 4152, 4153, 4154, 4155, 4156, 4157, 4158, 4159, 4160, 4161, 4162, 4163, 4164, 4165, 4166, 4167, 4168, 4169, 4170, 4171, 4172, 4173, 4174, 4175, 4176, 4177, 4178, 4179, 4180, 4181, 4182, 4183, 4184, 4185, 4186, 4187, 4188, 4189, 4190, 4191, 4192, 4193, 4194, 4195, 4196, 4197, 4198, 4199, 4200, 4201, 4202, 4203, 4204, 4205, 4206, 4207, 4208, 4209, 4210, 4211, 4212, 4213, 4214, 4215, 4216, 4217, 4218, 4219, 4220, 4221, 4222, 4223, 4224, 4225, 4226, 4227, 4228, 4229, 4230, 4231, 4232, 4233, 4234, 4235, 4236, 4237, 4238, 4239, 4240, 4241, 4242, 4243, 4244, 4245, 4246, 4247, 4248, 4249, 4250, 4251, 4252, 4253, 4254, 4255, 4256, 4257, 4258, 4259, 4260, 4261, 4262, 4263, 4264, 4265, 4266, 4267, 4268, 4269, 4270, 4271, 4272, 4273, 4274, 4275, 4276, 4277, 4278, 4279, 4280, 4281, 4282, 4283, 4284, 4285, 4286, 4287, 4288, 4289, 4290, 4291, 4292, 4293, 4294, 4295, 4296, 4297, 4298, 4299, 4300, 4301, 4302, 4303, 4304, 4305, 4306, 4307, 4308, 4309, 4310, 4311, 4312, 4313, 4314, 4315, 4316, 4317, 4318, 4319, 4320, 4321, 4322, 4323, 4324, 4325, 4326, 4327, 4328, 4329, 4330, 4331, 4332, 4333, 4334, 4335, 4336, 4337, 4338, 4339, 4340, 4341, 4342, 4343, 4344, 4345, 4346, 4347, 4348, 4349, 4350, 4351, 4352, 4353, 4354, 4355, 4356, 4357, 4358, 4359, 4360, 4361, 4362, 4363, 4364, 4365, 4366, 4367, 4368, 4369, 4370, 4371, 4372, 4373, 4374, 4375, 4376, 4377, 4378, 4379, 4380, 4381, 4382, 4383, 4384, 4385, 4386, 4387, 4388, 4389, 4390, 4391, 4392, 4393, 4394, 4395, 4396, 4397, 4398, 4399, 4400, 4401, 4402, 4403, 4404, 4405, 4406, 4407, 4408, 4409, 4410, 4411, 4412, 4413, 4414, 4415, 4416, 4417, 4418, 4419, 4420, 4421, 4422, 4423, 4424, 4425, 4426, 4427, 4428, 4429, 4430, 4431, 4432, 4433, 4434, 4435, 4436, 4437, 4438, 4439, 4440, 4441, 4442, 4443, 4444, 4445, 4446, 4447, 4448, 4449, 4450, 4451, 4452, 4453, 4454, 4455, 4456, 4457, 4458, 4459, 4460, 4461, 4462, 4463, 4464, 4465, 4466, 4467, 4468, 4469, 4470, 4471, 4472, 4473, 4474, 4475, 4476, 4477, 4478, 4479, 4480, 4481, 4482, 4483, 4484, 4485, 4486, 4487, 4488, 4489, 4490, 4491, 4492, 4493, 4494, 4495, 4496, 4497, 4498, 4499, 4500, 4501, 4502, 4503, 4504, 4505, 4506, 4507, 4508, 4509, 4510, 4511, 4512, 4513, 4514, 4515, 4516, 4517, 4518, 4519, 4520, 4521, 4522, 4523, 4524, 4525, 4526, 4527, 4528, 4529, 4530, 4531, 4532, 4533, 4534, 4535, 4536, 4537, 4538, 4539, 4540, 4541, 4542, 4543, 4544, 4545, 4546, 4547, 4548, 4549, 4550, 4551, 4552, 4553, 4554, 4555, 4556, 4557, 4558, 4559, 4560, 4561, 4562, 4563, 4564, 4565, 4566, 4567, 4568, 4569, 4570, 4571, 4572, 4573, 4574, 4575, 4576, 4577, 4578, 4579, 4580, 4581, 4582, 4583, 4584, 4585, 4586, 4587, 4588, 4589, 4590, 4591, 4592, 4593, 4594, 4595, 4596, 4597, 4598, 4599, 4600, 4601, 4602, 4603, 4604, 4605, 4606, 4607, 4608, 4609, 4610, 4611, 4612, 4613, 4614, 4615, 4616, 4617, 4618, 4619, 4620, 4621, 4622, 4623, 4624, 4625, 4626, 4627, 4628, 4629, 4630, 4631, 4632, 4633, 4634, 4635, 4636, 4637, 4638, 4639, 4640, 4641, 4642, 4643, 4644, 4645, 4646, 4647, 4648, 4649, 4650, 4651, 4652, 4653, 4654, 4655, 4656, 4657, 4658, 4659, 4660, 4661, 4662, 4663, 4664, 4665, 4666, 4667, 4668, 4669, 4670, 4671, 4672, 4673, 4674, 4675, 4676, 4677, 4678, 4679, 4680, 4681, 4682, 4683, 4684, 4685, 4686, 4687, 4688, 4689, 4690, 4691, 4692, 4693, 4694, 4695, 4696, 4697, 4698, 4699, 4700, 4701, 4702, 4703, 4704, 4705, 4706, 4707, 4708, 4709, 4710, 4711, 4712, 4713, 4714, 4715, 4716, 4717, 4718, 4719, 4720, 4721, 4722, 4723, 4724, 4725, 4726, 4727, 4728, 4729, 4730, 4731, 4732, 4733, 4734, 4735, 4736, 4737, 4738, 4739, 4740, 4741, 4742, 4743, 4744, 4745, 4746, 4747, 4748, 4749, 4750, 4751, 4752, 4753, 4754, 4755, 4756, 4757, 4758, 4759, 4760, 4761, 4762, 4763, 4764, 4765, 4766, 4767, 4768, 4769, 4770, 4771, 4772, 4773, 4774, 4775, 4776, 4777, 4778, 4779, 4780, 4781, 4782, 4783, 4784, 4785, 4786, 4787, 4788, 4789, 4790, 4791, 4792, 4793, 4794, 4795, 4796, 4797, 4798, 4799, 4800, 4801, 4802, 4803, 4804, 4805, 4806, 4807, 4808, 4809, 4810, 4811, 4812, 4813, 4814, 4815, 4816, 4817, 4818, 4819, 4820, 4821, 4822, 4823, 4824, 4825, 4826, 4827, 4828, 4829, 4830, 4831, 4832, 4833, 4834, 4835, 4836, 4837, 4838, 4839, 4840, 4841, 4842, 4843, 4844, 4845, 4846, 4847, 4848, 4849, 4850, 4851, 4852, 4853, 4854, 4855, 4856, 4857, 4858, 4859, 4860, 4861, 4862, 4863, 4864, 4865, 4866, 4867, 4868, 4869, 4870, 4871, 4872, 4873, 4874, 4875, 4876, 4877, 4878, 4879, 4880, 4881, 4882, 4883, 4884, 4885, 4886, 4887, 4888, 4889, 4890, 4891, 4892, 4893, 4894, 4895, 4896, 4897, 4898, 4899, 4900, 4901, 4902, 4903, 4904, 4905, 4906, 4907, 4908, 4909, 4910, 4911, 4912, 4913, 4914, 4915, 4916, 4917, 4918, 4919, 4920, 4921, 4922, 4923, 4924, 4925, 4926, 4927, 4928, 4929, 4930, 4931, 4932, 4933, 4934, 4935, 4936, 4937, 4938, 4939, 4940, 4941, 4942, 4943, 4944, 4945, 4946, 4947, 4948, 4949, 4950, 4951, 4952, 4953, 4954, 4955, 4956, 4957, 4958, 4959, 4960, 4961, 4962, 4963, 4964, 4965, 4966, 4967, 4968, 4969, 4970, 4971, 4972, 4973, 4974, 4975, 4976, 4977, 4978, 4979, 4980, 4981, 4982, 4983, 4984, 4985, 4986, 4987, 4988, 4989, 4990, 4991, 4992, 4993, 4994, 4995, 4996, 4997, 4998, 4999, 5000, 5001, 5002, 5003, 5004, 5005, 5006, 5007, 5008, 5009, 5010, 5011, 5012, 5013, 5014, 5015, 5016, 5017, 5018, 5019, 5020, 5021, 5022, 5023, 5024, 5025, 5026, 5027, 5028, 5029, 5030, 5031, 5032, 5033, 5034, 5035, 5036, 5037, 5038, 5383], [5039, 5040, 5041, 5042, 5043, 5044, 5045, 5384], [5046, 5047, 5048, 5049, 5050, 5051, 5052, 5385], [5053, 5054, 5386], [5055, 5056, 5057, 5387], [5058, 5059, 5388], [5060, 5061, 5062, 5063, 5064, 5065, 5066, 5067, 5068, 5389], [5069, 5070, 5071, 5072, 5073, 5074, 5075, 5076, 5077, 5078, 5079, 5080, 5081, 5082, 5083, 5084, 5085, 5086, 5087, 5088, 5089, 5090, 5091, 5092, 5093, 5094, 5095, 5096, 5097, 5098, 5099, 5100, 5101, 5102, 5103, 5104, 5105, 5106, 5107, 5108, 5109, 5110, 5111, 5112, 5113, 5114, 5115, 5116, 5117, 5118, 5119, 5120, 5121, 5122, 5123, 5124, 5125, 5126, 5127, 5128, 5129, 5130, 5131, 5132, 5133, 5134, 5135, 5136, 5137, 5138, 5139, 5140, 5141, 5142, 5143, 5144, 5145, 5146, 5147, 5148, 5390], [5149, 5150, 5151, 5152, 5153, 5154, 5155, 5156, 5157, 5158, 5159, 5160, 5161, 5162, 5163, 5164, 5165, 5166, 5167, 5168, 5169, 5170, 5171, 5172, 5173, 5174, 5175, 5176, 5177, 5178, 5179, 5180, 5181, 5182, 5183, 5184, 5185, 5186, 5187, 5188, 5189, 5190, 5191, 5192, 5193, 5194, 5195, 5196, 5197, 5198, 5199, 5200, 5201, 5202, 5203, 5204, 5205, 5206, 5207, 5208, 5209, 5210, 5211, 5212, 5213, 5214, 5215, 5216, 5217, 5218, 5219, 5220, 5221, 5222, 5223, 5224, 5225, 5226, 5227, 5228, 5229, 5230, 5231, 5232, 5233, 5234, 5235, 5236, 5237, 5238, 5239, 5240, 5241, 5242, 5243, 5244, 5245, 5246, 5247, 5248, 5249, 5250, 5251, 5252, 5253, 5254, 5255, 5256, 5257, 5258, 5259, 5260, 5261, 5262, 5263, 5264, 5265, 5266, 5267, 5268, 5269, 5270, 5271, 5272, 5273, 5274, 5275, 5276, 5277, 5278, 5279, 5280, 5281, 5282, 5283, 5284, 5285, 5286, 5287, 5288, 5289, 5290, 5291, 5292, 5293, 5294, 5295, 5296, 5297, 5298, 5299, 5300, 5301, 5302, 5303, 5304, 5305, 5306, 5307, 5308, 5309, 5310, 5311, 5312, 5313, 5314, 5315, 5316, 5317, 5318, 5319, 5320, 5321, 5322, 5323, 5324, 5325, 5326, 5327, 5328, 5329, 5330, 5331, 5332, 5333, 5334, 5335, 5336, 5337, 5338, 5339, 5340, 5341, 5342, 5343, 5344, 5345, 5346, 5347, 5348, 5349, 5350, 5351, 5352, 5353, 5354, 5355, 5356, 5357, 5358, 5359, 5360, 5361, 5362, 5363, 5364, 5365, 5366, 5367, 5368, 5369, 5370, 5371, 5372, 5373, 5374, 5375, 5376, 5377, 5378, 5379, 5380, 5381, 5391]] \ No newline at end of file diff --git a/examples/model_selection/Trails/internal/ml/model_slicing/data/hcdr_col_cardinalities b/examples/model_selection/Trails/internal/ml/model_slicing/data/hcdr_col_cardinalities new file mode 100755 index 0000000000..389a3c589c --- /dev/null +++ b/examples/model_selection/Trails/internal/ml/model_slicing/data/hcdr_col_cardinalities @@ -0,0 +1 @@ +[[0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 544], [15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 545], [33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 546], [50, 51, 52, 53, 54, 55, 547], [56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 548], [78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 549], [89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 550], [103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 551], [114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 552], [130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 553], [144, 146, 147, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 554], [161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 555], [185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 556], [209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 557], [241, 242, 243, 244, 245, 246, 247, 248, 249, 558], [250, 251, 252, 253, 254, 255, 256, 257, 259, 559], [260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 560], [271, 272, 273, 274, 275, 276, 277, 278, 561], [279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 562], [293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 563], [317, 318, 319, 320, 321, 322, 323, 324, 325, 564], [326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 565], [350, 351, 566], [352, 353, 567], [354, 355, 356, 357, 358, 359, 360, 568], [361, 362, 569], [363, 364, 570], [365, 366, 571], [367, 368, 369, 370, 371, 372, 572], [373, 374, 375, 573], [376, 377, 574], [378, 379, 575], [380, 381, 382, 383, 384, 576], [385, 386, 577], [387, 388, 578], [389, 390, 391, 392, 393, 579], [394, 395, 396, 397, 398, 399, 400, 580], [401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 581], [459, 460, 582], [461, 462, 583], [463, 464, 465, 466, 467, 468, 469, 470, 471, 584], [472, 473, 474, 585], [475, 476, 586], [477, 478, 587], [479, 480, 588], [481, 482, 589], [483, 484, 590], [485, 486, 591], [487, 488, 489, 592], [490, 491, 593], [492, 493, 594], [494, 495, 595], [496, 596], [497, 498, 597], [499, 500, 598], [501, 502, 599], [503, 504, 600], [505, 506, 601], [507, 508, 602], [509, 510, 603], [511, 512, 604], [513, 514, 605], [515, 516, 517, 518, 519, 520, 521, 522, 523, 606], [524, 525, 607], [526, 527, 608], [528, 529, 530, 531, 532, 533, 534, 609], [535, 536, 537, 538, 539, 610], [540, 541, 611], [542, 543, 612]] \ No newline at end of file diff --git a/examples/model_selection/Trails/internal/ml/model_slicing/pg_interface.py b/examples/model_selection/Trails/internal/ml/model_slicing/pg_interface.py new file mode 100644 index 0000000000..b33ad148b9 --- /dev/null +++ b/examples/model_selection/Trails/internal/ml/model_slicing/pg_interface.py @@ -0,0 +1,351 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + +# set PYTHONPATH +import sys + +# Path you want to add +sys.path = [ + '/project/Trails/internal/ml/model_slicing', + '/project/Trails/internal/ml/model_slicing/algorithm', + '/project/Trails/internal/ml', + '/usr/lib/python38.zip', '/usr/lib/python3.8', '/usr/lib/python3.8/lib-dynload', '/home/postgres/.local/lib/python3.8/site-packages', '/usr/local/lib/python3.8/dist-packages', '/usr/lib/python3/dist-packages'] + +import calendar +import os +import time +import json +import traceback +import orjson +from argparse import Namespace +from model_selection.shared_config import parse_config_arguments +from multiprocessing import shared_memory +import torch +from typing import Any, List, Dict, Tuple + + +def read_json(file_name): + print(f"Loading {file_name}...") + is_exist = os.path.exists(file_name) + if is_exist: + with open(file_name, 'r') as readfile: + data = json.load(readfile) + return data + else: + print(f"{file_name} is not exist") + return {} + + +def exception_catcher(func): + def wrapper(encoded_str: str): + try: + # each functon accepts a json string + params = json.loads(encoded_str) + config_file = params.get("config_file") + + # Parse the config file + args = parse_config_arguments(config_file) + + # Set the environment variables + ts = calendar.timegm(time.gmtime()) + os.environ.setdefault("base_dir", args.base_dir) + os.environ.setdefault("log_logger_folder_name", args.log_folder) + os.environ.setdefault("log_file_name", args.log_name + "_" + str(ts) + ".log") + + # Call the original function with the parsed parameters + return func(params, args) + except Exception as e: + return orjson.dumps( + {"Errored": traceback.format_exc()}).decode('utf-8') + + return wrapper + + +# Micro benchmarking filterting phaes +model = None +sliced_model = None +col_cardinalities = None +time_usage_dic = {} + + +@exception_catcher +def model_inference_load_model(params: dict, args: Namespace): + global model, sliced_model, col_cardinalities + from model_selection.src.logger import logger + try: + logger.info(f"Received parameters: {params}") + + from model_slicing.algorithm.src.data_loader import sql_attached_dataloader + from model_slicing.algorithm.profile_model_clean import load_model + # read saved col_cardinatlites file + if col_cardinalities is None: + col_cardinalities = read_json(params["col_cardinalities_file"]) + + # read the model path, + model_path = params["model_path"] + + # get the where condition + where_cond = json.loads(params["where_cond"]) + # generate default sql and selected sql + target_sql = [col[-1] for col in col_cardinalities] + for col_index, value in where_cond.items(): + target_sql[int(col_index)] = value + logger.info(f"target_sql encoding is: {target_sql}") + + if model is None: + logger.info("Load model .....") + model, config = load_model(model_path) + model.eval() + sliced_model = model.tailor_by_sql(torch.tensor(target_sql).reshape(1, -1)) + sliced_model.eval() + logger.info("Load model Done!") + else: + logger.info("Skip Load model") + except: + logger.info(orjson.dumps( + {"Errored": traceback.format_exc()}).decode('utf-8')) + return orjson.dumps({"ok": 1}).decode('utf-8') + + +@exception_catcher +def model_inference_compute(params: dict, args: Namespace): + global model, sliced_model, col_cardinalities, time_usage_dic + from model_selection.src.logger import logger + try: + + overall_begin = time.time() + mini_batch = json.loads(params["mini_batch"]) + logger.info("-----" * 10) + + time_usage_dic = {} + + # logger.info(f"Received status: {mini_batch['status']}") + # if mini_batch["status"] != 'success': + # raise Exception + + # todo: for credit datasets, it has 23 fields + mini_batch_used = [mini_batch[i:i + 23] for i in range(0, len(mini_batch), 23)] + + begin = time.time() + # pre-processing mini_batch + transformed_data = torch.LongTensor(mini_batch_used) + time_usage_dic["py_conver_to_tensor"] = time.time() - begin + + logger.info(f"transformed data size: {transformed_data.size()}") + + begin = time.time() + y = sliced_model(transformed_data, None) + time_usage_dic["py_compute"] = time.time() - begin + logger.info(f"Prediction Results = {y.tolist()[:2]}...") + + logger.info("-----" * 10) + overall_end = time.time() + time_usage_dic["py_overall_duration"] = overall_end - overall_begin + time_usage_dic["py_diff"] = time_usage_dic["py_overall_duration"] - \ + (time_usage_dic["py_conver_to_tensor"] + time_usage_dic["py_compute"]) + + logger.info(f"time usage of inference {len(transformed_data)} rows is {time_usage_dic}") + except: + logger.info(orjson.dumps( + {"Errored": traceback.format_exc()}).decode('utf-8')) + + return orjson.dumps({"model_outputs": 1}).decode('utf-8') + + +@exception_catcher +def model_inference_compute_shared_memory(params: dict, args: Namespace): + global model, sliced_model, col_cardinalities, time_usage_dic + from model_selection.src.logger import logger + try: + mini_batch_shared = get_data_from_shared_memory() + logger.info(f"mini_batch_shared: {mini_batch_shared[:100]}") + + overall_begin = time.time() + mini_batch = json.loads(mini_batch_shared) + logger.info("-----" * 10) + + time_usage_dic = {} + + logger.info(f"Received status: {mini_batch['status']}") + if mini_batch["status"] != 'success': + raise Exception + + begin = time.time() + # pre-processing mini_batch + transformed_data = torch.LongTensor([ + [int(item.split(':')[0]) for item in sublist[2:]] + for sublist in mini_batch["data"]]) + time_usage_dic["py_conver_to_tensor"] = time.time() - begin + + logger.info(f"transformed data size: {len(transformed_data)}") + + begin = time.time() + y = sliced_model(transformed_data, None) + time_usage_dic["py_compute"] = time.time() - begin + logger.info(f"Prediction Results = {y.tolist()[:2]}...") + + logger.info("-----" * 10) + overall_end = time.time() + time_usage_dic["py_overall_duration"] = overall_end - overall_begin + time_usage_dic["py_diff"] = time_usage_dic["py_overall_duration"] - \ + (time_usage_dic["py_conver_to_tensor"] + time_usage_dic["py_compute"]) + + logger.info(f"time usage of inference {len(transformed_data)} rows is {time_usage_dic}") + except: + logger.info(orjson.dumps( + {"Errored": traceback.format_exc()}).decode('utf-8')) + + return orjson.dumps({"model_outputs": 1}).decode('utf-8') + + +def decode_libsvm(columns): + map_func = lambda pair: (int(pair[0]), float(pair[1])) + # 0 is id, 1 is label + id, value = zip(*map(lambda col: map_func(col.split(':')), columns[2:])) + sample = {'id': list(id)} + return sample + + +def pre_processing(mini_batch_data: List[Tuple]): + """ + mini_batch_data: [('0', '0', '123:123', '123:123', '123:123',) + """ + sample_lines = len(mini_batch_data) + feat_id = [] + feat_value = [] + y = [] + for i in range(sample_lines): + row_value = mini_batch_data[i] + sample = decode_libsvm(row_value) + feat_id.append(sample['id']) + feat_id = torch.LongTensor(feat_id) + return {'id': feat_id} + + +@exception_catcher +def model_inference_compute_shared_memory_write_once(params: dict, args: Namespace): + global model, sliced_model, col_cardinalities, time_usage_dic + from model_selection.src.logger import logger + try: + mini_batch_shared = get_data_from_shared_memory() + logger.info(f"mini_batch_shared: <-{mini_batch_shared[:50]}->, type: {type(mini_batch_shared)}") + + overall_begin = time.time() + mini_batch = json.loads(mini_batch_shared) + logger.info("-----" * 10) + + time_usage_dic = {} + + begin = time.time() + # pre-processing mini_batch + transformed_data = pre_processing(mini_batch)['id'] + time_usage_dic["py_conver_to_tensor"] = time.time() - begin + logger.info(f"transformed data size: {len(transformed_data)}") + + begin = time.time() + y = sliced_model(transformed_data, None) + time_usage_dic["py_compute"] = time.time() - begin + logger.info(f"Prediction Results = {y.tolist()[:2]}...") + + logger.info("-----" * 10) + overall_end = time.time() + time_usage_dic["py_overall_duration"] = overall_end - overall_begin + time_usage_dic["py_diff"] = time_usage_dic["py_overall_duration"] - \ + (time_usage_dic["py_conver_to_tensor"] + time_usage_dic["py_compute"]) + + logger.info(f"time usage of inference {len(transformed_data)} rows is {time_usage_dic}") + except: + logger.info(orjson.dumps( + {"Errored": traceback.format_exc()}).decode('utf-8')) + + return orjson.dumps({"model_outputs": 1}).decode('utf-8') + +@exception_catcher +def model_inference_compute_shared_memory_write_once_int(params: dict, args: Namespace): + global model, sliced_model, col_cardinalities, time_usage_dic + from model_selection.src.logger import logger + time_usage_dic = {} + + try: + mini_batch_shared = get_data_from_shared_memory_int(int(params["rows"])) + # logger.info(f"mini_batch_shared: <-{mini_batch_shared[:50]}->, type: {type(mini_batch_shared)}") + logger.info(f"mini_batch_shared: <-{mini_batch_shared}->, type: {type(mini_batch_shared)}") + + overall_begin = time.time() + logger.info("-----" * 10) + + begin = time.time() + # pre-processing mini_batch + transformed_data = torch.LongTensor(mini_batch_shared) + time_usage_dic["py_conver_to_tensor"] = time.time() - begin + logger.info(f"transformed data size: {transformed_data.size()}") + + begin = time.time() + y = sliced_model(transformed_data, None) + time_usage_dic["py_compute"] = time.time() - begin + logger.info(f"Prediction Results = {y.tolist()[:2]}...") + + logger.info("-----" * 10) + overall_end = time.time() + time_usage_dic["py_overall_duration"] = overall_end - overall_begin + time_usage_dic["py_diff"] = time_usage_dic["py_overall_duration"] - \ + (time_usage_dic["py_conver_to_tensor"] + time_usage_dic["py_compute"]) + + logger.info(f"time usage of inference {len(transformed_data)} rows is {time_usage_dic}") + except: + logger.info(orjson.dumps( + {"Errored": traceback.format_exc()}).decode('utf-8')) + + return orjson.dumps({"model_outputs": 1}).decode('utf-8') + + +def records_results(params: str): + global time_usage_dic + from model_selection.src.logger import logger + try: + params = json.loads(params) + params.update(time_usage_dic) + logger.info(f"final result = {params}") + except: + logger.info(orjson.dumps( + {"Errored": traceback.format_exc()}).decode('utf-8')) + return orjson.dumps({"Done": 1}).decode('utf-8') + + +def get_data_from_shared_memory(shmem_name="my_shmem"): + # Open existing shared memory segment + shm = shared_memory.SharedMemory(name="my_shared_memory") + # Read data + data = shm.buf.tobytes().decode() + # Close + shm.close() + return data.rstrip('\x00') + + +import numpy as np + +def get_data_from_shared_memory_int(n_rows): + # Connect to existing shared memory by name + shm = shared_memory.SharedMemory(name="my_shared_memory") + # Map the shared memory to a numpy array. Assuming i32 integers. + data = np.frombuffer(shm.buf, dtype=np.int32) + # Reshape the 1D array to have n_rows and let numpy infer the number of columns + data = data.reshape(n_rows, -1) + return data + diff --git a/examples/model_selection/Trails/internal/ml/model_slicing/profile_preprocess.py b/examples/model_selection/Trails/internal/ml/model_slicing/profile_preprocess.py new file mode 100644 index 0000000000..5a0e82475b --- /dev/null +++ b/examples/model_selection/Trails/internal/ml/model_slicing/profile_preprocess.py @@ -0,0 +1,74 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + +from typing import List, Tuple +import time + + +def decode_libsvm(columns): + map_func = lambda pair: (int(pair[0]), float(pair[1])) + # 0 is id, 1 is label + id, value = zip(*map(lambda col: map_func(col.split(':')), columns[2:])) + sample = {'id': list(id), + 'value': list(value), + 'y': int(columns[1])} + return sample + +def decode_libsvm(columns): + # Decode without additional mapping or zipping, directly processing the splits. + ids = [] + values = [] + for col in columns[2:]: + id, value = col.split(':') + ids.append(int(id)) + values.append(float(value)) + return {'id': ids, 'value': values, 'y': int(columns[1])} + + +def pre_processing(mini_batch_data: List[Tuple]): + # Prepare storage for the results. + all_feat_ids = [] + all_feat_values = [] + all_ys = [] + + for row_value in mini_batch_data: + # Decode and extract data directly without additional unpacking. + sample = decode_libsvm(list(row_value)) + all_feat_ids.append(sample['id']) + all_feat_values.append(sample['value']) + all_ys.append(sample['y']) + + return {'id': all_feat_ids, 'value': all_feat_values, 'y': all_ys} + + +mini_batch = [ + ('4801', '0', '2:1', '4656:1', '5042:1', '5051:1', '5054:1', '5055:1', '5058:1', '5061:1', '5070:1', '5150:1'), +] + +mini_batch = mini_batch * 100000 +print(len(mini_batch)) + +begin = time.time() +pre_processing(mini_batch) +end = time.time() +print(end-begin) + + + + diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/scripts/pre_processing/pre_processing_data.sh b/examples/model_selection/Trails/internal/ml/model_slicing/save_satistics.py similarity index 54% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/scripts/pre_processing/pre_processing_data.sh rename to examples/model_selection/Trails/internal/ml/model_slicing/save_satistics.py index a54caf042e..2f77d313c2 100644 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/scripts/pre_processing/pre_processing_data.sh +++ b/examples/model_selection/Trails/internal/ml/model_slicing/save_satistics.py @@ -16,48 +16,35 @@ # limitations under the License. # -export PYTHONPATH=$PYTHONPATH:./internal/ml/model_selection -conda activate trails - - - - -python ./internal/ml/model_selection/exps/nas_bench_tabular/4.seq_score_online.py \ - --models_explore=1000 \ - --log_name=score_based \ - --search_space=mlp_sp \ - --num_layers=4 \ - --hidden_choice_len=10 \ - --base_dir=/hdd1/xingnaili/exp_data/ \ - --num_labels=2 \ - --device=cuda:6 \ - --batch_size=32 \ - --dataset=criteo \ - --nfeat=2100000 \ - --nfield=39 \ - --nemb=10 \ - --workers=0 \ - --result_dir=./internal/ml/model_selection/exp_result/ \ - --log_folder=log_score_time_criteo > outputCriScorAll.log& - - - - - - - - - - +from main import parse_arguments, seed_everything +import os +import glob +import json +from model_slicing.algorithm.src.data_loader import SQLAttacedLibsvmDataset +def write_json(file_name, data): + print(f"writting {file_name}...") + with open(file_name, 'w') as outfile: + outfile.write(json.dumps(data)) +args = parse_arguments() +seed_everything(args.seed) +data_dir = os.path.join(args.data_dir, args.dataset) +train_file = glob.glob("%s/tr*libsvm" % data_dir)[0] +train_loader = SQLAttacedLibsvmDataset( + train_file, + args.nfield, + args.max_filter_col) +write_json( + f"{args.dataset}_col_cardinalities", + train_loader.col_cardinalities) diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/pg_extension/.cargo/config.toml b/examples/model_selection/Trails/internal/pg_extension/.cargo/config.toml similarity index 100% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/pg_extension/.cargo/config.toml rename to examples/model_selection/Trails/internal/pg_extension/.cargo/config.toml diff --git a/examples/model_selection/Trails/internal/pg_extension/.gitignore b/examples/model_selection/Trails/internal/pg_extension/.gitignore new file mode 100644 index 0000000000..3906c33241 --- /dev/null +++ b/examples/model_selection/Trails/internal/pg_extension/.gitignore @@ -0,0 +1,6 @@ +.DS_Store +.idea/ +/target +*.iml +**/*.rs.bk +Cargo.lock diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/pg_extension/pg_extension.control b/examples/model_selection/Trails/internal/pg_extension/pg_extension.control similarity index 100% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/pg_extension/pg_extension.control rename to examples/model_selection/Trails/internal/pg_extension/pg_extension.control diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/pg_extension/sql/model_selection_cpu.sql b/examples/model_selection/Trails/internal/pg_extension/sql/model_selection_cpu.sql similarity index 94% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/pg_extension/sql/model_selection_cpu.sql rename to examples/model_selection/Trails/internal/pg_extension/sql/model_selection_cpu.sql index bcbf5c0fce..ea90dea72b 100644 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/pg_extension/sql/model_selection_cpu.sql +++ b/examples/model_selection/Trails/internal/pg_extension/sql/model_selection_cpu.sql @@ -1,5 +1,5 @@ /************************************************************ -* +* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information @@ -19,6 +19,7 @@ * *************************************************************/ + CREATE OR REPLACE PROCEDURE model_selection_end2end( dataset TEXT, --dataset name @@ -40,11 +41,11 @@ BEGIN WITH batch_rows AS ( SELECT %s FROM %I - ORDER BY RANDOM() + LIMIT 3200 ) SELECT model_selection( json_agg(row_to_json(t))::text, %L, %L ) FROM batch_rows AS t', column_list, dataset, budget, config_file) INTO result_status; - RAISE NOTICE '1. model_selection result: %', result_status; + RAISE NOTICE 'model_selection result: %', result_status; END; $$; diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/pg_extension/sql/model_selection_cpu_workloads.sql b/examples/model_selection/Trails/internal/pg_extension/sql/model_selection_cpu_workloads.sql similarity index 99% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/pg_extension/sql/model_selection_cpu_workloads.sql rename to examples/model_selection/Trails/internal/pg_extension/sql/model_selection_cpu_workloads.sql index 882be67f4e..932cc33b96 100644 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/pg_extension/sql/model_selection_cpu_workloads.sql +++ b/examples/model_selection/Trails/internal/pg_extension/sql/model_selection_cpu_workloads.sql @@ -1,5 +1,5 @@ /************************************************************ -* +* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information @@ -19,6 +19,7 @@ * *************************************************************/ + CREATE OR REPLACE PROCEDURE model_selection_workloads( dataset TEXT, --dataset name diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/pg_extension/sql/model_selection_dev.sql b/examples/model_selection/Trails/internal/pg_extension/sql/model_selection_dev.sql similarity index 99% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/pg_extension/sql/model_selection_dev.sql rename to examples/model_selection/Trails/internal/pg_extension/sql/model_selection_dev.sql index 6a72975471..1d1f1b3e1e 100644 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/pg_extension/sql/model_selection_dev.sql +++ b/examples/model_selection/Trails/internal/pg_extension/sql/model_selection_dev.sql @@ -1,5 +1,5 @@ /************************************************************ -* +* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information @@ -19,6 +19,7 @@ * *************************************************************/ + CREATE OR REPLACE PROCEDURE model_selection_sp( dataset TEXT, --dataset name diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/pg_extension/sql/model_selection_trails.sql b/examples/model_selection/Trails/internal/pg_extension/sql/model_selection_trails.sql similarity index 99% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/pg_extension/sql/model_selection_trails.sql rename to examples/model_selection/Trails/internal/pg_extension/sql/model_selection_trails.sql index 3d91da5017..9163e4aab2 100644 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/pg_extension/sql/model_selection_trails.sql +++ b/examples/model_selection/Trails/internal/pg_extension/sql/model_selection_trails.sql @@ -1,5 +1,5 @@ /************************************************************ -* +* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information @@ -19,6 +19,7 @@ * *************************************************************/ + CREATE OR REPLACE PROCEDURE model_selection_end2end( dataset TEXT, --dataset name diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/pg_extension/sql/model_selection_trails_workloads.sql b/examples/model_selection/Trails/internal/pg_extension/sql/model_selection_trails_workloads.sql similarity index 99% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/pg_extension/sql/model_selection_trails_workloads.sql rename to examples/model_selection/Trails/internal/pg_extension/sql/model_selection_trails_workloads.sql index aaf62c63d2..8a36afdaf3 100644 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/pg_extension/sql/model_selection_trails_workloads.sql +++ b/examples/model_selection/Trails/internal/pg_extension/sql/model_selection_trails_workloads.sql @@ -1,5 +1,5 @@ /************************************************************ -* +* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information @@ -19,6 +19,7 @@ * *************************************************************/ + CREATE OR REPLACE PROCEDURE model_selection_workloads( dataset TEXT, --dataset name diff --git a/examples/model_selection/Trails/internal/pg_extension/sql/pg_extension--0.1.0.sql b/examples/model_selection/Trails/internal/pg_extension/sql/pg_extension--0.1.0.sql new file mode 100644 index 0000000000..ed4cd0626f --- /dev/null +++ b/examples/model_selection/Trails/internal/pg_extension/sql/pg_extension--0.1.0.sql @@ -0,0 +1,221 @@ +/************************************************************ +* +* Licensed to the Apache Software Foundation (ASF) under one +* or more contributor license agreements. See the NOTICE file +* distributed with this work for additional information +* regarding copyright ownership. The ASF licenses this file +* to you under the Apache License, Version 2.0 (the +* "License"); you may not use this file except in compliance +* with the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an +* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +* KIND, either express or implied. See the License for the +* specific language governing permissions and limitations +* under the License. +* +*************************************************************/ + + +/* +This file is auto generated by pgrx. + +The ordering of items is not stable, it is driven by a dependency graph. +*/ + +-- src/lib.rs:80 +-- pg_extension::refinement_phase +CREATE FUNCTION "refinement_phase"( + "config_file" TEXT /* alloc::string::String */ +) RETURNS TEXT /* alloc::string::String */ +IMMUTABLE STRICT PARALLEL SAFE +LANGUAGE c /* Rust */ +AS 'MODULE_PATHNAME', 'refinement_phase_wrapper'; + +-- src/lib.rs:31 +-- pg_extension::profiling_refinement_phase +CREATE FUNCTION "profiling_refinement_phase"( + "mini_batch" TEXT, /* alloc::string::String */ + "config_file" TEXT /* alloc::string::String */ +) RETURNS TEXT /* alloc::string::String */ +IMMUTABLE STRICT PARALLEL SAFE +LANGUAGE c /* Rust */ +AS 'MODULE_PATHNAME', 'profiling_refinement_phase_wrapper'; + +-- src/lib.rs:16 +-- pg_extension::profiling_filtering_phase +CREATE FUNCTION "profiling_filtering_phase"( + "mini_batch" TEXT, /* alloc::string::String */ + "config_file" TEXT /* alloc::string::String */ +) RETURNS TEXT /* alloc::string::String */ +IMMUTABLE STRICT PARALLEL SAFE +LANGUAGE c /* Rust */ +AS 'MODULE_PATHNAME', 'profiling_filtering_phase_wrapper'; + +-- src/lib.rs:110 +-- pg_extension::model_selection_workloads +CREATE FUNCTION "model_selection_workloads"( + "mini_batch" TEXT, /* alloc::string::String */ + "n" INT, /* i32 */ + "k" INT, /* i32 */ + "config_file" TEXT /* alloc::string::String */ +) RETURNS TEXT /* alloc::string::String */ +IMMUTABLE STRICT PARALLEL SAFE +LANGUAGE c /* Rust */ +AS 'MODULE_PATHNAME', 'model_selection_workloads_wrapper'; + +-- src/lib.rs:138 +-- pg_extension::model_selection_trails_workloads +CREATE FUNCTION "model_selection_trails_workloads"( + "mini_batch" TEXT, /* alloc::string::String */ + "n" INT, /* i32 */ + "k" INT, /* i32 */ + "config_file" TEXT /* alloc::string::String */ +) RETURNS TEXT /* alloc::string::String */ +IMMUTABLE STRICT PARALLEL SAFE +LANGUAGE c /* Rust */ +AS 'MODULE_PATHNAME', 'model_selection_trails_workloads_wrapper'; + +-- src/lib.rs:125 +-- pg_extension::model_selection_trails +CREATE FUNCTION "model_selection_trails"( + "mini_batch" TEXT, /* alloc::string::String */ + "time_budget" TEXT, /* alloc::string::String */ + "config_file" TEXT /* alloc::string::String */ +) RETURNS TEXT /* alloc::string::String */ +IMMUTABLE STRICT PARALLEL SAFE +LANGUAGE c /* Rust */ +AS 'MODULE_PATHNAME', 'model_selection_trails_wrapper'; + +-- src/lib.rs:94 +-- pg_extension::model_selection +CREATE FUNCTION "model_selection"( + "mini_batch" TEXT, /* alloc::string::String */ + "time_budget" TEXT, /* alloc::string::String */ + "config_file" TEXT /* alloc::string::String */ +) RETURNS TEXT /* alloc::string::String */ +IMMUTABLE STRICT PARALLEL SAFE +LANGUAGE c /* Rust */ +AS 'MODULE_PATHNAME', 'model_selection_wrapper'; + +-- src/lib.rs:267 +-- pg_extension::model_init +CREATE FUNCTION "model_init"( + "condition" TEXT, /* alloc::string::String */ + "config_file" TEXT, /* alloc::string::String */ + "col_cardinalities_file" TEXT, /* alloc::string::String */ + "model_path" TEXT /* alloc::string::String */ +) RETURNS TEXT /* alloc::string::String */ +IMMUTABLE STRICT PARALLEL SAFE +LANGUAGE c /* Rust */ +AS 'MODULE_PATHNAME', 'model_init_wrapper'; + +-- src/lib.rs:243 +-- pg_extension::inference_shared_write_once_int +CREATE FUNCTION "inference_shared_write_once_int"( + "dataset" TEXT, /* alloc::string::String */ + "condition" TEXT, /* alloc::string::String */ + "config_file" TEXT, /* alloc::string::String */ + "col_cardinalities_file" TEXT, /* alloc::string::String */ + "model_path" TEXT, /* alloc::string::String */ + "sql" TEXT, /* alloc::string::String */ + "batch_size" INT /* i32 */ +) RETURNS TEXT /* alloc::string::String */ +IMMUTABLE STRICT PARALLEL SAFE +LANGUAGE c /* Rust */ +AS 'MODULE_PATHNAME', 'inference_shared_write_once_int_wrapper'; + +-- src/lib.rs:220 +-- pg_extension::inference_shared_write_once +CREATE FUNCTION "inference_shared_write_once"( + "dataset" TEXT, /* alloc::string::String */ + "condition" TEXT, /* alloc::string::String */ + "config_file" TEXT, /* alloc::string::String */ + "col_cardinalities_file" TEXT, /* alloc::string::String */ + "model_path" TEXT, /* alloc::string::String */ + "sql" TEXT, /* alloc::string::String */ + "batch_size" INT /* i32 */ +) RETURNS TEXT /* alloc::string::String */ +IMMUTABLE STRICT PARALLEL SAFE +LANGUAGE c /* Rust */ +AS 'MODULE_PATHNAME', 'inference_shared_write_once_wrapper'; + +-- src/lib.rs:197 +-- pg_extension::inference_shared +CREATE FUNCTION "inference_shared"( + "dataset" TEXT, /* alloc::string::String */ + "condition" TEXT, /* alloc::string::String */ + "config_file" TEXT, /* alloc::string::String */ + "col_cardinalities_file" TEXT, /* alloc::string::String */ + "model_path" TEXT, /* alloc::string::String */ + "sql" TEXT, /* alloc::string::String */ + "batch_size" INT /* i32 */ +) RETURNS TEXT /* alloc::string::String */ +IMMUTABLE STRICT PARALLEL SAFE +LANGUAGE c /* Rust */ +AS 'MODULE_PATHNAME', 'run_inference_shared_wrapper'; + +-- src/lib.rs:174 +-- pg_extension::inference +CREATE FUNCTION "inference"( + "dataset" TEXT, /* alloc::string::String */ + "condition" TEXT, /* alloc::string::String */ + "config_file" TEXT, /* alloc::string::String */ + "col_cardinalities_file" TEXT, /* alloc::string::String */ + "model_path" TEXT, /* alloc::string::String */ + "sql" TEXT, /* alloc::string::String */ + "batch_size" INT /* i32 */ +) RETURNS TEXT /* alloc::string::String */ +IMMUTABLE STRICT PARALLEL SAFE +LANGUAGE c /* Rust */ +AS 'MODULE_PATHNAME', 'run_inference_wrapper'; + +-- src/lib.rs:66 +-- pg_extension::filtering_phase +CREATE FUNCTION "filtering_phase"( + "mini_batch" TEXT, /* alloc::string::String */ + "n" INT, /* i32 */ + "k" INT, /* i32 */ + "config_file" TEXT /* alloc::string::String */ +) RETURNS TEXT /* alloc::string::String */ +IMMUTABLE STRICT PARALLEL SAFE +LANGUAGE c /* Rust */ +AS 'MODULE_PATHNAME', 'filtering_phase_wrapper'; + +-- src/lib.rs:46 +-- pg_extension::coordinator +CREATE FUNCTION "coordinator"( + "time_score" TEXT, /* alloc::string::String */ + "time_train" TEXT, /* alloc::string::String */ + "time_budget" TEXT, /* alloc::string::String */ + "only_phase1" bool, /* bool */ + "config_file" TEXT /* alloc::string::String */ +) RETURNS TEXT /* alloc::string::String */ +IMMUTABLE STRICT PARALLEL SAFE +LANGUAGE c /* Rust */ +AS 'MODULE_PATHNAME', 'coordinator_wrapper'; + +-- src/lib.rs:152 +-- pg_extension::benchmark_filtering_phase_latency +CREATE FUNCTION "benchmark_filtering_phase_latency"( + "explore_models" INT, /* i32 */ + "config_file" TEXT /* alloc::string::String */ +) RETURNS TEXT /* alloc::string::String */ +IMMUTABLE STRICT PARALLEL SAFE +LANGUAGE c /* Rust */ +AS 'MODULE_PATHNAME', 'benchmark_filtering_phase_latency_wrapper'; + +-- src/lib.rs:163 +-- pg_extension::benchmark_filtering_latency_in_db +CREATE FUNCTION "benchmark_filtering_latency_in_db"( + "explore_models" INT, /* i32 */ + "dataset" TEXT, /* alloc::string::String */ + "batch_size_m" INT, /* i32 */ + "config_file" TEXT /* alloc::string::String */ +) RETURNS TEXT /* alloc::string::String */ +IMMUTABLE STRICT PARALLEL SAFE +LANGUAGE c /* Rust */ +AS 'MODULE_PATHNAME', 'benchmark_filtering_latency_in_db_wrapper'; \ No newline at end of file diff --git a/examples/model_selection/Trails/internal/pg_extension/src/bindings/inference.rs b/examples/model_selection/Trails/internal/pg_extension/src/bindings/inference.rs new file mode 100644 index 0000000000..aa6161184f --- /dev/null +++ b/examples/model_selection/Trails/internal/pg_extension/src/bindings/inference.rs @@ -0,0 +1,797 @@ +/************************************************************ +* +* Licensed to the Apache Software Foundation (ASF) under one +* or more contributor license agreements. See the NOTICE file +* distributed with this work for additional information +* regarding copyright ownership. The ASF licenses this file +* to you under the Apache License, Version 2.0 (the +* "License"); you may not use this file except in compliance +* with the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an +* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +* KIND, either express or implied. See the License for the +* specific language governing permissions and limitations +* under the License. +* +*************************************************************/ + + +use serde_json::json; +use std::collections::HashMap; +use std::ffi::c_long; +use pgrx::prelude::*; +use crate::bindings::ml_register::PY_MODULE_INFERENCE; +use crate::bindings::ml_register::run_python_function; +use std::time::{Instant}; +use shared_memory::*; + + +pub fn run_inference_shared_memory( + dataset: &String, + condition: &String, + config_file: &String, + col_cardinalities_file: &String, + model_path: &String, + sql: &String, + batch_size: i32, +) -> serde_json::Value { + let mut response = HashMap::new(); + + let overall_start_time = Instant::now(); + + let mut last_id = 0; + + // Step 1: load model and columns etc + let mut task_map = HashMap::new(); + task_map.insert("where_cond", condition.clone()); + task_map.insert("config_file", config_file.clone()); + task_map.insert("col_cardinalities_file", col_cardinalities_file.clone()); + task_map.insert("model_path", model_path.clone()); + let task_json = json!(task_map).to_string(); + // here it cache a state + run_python_function( + &PY_MODULE_INFERENCE, + &task_json, + "model_inference_load_model"); + + let _end_time = Instant::now(); + let model_init_time = _end_time.duration_since(overall_start_time).as_secs_f64(); + response.insert("model_init_time", model_init_time.clone()); + + // Step 2: query data via SPI + let start_time = Instant::now(); + let results: Result>, String> = Spi::connect(|client| { + let query = format!("SELECT * FROM {}_train {} LIMIT {}", + dataset, sql, batch_size); + let mut cursor = client.open_cursor(&query, None); + let table = match cursor.fetch(batch_size as c_long) { + Ok(table) => table, + Err(e) => return Err(e.to_string()), // Convert the error to a string and return + }; + + let mut mini_batch = Vec::new(); + + for row in table.into_iter() { + let mut each_row = Vec::new(); + // add primary key + let col0 = match row.get::(1) { + Ok(Some(val)) => { + // Update last_id with the retrieved value + if val > 100000 { + last_id = 0; + } else { + last_id = val + } + val.to_string() + } + Ok(None) => "".to_string(), // Handle the case when there's no valid value + Err(e) => e.to_string(), + }; + each_row.push(col0); + // add label + let col1 = match row.get::(2) { + Ok(val) => val.map(|i| i.to_string()).unwrap_or_default(), + Err(e) => e.to_string(), + }; + each_row.push(col1); + // add fields + let texts: Vec = (3..row.columns() + 1) + .filter_map(|i| { + match row.get::<&str>(i) { + Ok(Some(s)) => Some(s.to_string()), + Ok(None) => None, + Err(e) => Some(e.to_string()), // Convert error to string + } + }).collect(); + each_row.extend(texts); + mini_batch.push(each_row) + } + // return + Ok(mini_batch) + }); + // serialize the mini-batch data + let tup_table = match results { + Ok(data) => { + serde_json::json!({ + "status": "success", + "data": data + }) + } + Err(e) => { + serde_json::json!({ + "status": "error", + "message": format!("Error while connecting: {}", e) + }) + } + }; + let mini_batch_json = tup_table.to_string(); + + let end_time = Instant::now(); + let data_query_time = end_time.duration_since(start_time).as_secs_f64(); + response.insert("data_query_time", data_query_time.clone()); + + + let start_time = Instant::now(); + // Set an identifier for the shared memory + let shmem_name = "my_shared_memory"; + let my_shmem = ShmemConf::new() + .size(tup_table.to_string().len()) + .os_id(shmem_name) + .create() + .unwrap(); + + // Use unsafe to access and write to the raw memory + let data_to_write = mini_batch_json.as_bytes(); + unsafe { + // Get the raw pointer to the shared memory + let shmem_ptr = my_shmem.as_ptr() as *mut u8; + // Copy data into the shared memory + std::ptr::copy_nonoverlapping( + data_to_write.as_ptr(), shmem_ptr, data_to_write.len()); + } + + let end_time = Instant::now(); + let data_copy_time = end_time.duration_since(start_time).as_secs_f64(); + response.insert("data_copy", data_copy_time.clone()); + + let start_time = Instant::now(); + // Step 3: model evaluate in Python + let mut eva_task_map = HashMap::new(); + eva_task_map.insert("config_file", config_file.clone()); + eva_task_map.insert("spi_seconds", data_query_time.to_string()); + + let eva_task_json = json!(eva_task_map).to_string(); // Corrected this line + + run_python_function( + &PY_MODULE_INFERENCE, + &eva_task_json, + "model_inference_compute_shared_memory"); + + let end_time = Instant::now(); + let python_compute_time = end_time.duration_since(start_time).as_secs_f64(); + response.insert("python_compute_time", python_compute_time.clone()); + + let overall_end_time = Instant::now(); + let overall_elapsed_time = overall_end_time.duration_since(overall_start_time).as_secs_f64(); + let diff_time = model_init_time + data_query_time + data_copy_time + python_compute_time - overall_elapsed_time; + + response.insert("overall_query_latency", overall_elapsed_time.clone()); + response.insert("diff", diff_time.clone()); + + // Step 4: Return to PostgresSQL + return serde_json::json!(response); +} + + +pub fn run_inference( + dataset: &String, + condition: &String, + config_file: &String, + col_cardinalities_file: &String, + model_path: &String, + sql: &String, + batch_size: i32, +) -> serde_json::Value { + let mut response = HashMap::new(); + + let overall_start_time = Instant::now(); + +// let mut last_id = 0; + + // Step 1: load model and columns etc + let mut task_map = HashMap::new(); + task_map.insert("where_cond", condition.clone()); + task_map.insert("config_file", config_file.clone()); + task_map.insert("col_cardinalities_file", col_cardinalities_file.clone()); + task_map.insert("model_path", model_path.clone()); + let task_json = json!(task_map).to_string(); + // here it cache a state + run_python_function( + &PY_MODULE_INFERENCE, + &task_json, + "model_inference_load_model"); + + let _end_time = Instant::now(); + let model_init_time = _end_time.duration_since(overall_start_time).as_secs_f64(); + response.insert("model_init_time", model_init_time.clone()); + + // Step 2: query data via SPI + let start_time = Instant::now(); + let mut all_rows = Vec::new(); + let _ = Spi::connect(|client| { + let query = format!("SELECT * FROM {}_int_train {} LIMIT {}", dataset, sql, batch_size); + let mut cursor = client.open_cursor(&query, None); + let table = match cursor.fetch(batch_size as c_long) { + Ok(table) => table, + Err(e) => return Err(e.to_string()), + }; + + let end_time = Instant::now(); + let data_query_time_spi = end_time.duration_since(start_time).as_secs_f64(); + response.insert("data_query_time_spi", data_query_time_spi); + + // todo: nl: this part can must be optimized, since i go through all of those staff. + for row in table.into_iter() { + for i in 3..=row.columns() { + match row.get::(i) { + Ok(Some(val)) => all_rows.push(val), // Handle the case when a valid i32 is obtained + Ok(None) => { + // Handle the case when the value is missing or erroneous + // For example, you can add a default value, like -1 + all_rows.push(-1); + } + Err(e) => { + // Handle the error, e.g., log it or handle it in some way + eprintln!("Error fetching value: {:?}", e); + } + } + } + } + // Return OK or some status + Ok(()) + }); + + let mini_batch_json = serde_json::to_string(&all_rows).unwrap(); + + let end_time = Instant::now(); + let data_query_time = end_time.duration_since(start_time).as_secs_f64(); + response.insert("data_query_time", data_query_time.clone()); + + let start_time = Instant::now(); + // Step 3: model evaluate in Python + let mut eva_task_map = HashMap::new(); + eva_task_map.insert("config_file", config_file.clone()); + eva_task_map.insert("mini_batch", mini_batch_json); + eva_task_map.insert("spi_seconds", data_query_time.to_string()); + + let eva_task_json = json!(eva_task_map).to_string(); // Corrected this line + + run_python_function( + &PY_MODULE_INFERENCE, + &eva_task_json, + "model_inference_compute"); + + let end_time = Instant::now(); + let python_compute_time = end_time.duration_since(start_time).as_secs_f64(); + response.insert("python_compute_time", python_compute_time.clone()); + + let overall_end_time = Instant::now(); + let overall_elapsed_time = overall_end_time.duration_since(overall_start_time).as_secs_f64(); + let diff_time = model_init_time + data_query_time + python_compute_time - overall_elapsed_time; + + response.insert("overall_query_latency", overall_elapsed_time.clone()); + response.insert("diff", diff_time.clone()); + + let response_json = json!(response).to_string(); + run_python_function( + &PY_MODULE_INFERENCE, + &response_json, + "records_results"); + + // Step 4: Return to PostgresSQL + return serde_json::json!(response); +} + + +pub fn run_inference_shared_memory_write_once( + dataset: &String, + condition: &String, + config_file: &String, + col_cardinalities_file: &String, + model_path: &String, + sql: &String, + batch_size: i32, +) -> serde_json::Value { + let mut response = HashMap::new(); + + let overall_start_time = Instant::now(); + + let mut last_id = 0; + + // Step 1: load model and columns etc + let mut task_map = HashMap::new(); + task_map.insert("where_cond", condition.clone()); + task_map.insert("config_file", config_file.clone()); + task_map.insert("col_cardinalities_file", col_cardinalities_file.clone()); + task_map.insert("model_path", model_path.clone()); + let task_json = json!(task_map).to_string(); + // here it cache a state + run_python_function( + &PY_MODULE_INFERENCE, + &task_json, + "model_inference_load_model"); + + let _end_time = Instant::now(); + let model_init_time = _end_time.duration_since(overall_start_time).as_secs_f64(); + response.insert("model_init_time", model_init_time.clone()); + + // Step 2: query data via SPI + let start_time = Instant::now(); + // Allocate shared memory in advance + // Set an identifier for the shared memory + let shmem_name = "my_shared_memory"; + + // Pre-allocate a size for shared memory (this might need some logic to determine a reasonable size) + let avg_row_size = 120; + let shmem_size = (1.5 * (avg_row_size * batch_size as usize) as f64) as usize; + let my_shmem = ShmemConf::new() + .size(shmem_size) + .os_id(shmem_name) + .create() + .unwrap(); + + let shmem_ptr = my_shmem.as_ptr() as *mut u8; + + let end_time = Instant::now(); + let mem_allocate_time = end_time.duration_since(start_time).as_secs_f64(); + response.insert("mem_allocate_time", mem_allocate_time.clone()); + + let start_time = Instant::now(); + // Use unsafe to access and write to the raw memory + unsafe { + let _ = Spi::connect(|client| { + let query = format!("SELECT * FROM {}_train {} LIMIT {}", dataset, sql, batch_size); + let mut cursor = client.open_cursor(&query, None); + let table = match cursor.fetch(batch_size as c_long) { + Ok(table) => table, + Err(e) => return Err(e.to_string()), + }; + + let end_time = Instant::now(); + let data_query_time_spi = end_time.duration_since(start_time).as_secs_f64(); + response.insert("data_query_time_spi", data_query_time_spi.clone()); + + let mut offset = 0; // Keep track of how much we've written to shared memory + + // Write the opening square bracket + shmem_ptr.offset(offset as isize).write(b"["[0]); + offset += 1; + + let mut is_first_row = true; + for row in table.into_iter() { + + // If not the first row, write a comma before the next row's data + if !is_first_row { + shmem_ptr.offset(offset as isize).write(b","[0]); + offset += 1; + } else { + is_first_row = false; + } + + let mut each_row = Vec::new(); + // add primary key + let col0 = match row.get::(1) { + Ok(Some(val)) => { + // Update last_id with the retrieved value + if val > 100000 { + last_id = 0; + } else { + last_id = val + } + val.to_string() + } + Ok(None) => "".to_string(), // Handle the case when there's no valid value + Err(e) => e.to_string(), + }; + each_row.push(col0); + // add label + let col1 = match row.get::(2) { + Ok(val) => val.map(|i| i.to_string()).unwrap_or_default(), + Err(e) => e.to_string(), + }; + each_row.push(col1); + // add fields + let texts: Vec = (3..row.columns() + 1) + .filter_map(|i| { + match row.get::<&str>(i) { + Ok(Some(s)) => Some(s.to_string()), + Ok(None) => None, + Err(e) => Some(e.to_string()), // Convert error to string + } + }).collect(); + each_row.extend(texts); + + // Serialize each row into shared memory + let serialized_row = serde_json::to_string(&each_row).unwrap(); + let bytes = serialized_row.as_bytes(); + + // Check if there's enough space left in shared memory + if offset + bytes.len() > shmem_size { + // Handle error: not enough space in shared memory + return Err("Shared memory exceeded estimated size.".to_string()); + } + + // Copy the serialized row into shared memory + std::ptr::copy_nonoverlapping(bytes.as_ptr(), + shmem_ptr.offset(offset as isize), + bytes.len()); + offset += bytes.len(); + } + // Write the closing square bracket after all rows + shmem_ptr.offset(offset as isize).write(b"]"[0]); + + // Return OK or some status + Ok(()) + }); + } + + let end_time = Instant::now(); + let data_query_time = end_time.duration_since(start_time).as_secs_f64(); + response.insert("data_query_time", data_query_time.clone()); + + let start_time = Instant::now(); + // Step 3: model evaluate in Python + let mut eva_task_map = HashMap::new(); + eva_task_map.insert("config_file", config_file.clone()); + eva_task_map.insert("spi_seconds", data_query_time.to_string()); + + let eva_task_json = json!(eva_task_map).to_string(); // Corrected this line + + run_python_function( + &PY_MODULE_INFERENCE, + &eva_task_json, + "model_inference_compute_shared_memory_write_once"); + + let end_time = Instant::now(); + let python_compute_time = end_time.duration_since(start_time).as_secs_f64(); + response.insert("python_compute_time", python_compute_time.clone()); + + let overall_end_time = Instant::now(); + let overall_elapsed_time = overall_end_time.duration_since(overall_start_time).as_secs_f64(); + let diff_time = model_init_time + data_query_time + python_compute_time - overall_elapsed_time; + + response.insert("overall_query_latency", overall_elapsed_time.clone()); + response.insert("diff", diff_time.clone()); + + + let response_json = json!(response).to_string(); + run_python_function( + &PY_MODULE_INFERENCE, + &response_json, + "records_results"); + + // Step 4: Return to PostgresSQL + return serde_json::json!(response); +} + + +pub fn run_inference_shared_memory_write_once_int_exp( + dataset: &String, + condition: &String, + config_file: &String, + col_cardinalities_file: &String, + model_path: &String, + sql: &String, + batch_size: i32, +) -> serde_json::Value { + let mut response = HashMap::new(); + // let mut response_log = HashMap::new(); + + let mut num_columns: i32 = 0; + match dataset.as_str() { // assuming dataset is a String + "frappe" => num_columns = 12, + "adult" => num_columns = 15, + "cvd" => num_columns = 13, + "bank" => num_columns = 18, + "census" => num_columns = 41+2, + "credit" => num_columns = 23+2, + "diabetes" => num_columns = 48+2, + "hcdr" => num_columns = 69+2, + _ => {}, + } + + let overall_start_time = Instant::now(); + + // Step 1: load model and columns etc + let mut task_map = HashMap::new(); + task_map.insert("where_cond", condition.clone()); + task_map.insert("config_file", config_file.clone()); + task_map.insert("col_cardinalities_file", col_cardinalities_file.clone()); + task_map.insert("model_path", model_path.clone()); + let task_json = json!(task_map).to_string(); + // here it cache a state + run_python_function( + &PY_MODULE_INFERENCE, + &task_json, + "model_inference_load_model"); + + let _end_time = Instant::now(); + let model_init_time = _end_time.duration_since(overall_start_time).as_secs_f64(); + response.insert("model_init_time", model_init_time.clone()); + + + // Step 1: query data + let start_time = Instant::now(); + let mut all_rows = Vec::new(); + + let _ = Spi::connect(|client| { + let query = format!("SELECT * FROM {}_int_train {} LIMIT {}", dataset, sql, batch_size); + let mut cursor = client.open_cursor(&query, None); + let table = match cursor.fetch(batch_size as c_long) { + Ok(table) => table, + Err(e) => return Err(e.to_string()), + }; + + let end_time = Instant::now(); + let data_query_time_spi = end_time.duration_since(start_time).as_secs_f64(); + response.insert("data_query_time_spi", data_query_time_spi); + + let mut t1: f64 = 0.0; + // todo: nl: this part can must be optimized, since i go through all of those staff. + let start_time_3 = Instant::now(); + for row in table.into_iter() { + for i in 3..=num_columns as usize { + let start_time_min = Instant::now(); + if let Ok(Some(val)) = row.get::(i) { + all_rows.push(val); + } + let end_time_min = Instant::now(); + let data_query_time_min = end_time_min.duration_since(start_time_min).as_secs_f64(); + t1 += data_query_time_min; + } + } + let end_time_min3 = Instant::now(); + let data_query_time_min3 = end_time_min3.duration_since(start_time_3).as_secs_f64(); + + response.insert("data_query_time3", data_query_time_min3.clone()); + response.insert("data_query_time2", t1.clone()); + + // Return OK or some status + Ok(()) + }); + let end_time = Instant::now(); + let data_query_time = end_time.duration_since(start_time).as_secs_f64(); + response.insert("data_query_time", data_query_time.clone()); + + + // log the query datas + // let serialized_row = serde_json::to_string(&all_rows).unwrap(); + // response_log.insert("query_data", serialized_row); + + // Step 3: Putting all data to he shared memory + let start_time = Instant::now(); + let shmem_name = "my_shared_memory"; + let my_shmem = ShmemConf::new() + .size(4 * all_rows.len()) + .os_id(shmem_name) + .create() + .unwrap(); + let shmem_ptr = my_shmem.as_ptr() as *mut i32; + + unsafe { + // Copy data into shared memory + std::ptr::copy_nonoverlapping( + all_rows.as_ptr(), + shmem_ptr as *mut i32, + all_rows.len(), + ); + } + let end_time = Instant::now(); + let mem_allocate_time = end_time.duration_since(start_time).as_secs_f64(); + response.insert("mem_allocate_time", mem_allocate_time.clone()); + + + let start_time = Instant::now(); + // Step 3: model evaluate in Python + let mut eva_task_map = HashMap::new(); + eva_task_map.insert("config_file", config_file.clone()); + eva_task_map.insert("spi_seconds", data_query_time.to_string()); + eva_task_map.insert("rows", batch_size.to_string()); + + let eva_task_json = json!(eva_task_map).to_string(); // Corrected this line + + run_python_function( + &PY_MODULE_INFERENCE, + &eva_task_json, + "model_inference_compute_shared_memory_write_once_int"); + + let end_time = Instant::now(); + let python_compute_time = end_time.duration_since(start_time).as_secs_f64(); + response.insert("python_compute_time", python_compute_time.clone()); + + let overall_end_time = Instant::now(); + let overall_elapsed_time = overall_end_time.duration_since(overall_start_time).as_secs_f64(); + let diff_time = model_init_time + data_query_time + python_compute_time - overall_elapsed_time; + + response.insert("overall_query_latency", overall_elapsed_time.clone()); + response.insert("diff", diff_time.clone()); + + let response_json = json!(response).to_string(); + run_python_function( + &PY_MODULE_INFERENCE, + &response_json, + "records_results"); + + // Step 4: Return to PostgresSQL + return serde_json::json!(response); +} + +pub fn run_inference_shared_memory_write_once_int( + dataset: &String, + condition: &String, + config_file: &String, + col_cardinalities_file: &String, + model_path: &String, + sql: &String, + batch_size: i32, +) -> serde_json::Value { + let mut response = HashMap::new(); + + let mut num_columns: i32 = 0; + match dataset.as_str() { // assuming dataset is a String + "frappe" => num_columns = 12, + "adult" => num_columns = 15, + "cvd" => num_columns = 13, + "bank" => num_columns = 18, + "census" => num_columns = 41+2, + "credit" => num_columns = 23+2, + "diabetes" => num_columns = 48+2, + "hcdr" => num_columns = 69+2, + _ => {}, + } + + let overall_start_time = Instant::now(); + + // Step 1: load model and columns etc + let mut task_map = HashMap::new(); + task_map.insert("where_cond", condition.clone()); + task_map.insert("config_file", config_file.clone()); + task_map.insert("col_cardinalities_file", col_cardinalities_file.clone()); + task_map.insert("model_path", model_path.clone()); + let task_json = json!(task_map).to_string(); + // here it cache a state + run_python_function( + &PY_MODULE_INFERENCE, + &task_json, + "model_inference_load_model"); + + let _end_time = Instant::now(); + let model_init_time = _end_time.duration_since(overall_start_time).as_secs_f64(); + response.insert("model_init_time", model_init_time.clone()); + + + // Step 1: query data + let start_time = Instant::now(); + let mut all_rows = Vec::new(); + let _ = Spi::connect(|client| { + let query = format!("SELECT * FROM {}_int_train {} LIMIT {}", dataset, sql, batch_size); + let mut cursor = client.open_cursor(&query, None); + let table = match cursor.fetch(batch_size as c_long) { + Ok(table) => table, + Err(e) => return Err(e.to_string()), + }; + let end_time = Instant::now(); + let data_query_time_spi = end_time.duration_since(start_time).as_secs_f64(); + response.insert("data_query_time_spi", data_query_time_spi); + + // todo: nl: this part can must be optimized, since i go through all of those staff. + let start_time_3 = Instant::now(); + for row in table.into_iter() { + for i in 3..= num_columns as usize { + if let Ok(Some(val)) = row.get::(i) { + all_rows.push(val); + } + } + } + let end_time_min3 = Instant::now(); + let data_query_time_min3 = end_time_min3.duration_since(start_time_3).as_secs_f64(); + response.insert("data_type_convert_time", data_query_time_min3.clone()); + + // Return OK or some status + Ok(()) + }); + let end_time = Instant::now(); + let data_query_time = end_time.duration_since(start_time).as_secs_f64(); + response.insert("data_query_time", data_query_time.clone()); + + + // log the query datas + // let serialized_row = serde_json::to_string(&all_rows).unwrap(); + // response_log.insert("query_data", serialized_row); + + // Step 3: Putting all data to he shared memory + let start_time = Instant::now(); + let shmem_name = "my_shared_memory"; + let my_shmem = ShmemConf::new() + .size(4 * all_rows.len()) + .os_id(shmem_name) + .create() + .unwrap(); + let shmem_ptr = my_shmem.as_ptr() as *mut i32; + + unsafe { + // Copy data into shared memory + std::ptr::copy_nonoverlapping( + all_rows.as_ptr(), + shmem_ptr as *mut i32, + all_rows.len(), + ); + } + let end_time = Instant::now(); + let mem_allocate_time = end_time.duration_since(start_time).as_secs_f64(); + response.insert("mem_allocate_time", mem_allocate_time.clone()); + + + let start_time = Instant::now(); + // Step 3: model evaluate in Python + let mut eva_task_map = HashMap::new(); + eva_task_map.insert("config_file", config_file.clone()); + eva_task_map.insert("spi_seconds", data_query_time.to_string()); + eva_task_map.insert("rows", batch_size.to_string()); + + let eva_task_json = json!(eva_task_map).to_string(); // Corrected this line + + run_python_function( + &PY_MODULE_INFERENCE, + &eva_task_json, + "model_inference_compute_shared_memory_write_once_int"); + + let end_time = Instant::now(); + let python_compute_time = end_time.duration_since(start_time).as_secs_f64(); + response.insert("python_compute_time", python_compute_time.clone()); + + let overall_end_time = Instant::now(); + let overall_elapsed_time = overall_end_time.duration_since(overall_start_time).as_secs_f64(); + let diff_time = model_init_time + data_query_time + python_compute_time - overall_elapsed_time; + + response.insert("overall_query_latency", overall_elapsed_time.clone()); + response.insert("diff", diff_time.clone()); + + let response_json = json!(response).to_string(); + run_python_function( + &PY_MODULE_INFERENCE, + &response_json, + "records_results"); + + // Step 4: Return to PostgresSQL + return serde_json::json!(response); +} + + +pub fn init_model( + condition: &String, + config_file: &String, + col_cardinalities_file: &String, + model_path: &String, +) -> serde_json::Value { + let overall_start_time = Instant::now(); + // Step 1: load model and columns etc + let mut task_map = HashMap::new(); + task_map.insert("where_cond", condition.clone()); + task_map.insert("config_file", config_file.clone()); + task_map.insert("col_cardinalities_file", col_cardinalities_file.clone()); + task_map.insert("model_path", model_path.clone()); + let task_json = json!(task_map).to_string(); + // here it cache a state + run_python_function( + &PY_MODULE_INFERENCE, + &task_json, + "model_inference_load_model"); + let _end_time = Instant::now(); + let model_init_time = _end_time.duration_since(overall_start_time).as_secs_f64(); + return serde_json::json!(model_init_time); +} \ No newline at end of file diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/pg_extension/src/bindings/ml_register.rs b/examples/model_selection/Trails/internal/pg_extension/src/bindings/ml_register.rs similarity index 75% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/pg_extension/src/bindings/ml_register.rs rename to examples/model_selection/Trails/internal/pg_extension/src/bindings/ml_register.rs index 5ca5396533..617edf52fc 100644 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/pg_extension/src/bindings/ml_register.rs +++ b/examples/model_selection/Trails/internal/pg_extension/src/bindings/ml_register.rs @@ -1,5 +1,5 @@ /************************************************************ -* +* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information @@ -19,10 +19,13 @@ * *************************************************************/ + use log::error; use once_cell::sync::Lazy; use pyo3::prelude::*; use pyo3::types::PyTuple; +use std::env; +use pyo3::types::PyList; pub fn run_python_function( @@ -32,6 +35,12 @@ pub fn run_python_function( ) -> serde_json::Value { let parameters_str = parameters.to_string(); let results = Python::with_gil(|py| -> String { + + // load package such that it can import python packages, we do this onyl for integrate with polarDB env + let sys_module = py.import("sys").unwrap(); + let sys_path: &PyList = sys_module.getattr("path").unwrap().downcast().unwrap(); + sys_path.append("/home/postgres/Trails/internal/ml/model_selection/").unwrap(); + let run_script: Py = py_module.getattr(py, function_name).unwrap().into(); let result = run_script.call1( py, @@ -69,6 +78,18 @@ pub static PY_MODULE: Lazy> = Lazy::new(|| { }); +/* + Python Module Path for SAMS + */ +pub static PY_MODULE_INFERENCE: Lazy> = Lazy::new(|| { + Python::with_gil(|py| -> Py { + let src = include_str!(concat!( + env!("CARGO_MANIFEST_DIR"), + "/../ml/model_slicing/pg_interface.py" + )); + PyModule::from_code(py, src, "", "").unwrap().into() + }) +}); diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/pg_extension/src/bindings/mod.rs b/examples/model_selection/Trails/internal/pg_extension/src/bindings/mod.rs similarity index 95% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/pg_extension/src/bindings/mod.rs rename to examples/model_selection/Trails/internal/pg_extension/src/bindings/mod.rs index 4e976d605e..b7d29e6b70 100644 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/pg_extension/src/bindings/mod.rs +++ b/examples/model_selection/Trails/internal/pg_extension/src/bindings/mod.rs @@ -1,5 +1,5 @@ /************************************************************ -* +* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information @@ -19,8 +19,12 @@ * *************************************************************/ + +mod ml_register; + #[cfg(feature = "python")] pub mod ms; -mod ml_register; -mod model; +#[cfg(feature = "python")] +pub mod inference; + diff --git a/examples/model_selection/Trails/internal/pg_extension/src/bindings/ms.rs b/examples/model_selection/Trails/internal/pg_extension/src/bindings/ms.rs new file mode 100644 index 0000000000..bc8640430f --- /dev/null +++ b/examples/model_selection/Trails/internal/pg_extension/src/bindings/ms.rs @@ -0,0 +1,265 @@ +/************************************************************ +* +* Licensed to the Apache Software Foundation (ASF) under one +* or more contributor license agreements. See the NOTICE file +* distributed with this work for additional information +* regarding copyright ownership. The ASF licenses this file +* to you under the Apache License, Version 2.0 (the +* "License"); you may not use this file except in compliance +* with the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an +* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +* KIND, either express or implied. See the License for the +* specific language governing permissions and limitations +* under the License. +* +*************************************************************/ + +use serde_json::json; +use std::collections::HashMap; +use pgrx::prelude::*; +use crate::bindings::ml_register::PY_MODULE; +use crate::bindings::ml_register::run_python_function; +use std::time::{Instant}; +use shared_memory::*; + +pub fn profiling_filtering_phase( + task: &String +) -> serde_json::Value { + run_python_function(&PY_MODULE, task, "profiling_filtering_phase") +} + + +pub fn profiling_refinement_phase( + task: &String +) -> serde_json::Value { + run_python_function(&PY_MODULE, task, "profiling_refinement_phase") +} + + +pub fn coordinator( + task: &String +) -> serde_json::Value { + run_python_function(&PY_MODULE, task, "coordinator") +} + + +pub fn filtering_phase( + task: &String +) -> serde_json::Value { + run_python_function(&PY_MODULE, task, "filtering_phase") +} + + +pub fn refinement_phase() -> serde_json::Value { + let task = "refinement_phase".to_string(); + run_python_function(&PY_MODULE, &task, "refinement_phase") +} + + +// this two are filtering + refinement in UDF runtime +pub fn model_selection( + task: &String +) -> serde_json::Value { + run_python_function(&PY_MODULE, task, "model_selection") +} + + +pub fn model_selection_workloads( + task: &String +) -> serde_json::Value { + run_python_function(&PY_MODULE, task, "model_selection_workloads") +} + + +// this two are filtering + refinement in GPU server +pub fn model_selection_trails( + task: &String +) -> serde_json::Value { + run_python_function(&PY_MODULE, task, "model_selection_trails") +} + + +pub fn model_selection_trails_workloads( + task: &String +) -> serde_json::Value { + run_python_function(&PY_MODULE, task, "model_selection_trails_workloads") +} + +// micro benchmarks +// this is query data in filtering phase via sql +pub fn benchmark_filtering_phase_latency( + task: &String +) -> serde_json::Value { + run_python_function(&PY_MODULE, task, "benchmark_filtering_phase_latency") +} + +// this is query data in filtering phase via spi +pub fn benchmark_filtering_latency_in_db( + explore_models: i32, dataset: &String, batch_size_m: i32, config_file: &String) -> serde_json::Value { + let mut return_result = HashMap::new(); + + + let mut total_columns: i32 = 0; + match dataset.as_str() { // assuming dataset is a String + "frappe" => total_columns = 12, + "criteo" => total_columns = 41, + "uci_diabetes" => total_columns = 45, + _ => {} + } + + let mut num_columns: i64 = 0; + match dataset.as_str() { // assuming dataset is a String + "frappe" => num_columns = 10 * 2 + 1, + "criteo" => num_columns = 39 * 2 + 1, + "uci_diabetes" => num_columns = 43 * 2 + 1, + _ => {} + } + + let batch_size: i64 = batch_size_m as i64; + + let call_time_begin = Instant::now(); + for _ in 1..=5000 { + run_python_function( + &PY_MODULE, + &"".to_string(), + "measure_call_overheads"); + } + let _end_time = Instant::now(); + let call_time = _end_time.duration_since(call_time_begin).as_secs_f64(); + return_result.insert("call_time", call_time.to_string()); + + + let overall_start_time = Instant::now(); + + let mut last_id = 0; + let mut eva_results = serde_json::Value::Null; // Initializing the eva_results + + // Step 3: Putting all data to he shared memory + let shmem_name = "my_shared_memory"; + let my_shmem = ShmemConf::new() + .size((4 * batch_size * num_columns) as usize) + .os_id(shmem_name) + .create() + .unwrap(); + + let mut numbers: Vec = Vec::with_capacity((num_columns - 1) as usize ); + + let _ = Spi::connect(|client| { + for i in 1..explore_models + 1 { + // Step 1: Initialize State in Python + let mut task_map = HashMap::new(); + task_map.insert("config_file", config_file.clone()); + task_map.insert("dataset", dataset.clone()); + task_map.insert("eva_results", eva_results.to_string()); + let task_json = json!(task_map).to_string(); + + // here it cache a state + let sample_result = run_python_function( + &PY_MODULE, + &task_json, + "in_db_filtering_state_init"); + + // 2. query data via SPI + let start_time = Instant::now(); + let mut mini_batch = Vec::new(); + + let query = format!("SELECT * FROM {}_train WHERE id > {} ORDER BY id ASC LIMIT {}", dataset, last_id, batch_size); + let mut cursor = client.open_cursor(&query, None); + let table = match cursor.fetch(batch_size) { + Ok(table) => table, + Err(e) => return Err(e.to_string()), // Convert the error to a string and return + }; + + for row in table.into_iter() { + // add primary key + let val = row.get::(1) + .expect("Failed to retrieve value") // This will panic if it encounters `Err` + .expect("Retrieved value is NULL"); // This will panic if it encounters `None` + + if val > 80000 { + last_id = 0; + } else { + last_id = val; + } + + // add label + if let Ok(Some(col1)) = row.get::(2) { + mini_batch.push(col1 as f32); + }; + + numbers.clear(); + for i in 3..= total_columns as usize { + if let Some(s) = row.get::<&str>(i).ok().flatten() { // Ensuring it's Some(&str) + for part in s.split(':') { + match part.parse::() { + Ok(num) => numbers.push(num), + Err(_) => eprintln!("Failed to parse part as f32"), // Handle the error as appropriate for your application. + } + } + } + } + + mini_batch.extend_from_slice(&numbers); + } + + unsafe { + let shmem_ptr = my_shmem.as_ptr() as *mut f32; + // Copy data into shared memory + std::ptr::copy_nonoverlapping( + mini_batch.as_ptr(), + shmem_ptr as *mut f32, + mini_batch.len(), + ); + } + + let end_time = Instant::now(); + let elapsed_time = end_time.duration_since(start_time); + let elapsed_seconds = elapsed_time.as_secs_f64(); + + // Step 3: model evaluate in Python + let mut eva_task_map = HashMap::new(); + eva_task_map.insert("config_file", config_file.clone()); + eva_task_map.insert("sample_result", sample_result.to_string()); + eva_task_map.insert("spi_seconds", elapsed_seconds.to_string()); + eva_task_map.insert("rows", batch_size.to_string()); + eva_task_map.insert("model_index", i.to_string()); + let eva_task_json = json!(eva_task_map).to_string(); // Corrected this line + + eva_results = run_python_function( + &PY_MODULE, + &eva_task_json, + "in_db_filtering_evaluate"); + + // debug the fetched data + // if i == 1{ + // let serialized_data = json!(mini_batch).to_string(); + // return_result.insert("serialized_data", serialized_data); + // }; + }; + Ok(()) + }); + + let overall_end_time = Instant::now(); + let overall_elapsed_time = overall_end_time.duration_since(overall_start_time); + let overall_elapsed_seconds = overall_elapsed_time.as_secs_f64(); + + return_result.insert("overall time usage", overall_elapsed_seconds.to_string()); + + let mut record_task_map = HashMap::new(); + record_task_map.insert("config_file", config_file.clone()); + record_task_map.insert("dataset", dataset.clone()); + let record_task_json = json!(record_task_map).to_string(); + run_python_function( + &PY_MODULE, + &record_task_json, + "records_results"); + + // Step 4: Return to PostgresSQL + return serde_json::json!(return_result); +} + diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/pg_extension/src/lib.rs b/examples/model_selection/Trails/internal/pg_extension/src/lib.rs similarity index 71% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/pg_extension/src/lib.rs rename to examples/model_selection/Trails/internal/pg_extension/src/lib.rs index 5ff49a7085..e4f361a868 100644 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/pg_extension/src/lib.rs +++ b/examples/model_selection/Trails/internal/pg_extension/src/lib.rs @@ -1,5 +1,5 @@ /************************************************************ -* +* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information @@ -182,11 +182,118 @@ pub fn benchmark_filtering_phase_latency(explore_models: i32, config_file: Strin #[pg_extern(immutable, parallel_safe, name = "benchmark_filtering_latency_in_db")] #[allow(unused_variables)] pub fn benchmark_filtering_latency_in_db( - explore_models: i32, dataset: String, config_file: String) -> String { - crate::bindings::ms::benchmark_filtering_latency_in_db(explore_models, &dataset, &config_file).to_string() + explore_models: i32, dataset: String, batch_size_m: i32, config_file: String) -> String { + crate::bindings::ms::benchmark_filtering_latency_in_db(explore_models, &dataset, batch_size_m ,&config_file).to_string() } +// Model Inference +#[cfg(feature = "python")] +#[pg_extern(immutable, parallel_safe, name = "inference")] +#[allow(unused_variables)] +pub fn run_inference( + dataset: String, + condition: String, + config_file: String, + col_cardinalities_file: String, + model_path: String, + sql: String, + batch_size: i32, +) -> String { + crate::bindings::inference::run_inference( + &dataset, + &condition, + &config_file, + &col_cardinalities_file, + &model_path, + &sql, + batch_size).to_string() +} +// Model Inference +#[cfg(feature = "python")] +#[pg_extern(immutable, parallel_safe, name = "inference_shared")] +#[allow(unused_variables)] +pub fn run_inference_shared( + dataset: String, + condition: String, + config_file: String, + col_cardinalities_file: String, + model_path: String, + sql: String, + batch_size: i32, +) -> String { + crate::bindings::inference::run_inference_shared_memory( + &dataset, + &condition, + &config_file, + &col_cardinalities_file, + &model_path, + &sql, + batch_size).to_string() +} +// Model Inference +#[cfg(feature = "python")] +#[pg_extern(immutable, parallel_safe, name = "inference_shared_write_once")] +#[allow(unused_variables)] +pub fn inference_shared_write_once( + dataset: String, + condition: String, + config_file: String, + col_cardinalities_file: String, + model_path: String, + sql: String, + batch_size: i32, +) -> String { + crate::bindings::inference::run_inference_shared_memory_write_once( + &dataset, + &condition, + &config_file, + &col_cardinalities_file, + &model_path, + &sql, + batch_size).to_string() +} + +// Model Inference +#[cfg(feature = "python")] +#[pg_extern(immutable, parallel_safe, name = "inference_shared_write_once_int")] +#[allow(unused_variables)] +pub fn inference_shared_write_once_int( + dataset: String, + condition: String, + config_file: String, + col_cardinalities_file: String, + model_path: String, + sql: String, + batch_size: i32, +) -> String { + crate::bindings::inference::run_inference_shared_memory_write_once_int( + &dataset, + &condition, + &config_file, + &col_cardinalities_file, + &model_path, + &sql, + batch_size).to_string() +} + + +// Model Inference +#[cfg(feature = "python")] +#[pg_extern(immutable, parallel_safe, name = "model_init")] +#[allow(unused_variables)] +pub fn model_init( + condition: String, + config_file: String, + col_cardinalities_file: String, + model_path: String +) -> String { + crate::bindings::inference::init_model( + &condition, + &config_file, + &col_cardinalities_file, + &model_path).to_string() +} diff --git a/examples/model_selection/Trails/internal/pg_extension/template/Cargo.pg11.toml b/examples/model_selection/Trails/internal/pg_extension/template/Cargo.pg11.toml new file mode 100644 index 0000000000..02b15a4564 --- /dev/null +++ b/examples/model_selection/Trails/internal/pg_extension/template/Cargo.pg11.toml @@ -0,0 +1,40 @@ +[package] +name = "pg_extension" +version = "0.1.0" +edition = "2021" + +[lib] +crate-type = ["cdylib"] + +[features] +default = ["pg11", "python"] +python = ["pyo3"] +pg11 = ["pgrx/pg11", "pgrx-tests/pg11" ] +pg12 = ["pgrx/pg12", "pgrx-tests/pg12" ] +pg13 = ["pgrx/pg13", "pgrx-tests/pg13" ] +pg14 = ["pgrx/pg14", "pgrx-tests/pg14" ] +pg15 = ["pgrx/pg15", "pgrx-tests/pg15" ] +pg_test = [] + +[dependencies] +pgrx = "=0.9.7" +pgrx-pg-sys = "=0.9.7" +serde_json = { version = "1.0.85", features = ["preserve_order"] } +pyo3 = { version = "0.17", features = ["auto-initialize"], optional = true } +once_cell = "1.8.0" +log = "0.4.14" +serde = "1.0" +serde_derive = "1.0" +shared_memory = "0.12.4" + +[dev-dependencies] +pgrx-tests = "=0.9.7" + +[profile.dev] +panic = "unwind" + +[profile.release] +panic = "unwind" +opt-level = 3 +lto = "fat" +codegen-units = 1 \ No newline at end of file diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/pg_extension/Cargo.toml b/examples/model_selection/Trails/internal/pg_extension/template/Cargo.pg14.toml similarity index 96% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/pg_extension/Cargo.toml rename to examples/model_selection/Trails/internal/pg_extension/template/Cargo.pg14.toml index 5c3e747391..667fa75360 100644 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/pg_extension/Cargo.toml +++ b/examples/model_selection/Trails/internal/pg_extension/template/Cargo.pg14.toml @@ -25,6 +25,7 @@ once_cell = "1.8.0" log = "0.4.14" serde = "1.0" serde_derive = "1.0" +shared_memory = "0.12.4" [dev-dependencies] pgrx-tests = "=0.9.7" diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/pg_extension/test/lib.rs b/examples/model_selection/Trails/internal/pg_extension/test/lib.rs similarity index 97% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/pg_extension/test/lib.rs rename to examples/model_selection/Trails/internal/pg_extension/test/lib.rs index bb91c29811..21a45e1d34 100644 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/pg_extension/test/lib.rs +++ b/examples/model_selection/Trails/internal/pg_extension/test/lib.rs @@ -1,21 +1,21 @@ -/************************************************************ -* -* Licensed to the Apache Software Foundation (ASF) under one -* or more contributor license agreements. See the NOTICE file -* distributed with this work for additional information -* regarding copyright ownership. The ASF licenses this file -* to you under the Apache License, Version 2.0 (the -* "License"); you may not use this file except in compliance -* with the License. You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, -* software distributed under the License is distributed on an -* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -* KIND, either express or implied. See the License for the -* specific language governing permissions and limitations -* under the License. -* -*************************************************************/ - +/************************************************************ +* +* Licensed to the Apache Software Foundation (ASF) under one +* or more contributor license agreements. See the NOTICE file +* distributed with this work for additional information +* regarding copyright ownership. The ASF licenses this file +* to you under the Apache License, Version 2.0 (the +* "License"); you may not use this file except in compliance +* with the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an +* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +* KIND, either express or implied. See the License for the +* specific language governing permissions and limitations +* under the License. +* +*************************************************************/ + diff --git a/examples/model_selection/Trails/requirement.txt b/examples/model_selection/Trails/requirement.txt new file mode 100644 index 0000000000..855bab3c91 --- /dev/null +++ b/examples/model_selection/Trails/requirement.txt @@ -0,0 +1,65 @@ +aiofiles==23.1.0 +blessed==1.20.0 +certifi==2023.7.22 +charset-normalizer==3.2.0 +ConfigSpace==0.7.1 +contourpy==1.1.0 +cycler==0.11.0 +einops==0.7.0 +fonttools==4.41.0 +fvcore==0.1.5.post20221221 +gpustat==1.1 +h5py==3.10.0 +html5tagger==1.3.0 +httptools==0.6.0 +idna==3.4 +importlib-resources==6.0.0 +iopath==0.1.10 +joblib==1.3.1 +kiwisolver==1.4.4 +matplotlib==3.7.2 +more-itertools==9.1.0 +multidict==6.0.4 +numpy==1.24.4 +nvidia-ml-py==12.535.77 +objgraph==3.6.0 +orjson==3.9.2 +packaging==23.1 +palettable==3.3.3 +pandas==2.0.3 +Pillow==10.0.0 +portalocker==2.8.2 +psutil==5.9.5 +psycopg2-binary==2.9.6 +Pympler==1.0.1 +pyparsing==3.0.9 +python-dateutil==2.8.2 +pytz==2023.3 +PyYAML==6.0.1 +requests==2.31.0 +sanic==23.6.0 +sanic-routing==23.6.0 +scikit-learn==1.3.0 +scipy==1.10.1 +seaborn==0.12.2 +six==1.16.0 +sklearn==0.0 +tabulate==0.9.0 +termcolor==2.3.0 +thop @ git+https://github.com/Lyken17/pytorch-OpCounter.git@43c064afb71383501e41eaef9e8c8407265cf77f +threadpoolctl==3.1.0 +torch==1.8.1 +torchaudio==0.8.1 +torchinfo==1.8.0 +torchvision==0.9.1 +tqdm==4.47.0 +tracerite==1.1.0 +typing_extensions==4.7.1 +tzdata==2023.3 +ujson==5.8.0 +urllib3==2.0.4 +uvloop==0.17.0 +wcwidth==0.2.6 +websockets==11.0.3 +yacs==0.1.8 +zipp==3.16.2 \ No newline at end of file diff --git a/examples/model_selection/Trails/singa.polarDB.Dockerfile b/examples/model_selection/Trails/singa.polarDB.Dockerfile new file mode 100644 index 0000000000..cba0e92033 --- /dev/null +++ b/examples/model_selection/Trails/singa.polarDB.Dockerfile @@ -0,0 +1,83 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + +# Based on PolarDB with PostgreSQL 11.9 +FROM polardb/polardb_pg_local_instance:latest + +# LABEL maintainer="Naili Xing " + +# Install Python, Vim, and necessary libraries +# Note: The 'pip' package might not be directly available like this, usually python3-pip is the package name. +USER root +RUN apt-get update && apt-get install -y \ + python3-pip \ + bzip2 \ + libbz2-dev \ + build-essential \ + libffi-dev \ + libssl-dev \ + zlib1g-dev \ + libbz2-dev \ + libreadline-dev \ + libsqlite3-dev \ + wget \ + llvm \ + libncursesw5-dev \ + xz-utils \ + tk-dev \ + libxml2-dev \ + libxmlsec1-dev \ + liblzma-dev \ + && rm -rf /var/lib/apt/lists/* + +USER postgres +# Install pyenv and Python 3.8 +RUN curl https://pyenv.run | bash \ + && export PYENV_ROOT="$HOME/.pyenv" \ + && export PATH="$PYENV_ROOT/bin:$PATH" \ + && eval "$(pyenv init --path)" \ + && eval "$(pyenv init -)" \ + && env PYTHON_CONFIGURE_OPTS="--enable-shared" pyenv install 3.8 + + +# Switch to the postgres user, install Rust, init the cargo +# polarDB uses the pg 11.9 +RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y && \ + echo 'source $HOME/.cargo/env' >> $HOME/.bashrc && \ + /bin/bash -c "source $HOME/.cargo/env && cargo install cargo-pgrx --version '0.9.7' --locked" && \ + /bin/bash -c "source $HOME/.cargo/env && cargo pgrx init --pg11 /home/postgres/tmp_basedir_polardb_pg_1100_bld/bin/pg_config" + + +# Clone code to there, install dependences, +WORKDIR /home/postgres +RUN git clone https://github.com/NLGithubWP/Trails.git && \ + cd Trails && \ + git checkout trails_singa && \ + cp ./internal/pg_extension/template/Cargo.pg11.toml ./internal/pg_extension/Cargo.toml && \ + cd ./internal/ml/model_selection && \ + pip install -r requirement.txt && \ + pip install ../../../singa_pkg_code/singa-3.1.0-cp38-cp38-manylinux2014_x86_64.whl + + +WORKDIR /home/postgres/Trails/internal/pg_extension +RUN /bin/bash -c "source $HOME/.cargo/env && cargo pgrx install --pg-config /home/postgres/tmp_basedir_polardb_pg_1100_bld/bin/pg_config" + +WORKDIR /home/postgres +RUN chmod +x ./Trails/init_polardb.sh +# here we run the default script in /home/postgres diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/singa.polarDB.Dockerfile b/examples/model_selection/Trails/singa.psql.Dockerfile similarity index 81% rename from examples/model_selection/TRAILS-Database-Native-Model-Selection/singa.polarDB.Dockerfile rename to examples/model_selection/Trails/singa.psql.Dockerfile index e85bdeeba9..75e90c3795 100644 --- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/singa.polarDB.Dockerfile +++ b/examples/model_selection/Trails/singa.psql.Dockerfile @@ -16,9 +16,7 @@ # limitations under the License. # - - -FROM polardb/polardb_pg_local_instance:latest +FROM ubuntu:20.04 #LABEL maintainer="Naili Xing " @@ -55,6 +53,12 @@ RUN adduser --disabled-password --gecos "" postgres && \ RUN wget --quiet -O - https://www.postgresql.org/media/keys/ACCC4CF8.asc | apt-key add - \ && sh -c 'echo "deb http://apt.postgresql.org/pub/repos/apt $(. /etc/os-release; echo $VERSION_CODENAME)-pgdg main" > /etc/apt/sources.list.d/pgdg.list' +# Install postgresql client +RUN apt-get update && apt-get install -y \ + postgresql-client-14 && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + # Switch to the postgres user and Install Rust and init the cargo USER postgres RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y && \ @@ -71,13 +75,16 @@ ARG CACHEBUST=1 # Clone code to there, install dependences, WORKDIR /project -RUN git clone https://github.com/apache/singa/tree/dev-postgresql && \ - cd ./singa/examples/model_selection && \ - pip install -r requirement.txt - +RUN git clone https://github.com/NLGithubWP/Trails.git && \ + cd Trails && \ + git checkout trails_singa && \ + cp ./internal/pg_extension/template/Cargo.pg14.toml ./internal/pg_extension/Cargo.toml && \ + cd ./internal/ml/model_selection && \ + pip install -r requirement.txt && \ + pip install ../../../singa_pkg_code/singa-3.1.0-cp38-cp38-manylinux2014_x86_64.whl WORKDIR /project -RUN chmod +x ./singa/examples/model_selection/init.sh +RUN chmod +x ./Trails/init.sh # Set the entry point to your script -ENTRYPOINT ["/project/singa/examples/model_selection/init.sh"] +ENTRYPOINT ["/project/Trails/init.sh"] diff --git a/examples/model_selection/Trails/singa_pkg_code/model.py b/examples/model_selection/Trails/singa_pkg_code/model.py new file mode 100644 index 0000000000..34ae3ce3e2 --- /dev/null +++ b/examples/model_selection/Trails/singa_pkg_code/model.py @@ -0,0 +1,383 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# ============================================================================= +''' +This script includes Model class for python users +to use Computational Graph in their model. +''' + +import os +import gc +import time +import json +import zipfile +import numpy as np +from functools import wraps +# from collections import Iterable + +try: + from collections.abc import Iterable +except ImportError: + from collections import Iterable + +from singa import tensor +from singa import autograd +from singa import layer +from .tensor import Tensor +from . import singa_wrap as singa + + +class ModelMeta(layer.LayerMeta): + + def buffer_operation(func): + + def remove_creator(tensors): + if not tensors: + return + + if isinstance(tensors, Iterable): + if isinstance(tensors, str): + return + else: + for item in tensors: + if isinstance(item, Iterable): + remove_creator(item) + elif isinstance(item, tensor.Tensor): + item.creator = None + elif isinstance(tensors, tensor.Tensor): + tensors.creator = None + + @wraps(func) + def wrapper(self, *args, **kwargs): + # print ("in model.py wrapper function") + # print ("in model.py wrapper function args[0] shape: ", args[0].shape) + # print ("in model.py self._buffered: ", self._buffered) + # print ("in model.py begin wrapper self._results: ", self._results) + if self.graph_mode and self.training: + if len(args) == 0: + raise ValueError('expect at least one input tensor') + + if isinstance(args[0], list): + assert isinstance( + args[0][0], + Tensor), ('function expects PlaceHolders or Tensors') + dev = args[0][0].device + else: + assert isinstance( + args[0], + Tensor), ('function expects PlaceHolders or Tensors') + dev = args[0].device + + if not self._buffered: + # buffer operations + dev.EnableGraph(True) + # print ("model.py wrap not self._buffered args[0].shape", args[0].shape) + self._results = func(self, *args, **kwargs) + # print ("model.py wrap not self._buffered func: ", func) + dev.Sync() + dev.EnableGraph(False) + self._buffered = True + + # deconstruct Operations before running the entire graph + remove_creator(self._results) + + # make sure all Operations are deallocated + gc.collect() + + # run graph + # print ("in model.py before dev.RunGraph self._results[0] shape: ", self._results[0].shape) + # print ("in model.py before dev.RunGraph args[0] shape: ", args[0].shape) + # print ("in model.py before dev.RunGraph self._results: ", self._results) + dev.RunGraph(self.sequential) + # print ("in model.py after dev.RunGraph") + # print ("in model.py after dev.RunGraph self._results[0] shape: ", self._results[0].shape) + # print ("in model.py after dev.RunGraph self._results: ", self._results) + # print ("in model.py after dev.RunGraph args[0] shape: ", args[0].shape) + return self._results + else: + return func(self, *args, **kwargs) + + print ("model.py return buffer_operation wrapper: ", wrapper) + return wrapper + + def __new__(cls, name, bases, attr): + print ("in __new__ attr['train_one_batch']: \n", attr['train_one_batch']) + if 'train_one_batch' in attr: + attr['train_one_batch'] = ModelMeta.buffer_operation( + attr['train_one_batch']) + + return super(ModelMeta, cls).__new__(cls, name, bases, attr) + + +class Model(layer.Layer, metaclass=ModelMeta): + """ Base class for your neural network models. + + Example usage:: + + import numpy as np + from singa import opt + from singa import tensor + from singa import device + from singa import autograd + from singa import layer + from singa import model + + class MyModel(model.Model): + def __init__(self): + super(MyModel, self).__init__() + + self.softmax_cross_entropy = layer.SoftMaxCrossEntropy() + self.conv1 = layer.Conv2d(1, 20, 5, padding=0) + self.conv2 = layer.Conv2d(20, 50, 5, padding=0) + self.sgd = opt.SGD(lr=0.01) + + def forward(self, x): + y = self.conv1(x) + y = self.conv2(y) + return y + + def train_one_batch(self, x, y): + out = self.forward(x) + loss = self.softmax_cross_entropy(out, y) + self.sgd(loss) + return out, loss + + """ + + # save load states constant + TENSOR_DICT_FILENAME = '/tensor_dict.npz' + STATES_ATTR_FILENAME = '/states_attr.json' + MODEL_STATE_TYPE = 0 + AUX_STATE_TYPE = 1 + + def __init__(self): + """ + Initializes internal Model state + """ + super(Model, self).__init__() + + self.training = True + self.graph_mode = True + self.sequential = False + self._buffered = False + self._results = None + + def compile(self, inputs, is_train=True, use_graph=False, sequential=False): + """ Compile and initialize the model + + This function will automatically derive the shape of parameters + in each sublayer based on the shape of input placeholders. It will + also do some settings. + + Args: + inputs(list): the list of input tensors(placeholders) + is_train(bool): when is_trainis True, this model will enter + training mode, otherwise it will enter the evaluation mode + use_graph(bool): when use_graph is True, computational graph + will be used to train this model + sequential(bool): when sequential is True, model will execute ops + in the graph follow the order of joining the graph + """ + assert len(inputs) > 0 and isinstance(inputs[0], Tensor), ( + 'compile function expects PlaceHolders or Tensors') + + dev = inputs[0].device + dev.EnableGraph(True) + self.forward(*inputs) + dev.EnableGraph(False) + dev.ResetGraph() + + autograd.training = is_train + self.training = is_train + self.graph_mode = use_graph + self.sequential = sequential + + def forward(self, *input): + """Defines the computation performed in every forward propagation. + + Should be overridden by all subclasses. + + Args: + *input: the input training data for the model + + Returns: + out: the outputs of the forward propagation. + """ + raise NotImplementedError + + def train_one_batch(self, *input, **kwargs): + """Defines the computation performed in every training iteration + + Should be overridden by all subclasses. + + Args: + *input: the arguments of train_one_batch + **kwargs: the keyword arguments of train_one_batch + """ + raise NotImplementedError + + def train(self, mode=True): + """Set the model in evaluation mode. + + Args: + mode(bool): when mode is True, this model will enter training mode + """ + self.training = mode + autograd.training = mode + + def eval(self): + """Sets the model in evaluation mode. + """ + self.train(mode=False) + + def graph(self, mode=True, sequential=False): + """ Turn on the computational graph. Specify execution mode. + + Args: + mode(bool): when mode is True, model will use computational graph + sequential(bool): when sequential is True, model will execute ops + in the graph follow the order of joining the graph + """ + self.graph_mode = mode + self.sequential = sequential + + def __get_name__(self): + return self.__class__.__name__ + + def __call__(self, *input, **kwargs): + # print ("in pkg model.py __call__") + if self.training: + # print ("in pkg model.py train_one_batch") + # print ("self: ", self) + # print ("self.num_classes: ", self.num_classes) + # print ("input[0].shape: ", input[0].shape) + return self.train_one_batch(*input, **kwargs) + else: + # print ("in pkg model.py forward") + return self.forward(*input, **kwargs) + + def save_states(self, fpath, aux_states={}): + """Save states. + + Args: + fpath: output file path (without the extension) + aux_states(dict): values are standard data types or Tensor, + e.g., epoch ID, learning rate, optimizer states + """ + assert not os.path.isfile(fpath), ( + "Failed to save states, %s is already existed." % fpath) + + states = self.get_states() + + # save states data and attr + tensor_dict = {} + states_attr = {} + for k, v in states.items(): + assert isinstance(v, tensor.Tensor), "Only tensor state is allowed" + tensor_dict[k] = tensor.to_numpy(v) + states_attr[k] = { + 'state_type': self.MODEL_STATE_TYPE, + 'shape': v.shape, + 'dtype': v.dtype + } + + for k, v in aux_states.items(): + assert isinstance(v, + tensor.Tensor), "Only tensor aux state is allowed" + tensor_dict[k] = tensor.to_numpy(v) + states_attr[k] = { + 'state_type': self.AUX_STATE_TYPE, + 'shape': v.shape, + 'dtype': v.dtype + } + + # save to files + timestamp = time.time() + tmp_dir = '/tmp/singa_save_states_%s' % timestamp + os.mkdir(tmp_dir) + tensor_dict_fp = tmp_dir + self.TENSOR_DICT_FILENAME + states_attr_fp = tmp_dir + self.STATES_ATTR_FILENAME + + np.savez(tensor_dict_fp, **tensor_dict) + + with open(states_attr_fp, 'w') as fp: + json.dump(states_attr, fp) + + compression = zipfile.ZIP_DEFLATED + with zipfile.ZipFile(fpath, mode="w") as zf: + zf.write(tensor_dict_fp, + os.path.basename(tensor_dict_fp), + compress_type=compression) + zf.write(states_attr_fp, + os.path.basename(states_attr_fp), + compress_type=compression) + + # clean up tmp files + os.remove(tensor_dict_fp) + os.remove(states_attr_fp) + os.rmdir(tmp_dir) + + def load_states(self, fpath): + """Load the model states and auxiliary states from disk. + + Usage: + m = MyModel() + m.compile(...) + aux_states = m.load_states('mymodel.zip') + + Args: + path: input file path (without the extension) + Returns: + dict + """ + + assert os.path.isfile(fpath), ( + "Failed to load states, %s is not exist." % fpath) + + timestamp = time.time() + tmp_dir = '/tmp/singa_load_states_%s' % timestamp + os.mkdir(tmp_dir) + + with zipfile.ZipFile(fpath, 'r') as zf: + zf.extractall(tmp_dir) + + tensor_dict_fp = tmp_dir + self.TENSOR_DICT_FILENAME + states_attr_fp = tmp_dir + self.STATES_ATTR_FILENAME + + with open(states_attr_fp) as f: + states_attr = json.load(f) + + tensor_dict = np.load(tensor_dict_fp) + + # restore singa tensor from numpy + model_states = dict() + aux_states = dict() + + for k in tensor_dict.files: + if states_attr[k]['state_type'] == self.MODEL_STATE_TYPE: + model_states[k] = tensor.from_numpy(tensor_dict[k]) + elif states_attr[k]['state_type'] == self.AUX_STATE_TYPE: + aux_states[k] = tensor.from_numpy(tensor_dict[k]) + + # restore model_states + self.set_states(model_states) + + # clean up tmp files + os.remove(tensor_dict_fp) + os.remove(states_attr_fp) + os.rmdir(tmp_dir) + return aux_states diff --git a/examples/model_selection/Trails/singa_pkg_code/singa-3.1.0-cp38-cp38-manylinux2014_x86_64.whl b/examples/model_selection/Trails/singa_pkg_code/singa-3.1.0-cp38-cp38-manylinux2014_x86_64.whl new file mode 100644 index 0000000000..cfb101382f Binary files /dev/null and b/examples/model_selection/Trails/singa_pkg_code/singa-3.1.0-cp38-cp38-manylinux2014_x86_64.whl differ diff --git a/examples/model_selection/Trails/singa_pkg_code/tensor.py b/examples/model_selection/Trails/singa_pkg_code/tensor.py new file mode 100644 index 0000000000..d1ebb61d9c --- /dev/null +++ b/examples/model_selection/Trails/singa_pkg_code/tensor.py @@ -0,0 +1,1804 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# ============================================================================= +""" +Example usage:: + + import numpy as np + from singa import tensor + from singa import device + + # create a tensor with shape (2,3), default CppCPU device and float32 + x = tensor.Tensor((2, 3)) + x.set_value(0.4) + + # create a tensor from a numpy array + npy = np.zeros((3, 3), dtype=np.float32) + y = tensor.from_numpy(npy) + + y.uniform(-1, 1) # sample values from the uniform distribution + + z = tensor.mult(x, y) # gemm -> z of shape (2, 3) + + x += z # element-wise addition + + dev = device.get_default_device() + x.to_device(dev) # move the data to a gpu device + + s = tensor.to_numpy(x) # tensor -> numpy array + +There are two sets of tensor functions, + +Tensor member functions + which would change the internal state of the Tensor instance. + +Tensor module functions + which accept Tensor instances as arguments and return Tensor instances. + +Every Tesor instance must be initialized before reading data from it. +""" +from __future__ import division +from __future__ import print_function +from __future__ import absolute_import + +from deprecated import deprecated +from builtins import object +import numpy as np +from functools import reduce +import re + +from . import singa_wrap as singa +from .device import get_default_device + +int32 = 2 #core.proto.kInt32 +float32 = 0 #core.proto.kFloat32 +CTensor = singa.Tensor + + +class Tensor(object): + '''Python Tensor, which wraps a swig converted Tensor from CPP Tensor. + + Args: + shape (tuple): a tuple of integers for the tensor shape. If shape + is not specified, the created tensor is called a dummy tensor. + device: a swig device. If None, the default host device is used. + dtype: data type. currently, most operations only accept float32. + data: a numpy array or swig tensor. + requires_grad: boolean indicator for computing the gradient. + stores_grad: boolean indicator for storing and returning the gradient. + Some intermediate tensors' gradient can be released + during the backward propagation. A tensor may require + grad but not store grad; But if a tensor stores grad + then it must require grad. + ''' + tensor_count = 0 + + def __init__(self, + shape=(), + device=None, + dtype=float32, + data=None, + requires_grad=True, + stores_grad=False, + creator=None, + name=None): + if device is None: + device = get_default_device() + if isinstance(data, np.ndarray): + self.data = CTensor(list(data.shape), device, dtype) + copy_from_numpy(self.data, data) + elif isinstance(data, CTensor): + self.data = data + assert data.device().id() == device.id(), 'not the same device' + else: + self.data = CTensor(list(shape), device, dtype) + + self.shape = tuple(self.data.shape()) + self.device = device + self.dtype = self.data.data_type() + self.requires_grad = requires_grad + self.stores_grad = stores_grad + if name is None: + self.name = 'Dummy#{}'.format(Tensor.tensor_count) + Tensor.tensor_count += 1 + else: + self.name = name + if creator is None: + from . import autograd + self.creator = autograd.Dummy(self, name) + else: + self.creator = creator + + def __getitem__(self, keys): + if type(keys) != tuple: + keys = (keys,) + + ret = self.clone() + axis_index = 0 + for key in keys: + if type(key) == int: + key += self.shape[axis_index] if key < 0 else 0 + + if not (key >= 0 and key < self.shape[axis_index]): + raise ValueError("Invalid Index") + + ret.data = singa.SliceOn(ret.data, key, key + 1, axis_index) + elif type(key) == slice: + start = key.start if key.start else 0 + end = key.stop if key.stop else self.shape[axis_index] + + start += self.shape[axis_index] if start < 0 else 0 + end += self.shape[axis_index] if end < 0 else 0 + + if not (start >= 0 and start < end and + end <= self.shape[axis_index]): + raise ValueError("Invalid Index") + + ret.data = singa.SliceOn(ret.data, start, end, axis_index) + else: + raise ValueError("Invalid Index") + axis_index += 1 + + return ret + + def is_dummy(self): + ''' + Returns: + True if the tensor is a dummy tensor + ''' + match = re.match(r'Dummy#\d+', self.name) + if match: + return True + else: + return False + + def ndim(self): + ''' + Returns: + the number of dimensions of the tensor. + ''' + return self.data.nDim() + + def is_empty(self): + ''' + Returns: + True if the tensor is empty according to its shape + ''' + return self.ndim() == 0 + + def is_transpose(self): + ''' + Returns: + True if the internal data is transposed; otherwise False. + ''' + return self.data.transpose() + + def transpose(self, axes=None): + ''' To transpose the tensor + + Args: + axes: axes to transpose + + Returns: + new transposed tensor + ''' + t = Tensor(self.shape, self.device, self.dtype) + if axes is None: + tshape = [self.shape[x] for x in range(len(t.shape))] + t.shape = tuple(tshape) + t.data = singa.DefaultTranspose(self.data) + else: + if (len(axes) != len(self.shape)): + raise ValueError('dimensions do not match') + tshape = [self.shape[x] for x in axes] + t.shape = tuple(tshape) + t.data = singa.Transpose(self.data, list(axes)) + return t + + def size(self): # TODO(wangwei) compute size + ''' + Returns: + the number of elements of the tensor. + ''' + return self.data.Size() + + def memsize(self): + ''' + Returns: + the number of Bytes allocated for this tensor. + ''' + return self.data.MemSize() + + def contiguous(self): + t = Tensor(self.shape, self.device, self.dtype) + t.data = singa.Contiguous(self.data) + return t + + def reshape(self, shape): + '''Return a new tensor with the given shape, and the original + tensor is not changed. + + Args: + shape (list): new shape, which should have the same + volumn as the original shape. + + Returns: + new tensor reshaped + ''' + t = Tensor(self.shape, self.device, self.dtype) + assert product(self.shape) == product(shape), \ + 'product of shape should be equal' + t.shape = shape + t.data = singa.Reshape(self.data, shape) + return t + + def reset_like(self, t): + '''Reset the shape, dtype and device as the given tensor. + + Args: + t (Tensor): a tensor + ''' + self.data.ResetLike(t.data) + self.shape = t.shape + self.device = t.device + self.dtype = t.dtype + + def as_type(self, dtype): + '''Change the data type. + + Args: + dtype: accepts 'int', 'float', 'singa.kFloat32', 'singa.kInt' + + Returns: + new tensor with new type + ''' + if dtype == singa.kInt: + pass + elif dtype == singa.kFloat32: + pass + elif dtype == 'int': + dtype = singa.kInt + elif dtype == 'float': + dtype = singa.kFloat32 + else: + raise TypeError("invalid data type %s" % dtype) + t = Tensor(self.shape, self.device, dtype) + t.data = self.data.AsType(dtype) + return t + + def to_device(self, device): + '''Move the tensor data onto a given device. + + Args: + device: a swig Device converted from CudaGPU or CppCPU or OpenclGPU + ''' + self.data.ToDevice(device) + self.device = device + + def to_host(self): + '''Move the tensor data onto the default host CppCPU device. + ''' + self.data.ToHost() + self.device = get_default_device() + + def l2(self): + ''' + Returns: + the L2 norm. + ''' + return self.data.L2() + + def l1(self): + ''' + Returns: + the L1 norm. + ''' + return self.data.L1() + + def set_value(self, x, inplace=True): + '''Set all elements of the tensor to be the give value. + + Args: + x (float): a float value to be set to all elements. + inplace: inplace flag + + Returns: + this tensor + ''' + # assert type(x) == float, 'set value only accepts float input' + # if isinstance(x, float): + if not inplace: + # return new tensor filled with value + raise NotImplementedError + + self.data.SetFloatValue(float(x)) + return self + + def copy_from_numpy(self, np_array, offset=0): + ''' Copy the data from the numpy array. + + Args: + np_array: source numpy array + offset (int): destination offset + ''' + assert np_array.size == self.size(), 'tensor shape should be the same' + if not np_array.ndim == 1: + np_array = np_array.flatten() + dt = np_array.dtype + if dt == np.float32: + self.data.CopyFloatDataFromHostPtr(np_array) + elif dt == int or dt == np.int32: + self.data.CopyIntDataFromHostPtr(np_array) + else: + print('Not implemented yet for ', dt) + + def copy_data(self, t): + '''Copy data from other Tensor instance. + + Args: + t (Tensor): source Tensor. + ''' + assert (t.size() == self.size()), "tensor shape should be the same" + assert isinstance(t, Tensor), 't must be a singa Tensor instance' + self.data.CopyData(t.data) + + def copy_from(self, t, offset=0): + ''' Copy the data from the numpy array or other Tensor instance + + Args: + t (Tensor or np array): source Tensor or numpy array + offset (int): destination offset + ''' + if isinstance(t, Tensor): + self.copy_data(t) + elif isinstance(t, np.ndarray): + self.copy_from_numpy(t) + else: + raise ValueError("t should be Tensor or numpy array.") + + def clone(self): + ''' + Returns: + a new Tensor which does deep copy of this tensor + ''' + return _call_singa_func(self.data.Clone) + + def repeat(self, repeats, axis): + '''Repeat data of a tensor + + Args: + repeats(int or a sequence): the number that the tensor need to repeat for + axis (int):the axis to do repeat + If it is None, then the repeated tensor will be flattened.If it isn't None, + the repeats could be sequence, but it's size should match the axis's shape + + Returns: + the tensor which has been repeated + + ''' + t = Tensor() + t_ndim = self.ndim() + if isinstance(repeats, int) or isinstance(repeats, complex): + if repeats < 0: + raise ValueError( + "'repeats' should not be negative: {}".format(repeats)) + if axis != None and axis < 0: + axis += t_ndim + # broadcast = True + if axis is None: + axis = 9999 + t.shape = (product(self.shape) * repeats,) + Repeats = [ + repeats, + ] + t.data = self.data.Repeat(Repeats, axis) + elif axis >= 0: + t_shape = list(self.shape) + t_shape[axis] = self.shape[axis] * repeats + t.shape = tuple(t_shape) + Repeats = [ + repeats, + ] + t.data = self.data.Repeat(Repeats, axis) + + elif isinstance(repeats, tuple) or isinstance(repeats, list): + for rep in repeats: + if rep < 0: + raise ValueError( + "'repeats' should be int or sequence: {}".format( + repeats)) + + if axis != None and axis < 0: + axis += t_ndim + if axis is None: + raise ValueError( + "when axis us None, 'repeats' should be int: {}".format( + repeats)) + elif axis >= 0: + t_shape = list(self.shape) + t_shape[axis] = sum(repeats) + t.shape = tuple(t_shape) + t.data = self.data.Repeat(list(repeats), axis) + else: + raise ValueError('repeats should be int or sequence') + + return t + + def T(self): + ''' shallow copy. + + Returns: + a new Tensor which shares the underlying data memory (shallow copy). + ''' + return _call_singa_func(singa.DefaultTranspose, self.data) + + def copy(self): + '''shallow copy calls copy constructor of singa::Tensor + + Returns: + new tensor copied + ''' + return _call_singa_func(CTensor, self.data) + + def deepcopy(self): + '''Same as clone(). + + Returns: + a new Tensor + ''' + return self.clone() + + def bernoulli(self, p, inplace=True): + '''Sample 0/1 for each element according to the given probability. + + Args: + p (float): with probability p, each element is sample to 1. + inplace: inplace flag + + Returns: + this tensor + ''' + if not inplace: + # return new tensor + raise NotImplementedError + + singa.Bernoulli(float(p), self.data) + return self + + def gaussian(self, mean, std, inplace=True): + '''Generate a value for each element following a Gaussian distribution. + + Args: + mean (float): mean of the distribution + std (float): standard variance of the distribution + inplace: inplace flag + + Returns: + this tensor + ''' + if not inplace: + # return new tensor + raise NotImplementedError + + singa.Gaussian(float(mean), float(std), self.data) + return self + + def uniform(self, low, high, inplace=True): + '''Generate a value for each element following a uniform distribution. + + Args: + low (float): the lower bound + high (float): the hight bound + inplace: inplace flag + + Returns: + this tensor + ''' + if not inplace: + # return new tensor + raise NotImplementedError + + singa.Uniform(float(low), float(high), self.data) + return self + + @deprecated(reason="use broadcast instead") + def add_column(self, v): + '''(DEPRECATED, use broadcast)Add a tensor to each column of this tensor. + + Args: + v (Tensor): a Tensor to be added as a column to this tensor. + ''' + singa.AddColumn(v.data, self.data) + + @deprecated(reason="use broadcast instead") + def add_row(self, v): + '''(DEPRECATED, use broadcast)Add a tensor to each row of this tensor. + + Args: + v (Tensor): a Tensor to be added as a row to this tensor. + ''' + singa.AddRow(v.data, self.data) + + @deprecated(reason="use broadcast instead") + def div_column(self, v): + '''(DEPRECATED, use broadcast)Divide each column of this tensor by v. + + Args: + v (Tensor): 1d tensor of the same length the column of self. + ''' + singa.DivColumn(v.data, self.data) + + @deprecated(reason="use broadcast instead") + def div_row(self, v): + '''(DEPRECATED, use broadcast)Divide each row of this tensor by v. + + Args: + v (Tensor): 1d tensor of the same length the row of self. + ''' + singa.DivRow(v.data, self.data) + + @deprecated(reason="use broadcast instead") + def mult_column(self, v): + '''(DEPRECATED, use broadcast)Multiply each column of this tensor by v element-wisely. + + Args: + v (Tensor): 1d tensor of the same length the column of self. + ''' + singa.MultColumn(v.data, self.data) + + @deprecated(reason="use broadcast instead") + def mult_row(self, v): + '''(DEPRECATED, use broadcast)Multiply each row of this tensor by v element-wisely. + + Args: + v (Tensor): 1d tensor of the same length the row of self. + ''' + singa.MultRow(v.data, self.data) + + ''' + python operators (+=, -=, *=, /=) for singa::Tensor unary operators + ''' + + def __iadd__(self, x): + ''' inplace element-wise addition with a tensor or a float value. + + Args: + x (float or Tensor): input value + + Returns: + this tensor + ''' + if isinstance(x, Tensor): + self.data += x.data + else: + self.data += float(x) + return self + + def __isub__(self, x): + ''' inplace element-wise subtraction with a tensor or a float value. + + Args: + x (float or Tensor): input value + + Returns: + this tensor + ''' + + if isinstance(x, Tensor): + self.data -= x.data + else: + self.data -= float(x) + return self + + def __imul__(self, x): + ''' inplace element-wise multiplication with a tensor or a float value. + + Args: + x (float or Tensor): input value + + Returns: + this tensor + ''' + if isinstance(x, Tensor): + self.data *= x.data + else: + self.data *= float(x) + return self + + def __itruediv__(self, x): + ''' inplace element-wise division by a tensor or a float value. + + Args: + x (float or Tensor): input value + + Returns: + this tensor + ''' + if isinstance(x, Tensor): + self.data /= x.data + else: + self.data /= float(x) + return self + + ''' + python operators (+, -, *, /, <, <=, >, >=) for singa binary operators + https://docs.python.org/2/library/operator.html#mapping-operators-to-functions + ''' + + def __add__(self, rhs): + if isinstance(rhs, Tensor): + return from_raw_tensor(singa.__add__(self.data, rhs.data)) + else: + return _call_singa_func(singa.AddFloat, self.data, rhs) + + def __sub__(self, rhs): + if isinstance(rhs, Tensor): + return from_raw_tensor(singa.__sub__(self.data, rhs.data)) + else: + return _call_singa_func(singa.SubFloat, self.data, rhs) + + def __mul__(self, rhs): + if isinstance(rhs, Tensor): + return from_raw_tensor(singa.__mul__(self.data, rhs.data)) + else: + return _call_singa_func(singa.MultFloat, self.data, rhs) + + def __div__(self, rhs): + if isinstance(rhs, Tensor): + return from_raw_tensor(singa.__div__(self.data, rhs.data)) + else: + return _call_singa_func(singa.DivFloat, self.data, rhs) + + def __truediv__(self, rhs): + if isinstance(rhs, Tensor): + return from_raw_tensor(singa.__div__(self.data, rhs.data)) + else: + return _call_singa_func(singa.DivFloat, self.data, rhs) + + def __floordiv__(self, rhs): + if isinstance(rhs, Tensor): + tmp = from_raw_tensor(singa.__div__(self.data, rhs.data)) + return _call_singa_func(singa.Floor, tmp.data) + else: + tmp = _call_singa_func(singa.DivFloat, self.data, rhs) + return _call_singa_func(singa.Floor, tmp.data) + + def __lt__(self, rhs): + if isinstance(rhs, Tensor): + return from_raw_tensor(singa.__lt__(self.data, rhs.data)) + else: + return _call_singa_func(singa.LTFloat, self.data, rhs) + + def __le__(self, rhs): + if isinstance(rhs, Tensor): + return from_raw_tensor(singa.__le__(self.data, rhs.data)) + else: + return _call_singa_func(singa.LEFloat, self.data, rhs) + + def __gt__(self, rhs): + if isinstance(rhs, Tensor): + return from_raw_tensor(singa.__gt__(self.data, rhs.data)) + else: + return _call_singa_func(singa.GTFloat, self.data, rhs) + + def __ge__(self, rhs): + if isinstance(rhs, Tensor): + return from_raw_tensor(singa.__ge__(self.data, rhs.data)) + else: + return _call_singa_func(singa.GEFloat, self.data, rhs) + + def __eq__(self, rhs): + if isinstance(rhs, Tensor): + return from_raw_tensor(singa.__eq__(self.data, rhs.data)) + elif rhs is None: + return False + else: + return _call_singa_func(singa.EQFloat, self.data, rhs) + + def __radd__(self, lhs): + lhs = float(lhs) + one = Tensor(self.shape, self.device, self.dtype) + one.set_value(lhs) + one += self + return one + + def __rsub__(self, lhs): + lhs = float(lhs) + one = Tensor(self.shape, self.device, self.dtype) + one.set_value(lhs) + one -= self + return one + + def __rmul__(self, lhs): + lhs = float(lhs) + one = Tensor(self.shape, self.device, self.dtype) + one.set_value(lhs) + one *= self + return one + + def __rdiv__(self, lhs): + lhs = float(lhs) + one = Tensor(self.shape, self.device, self.dtype) + one.set_value(lhs) + one /= self + return one + + def __rtruediv__(self, lhs): + lhs = float(lhs) + one = Tensor(self.shape, self.device, self.dtype) + one.set_value(lhs) + one /= self + return one + + def __repr__(self): + return np.array2string(to_numpy(self)) + + +''' alias Tensor to PlaceHolder +''' +PlaceHolder = Tensor +''' python functions for global functions in Tensor.h +''' + + +def from_raw_tensor(t): + x = Tensor(t.shape(), t.device(), t.data_type()) + x.data = t + return x + + +def from_raw_tensors(tt): + ret = [] + for t in list(tt): + ret.append(from_raw_tensor(t)) + return ret + + +def zeros_like(t): + ret = Tensor(t.shape, t.device, t.dtype) + ret.set_value(float(0)) + return ret + + +def ones_like(t): + ret = Tensor(t.shape, t.device, t.dtype) + ret.set_value(float(1)) + return ret + + +def product(shape): + return reduce(lambda x, y: x * y, shape) + + +def sizeof(dtype): + '''Get size of datatype + + Args: + dtype: singa datatype + + Returns: + the number of bytes of the given SINGA data type defined in core.proto + ''' + return singa.SizeOf(dtype) + + +def contiguous(tensor): + return _call_singa_func(singa.Contiguous, tensor.data) + + +def reshape(tensor, shape): + '''Reshape the input tensor with the given shape and + the original tensor is not changed + + Args: + tensor (Tensor): the tensor to be changed + shape (list): the new shape, which should have the same volumn as the + old shape. + + Returns: + the new Tensor + ''' + return _call_singa_func(singa.Reshape, tensor.data, shape) + + +def transpose(t, axes=None): + '''To transpose the tensor + + Args: + t: input tensor + axes: axes to transpose + + Returns: + the transposed tensor + ''' + ret = t.transpose(axes) + return ret + + +def copy_data_to_from(dst, src, size, dst_offset=0, src_offset=0): + '''Copy the data between two Tensor instances which could be on different + devices. + + Args: + dst (Tensor): destination Tensor + src (Tensor): source Tensor + size (int) : number of elements to copy + dst_offset (int): offset in terms of elements to the start of dst + src_offset (int): offset in terms of elements to the start of src + ''' + singa.CopyDataToFrom(dst.data, src.data, size, dst_offset, src_offset) + + +def from_numpy(np_array, dev=None): + '''Create a Tensor instance with the shape, dtype and values from the numpy + array. + + Args: + np_array: the numpy array. + + Returns: + A Tensor instance allocated on the default CppCPU device. + ''' + assert type(np_array) is np.ndarray, 'Must input numpy array' + # convert to float32 array + if np_array.dtype == np.float64 or np_array.dtype == np.float: + np_array = np_array.astype(np.float32) + + if np_array.dtype == np.int64 or np_array.dtype == int: + np_array = np_array.astype(np.int32) + + if np_array.dtype == np.float32: + dtype = float32 + else: + assert np_array.dtype == np.int32, \ + 'Only float and int tensors are supported' + dtype = int32 + ret = Tensor(np_array.shape, dtype=dtype) + ret.copy_from_numpy(np_array) + if dev: + ret.to_device(dev) + return ret + + +def to_host(t): + '''Copy the data to a host tensor. + + Args: + t (Tensor): a Tensor + + Returns: + new Tensor at host + ''' + ret = t.clone() + ret.to_host() + return ret + + +def to_numpy(t): + '''Copy the tensor into a numpy array. + + Args: + t (Tensor): a Tensor + + Returns: + a numpy array + ''' + th = to_host(t) + if th.dtype == float32: + np_array = th.data.GetFloatValue(int(th.size())) + elif th.dtype == int32: + np_array = th.data.GetIntValue(int(th.size())) + else: + print('Not implemented yet for ', th.dtype) + return np_array.reshape(th.shape) + + +def abs(t): + ''' + Args: + t (Tensor): input Tensor + + Returns: + a new Tensor whose element y = abs(x), x is an element of t + ''' + return _call_singa_func(singa.Abs, t.data) + + +def exp(t): + ''' + Args: + t (Tensor): input Tensor + + Returns: + a new Tensor whose element y = exp(x), x is an element of t + ''' + return _call_singa_func(singa.Exp, t.data) + + +def ceil(t): + ''' + Args: + t (Tensor): input Tensor + + Returns: + a new Tensor whose element y = ceil(x), x is an element of t + ''' + return _call_singa_func(singa.Ceil, t.data) + + +def log(t): + ''' + Args: + t (Tensor): input Tensor + + Returns: + a new Tensor whose element y = log(x), x is an element of t + ''' + return _call_singa_func(singa.Log, t.data) + + +def sigmoid(t): + ''' + Args: + t (Tensor): input Tensor + + Returns: + a new Tensor whose element y = sigmoid(x); x is an element of t + ''' + return _call_singa_func(singa.Sigmoid, t.data) + + +def sign(t): + ''' + Args: + t (Tensor): input Tensor + + Returns: + a new Tensor whose element y = sign(x) + ''' + return _call_singa_func(singa.Sign, t.data) + + +def sqrt(t): + ''' + Args: + t (Tensor): input Tensor + + Returns: + a new Tensor whose element y = sqrt(x), x is an element of t + ''' + return _call_singa_func(singa.Sqrt, t.data) + + +def square(t): + ''' + Args: + t (Tensor): input Tensor + + Returns: + a new Tensor whose element y = x * x, x is an element of t + ''' + return _call_singa_func(singa.Square, t.data) + + +def tanh(t): + ''' + Args: + t (Tensor): input Tensor + + Returns: + a new Tensor whose element y = tanh(x), x is an element of t + ''' + return _call_singa_func(singa.Tanh, t.data) + + +def sum(t, axis=None, out=None): + '''Sum of tensor elements over given axis + + Args: + t: Singa.tensor + The array_like tensor to be sumed + axis: None or int or tuple of ints, optional + Axis or axes along which a sum is performed. + The default, axis=None, will sum all of the elements of the input array. + If axis is negative it counts from the last to the first axis. + If axis is a tuple of ints, a sum is performed on all of the axes specified + in the tuple instead of a single axis or all the axes as before. + out:Singa.tensor optional + Alternative output array in which to place the result. + It must have the same shape as the expected output, + but the type of the output values will be cast if necessary. + + Returns: + A tensor with the same shape as t, with the specified axis removed. + If a is a 0-d array, or if axis is None, a scalar is returned. + If an output array is specified, a reference to out is returned + ''' + + t_shape = t.shape + t_ndim = t.ndim() + + if axis is None: + one = Tensor(t.shape, t.device) + one.set_value(1.0) + ret = tensordot(t, one, t_ndim) + + if isinstance(axis, int): + if axis < 0: + axis += t_ndim + + axis_shape = t_shape[axis] + axis_shape = int(axis_shape) + one = Tensor(shape=(axis_shape,), device=t.device) + one.set_value(1.0) + ret = tensordot(t, one, axes=([axis], [0])) + + if isinstance(axis, tuple): + l_axis = list(axis) + axis_shape = [t_shape[x] for x in axis] + axisshape = tuple(axis_shape) + one = Tensor(axisshape, t.device) + one.set_value(1.0) + one_axis = [x for x in range(one.ndim())] + ret = tensordot(t, one, (l_axis, one_axis)) + + if out is not None: + if out.shape != ret.shape: + raise ValueError('dimensions do not match') + out[:] = ret + return out + else: + return ret + + +def pow(t, x, out=None): + ''' + Args: + t (Tensor): input tensor + x (float or Tensor): y[i] = t[i]^x if x is a float value; otherwise, + y[i]= t[i]^x[i] if x is a tensor. + out (None or Tensor): if None, a new Tensor would be constructed to + store the result; otherwise, the result is put into out. + + Returns: + the result tensor. + ''' + if out is None: + if isinstance(x, Tensor): + return _call_singa_func(singa.Pow, t.data, x.data) + else: + return _call_singa_func(singa.PowFloat, t.data, x) + else: + if isinstance(x, Tensor): + singa.PowWithRet(t.data, x.data, out.data) + else: + singa.PowFloatWitRet(t.data, x, out.data) + return out + + +def average(t, axis=None): + ''' + Args: + t (Tensor): input Tensor + axis (int, optional): if None, average all elements; otherwise average + along the given dimension. 0 for averaging each column; 1 for + averaging each row. + + Returns: + a float value if axis is None; otherwise, a new Tensor for the result. + ''' + if t.ndim() > 1: + return _call_singa_func(singa.Average, t.data, axis) + else: + return singa.SumAsFloat(t.data) / t.size() + + +def softmax(t, out=None): + '''Apply SoftMax for each row of the Tensor. + + Args: + t (Tensor): the input 1d or 2d tensor + out (Tensor, optional): if not None, it is used to store the result + + Returns: + the result Tensor + ''' + if out is None: + return _call_singa_func(singa.SoftMax, t.data) + else: + singa.SoftMax(t.data, out.data) + return out + + +def lt(t, x): + '''Elementi-wise comparison for t < x + + Args: + t (Tensor): left hand side operand + x (Tensor or float): right hand side operand + + Returns: + a Tensor with each element being t[i] < x ? 1.0f:0.0f, + or t[i] < x[i] ? 1.0f:0.0f + ''' + return t < x + + +def le(t, x): + '''Elementi-wise comparison for t <= x. + + Args: + t (Tensor): left hand side operand + x (Tensor or float): right hand side operand + + Returns: + a Tensor with each element being t[i] <= x ? 1.0f:0.0f, + or t[i] <= x[i] ? 1.0f:0.0f + ''' + return t <= x + + +def gt(t, x): + '''Elementi-wise comparison for t > x. + + Args: + t (Tensor): left hand side operand + x (Tensor or float): right hand side operand + + Returns: + a Tensor with each element being t[i] > x ? 1.0f:0.0f, + or t[i] > x[i] ? 1.0f:0.0f + ''' + return t > x + + +def ge(t, x): + '''Elementi-wise comparison for t >= x. + + Args: + t (Tensor): left hand side operand + x (Tensor or float): right hand side operand + + Returns: + a Tensor with each element being t[i] >= x ? 1.0f:0.0f, + or t[i] >= x[i] ? 1.0f:0.0f + ''' + return t >= x + + +def eq(t, x): + '''Elementi-wise comparison for t == x. + + Args: + t (Tensor): left hand side operand + x (Tensor or float): right hand side operand + + Returns: + a Tensor with each element being t[i] == x ? 1.0f:0.0f, + or t[i] == x[i] ? 1.0f:0.0f + ''' + return t == x + + +def add(lhs, rhs, ret=None): + '''Elementi-wise addition. + + Args: + lhs (Tensor): lhs tensor + rhs (Tensor): rhs tensor + ret (Tensor, optional): if not None, the result is stored in it; + otherwise, a new Tensor would be created for the result. + + Returns: + the result Tensor + ''' + if ret is None: + # call Tensor.__add__() + return lhs + rhs + else: + if isinstance(rhs, Tensor): + singa.Add(lhs.data, rhs.data, ret.data) + else: + singa.AddFloatWithRet(lhs.data, rhs, ret.data) + return ret + + +def sub(lhs, rhs, ret=None): + '''Elementi-wise subtraction. + + Args: + lhs (Tensor): lhs tensor + rhs (Tensor): rhs tensor + ret (Tensor, optional): if not None, the result is stored in it; + otherwise, a new Tensor would be created for the result. + + Returns: + the result Tensor + ''' + if ret is None: + # call Tensor.__sub__() + return lhs - rhs + else: + if isinstance(rhs, Tensor): + singa.Sub(lhs.data, rhs.data, ret.data) + else: + singa.SubFloatWithRet(lhs.data, rhs, ret.data) + return ret + + +def eltwise_mult(lhs, rhs, ret=None): + '''Elementi-wise multiplication. + + Args: + lhs (Tensor): lhs tensor + rhs (Tensor): rhs tensor + ret (Tensor, optional): if not None, the result is stored in it; + otherwise, a new Tensor would be created for the result. + + Returns: + the result Tensor + ''' + + if ret is None: + # call Tensor.__mul__() + return lhs * rhs + else: + if isinstance(rhs, Tensor): + singa.EltwiseMult(lhs.data, rhs.data, ret.data) + else: + singa.EltwiseMultFloatWithRet(lhs.data, rhs, ret.data) + return ret + + +def mult(A, B, C=None, alpha=1.0, beta=0.0): + '''Do matrix-matrix or matrix-vector multiplication. + This function returns C = alpha * A * B + beta * C + Currently below cases are supported + case 1 - matrix * vector: + A (Tensor): 2d Tensor + B (Tensor): 1d Tensor, GEMV would be invoked + case 2 - matrix * matrix: + A (Tensor): 2d Tensor + B (Tensor): 2d Tensor, GEMM would be invoked + case 3 - batched matrix * batched matrix: + A (Tensor): 3/4d Tensor + B (Tensor): 3/4d Tensor, batched GEMM would be invoked + Where first/first and second dimension(s) of A, B should be exactly the same + e.g. C{2,3,4,6} = A{2,3,4,5} * B{2,3,5,6} + + Args: + A: n-d tensor + B: n-d tensor + C (Tensor, optional): for storing the result; If None, a new Tensor would be created. + alpha (float): scaling factor + beta (float): scaling factor + + Returns: + the result Tensor + ''' + if C is None: + return _call_singa_func(singa.Mult, A.data, B.data) + else: + singa.MultWithScale(alpha, A.data, B.data, beta, C.data) + return C + + +def einsum(ops, *args): + ''' function TODO list to finish the function in cpp(just like numpy function): + 1.sum(A,axis = None) + 2.repeat(A,repeats) + 3.transpose(A,axes = None) + Do the matrix to matrix einsum calculation according to the operands + Warning : this function could only support two matrix' einsum calcultion + + Args: + ops(string): the string specifies the subscripts for summation such as + 'ki,kj->kij' Here all the 26 lowercase letter can be used here. + args(list of array_like): These are the tensors for the operation, + but here only support two tensors. + + Returns: + Singa.Tensor the output matirx of the einsum calculation + + The best way to understand this function is to try the examples below: + A_ = [0,1,2,3,4,5,6,7,8,9,10,11] + A = A_.reshape(4,3) + B = A_.reshape(3,4) + + Here this einsum calculation is the same as normal 'mult' + Res = einsum('ij,jk->ik',A,B) + + >>> [[ 20 23 26 29] + [ 56 68 80 92] + [ 92 113 134 155] + [128 158 188 218]] + + A_ = [0,1,2,3,4,5,6,7,8,9,10,11] + A = A_.reshape(4,3) + B = A_.reshape(4,3) + + Here the einsum calculation is the same as normol 'eltwise_mult' + Res = einsum('ki,ki->ki',A,B) + + >>> [[ 0 1 4] + [ 9 16 25] + [ 36 49 64] + [ 81 100 121]] + + A = [0,1,2,3,4,5,6,7,8,9,10,11] + A = A.reshape(4,3) + + Res = einsum('ki,kj->kij',A,A) + >>> [[[ 0 0 0] + [ 0 1 2] + [ 0 2 4]] + [[ 9 12 15] + [ 12 16 20] + [ 15 20 25]] + [[ 36 42 48] + [ 42 49 56] + [ 48 56 64]] + [[ 81 90 99] + [ 90 100 110] + [ 99 110 121]]] + + A_ = [0,1,2,3,4,5,6,7,8,9,10,11] + A = A_.reshape(3,2,2) + + Res = einsum('kia,kja->kij',A,A) + >>> [[[ 1 3] + [ 3 13]] + [[ 41 59] + [ 59 85]] + [[145 179] + [179 221]]] + ''' + + if len(ops) == 0: + raise ValueError("No input operands") + + if len(args) != 2: + raise ValueError("Currently only two operands are supported") + # to get the input and output ops + inputops, outputops = ops.split('->') + inputops = inputops.split(',') + + # to get the two input tensor + A = args[0] + B = args[1] + + if A.ndim() != len(inputops[0]) or B.ndim() != len(inputops[1]): + raise ValueError("input dim doesn't match operands") + + # to get the indices in input but not in output + sums = sorted(list((set(inputops[0]) | set(inputops[1])) - set(outputops))) + + # to get the indices that A and B use to broadcast to each other + broadcast_A = sorted(list(set(inputops[1]) - set(inputops[0]))) + broadcast_B = sorted(list(set(inputops[0]) - set(inputops[1]))) + # to get all the indices in input + outputall = sorted(list(set(inputops[0]) | set(inputops[1]))) + + # Map indices to axis integers + sums = [outputall.index(x) for x in sums] + broadcast_idA = [inputops[1].find(x) for x in broadcast_A] + broadcast_idB = [inputops[0].find(x) for x in broadcast_B] + + broadcast_a = [B.shape[x] for x in broadcast_idA] + broadcast_b = [A.shape[x] for x in broadcast_idB] + + # get the the transpose and reshape parameter used in the elementwise + # calculation + transpose_A = [(list(inputops[0]) + broadcast_A).index(x) for x in outputall + ] + transpose_B = [(list(inputops[1]) + broadcast_B).index(x) for x in outputall + ] + + reshape_A = list(A.shape) + broadcast_a + reshape_B = list(B.shape) + broadcast_b + + if len(broadcast_a) == 0: + broadcast_a = [1] + if len(broadcast_b) == 0: + broadcast_b = [1] + mult_A = repeat(A, product(broadcast_a)) + mult_A = mult_A.reshape(reshape_A) + mult_A = transpose(mult_A, transpose_A) + mult_B = repeat(B, product(broadcast_b)) + mult_B = mult_B.reshape(reshape_B) + mult_B = transpose(mult_B, transpose_B) + + if mult_A.shape != mult_B.shape: + raise ValueError("Error: matrix dimension mismatch") + res = eltwise_mult(mult_A, mult_B) + sum_R = sorted(sums, reverse=True) + for i in sum_R: + res = sum(res, axis=i) + transpose_res = [sorted(list(outputops)).index(x) for x in list(outputops)] + res = transpose(res, transpose_res) + + return res + + +def repeat(t, repeats, axis=None): + '''Return the repeated tensor + + Args: + t(tensor): the tensor to be repeated + repeats(int or a sequence): the number that the tensor need to repeat for + axis (int):the axis to do repeat + If it is None, then the repeated tensor will be flattened.If it isn't None, + the repeats could be sequence, but it's size should match the axis's shape + + Returns: + the tensor which has been repeated + ''' + ret = t.repeat(repeats, axis) + return ret + + +def tensordot(A, B, axes=2): + """Returns the tensor multiplication of two tensors along specified axes. + + This is equivalent to compute dot product along the specified axes which + are treated as one axis by reshaping. + + Args: + A: Singa.Tensor + B: Singa.Tensor + axes: + - If it is an integer, then ''axes'' represent axes at the last of ''a`'' and + the first of ''b'' are used. + - If it is a pair of sequences of integers, then these two + sequences specify the list of axes for ''a'' and ''b''. The + corresponding axes are paired for sum-product. + + Returns: + singa.tensor: The tensor product of ''A'' and ''B'' along the + axes specified by ''axes''. + + Thanks to numpy.tensordot. + the link is https://github.com/numpy/numpy/blob/v1.14.0/numpy/core/numeric.py#L1123-L1306 + """ + # when axes is an integer, axes_A and axes_B represent axes at the last of ''A'' and + # the first of ''B''. For example, when axes is 1, we do the normal multiplication : + # if A is in shape(3,2,4), B is in shape(4,2,5), it will return a matrix in shape(3,2,2,5) + # when axes is 2 and A,B are shape (3,2,4) and (2,4,5), it will return a + # matrix in shape(3,5) + + if type(axes) == int: + axes_A = list(range(-axes, 0)) + axes_B = list(range(0, axes)) + else: + axes_A, axes_B = axes + # when axes is a pair of sequences of integers.For example, A is in shape(3,2,4), + # B is in shape(4,2,5), we set axes as ([1,2],[1,0]), it will return a + # matrix in shape(3,5) + if isinstance(axes_A, list): + na = len(axes_A) + axes_A = list(axes_A) + else: + axes_A = [axes_A] + na = 1 + if isinstance(axes_B, list): + nb = len(axes_B) + axes_B = list(axes_B) + else: + axes_B = [axes_B] + nb = 1 + + # a_shape and b_shape are the shape of tensor A and B, while nda and ndb + # are the dim of A and B + a_shape = A.shape + nda = A.ndim() + b_shape = B.shape + ndb = B.ndim() + equal = True + # to check if the length of axe_A is equal to axes_B + if na != nb: + equal = False + else: + # to make the shape match + for k in range(na): + if a_shape[axes_A[k]] != b_shape[axes_B[k]]: + equal = False + break + if axes_A[k] < 0: + axes_A[k] += nda + if axes_B[k] < 0: + axes_B[k] += ndb + if not equal: + raise ValueError("shape-mismatch for sum") + '''start to do the calculation according to the axes''' + + notin = [k for k in range(nda) if k not in axes_A] + # nda is the dim of A, and axes_a is the axis for A, notin is the axis + # which is not in axes_A + newaxes_a = notin + axes_A + N2 = 1 + for axis in axes_A: + N2 *= a_shape[axis] + N1 = 1 + for ax in notin: + N1 *= a_shape[ax] + # newshape_a is the shape to do multiplication.For example, A is in shape(3,2,4), + # B is in shape(4,2,5), we set axes as ([1,2],[1,0]), then newshape_a should be (3,5) + # olda is the shape that will be shown in the result. + newshape_a = (N1, N2) + olda = [a_shape[axis] for axis in notin] + notin = [k for k in range(ndb) if k not in axes_B] + newaxes_b = axes_B + notin + N2 = 1 + for axis in axes_B: + N2 *= b_shape[axis] + N1 = 1 + for bx in notin: + N1 *= b_shape[bx] + newshape_b = (N2, N1) + oldb = [b_shape[axis] for axis in notin] + + A = transpose(A, newaxes_a) + B = transpose(B, newaxes_b) + at = reshape(A, newshape_a) + bt = reshape(B, newshape_b) + + res = mult(at, bt) + if len(olda + oldb) == 0: + olda = [1] + oldb = [1] + res = res.reshape(tuple(olda + oldb)) + else: + res = res.reshape(tuple(olda + oldb)) + + return res + + +def div(lhs, rhs, ret=None): + '''Elementi-wise division. + + Args: + lhs (Tensor): lhs tensor + rhs (Tensor): rhs tensor + ret (Tensor, optional): if not None, the result is stored in it; + otherwise, a new Tensor would be created for the result. + + Returns: + the result Tensor + ''' + if ret is None: + # call Tensor.__div__() + return lhs / rhs + else: + if isinstance(rhs, Tensor): + singa.Div(lhs.data, rhs.data, ret.data) + else: + singa.DivFloatWithRet(lhs.data, rhs, ret.data) + return ret + + +def axpy(alpha, x, y): + '''Element-wise operation for y += alpha * x. + + Args: + alpha (float): scaling factor + x (Tensor): a tensor + y (Tensor): a tensor + + Returns: + y + ''' + singa.Axpy(float(alpha), x.data, y.data) + return y + + +def bernoulli(p, t): + '''Generate a binary value for each element of t. + + Args: + p (float): each element is 1 with probability p; and 0 with 1 - p + t (Tensor): the results are put into t + + Returns: + t + ''' + singa.Bernoulli(float(p), t.data) + return t + + +def gaussian(mean, std, t): + '''Generate values following a Gaussian distribution. + + Args: + mean (float): the mean of the Gaussian distribution. + std (float): the standard variance of the Gaussian distribution. + t (Tensor): the results are put into t + + Returns: + t + ''' + singa.Gaussian(float(mean), float(std), t.data) + return t + + +def uniform(low, high, t): + '''Generate values following a Uniform distribution. + + Args: + low (float): the lower bound + high (float): the higher bound + t (Tensor): the results are put into t + + Returns: + t + ''' + singa.Uniform(float(low), float(high), t.data) + return t + + +def add_column(alpha, v, beta, M): + '''Add v to each column of M. + + Denote each column of M as m, m = alpha * v + beta * m + + Args: + alpha (float): scalar factor + v (Tensor): a tensor + beta (float): scalar factor + M (Tensor): 2d tensor + + Returns: + Resulted tensor M + ''' + singa.AddColumnWithScale(float(alpha), float(beta), v.data, M.data) + return M + + +def add_row(alpha, v, beta, M): + '''Add v to each row of M. + + Denote each row of M as m, m = alpha * v + beta * m + + Args: + alpha (float): scaling factor + v (Tensor): a tensor + beta (float): scaling factor + M (Tensor): 2d tensor + + Returns: + Resulted tensor M + ''' + singa.AddRowWithScale(alpha, beta, v.data, M.data) + return M + + +def sum_columns(M): + '''Sum all columns into a single column. + + Args: + M (Tensor): the input 2d tensor. + + Returns: + a new Tensor as the resulted column. + ''' + assert M.ndim() == 2, 'M.nDim() is supposed to be 2' + ret = Tensor((M.shape[0], 1), M.data.device()) + singa.SumColumns(M.data, ret.data) + return ret + + +def sum_rows(M): + '''Sum all rows into a single row. + + Args: + M (Tensor): the input 2d tensor. + + Returns: + a new Tensor as the resulted row. + ''' + assert M.ndim() == 2, 'M.nDim() is supposed to be 2' + ret = Tensor((1, M.shape[1]), M.data.device()) + singa.SumRows(M.data, ret.data) + return ret + + +''' private functions, internally used +''' + + +def _call_singa_func(_singa_func, *args): + ''' this function calls singa global functions that returns Tensor + and create new python Tensor instance + e.g., Tensor [singa_func](args...) + + Args: + _singa_func: singa CPP API + args: args for singa CPP API + + Returns: + new singa tensor + ''' + new_t = Tensor() + new_t.data = _singa_func(*args) + new_t.shape = tuple(new_t.data.shape()) + new_t.device = new_t.data.device() + new_t.dtype = new_t.data.data_type() + return new_t + + +def copy_from_numpy(data, np_array): + ''' Copy the data from the numpy array. + used as static method + + Args: + data: singa ctensor + np_array: source numpy array + ''' + assert np_array.size == data.Size(), \ + 'tensor shape should be the same' + if not np_array.ndim == 1: + np_array = np_array.flatten() + dt = np_array.dtype + if dt == np.float32: + data.CopyFloatDataFromHostPtr(np_array) + elif dt == int or dt == np.int32: + data.CopyIntDataFromHostPtr(np_array) + else: + print('Not implemented yet for ', dt) + + +def concatenate(tensors, axis): + '''concatenate list of tensors together based on given axis + + Args: + tensors: list of tensors. + axis: number of axis to cancatenate on, all the dim should be the same + except the axis to be concatenated. + + Returns: + new tensor concatenated + ''' + ctensors = singa.VecTensor() + for t in tensors: + ctensors.append(t.data) + return _call_singa_func(singa.ConcatOn, ctensors, axis) + + +def random(shape, device=get_default_device()): + ''' return a random tensor with given shape + + Args: + shape: shape of generated tensor + device: device of generated tensor, default is cpu + + Returns: + new tensor generated + ''' + ret = Tensor(shape, device=device) + ret.uniform(0, 1) + return ret + + +def zeros(shape, device=get_default_device()): + ret = Tensor(shape, device=device) + ret.set_value(0.0) + return ret + + +def ones(shape, device=get_default_device()): + ret = Tensor(shape, device=device) + ret.set_value(1.0) + return ret