This repository has been archived by the owner on Mar 13, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtitanic_pipeline.yml
72 lines (62 loc) · 1.97 KB
/
titanic_pipeline.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
# Same example with https://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer_mixed_types.html
source: titanic
dbname: ml_tips
train_sample_rate: 0.8
oversample_n_times: 3 # Multiply all samples with this scale. The value should be int
#oversample_pos_n_times: 2 # Multiply positive samples with this scale. The value should be int.
query_dir: queries
id_column: "rowid"
target_column: "survived"
#stratify: True
numerical_columns:
- columns:
- "age"
- "fare"
transformer:
imputer:
strategy: "median" # median, mean, constant
phase: "train" # train, test, or null (whole dataset to get stats)
normalizer:
strategy: "minmax" # standardize, log1p, minmax
phase: "train"
categorical_columns:
- columns:
- "embarked"
- "sex"
- "pclass"
transformer:
imputer:
strategy: "constant"
phase: "train"
fill_value: "missing"
vectorizer:
train_table: "train"
test_table: "test"
# whole_table: "whole" # vectorize all data
# Options for creating dense vector which is required by train_randomforest*
dense:
mode: "auto" # auto or force. Default: auto
hashing: true # true or false. Default: true
feature_cardinality: "auto" # "auto" or integer, which represents maximum cardinality of categorical columns
# Not implemented yet
tuner:
strategy: "gridsearch" # gridsearch, randomsearch, python
trainer:
- name: "train_classifier"
model_table: "model_lr"
# source_table: "train" # Set if you want to set specific table name
- name: "train_randomforest_classifier"
model_table: "model_rf"
option: "-trees 15 -seed 31"
predictor:
- name: "predict_classifier"
model_table: "model_lr"
output_table: "prediction_lr"
# target_table: "test" # Set if you want to set specific table name
- name: "predict_randomforest_classifier"
model_table: "model_rf"
output_table: "prediction_rf"
evaluator:
metrics:
- auc
- logloss