forked from Reza-B/data_cleaner_dsl
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtest.datacleaner
41 lines (28 loc) · 1009 Bytes
/
test.datacleaner
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
// Load the sample data
load "sample_data.csv"
// Remove rows with missing values in 'salary' column
remove_rows_missing salary
// Drop specific rows (e.g., 2nd and 5th rows)
drop_row 5
// Remove duplicate rows
remove_duplicates
// Fill missing values in 'salary' column with mean
fill_missing salary with mean
// Normalize 'salary' column to range 0 to 1
normalize salary to_range(0, 1)
// Standardize the 'tax' column
standardize tax
// Logarithm transform the 'experience' column
log_transform experience
// Auto categorize the 'salary' column into 3 clusters
auto_categorize salary n_clusters=3
// Split data into 70% training, 30% testing sets
split_data train=70 , test=30
// Integrate inconsistent data in 'department' column
integrate IT to Finance in department
// Delete outliers in 'salary' column using IQR method
delete_outliers salary with IQR
// Drop specific columns (e.g., 'id' and 'department')
drop_column id
// Encode all columns with one-hot encoding
encode all with one_hot