-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcnv_rf_classify.R
executable file
·74 lines (59 loc) · 4.12 KB
/
cnv_rf_classify.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
cnv_random_forest_classify <- function(path_to_train,path_to_test,path_to_output){
# Read in training dataset
training <- read.table(path_to_train, sep = "\t");
# Read in test (identified) dataset
test <- read.table(path_to_test, sep = "\t");
# Combine the two so we can do feature scaling and mean normalization
combined <- rbind(training, test);
# Add column name
colnames(combined)<- c("time_stamp","username","sample","control","window_id","gene","avg_cnv_ratio","avg_bowtie_bwa_ratio","bb_std","cnv_ratio_std","cov_std","avg_cov","avg_dup_ratio","gc_perc","allele_freq","read_stats","is_training","het_classification");
# Create output data frame
classified_output <- test;
colnames(classified_output)<- c("time_stamp","username","sample","control","window_id","gene","avg_cnv_ratio","avg_bowtie_bwa_ratio","bb_std","cnv_ratio_std","cov_std","avg_cov","avg_dup_ratio","gc_perc","allele_freq","read_stats","is_training","het_classification");
# Do feature scaling and mean normalization
combined[,"avg_cnv_ratio"] <- scale_n_mean_normalize_column(as.numeric(combined[,"avg_cnv_ratio"]));
combined[,"bb_std"] <- scale_n_mean_normalize_column(as.numeric(combined[,"bb_std"]));
combined[,"cnv_ratio_std"] <- scale_n_mean_normalize_column(as.numeric(combined[,"cnv_ratio_std"]));
combined[,"cov_std"] <- scale_n_mean_normalize_column(as.numeric(combined[,"cov_std"]));
combined[,"avg_cov"] <- scale_n_mean_normalize_column(as.numeric(combined[,"avg_cov"]));
combined[,"avg_dup_ratio"] <- scale_n_mean_normalize_column(as.numeric(combined[,"avg_dup_ratio"]));
combined[,"gc_perc"] <- scale_n_mean_normalize_column(as.numeric(combined[,"gc_perc"]));
combined[,"allele_freq"] <- scale_n_mean_normalize_column(as.numeric(combined[,"allele_freq"]));
combined[,"read_stats"] <- scale_n_mean_normalize_column(as.numeric(combined[,"read_stats"]));
dataTrain = combined[combined$is_training == 1,]
dataTest = combined[combined$is_training == 0,]
dataTrain[,18] <- as.factor(dataTrain[,18]);
dataTest[,18] <- as.factor(dataTest[,18]);
# NOTE: class in the formula below contains the different classes we want to classify. In out case 1 if it is a true het variant
r_formula <- formula(het_classification ~ avg_cnv_ratio + bb_std + cnv_ratio_std + cov_std + avg_cov + avg_dup_ratio + gc_perc + allele_freq + read_stats);
randomForestmodel <- randomForest(r_formula, data=dataTrain);
predict_forest <-predict(randomForestmodel,type="class",newdata=dataTest);
classified_output[,18] <- predict_forest;
write.table(classified_output, path_to_output, sep="\t",row.names=FALSE);
}
scale_n_mean_normalize_column <- function(data_column){
if(max(data_column,na.rm=TRUE) == min(data_column,na.rm=TRUE)){
data_column <- data_column - mean(data_column,na.rm=TRUE);
}else{
data_column <- (data_column - mean(data_column,na.rm=TRUE))/(max(data_column,na.rm=TRUE) - min(data_column,na.rm=TRUE));
}
return (data_column);
}
file_paths = readLines("cnv_rf_data_paths.txt")
training_data <- strsplit(file_paths[grepl("training_data", file_paths)],"=")[[1]][2];
sample_name <- strsplit(file_paths[grepl("sample_name", file_paths)],"=")[[1]][2];
ident_het_path <- strsplit(file_paths[grepl("ident_het_path", file_paths)],"=")[[1]][2];
classified_het_path <- strsplit(file_paths[grepl("classified_het_path", file_paths)],"=")[[1]][2];
R_libs <- strsplit(file_paths[grepl("R_libs", file_paths)],"=")[[1]][2];
cmd <- paste("hadoop", "fs", "-getmerge", ident_het_path, "ident_het.txt", sep=" ");
cmd_response <- try(system(cmd, intern = TRUE));
print(R_libs)
library('randomForest',lib.loc =R_libs);
cnv_random_forest_classify(training_data,"ident_het.txt","classified_het_data.txt")
cmd <- paste("hadoop", "fs", "-put", "classified_het_data.txt", classified_het_path, sep=" ");
cmd_response <- try(system(cmd, intern = TRUE));
# clean up
cmd <- paste("rm", "-rf", "ident_het.txt", sep=" ");
cmd_response <- try(system(cmd, intern = TRUE));
cmd <- paste("rm", "-rf", "classified_het_data.txt", sep=" ");
cmd_response <- try(system(cmd, intern = TRUE));