-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcode.r
161 lines (133 loc) · 4.67 KB
/
code.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
# Set WD, read data
getwd()
setwd("C:\\Users\\Dell pc\\Music\\m u s i c c c")
hrdata = read.csv("hr.csv",header = T, na.strings=c("",NA))
head(hrdata)
# Load libraries
library(ggplot2)
library(caret)
library(irr)
library(dplyr)
library(pvclust)
library(Amelia)
library(fastDummies)
library(caTools)
library(randomForest)
#########################################
## Data Exploration ##
#########################################
str(hrdata)
summary(hrdata)
# Missing Values
colSums(is.na(hrdata))
# Employee Status
ggplot(hrdata, aes(x = left)) +
geom_histogram(bins = 2, binwidth = .5, fill = 'orange') +
labs(title = 'Employment Status Of HR Data', x = 'Status', y = 'Count')
# Distrbution by Dept
ggplot(hrdata, aes(x = sales)) +
geom_histogram(bins = 10,stat = 'count', binwidth = .5) +
geom_bar(aes(fill = 'red')) +
labs(title = 'Employee Count By Department', x = 'Department', y = 'Count')
# Distribution of Satisfaction Levels
hist(hrdata$satisfaction_level,
breaks = 10,
col = 'pink',
main = 'Distribution Of Satisfaction Level',
xlab = 'Satisfaction Level',
ylab = 'Count')
# Projects
barData <- table(as.factor(hrdata$left), hrdata$number_project)
barplot(barData,
main="Employees Left Vs. Projects",
xlab="No. of Projects",
col=c("blue","orange"),
legend = rownames(barData),
beside=T)
# Dummy Variables Creation
dummy_cols(hrdata, select_columns = hrdata$salary, remove_first_dummy = TRUE,
remove_most_frequent_dummy = FALSE, sort_columns = FALSE,
ignore_na = FALSE, split = NULL)
data.class(hrdata$salary)
as.factor(hrdata$salary)
sales_dummy <- dummy(hrdata$sales, sep = '_')
salary_dummy <- dummy(hrdata$salary, sep = '_')
hr2 <- cbind(hrdata, sales_dummy, salary_dummy)
ind <- which(colnames(hr2)=="sales")
hr2 <- hr2[,-ind]
ind2 <- which(colnames(hr2)=="salary")
hr2 <- hr2[,-ind2]
View(hr2)
#########################################
## Logistic Regression ##
#########################################
#Split data into test and training samples
set.seed(200)
index <- sample(nrow(hrdata),0.70*nrow(hrdata),replace=F)
train <- hrdata[index,]
test <- hrdata[-index,]
#Build first model using all variables
mod <- glm(left~.,data=train,family="binomial", control = list(maxit=50))
summary(mod)
step(mod,direction="both")
# Summary Of Model
summary(mod)
# Find the best predictors
confint(mod, level = .95)
# Model Predictions
hr_pred <- predict(mod, test, type = 'response')
# Find significant cut-off point
# Create a ROC Curve To find Cutoff Point
# sensitivity vs. specificity
modAUC <- colAUC(hr_pred, test$left, plotROC = T)
abline(h=model.AUC, col = 'red')
text(.2,.9,cex = .8, labels = paste('Optimal Cutoff: ', round(model.AUC, 4)))
# Convert Probabilities To Class
# 1 indicates the employee left,
# 0 indicates the employee stayed
# Covert model pred probabilities into classes
predclass <- ifelse(hr_pred > .7860, 1, 0)
# Create a confusion matrix
confusionMatrix(predclass, test$left)
#########################################
## Cluster Analysis ##
#########################################
hr_clusters <- pvclust(hr2)
plot(hr_clusters, main = 'HR Data Cluster')
#########################################
## ##
## 4. Anova Tests ##
## ##
#########################################
# ANOVA: Time Spent At Company By Sales
anova <- aov(time_spend_company ~ sales, data = hrdata)
summary(anova)
TukeyHSD(anova, conf.level = .95)
ggplot(hrdata,
aes(y = time_spend_company, x = sales)) +
geom_boxplot(outlier.color = 'red',
outlier.size = .5,
fill = '#4c90ff',
color = '#2a5fb7') +
labs(title = 'ANOVA: Time Spent At Company By Dept.',
x = 'Employee Department',
y = 'Time Spent At Company')
# ANOVA: SATISFACTION LEVEL BY SALARY
anova2 <- aov(satisfaction_level ~ salary, data = hrdata)
summary(anova2)
TukeyHSD(anova2)
ggplot(hrdata, aes(y = satisfaction_level, x = salary)) +
geom_boxplot(outlier.color = 'red', outlier.size = .5,fill = c('#ff4f7d', '#4fa1ff', '#4fff95'), color = '#333333') +
labs(title = 'ANOVA: Satisfaction Level By Salary', x = 'Salary Level', y = 'Satisfaction Level')
#########################################
## ##
## Random Forest ##
## ##
#########################################
# Create Model
rfmod <- randomForest(as.factor(left) ~ ., train, ntree = 20)
summary(rfmod)
rfpred <- predict(rfmod, test)
summary(rfpred)
# View Confusion Matrix
confusionMatrix(rfpred, test$left)