# function for loading necessary libraries and installing them if they have not
# yet been installed
pack <- function(lib){
new.lib <- lib[!(lib %in%
installed.packages()[, 'Package'])]
if (length(new.lib))
install.packages(new.lib, dependencies = TRUE)
sapply(lib, require, character.only = TRUE)
}
packages <- c('partykit', 'e1071', 'caret', 'corrplot', 'MASS', 'car', 'DT',
'ggplot2', 'cowplot', 'ggpubr', 'rms', 'pander', 'ROCR', 'pROC')
pack(packages) # run function
## partykit e1071 caret corrplot MASS car DT ggplot2
## TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## cowplot ggpubr rms pander ROCR pROC
## TRUE TRUE TRUE TRUE TRUE TRUE
# set working directory by concatenating long string
string1 <- 'C:/Users/lshpaner/OneDrive/Cornell University/Coursework'
string2 <- '/Data Science Certificate Program/CEEM585 '
string3 <- '- Supervised Learning Techniques'
# concatenate each string
working_dir = paste(string1, string2, string3, sep = '')
# set the working directory by calling function
setwd(working_dir)
# confirm working directory
getwd()
[1] “C:/Users/lshpaner/OneDrive/Cornell University/Coursework/Data Science Certificate Program/CEEM585 - Supervised Learning Techniques”
In this part of the project, you will focus on building a model to
understand who might make a good product technician if hired using
linear discriminate analysis logit and ordered logit modeling. The data
set you will be using is in the file HRdata2groups.csv
,
contained in the RStudio instance.
The four performance scores in PerfScore
have been
mapped into two new categories of Satisfactory and Unsatisfactory under
the heading of CollapseScore
. Assume that levels 1 and 2
are unacceptable and levels 3 and 4 are acceptable. Build a linear
discriminant analysis using regression with these two categories as the
dependent variable. The purpose of this question is for you to examine
the independent variables and conclude which one to include in the
regression model. Several are not useful. Remember that when we do this,
only the coefficients in the model are useful. You may use the function
lm()
which has the syntax
lm(dependent variable ~ independent variable 1+ independent variable 2+…, data=frame)
.
This function is part of the package caret: hence you will need to use
the command library(caret)
.
The dataset is inspected and the categorical
classes of Acceptable
and
Unacceptable
are cast to the
Performance Score ( PerfScoreID
) in a new column named
CollapseScore
. However, since supervised learning
models need to learn from a numerical, though, binarized target column,
a new column of Score
is thus
created. Extraneous or otherwise not useful columns like
Employee ID
,
CollapseScore
, and
Score
are removed such that a
numerical only dataframe is created for subsequent distribution
analysis.
# read in the data
hr_data <- read.csv('HRdata2groups.csv')
# Adding column based on other column:
# inspect first five rows of the dataset
datatable(hr_data)
# cast categorical classes to Performance Score
hr_data$CollapseScore <- ifelse(hr_data$PerfScoreID >= 3, 'Acceptable',
'Unacceptable')
# numerically binarize these performance scores
hr_data$Score <- ifelse(hr_data$CollapseScore == 'Acceptable', 1, 0)
datatable(hr_data, options = list(scrollX = TRUE))
# extract meaningful data (i.e., remove categorical data types)
hr_data_numeric <- subset(hr_data, select = -c(EmpID, CollapseScore, Score))
The histogram distributions below do not yield or
uncover any near-zero-variance predictors, but it is worth noting that
Termd
has only two class
labels. MechanicalApt
and
VerbalApt
exhibit normality;
other variables approach the same trend.
# create function for plotting histograms to check for near-zero variance
# in distributions where input `df` is a dataframe of interest
nearzerohist <- function(df, x, y) {
# x rows by y columns & adjust margins
par(mfrow = c(x, y), mar = c(4, 4, 4, 0))
for (i in 1:ncol(df)){
hist(df[, i],
xlab = names(df[i]),
main = paste(names(df[i]), ''),
col = 'gray18')
}
# check for near zero variance predictors using if-else statement
nearzero_names <- nearZeroVar(df)
if (length(nearzero_names) == 0) {
print('There are no near-zero variance predictors.')
} else {
cat('The following near-zero variance predictors exist:',
print(nearzero_names))
}
}
# call the `nearzerohist()` function
nearzerohist(hr_data_numeric, x = 2, y = 3)
## [1] "There are no near-zero variance predictors."
Examining the
Score
column separately yields an imbalanced
dataset where 172 Acceptable
cases outweigh the 21 Unacceptable
classes. However, no solution is rendered for this outcome. The
data is treated as-is.
# function for generating class balance table and barplot
# inputs --> feat: feature or column of interest
# title: plot title
# x: x-axis label
# y: y-axis label
class_balance <- function(feat, title, x, y) {
# check target column's class balance
# parse target variable into table showcasing class distribution
feat_table <- table(unname(feat)) # generate table for column
# fix plot margins
par(mar = c (2, 2, 2, 1))
# plot the counts (values) of each respective class on barplot
barplot(feat_table, main = title, space = c(0), horiz = FALSE,
names.arg = c(x, y),
col = c('cornflowerblue', 'brown2'))
return (feat_table)
}
class_balance(feat = hr_data$CollapseScore, title = 'HR by Class',
x = 'Acceptable', y = 'Unacceptable')
##
## Acceptable Unacceptable
## 172 21
The employee’s hiring status
EmpStatusID
in conjunction
with the employee’s satisfaction
EmpSatisfaction
and average aptitude score are used
in the model.
Averaging the mechanical and verbal scores row
over row creates a new Aptitude
column with these values. Mechanical and verbal aptitude scores
are omitted because of their high between-predictor relationships.
MechanicalApt
vs. VerbalApt
yields an
r = 0.96. Once the scores are averaged and passed into one
column, the problem of multicollinearity is removed.
Termd
is also omitted because
its correlation with EmpStatusID
is r = 0.96.
# create function to plot correlation matrix and establish multicollinearity
# takes one input (df) to pass in dataframe of interest
multicollinearity <- function(df) {
# Examine between predictor correlations/multicollinearity
corr <- cor(df, use = 'pairwise.complete.obs')
corrplot(corr, mar = c(0, 0, 0, 0), method = 'color',
col = colorRampPalette(c('#FC0320', '#FFFFFF',
'#FF0000'))(100),
addCoef.col = 'black', tl.srt = 45, tl.col = 'black',
type = 'lower')
# assign variable to count how many highly correlated
# variables there exist based on 0.75 threshold
highCorr <- findCorrelation(corr, cutoff = 0.75)
# find correlated names
highCorr_names <- findCorrelation(corr, cutoff = 0.75, names = TRUE)
cat(' The following variables should be omitted:',
paste('\n', unlist(highCorr_names)))
}
# determine multicollinearity
multicollinearity(hr_data[c(1:7)])
## The following variables should be omitted:
## VerbalApt
## MechanicalApt
## Termd
Variance Inflation Factor (VIF) scores confirm similar behavior, exhibiting high multicollinearity once a threshold of five is reached and surpassed. A linear model (lm) is used to test this behavior.
# use generalized linear model to determine confirm multicollinearity w/ VIF
model_all <- lm(Score ~ . - CollapseScore, data = hr_data) # remove CollapseScore
# since it is target
# and we are only interested in comparing between-predictor relationships
# use car library to extract VIF and parse it into a pandoc table using the
# linear model as a proxy for analysis
pandoc.table(vif(model_all), style = 'simple', split.table = Inf)
EmpID | Termd | EmpStatusID | PerfScoreID | EmpSatisfaction | MechanicalApt | VerbalApt |
---|---|---|---|---|---|---|
1.058 | 13.74 | 13.93 | 2.785 | 1.096 | 15.91 | 13.94 |
# create vector of VIF values for plotting
vif_values <- vif(model_all)
par(mar = c(7.5, 2, 1, 1)) # fix plot margins
# create column chart to display each VIF value
barplot(vif_values, main = 'VIF Values', horiz = FALSE, col = 'steelblue',
las = 2)
# add vertical line at 5 as after 5 there is severe correlation
abline(h = 5, lwd = 3, lty = 2)
# create average score since result of both scores creates multicollinearity
hr_data$Aptitude <- rowMeans(hr_data[, c(6, 7)], na.rm = TRUE)
# create a final dataframe with selected columns of interest for modeling
hr_data_final <- hr_data[, c(3, 5, 8, 9, 10)]
# Re-examine between predictor correlations/multicollinearity
highCorr <- findCorrelation(cor(hr_data_final[c(1, 2, 5)]), cutoff = 0.75,
names = TRUE)
cat(' The following variables should be omitted:',
paste('\n', unlist(highCorr)))
## The following variables should be omitted:
##
The Score vs. Aptitude scatterplot below exhibits a moderate correlation of r = 0.62. Employee satisfaction exhibits a much weaker relationship of r = 0.26, and there is almost no relationship between Score and Employee Status ID where r = -0.067.
# create function for plotting correlations between variables
# inputs: xvar: independent variable, yvar: dependent variable,
# title: plot title, xlab: x-axis label, ylab: y-axis label
correl_plot <- function(df, xvar, yvar, title, xlab, ylab) {
ggplot(df, aes(x = xvar, y = yvar)) +
ggtitle(title) +
xlab(xlab) + ylab(ylab) +
geom_point(pch = 1) + ylim(0, 1.25) +
geom_smooth(method = 'lm', se = FALSE) +
theme_classic() +
stat_cor(method = 'pearson', label.x = 0.15, label.y = 0.20) # correl coeff.
}
# create three correlation plots on same grid
plot1 <- correl_plot(hr_data_final, xvar = hr_data_final$EmpStatusID,
yvar = hr_data_final$Score, title = 'Score vs. EmpStatusID',
xlab = 'EmpStatusID', ylab = 'Score')
plot2 <- correl_plot(hr_data_final, xvar = hr_data_final$EmpSatisfaction,
yvar = hr_data_final$Score,
title = 'Score vs. EmpSatisfaction',
xlab = 'EmpSatisfaction', ylab = 'Score')
plot3 <- correl_plot(hr_data_final, xvar = hr_data_final$Aptitude,
yvar = hr_data_final$Score, title = 'Score vs. Aptitude',
xlab = 'Aptitude', ylab = 'Score')
# plot all correlations together
plot_grid(plot1, plot2, plot3, labels = 'AUTO', ncol = 3, align = '')
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'