This project is my trial to follow KNN algotrithms chaptern in Machine Learning with R. The data is NTUSC vote rate and attedance rate data.
library(tidyverse)
wbcd <- read_rds("D:/R_programming/Journalism_homework/NTUSC/datalm")
wbcd <- as_tibble(wbcd)
wbcd$Attnd <- ifelse(wbcd$Attnd_Rate >= 0.6, "H", "L")
# table of Attendance Rate
table(wbcd$Attnd)
##
## H L
## 102 84
# recode Attendance Rate as a factor
wbcd$Attnd <- factor(wbcd$Attnd, levels = c("H", "L"),
labels = c("High", "Low"))
# table or proportions with more informative labels
round(prop.table(table(wbcd$Attnd)) * 100, digits = 1)
##
## High Low
## 54.8 45.2
“H” means that student representatives has a high attendance rate, “L” vice versa. There are 186 observations total.
# summarize three numeric features
summary(wbcd[c("support_rate", "vote_rate", "nonobj_Rate")])
## support_rate vote_rate nonobj_Rate
## Min. :0.0000 Min. :0.01000 Min. :0.0000
## 1st Qu.:0.1500 1st Qu.:0.03000 1st Qu.:0.0000
## Median :0.2000 Median :0.04000 Median :0.3200
## Mean :0.2391 Mean :0.06005 Mean :0.2871
## 3rd Qu.:0.2875 3rd Qu.:0.10000 3rd Qu.:0.4700
## Max. :1.0000 Max. :0.20000 Max. :0.8700
Let’s take a look at these three numeric features I am going to use.
# create normalization function
normalize <- function(x) {
return ((x - min(x)) / (max(x) - min(x)))
}
# normalize the wbcd data
wbcd_n <- wbcd %>%
select(4:12) %>%
mutate(support_rate = normalize(support_rate),
nonobj_Rate = normalize(nonobj_Rate),
vote_rate = normalize(vote_rate),
college_population_rate = normalize(college_population_rate)) %>%
select(term, support_rate, nonobj_Rate, vote_rate, college_population_rate, Attnd)
# confirm that normalization worked
summary(wbcd_n$support_rate)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.1500 0.2000 0.2391 0.2875 1.0000
After normalizing, it’s time to continue to seperate data into train and test datasets.
# create training and test data
wbcd_train <- wbcd_n %>% filter(term != "106-1")
wbcd_test <- wbcd_n %>% filter(term == "106-1")
# create labels for training and test data
wbcd_train_labels <- wbcd_train$Attnd
wbcd_test_labels <- wbcd_test$Attnd
wbcd_train_labels <- as_vector(wbcd_train_labels)
wbcd_test_labels <- as_vector(wbcd_test_labels)
# preserve needed column only
wbcd_train <- wbcd_train %>% select(2:4)
wbcd_test <- wbcd_test %>% select(2:4)
Seperation completed. The next step is traing a model on the data using class
package, then evaluating model performace using gmodels
package.
# load the "class" library
library(class)
wbcd_test_pred <- knn(train = wbcd_train, test = wbcd_test,
cl = wbcd_train_labels, k = 21)
# load the "gmodels" library
library(gmodels)
prop.table(table(wbcd_test_labels, wbcd_test_pred))
## wbcd_test_pred
## wbcd_test_labels High Low
## High 0.48837209 0.02325581
## Low 0.46511628 0.02325581
# Create the cross tabulation of predicted vs. actual
CrossTable(x = wbcd_test_labels, y = wbcd_test_pred,
prop.chisq = FALSE)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 43
##
##
## | wbcd_test_pred
## wbcd_test_labels | High | Low | Row Total |
## -----------------|-----------|-----------|-----------|
## High | 21 | 1 | 22 |
## | 0.955 | 0.045 | 0.512 |
## | 0.512 | 0.500 | |
## | 0.488 | 0.023 | |
## -----------------|-----------|-----------|-----------|
## Low | 20 | 1 | 21 |
## | 0.952 | 0.048 | 0.488 |
## | 0.488 | 0.500 | |
## | 0.465 | 0.023 | |
## -----------------|-----------|-----------|-----------|
## Column Total | 41 | 2 | 43 |
## | 0.953 | 0.047 | |
## -----------------|-----------|-----------|-----------|
##
##
It is quite clear that the model performs quite bad… but no need to worry. There is still way to improve model performance.
# use the scale() function to z-score standardize a data frame
wbcd_z <- wbcd %>%
select(support_rate, nonobj_Rate, vote_rate, Attnd, term)
wbcd_z_scale <- as.data.frame(scale(wbcd_z[-(4:5)]))
# confirm that the transformation was applied correctly
summary(wbcd_z_scale$support_rate)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1.4402 -0.5366 -0.2355 0.0000 0.2916 4.5837
# create training and test datasets
wbcd_z_train <- wbcd_z %>% filter(term != "106-1")
wbcd_z_train <- scale(wbcd_z_train[-(4:5)])
wbcd_z_test <- wbcd_z %>% filter(term == "106-1")
wbcd_z_test <- scale(wbcd_z_test[-(4:5)])
# re-classify test cases
wbcd_test_pred_z <- knn(train = wbcd_z_train, test = wbcd_z_test,
cl = wbcd_train_labels, k = 21)
# Create the cross tabulation of predicted vs. actual
CrossTable(x = wbcd_test_labels, y = wbcd_test_pred_z,
prop.chisq = FALSE)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 43
##
##
## | wbcd_test_pred_z
## wbcd_test_labels | High | Low | Row Total |
## -----------------|-----------|-----------|-----------|
## High | 17 | 5 | 22 |
## | 0.773 | 0.227 | 0.512 |
## | 0.739 | 0.250 | |
## | 0.395 | 0.116 | |
## -----------------|-----------|-----------|-----------|
## Low | 6 | 15 | 21 |
## | 0.286 | 0.714 | 0.488 |
## | 0.261 | 0.750 | |
## | 0.140 | 0.349 | |
## -----------------|-----------|-----------|-----------|
## Column Total | 23 | 20 | 43 |
## | 0.535 | 0.465 | |
## -----------------|-----------|-----------|-----------|
##
##
As you can see, this time I use z-score instead of normalization to standardize the dataset. It looks like it works. The next step is to try several different values of k.
wbcd_test_pred_z <- knn(train = wbcd_z_train, test = wbcd_z_test, cl = wbcd_train_labels, k=1)
CrossTable(x = wbcd_test_labels, y = wbcd_test_pred_z, prop.chisq=FALSE)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 43
##
##
## | wbcd_test_pred_z
## wbcd_test_labels | High | Low | Row Total |
## -----------------|-----------|-----------|-----------|
## High | 11 | 11 | 22 |
## | 0.500 | 0.500 | 0.512 |
## | 0.524 | 0.500 | |
## | 0.256 | 0.256 | |
## -----------------|-----------|-----------|-----------|
## Low | 10 | 11 | 21 |
## | 0.476 | 0.524 | 0.488 |
## | 0.476 | 0.500 | |
## | 0.233 | 0.256 | |
## -----------------|-----------|-----------|-----------|
## Column Total | 21 | 22 | 43 |
## | 0.488 | 0.512 | |
## -----------------|-----------|-----------|-----------|
##
##
wbcd_test_pred_z <- knn(train = wbcd_z_train, test = wbcd_z_test, cl = wbcd_train_labels, k=5)
CrossTable(x = wbcd_test_labels, y = wbcd_test_pred_z, prop.chisq=FALSE)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 43
##
##
## | wbcd_test_pred_z
## wbcd_test_labels | High | Low | Row Total |
## -----------------|-----------|-----------|-----------|
## High | 16 | 6 | 22 |
## | 0.727 | 0.273 | 0.512 |
## | 0.696 | 0.300 | |
## | 0.372 | 0.140 | |
## -----------------|-----------|-----------|-----------|
## Low | 7 | 14 | 21 |
## | 0.333 | 0.667 | 0.488 |
## | 0.304 | 0.700 | |
## | 0.163 | 0.326 | |
## -----------------|-----------|-----------|-----------|
## Column Total | 23 | 20 | 43 |
## | 0.535 | 0.465 | |
## -----------------|-----------|-----------|-----------|
##
##
wbcd_test_pred_z <- knn(train = wbcd_z_train, test = wbcd_z_test, cl = wbcd_train_labels, k=11)
CrossTable(x = wbcd_test_labels, y = wbcd_test_pred_z, prop.chisq=FALSE)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 43
##
##
## | wbcd_test_pred_z
## wbcd_test_labels | High | Low | Row Total |
## -----------------|-----------|-----------|-----------|
## High | 16 | 6 | 22 |
## | 0.727 | 0.273 | 0.512 |
## | 0.696 | 0.300 | |
## | 0.372 | 0.140 | |
## -----------------|-----------|-----------|-----------|
## Low | 7 | 14 | 21 |
## | 0.333 | 0.667 | 0.488 |
## | 0.304 | 0.700 | |
## | 0.163 | 0.326 | |
## -----------------|-----------|-----------|-----------|
## Column Total | 23 | 20 | 43 |
## | 0.535 | 0.465 | |
## -----------------|-----------|-----------|-----------|
##
##
wbcd_test_pred_z <- knn(train = wbcd_z_train, test = wbcd_z_test, cl = wbcd_train_labels, k=15)
CrossTable(x = wbcd_test_labels, y = wbcd_test_pred_z, prop.chisq=FALSE)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 43
##
##
## | wbcd_test_pred_z
## wbcd_test_labels | High | Low | Row Total |
## -----------------|-----------|-----------|-----------|
## High | 16 | 6 | 22 |
## | 0.727 | 0.273 | 0.512 |
## | 0.640 | 0.333 | |
## | 0.372 | 0.140 | |
## -----------------|-----------|-----------|-----------|
## Low | 9 | 12 | 21 |
## | 0.429 | 0.571 | 0.488 |
## | 0.360 | 0.667 | |
## | 0.209 | 0.279 | |
## -----------------|-----------|-----------|-----------|
## Column Total | 25 | 18 | 43 |
## | 0.581 | 0.419 | |
## -----------------|-----------|-----------|-----------|
##
##
wbcd_test_pred_z <- knn(train = wbcd_z_train, test = wbcd_z_test, cl = wbcd_train_labels, k=17)
CrossTable(x = wbcd_test_labels, y = wbcd_test_pred_z, prop.chisq=FALSE)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 43
##
##
## | wbcd_test_pred_z
## wbcd_test_labels | High | Low | Row Total |
## -----------------|-----------|-----------|-----------|
## High | 18 | 4 | 22 |
## | 0.818 | 0.182 | 0.512 |
## | 0.720 | 0.222 | |
## | 0.419 | 0.093 | |
## -----------------|-----------|-----------|-----------|
## Low | 7 | 14 | 21 |
## | 0.333 | 0.667 | 0.488 |
## | 0.280 | 0.778 | |
## | 0.163 | 0.326 | |
## -----------------|-----------|-----------|-----------|
## Column Total | 25 | 18 | 43 |
## | 0.581 | 0.419 | |
## -----------------|-----------|-----------|-----------|
##
##
wbcd_test_pred_z <- knn(train = wbcd_z_train, test = wbcd_z_test, cl = wbcd_train_labels, k=21)
CrossTable(x = wbcd_test_labels, y = wbcd_test_pred_z, prop.chisq=FALSE)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 43
##
##
## | wbcd_test_pred_z
## wbcd_test_labels | High | Low | Row Total |
## -----------------|-----------|-----------|-----------|
## High | 17 | 5 | 22 |
## | 0.773 | 0.227 | 0.512 |
## | 0.739 | 0.250 | |
## | 0.395 | 0.116 | |
## -----------------|-----------|-----------|-----------|
## Low | 6 | 15 | 21 |
## | 0.286 | 0.714 | 0.488 |
## | 0.261 | 0.750 | |
## | 0.140 | 0.349 | |
## -----------------|-----------|-----------|-----------|
## Column Total | 23 | 20 | 43 |
## | 0.535 | 0.465 | |
## -----------------|-----------|-----------|-----------|
##
##
To conclude, I applied the concepts learned and tried many useful packages such as dplyr
, class
, gmodels
in this porject. The script is mainly referred from Machine Learning with R and I use it on my own data. It’s worth trying other data next time. Hope you enjoy it!