Read in and clean data
This code reads in 2 csvs, reduces the data frame to the 53 features(removing columns that only have na’s and cols not in the test set). Lastly the function turns classe into a factor and creates a validation data set from the training data.
training <- read.csv("C:/Users/Srodger/Documents/R/coursera/ml/week 4 project/pml-training.csv")
testing <- read.csv("C:/Users/Srodger/Documents/R/coursera/ml/week 4 project/pml-testing.csv")
set.seed(222)
testing[,7:159] <- sapply(testing[,7:159],as.numeric)
testing <- testing %>% select(roll_belt:problem_id) #%>% sample_frac(.4)
testing <- testing %>% select_if(~ !any(is.na(.)))
training[,7:159] <- sapply(training[,7:159],as.numeric)
training <- training %>% select(roll_belt:classe) #%>% sample_frac(.4)
cols <- colnames(testing)
cols[53] <- "classe"
training <- training %>%select(cols)
training <- training %>% select_if(~ !any(is.na(.)))
training$classe <- as.factor(training$classe)
inTrain <-createDataPartition(y=training$classe, p=0.6, list=FALSE)
training <- training[inTrain,]
validation <- training[-inTrain,]
Build decsion tree, display tree, and calulate confusion matrix and statistics.
fit_dt <- train(classe~., method="rpart",data=training, trControl = fitControl)
fancyRpartPlot(fit_dt$finalModel)

pred_dt <- predict(fit_dt, validation)
confusionMatrix(pred_dt, validation$classe)
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 1365 894 842 749 843
## B 0 0 0 0 0
## C 0 0 0 0 0
## D 0 0 0 0 0
## E 0 0 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.2909
## 95% CI : (0.2779, 0.3041)
## No Information Rate : 0.2909
## P-Value [Acc > NIR] : 0.5055
##
## Kappa : 0
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 1.0000 0.0000 0.0000 0.0000 0.0000
## Specificity 0.0000 1.0000 1.0000 1.0000 1.0000
## Pos Pred Value 0.2909 NaN NaN NaN NaN
## Neg Pred Value NaN 0.8095 0.8206 0.8404 0.8204
## Prevalence 0.2909 0.1905 0.1794 0.1596 0.1796
## Detection Rate 0.2909 0.0000 0.0000 0.0000 0.0000
## Detection Prevalence 1.0000 0.0000 0.0000 0.0000 0.0000
## Balanced Accuracy 0.5000 0.5000 0.5000 0.5000 0.5000
Build Random Forest model, show most important variables, and calculate confusion matrix. Lastly plot fit and final model.
# Random Forest Run previously loaded using readRDS
#fit_rf <- train(classe ~., method="rf", data=imputed_train, trControl = fitControl)
#saveRDS(fit_rf, "randf.RData")
fit_rf <- readRDS("C:/Users/Srodger/Documents/R/coursera/ml/week 4 project/randf.RData")
caret::varImp(fit_rf)
## rf variable importance
##
## only 20 most important variables shown (out of 52)
##
## Overall
## roll_belt 100.000
## pitch_forearm 59.848
## yaw_belt 52.405
## pitch_belt 43.008
## magnet_dumbbell_y 42.953
## magnet_dumbbell_z 42.696
## roll_forearm 41.300
## accel_dumbbell_y 20.603
## roll_dumbbell 17.468
## magnet_dumbbell_x 17.075
## accel_forearm_x 16.900
## magnet_belt_z 14.481
## accel_dumbbell_z 13.485
## accel_belt_z 13.249
## total_accel_dumbbell 13.077
## magnet_forearm_z 12.653
## magnet_belt_y 11.803
## yaw_arm 10.460
## gyros_belt_z 9.527
## yaw_dumbbell 8.549
pred_rf <- predict(fit_rf, validation)
confusionMatrix(pred_rf, validation$classe)
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 1365 0 0 0 0
## B 0 894 0 0 0
## C 0 0 842 0 0
## D 0 0 0 749 0
## E 0 0 0 0 843
##
## Overall Statistics
##
## Accuracy : 1
## 95% CI : (0.9992, 1)
## No Information Rate : 0.2909
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 1.0000 1.0000 1.0000 1.0000 1.0000
## Specificity 1.0000 1.0000 1.0000 1.0000 1.0000
## Pos Pred Value 1.0000 1.0000 1.0000 1.0000 1.0000
## Neg Pred Value 1.0000 1.0000 1.0000 1.0000 1.0000
## Prevalence 0.2909 0.1905 0.1794 0.1596 0.1796
## Detection Rate 0.2909 0.1905 0.1794 0.1596 0.1796
## Detection Prevalence 0.2909 0.1905 0.1794 0.1596 0.1796
## Balanced Accuracy 1.0000 1.0000 1.0000 1.0000 1.0000
plot(fit_rf, main="Cross Validation shows 27 predictors gives max accuracy")

plot(fit_rf$finalModel, main ="Final Model showing < 100 trees for best ROI (time vs accuracy)")

Build boosted model, caluclate confusion matrix and plot fit
# Boosted Tree Run previously loaded using readRDS
#fit_bt <- train(classe ~., method="gbm", data=imputed_train, trControl = fitControl)
#saveRDS(fit_bt, "boostedtree.RData")
fit_bt <- readRDS("C:/Users/Srodger/Documents/R/coursera/ml/week 4 project/boostedtree.RData")
pred_bt <- predict(fit_bt, validation)
confusionMatrix(pred_bt, validation$classe)
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 1348 18 0 0 1
## B 11 857 17 2 5
## C 4 19 815 25 5
## D 2 0 9 719 5
## E 0 0 1 3 827
##
## Overall Statistics
##
## Accuracy : 0.9729
## 95% CI : (0.9679, 0.9774)
## No Information Rate : 0.2909
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9657
## Mcnemar's Test P-Value : 0.003126
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 0.9875 0.9586 0.9679 0.9599 0.9810
## Specificity 0.9943 0.9908 0.9862 0.9959 0.9990
## Pos Pred Value 0.9861 0.9608 0.9389 0.9782 0.9952
## Neg Pred Value 0.9949 0.9903 0.9929 0.9924 0.9959
## Prevalence 0.2909 0.1905 0.1794 0.1596 0.1796
## Detection Rate 0.2872 0.1826 0.1737 0.1532 0.1762
## Detection Prevalence 0.2913 0.1901 0.1850 0.1566 0.1771
## Balanced Accuracy 0.9909 0.9747 0.9771 0.9779 0.9900
plot(fit_bt, main=" Highest Accuracy comes from a tree depth of 3 and >140 boosting iterations")

Compare all three models
# load the library
results <- resamples(list(DecisionTree=fit_dt, RandomForest=fit_rf, GBM=fit_bt))
summary(results)
##
## Call:
## summary.resamples(object = results)
##
## Models: DecisionTree, RandomForest, GBM
## Number of resamples: 5
##
## Accuracy
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## DecisionTree 0.5044586 0.5050934 0.5053079 0.5107848 0.5178420 0.5212224
## RandomForest 0.9868365 0.9881154 0.9893888 0.9891306 0.9906542 0.9906582
## GBM 0.9575191 0.9592357 0.9609508 0.9603427 0.9609508 0.9630573
## NA's
## DecisionTree 0
## RandomForest 0
## GBM 0
##
## Kappa
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## DecisionTree 0.3519600 0.3536704 0.3548824 0.3610780 0.3713551 0.3735221
## RandomForest 0.9833491 0.9849643 0.9865765 0.9862494 0.9881747 0.9881826
## GBM 0.9462552 0.9484353 0.9506041 0.9498335 0.9506220 0.9532509
## NA's
## DecisionTree 0
## RandomForest 0
## GBM 0
dotplot(results)

Random Forest model is selected for highest accuracy and kappa. Predicting test set data using random forest model created:
# impute values
testing <- predict(train_pre_obj, testing)
predF <-predict(fit_rf, testing)
predF
## [1] B A B A A E D B A A B C B A E E A B B B
## Levels: A B C D E