Read in and clean data

This code reads in 2 csvs, reduces the data frame to the 53 features(removing columns that only have na’s and cols not in the test set). Lastly the function turns classe into a factor and creates a validation data set from the training data.

training <- read.csv("C:/Users/Srodger/Documents/R/coursera/ml/week 4 project/pml-training.csv")
testing  <- read.csv("C:/Users/Srodger/Documents/R/coursera/ml/week 4 project/pml-testing.csv")
set.seed(222)

testing[,7:159] <- sapply(testing[,7:159],as.numeric) 
testing <- testing %>% select(roll_belt:problem_id) #%>% sample_frac(.4)
testing <- testing %>% select_if(~ !any(is.na(.)))

training[,7:159] <- sapply(training[,7:159],as.numeric) 
training <- training %>% select(roll_belt:classe) #%>% sample_frac(.4)
cols <- colnames(testing)
cols[53] <- "classe"
training  <- training %>%select(cols)

training <- training %>% select_if(~ !any(is.na(.)))
training$classe <- as.factor(training$classe)

inTrain <-createDataPartition(y=training$classe, p=0.6, list=FALSE)
training <- training[inTrain,]
validation <- training[-inTrain,]

This section of code sets the train control for the caret train function to use 5 fold cross validation (this will be used for all models), enables parrallel processing and does not return data to increase performance. Lastly this function inputes any missing values for using k nearest neighbor.

fitControl <- trainControl(method = "cv",
                           number = 5,
                           allowParallel = TRUE, 
                           returnData = FALSE)



train_pre_obj <- preProcess(training, method = "knnImpute")
imputed_train <- predict(train_pre_obj, training)

validation <- predict(train_pre_obj, validation)

Build decsion tree, display tree, and calulate confusion matrix and statistics.

fit_dt <- train(classe~., method="rpart",data=training, trControl = fitControl)
fancyRpartPlot(fit_dt$finalModel)

pred_dt <- predict(fit_dt, validation)
confusionMatrix(pred_dt, validation$classe)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    A    B    C    D    E
##          A 1365  894  842  749  843
##          B    0    0    0    0    0
##          C    0    0    0    0    0
##          D    0    0    0    0    0
##          E    0    0    0    0    0
## 
## Overall Statistics
##                                           
##                Accuracy : 0.2909          
##                  95% CI : (0.2779, 0.3041)
##     No Information Rate : 0.2909          
##     P-Value [Acc > NIR] : 0.5055          
##                                           
##                   Kappa : 0               
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: A Class: B Class: C Class: D Class: E
## Sensitivity            1.0000   0.0000   0.0000   0.0000   0.0000
## Specificity            0.0000   1.0000   1.0000   1.0000   1.0000
## Pos Pred Value         0.2909      NaN      NaN      NaN      NaN
## Neg Pred Value            NaN   0.8095   0.8206   0.8404   0.8204
## Prevalence             0.2909   0.1905   0.1794   0.1596   0.1796
## Detection Rate         0.2909   0.0000   0.0000   0.0000   0.0000
## Detection Prevalence   1.0000   0.0000   0.0000   0.0000   0.0000
## Balanced Accuracy      0.5000   0.5000   0.5000   0.5000   0.5000

Build Random Forest model, show most important variables, and calculate confusion matrix. Lastly plot fit and final model.

# Random Forest Run previously loaded using readRDS
#fit_rf <- train(classe ~., method="rf", data=imputed_train, trControl = fitControl)
#saveRDS(fit_rf, "randf.RData")

fit_rf <- readRDS("C:/Users/Srodger/Documents/R/coursera/ml/week 4 project/randf.RData")
caret::varImp(fit_rf)
## rf variable importance
## 
##   only 20 most important variables shown (out of 52)
## 
##                      Overall
## roll_belt            100.000
## pitch_forearm         59.848
## yaw_belt              52.405
## pitch_belt            43.008
## magnet_dumbbell_y     42.953
## magnet_dumbbell_z     42.696
## roll_forearm          41.300
## accel_dumbbell_y      20.603
## roll_dumbbell         17.468
## magnet_dumbbell_x     17.075
## accel_forearm_x       16.900
## magnet_belt_z         14.481
## accel_dumbbell_z      13.485
## accel_belt_z          13.249
## total_accel_dumbbell  13.077
## magnet_forearm_z      12.653
## magnet_belt_y         11.803
## yaw_arm               10.460
## gyros_belt_z           9.527
## yaw_dumbbell           8.549
pred_rf <- predict(fit_rf, validation)
confusionMatrix(pred_rf, validation$classe)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    A    B    C    D    E
##          A 1365    0    0    0    0
##          B    0  894    0    0    0
##          C    0    0  842    0    0
##          D    0    0    0  749    0
##          E    0    0    0    0  843
## 
## Overall Statistics
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9992, 1)
##     No Information Rate : 0.2909     
##     P-Value [Acc > NIR] : < 2.2e-16  
##                                      
##                   Kappa : 1          
##  Mcnemar's Test P-Value : NA         
## 
## Statistics by Class:
## 
##                      Class: A Class: B Class: C Class: D Class: E
## Sensitivity            1.0000   1.0000   1.0000   1.0000   1.0000
## Specificity            1.0000   1.0000   1.0000   1.0000   1.0000
## Pos Pred Value         1.0000   1.0000   1.0000   1.0000   1.0000
## Neg Pred Value         1.0000   1.0000   1.0000   1.0000   1.0000
## Prevalence             0.2909   0.1905   0.1794   0.1596   0.1796
## Detection Rate         0.2909   0.1905   0.1794   0.1596   0.1796
## Detection Prevalence   0.2909   0.1905   0.1794   0.1596   0.1796
## Balanced Accuracy      1.0000   1.0000   1.0000   1.0000   1.0000
plot(fit_rf, main="Cross Validation shows 27 predictors gives max accuracy")

plot(fit_rf$finalModel, main ="Final Model showing < 100 trees for best ROI (time vs accuracy)")

Build boosted model, caluclate confusion matrix and plot fit

# Boosted Tree Run previously loaded using readRDS
#fit_bt <- train(classe ~., method="gbm", data=imputed_train, trControl = fitControl)
#saveRDS(fit_bt, "boostedtree.RData")

fit_bt <- readRDS("C:/Users/Srodger/Documents/R/coursera/ml/week 4 project/boostedtree.RData")

pred_bt <- predict(fit_bt, validation)
confusionMatrix(pred_bt, validation$classe)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    A    B    C    D    E
##          A 1348   18    0    0    1
##          B   11  857   17    2    5
##          C    4   19  815   25    5
##          D    2    0    9  719    5
##          E    0    0    1    3  827
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9729          
##                  95% CI : (0.9679, 0.9774)
##     No Information Rate : 0.2909          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9657          
##  Mcnemar's Test P-Value : 0.003126        
## 
## Statistics by Class:
## 
##                      Class: A Class: B Class: C Class: D Class: E
## Sensitivity            0.9875   0.9586   0.9679   0.9599   0.9810
## Specificity            0.9943   0.9908   0.9862   0.9959   0.9990
## Pos Pred Value         0.9861   0.9608   0.9389   0.9782   0.9952
## Neg Pred Value         0.9949   0.9903   0.9929   0.9924   0.9959
## Prevalence             0.2909   0.1905   0.1794   0.1596   0.1796
## Detection Rate         0.2872   0.1826   0.1737   0.1532   0.1762
## Detection Prevalence   0.2913   0.1901   0.1850   0.1566   0.1771
## Balanced Accuracy      0.9909   0.9747   0.9771   0.9779   0.9900
plot(fit_bt, main=" Highest Accuracy comes from a tree depth of 3 and >140 boosting iterations")

Compare all three models

# load the library

results <- resamples(list(DecisionTree=fit_dt, RandomForest=fit_rf, GBM=fit_bt))
summary(results)
## 
## Call:
## summary.resamples(object = results)
## 
## Models: DecisionTree, RandomForest, GBM 
## Number of resamples: 5 
## 
## Accuracy 
##                   Min.   1st Qu.    Median      Mean   3rd Qu.      Max.
## DecisionTree 0.5044586 0.5050934 0.5053079 0.5107848 0.5178420 0.5212224
## RandomForest 0.9868365 0.9881154 0.9893888 0.9891306 0.9906542 0.9906582
## GBM          0.9575191 0.9592357 0.9609508 0.9603427 0.9609508 0.9630573
##              NA's
## DecisionTree    0
## RandomForest    0
## GBM             0
## 
## Kappa 
##                   Min.   1st Qu.    Median      Mean   3rd Qu.      Max.
## DecisionTree 0.3519600 0.3536704 0.3548824 0.3610780 0.3713551 0.3735221
## RandomForest 0.9833491 0.9849643 0.9865765 0.9862494 0.9881747 0.9881826
## GBM          0.9462552 0.9484353 0.9506041 0.9498335 0.9506220 0.9532509
##              NA's
## DecisionTree    0
## RandomForest    0
## GBM             0
dotplot(results)

Random Forest model is selected for highest accuracy and kappa. Predicting test set data using random forest model created:

# impute values
testing <- predict(train_pre_obj, testing)

predF <-predict(fit_rf, testing)
predF
##  [1] B A B A A E D B A A B C B A E E A B B B
## Levels: A B C D E