Code Appendix

This is the code appendix attached to the report Predicting Student Success in a MOOC

library(dplyr)
library(maps)
library(maptools)
library(countrycode)
library(glmnet)
library(nnet)
library(caret)
library(gbm)
library(e1071)
library(randomForest)
library(pROC)

Each of the files are drawn from each run of the course and then compiled later with an additonal variable that describes which run the datum came from.

stats <- read.csv("~/Documents/Entrepreneurship101/Run1/Entrepreneurship_101_1st_run%2Fshow_answer_stats_by_user.csv")
users <- read.csv("~/Documents/Entrepreneurship101/Run1/Entrepreneurship_101_1st_run%2Fperson_course.csv", stringsAsFactors = FALSE)
staff <- read.csv("~/Documents/Entrepreneurship101/Run1/Entrepreneurship_101_1st_run%2Fuser_info_combo.csv")
problems <- read.csv("~/Documents/Entrepreneurship101/Run1/Entrepreneurship_101_1st_run%2Fperson_problem.csv")

stats2 <- read.csv("~/Documents/Entrepreneurship101/Run2/Entrepreneurship_101_2nd_run%2Fshow_answer_stats_by_user.csv")
users2 <- read.csv("~/Documents/Entrepreneurship101/Run2/Entrepreneurship_101_2nd_run%2Fperson_course.csv", stringsAsFactors = FALSE)
staff2 <- read.csv("~/Documents/Entrepreneurship101/Run2/Entrepreneurship_101_2nd_run%2Fuser_info_combo.csv")
problems2 <- read.csv("~/Documents/Entrepreneurship101/Run2/Entrepreneurship_101_2nd_run%2Fperson_problem.csv")

stats3 <- read.csv("~/Documents/Entrepreneurship101/Run3/Entrepreneurship_101_3rd_run%2Fshow_answer_stats_by_user.csv")
users3 <- read.csv("~/Documents/Entrepreneurship101/Run3/Entrepreneurship_101_3rd_run%2Fperson_course.csv", stringsAsFactors = FALSE)
staff3 <- read.csv("~/Documents/Entrepreneurship101/Run3/Entrepreneurship_101_3rd_run%2Fuser_info_combo.csv")
problems3 <- read.csv("~/Documents/Entrepreneurship101/Run3/Entrepreneurship_101_3rd_run%2Fperson_problem.csv")

Once the data are loaded we want to narrow down some predictors. Each of these predictors are selected alongside of the user_id variable so that we can utilize dplyr’s left_join() function to join by user id. The stats dataframe describes the number of problems that the student attempted in the course. The users dataframe describes; whether or not a student viewed the course, whether or not the student became certified, the student’s country code based on their ip address, their latitude and longitude, their year of birth, gender, grade in the course, their time they began the course, the last time they accessed the course, their video plays count, whether they were honor or verified track, and their number of forum posts. I also altered a variable to show their age instead of their year of birth. This was done by taking the computer outputted current year and subtracting the student’s year of birth: mutate(age = as.integer(format(Sys.Date(), "%Y")) - YoB). mutate() will create that new variable and then I just deselect the YoB varaible using, dplyr::select(-YoB).

Some of the user accounts were staff members so I used one of the datasets to track which ones were staff members and then removed those accounts before the analysis.

The problems dataframe was my experiment in checking the importance of whether or not a student completed the first homework assignment. This was done simply by taking the data of all homework assignments completed and just grabbing the first one, indexprob <- which(problems$problem_nid == 1), then selecting the accounts where the student completed that assignment, staff <- staff %>% dplyr::select(user_id, is_staff), and finally assigning this new boolean variable eager that shows whether or not the first assignment was completed for every student.

The students that did not access the forum at all were given NA values. I just assume here that this is equivalent to accessing the forum 0 times and replace all of the NAs with 0’s. The same is true for n_attempted, if the student never viewed the course they received an NA as the number of problems attempted, but this is equivalent to saying they attempted 0 problems.

Finally, I clean up some missing value data points and join all of the dataframes together into one, StudentStats.

xFormData <- function(dfStats, dfUsers, dfStaff, dfProblems) {
# Function to clean data into one data frame for analysis
# All of the variables with 'roles' are just NA
# Viewed: Boolean flag, indicating that the user visited the course at least once
# Certified: Boolean flag, indicating that the user earned a certificate in the course
# -------------------------------------------------------------------------------------
stats <- dfStats %>% dplyr::select(user_id, n_attempted, n_perfect)
users <- dfUsers %>% dplyr::select(user_id, viewed, certified, 
                                   cc_by_ip, latitude, longitude, YoB, 
                                   gender, grade, start_time, last_event, 
                                   nplay_video, mode, nforum_posts) %>% 
  mutate(age = as.integer(format(Sys.Date(), "%Y")) - YoB) %>% 
  dplyr::select(-YoB)
staff <- staff %>% dplyr::select(user_id, is_staff)
staff$is_staff <- as.factor(staff$is_staff)

problems <- dfProblems %>% select(user_id, problem_nid, problem_raw_score)
indexprob <- which(problems$problem_nid == 1)
problems <- problems[indexprob,]
problems <- problems %>% mutate(eager = problem_raw_score == 1)
problems <- problems %>% select(user_id, eager)

users$viewed <- as.factor(users$viewed)
users$certified <- as.factor(users$certified)
# Those students that weren't in the forum data did not access 
# the forum so set nforum = 0
users$nforum_posts[which(is.na(users$nforum_posts))] <- 0
users$gender[users$gender == "o"] <- NA
users$gender[users$gender == ""] <- NA
users$gender[users$gender == "null"] <- NA
users$gender <- as.factor(users$gender)
unique(users$gender)
users$mode <- as.factor(users$mode)
StudentStats <- left_join(users, stats, by = "user_id")
StudentStats <- left_join(StudentStats, staff, by = "user_id")
StudentStats <- left_join(StudentStats, problems, by = "user_id")
staffindex <- which(StudentStats$is_staff == 1)
StudentStats <- StudentStats[-staffindex,]
# The NAs mean the student never viewed the course
StudentStats$n_attempted[is.na(StudentStats$n_attempted)] <- 0 
StudentStats$n_perfect[is.na(StudentStats$n_perfect)] <- 0
return(StudentStats)
}

Now we can run this previous function on each run’s dataframes and we will get 3 dataframes, 1 for each run. Then we simply join those dataframes by a quick rowbind and create another variable that describes which run those data came from.

studentstats1 <- xFormData(stats, users, staff, problems) %>% mutate(Run = "run1")
studentstats2 <- xFormData(stats2, users2, staff2, problems2) %>% mutate(Run = "run2")
studentstats3 <- xFormData(stats3, users3, staff3, problems3) %>% mutate(Run = "run3")
StudentStats <- rbind(studentstats1, studentstats2, studentstats3)
StudentStats$Run <- as.factor(StudentStats$Run)
StudentStats$eager[which(is.na(StudentStats$eager))] <- FALSE
StudentStats$nplay_video[which(is.na(StudentStats$nplay_video))] <- 0
StudentStats$eager <- as.factor(StudentStats$eager)
StudentStats$certified <- as.factor(StudentStats$certified)

Since we are performing predictions I subset the data into a trianing and a test set. I take the first 100,000 rows for the training set and leave the rest for the test set. This way when I go to check how the models are performing I don’t have a skewed sense of how well it’s done. Recall we use the AUC as our metric.

# How many people scored above 60% but did not receive a certificate?
summary(StudentStats[StudentStats$grade > .6,]$certified)
set.seed(890522)
o<-order(runif(dim(StudentStats)[1]))
stats.train <- StudentStats[o[1:100000],]
stats.pred <- StudentStats[o[100001:dim(StudentStats)[1]],]
stats.train$cc_by_ip <- as.factor(stats.train$cc_by_ip)
levels(stats.train$certified) <- c(0, 1)

I separated the character variables out from the numerical ones to take a closer look at the times (which are currently just character vectors).

train_numr = StudentStats[, sapply(StudentStats, is.numeric)]
train_char = StudentStats[, sapply(StudentStats, is.character)]
cat("Numerical column count : ", dim(train_numr)[2], 
    "; Character column count : ", dim(train_char)[2])

Now I just grep for the dates and isolate them into two different vectors. One is the hour of the day pertaining to the student’s start_time and last_event, and the other is simply the date and time of both variables.

train_date = train_char[,grep("2015-01-01", train_char),]
train_char = train_char[, !colnames(train_char) %in% colnames(train_date)]
train_date = sapply(train_date, function(x) strptime(x, "%Y-%m-%d %H:%M:%S"))
train_date = do.call(cbind.data.frame, train_date)

train_time = train_date[,colnames(train_date) %in% c("start_time", "last_event")]
train_time = data.frame(sapply(train_time, function(x) strftime(x, "%H:%M:%S")))
train_hour = as.data.frame(sapply(train_time, 
                                  function(x) as.numeric(as.character(substr( x ,1, 2)))))

This displays a histogram of the hour of the data these actions were logged. It does seem natural that there is a mode during the (after-work hours, or after-school day). The high number of actions at midnight I believe is either ill-founded data inputting or could be a mistake in my own parsing of the data.

par(mfrow=c(1,2))
for(i in 1:2) hist(train_hour[,i], main = paste(colnames(train_hour)[i], 
                                                "hourly"), breaks = c(0:24), 
                   xlab="", ylab="")

I wanted to take a look at where the students were coming from in the world so I grabbed all of the country codes from the cc_by_ip variable and matched them up with a dictionary of country names and country codes I found in an R package called countrycode. The purpose of translating the country codes to country names is that I want to make a heat map of all the students’ locations but the method for rendering the heat map only takes country names.

After translating the country codes to country names I realize that some of the country names from this dictionary do not match the country names from the dictionary used by the heat map package. So I have to clean up some of these names to make them match.

cc <- na.omit(stats.train$cc_by_ip)
cc[cc == ""] <- NA
cc <- na.omit(cc)
ccnames <- countrycode(cc,"iso2c","country.name")
ccnames <- as.character(ccnames)

ccnames[ccnames == "United States"] <- "USA"
ccnames[ccnames == "Russian Federation"] <- "Russia"
ccnames[ccnames == "Venezuela, Bolivarian Republic of"] <- "Venezuela"
ccnames[ccnames == "Taiwan, Province of China"] <- "Taiwan"
ccnames[ccnames == "Korea, Republic of"] <- "South Korea"
ccnames[ccnames == "Viet Nam"] <- "Vietnam"

ccnames <- as.data.frame(ccnames)
colnames(ccnames) <- "cc_by_ip"

Now that I have my list of country names I just render a heat map using the code block below.

mapWORLD <- map('world', fill=TRUE, plot=FALSE)
nms <- sapply(strsplit(mapWORLD$names,  ':'),  function(x)x[1])
WORLDpolygons <- map2SpatialPolygons(mapWORLD,  IDs = nms,  CRS('+proj=longlat'))

mapCountries = function(df, feat){
  dat = data.frame(table(df[,feat]))
  names(dat) = c("country", "value")
  
  idx <- match(unique(nms),  dat$country)
  dat2 <- data.frame(value = dat$value[idx], country = unique(nms))
  row.names(dat2) <- unique(nms) 
  WORLDsp <- SpatialPolygonsDataFrame(WORLDpolygons,  data = dat2)
  spplot(WORLDsp['value'], main=paste(feat, "frequency"), col.regions=rev(heat.colors(21)))
}
mapCountries(ccnames, "cc_by_ip")

Narrow down predictors on the training dataset and the test dataset.

# classes <- class.ind(stats.train$certified)
stats.train.subset <- stats.train %>% select(certified, cc_by_ip, 
                                             age, gender, nplay_video, 
                                             nforum_posts, n_attempted, eager, Run)
stats.pred.subset <- stats.pred %>% select(certified, cc_by_ip, 
                                           age, gender, nplay_video, 
                                           nforum_posts, n_attempted, eager, Run)

stats.train.subset <- na.omit(stats.train.subset)
stats.pred.subset <- na.omit(stats.pred.subset)

Use tuning with 10-fold cross validation to determine optimal hyperparamters based on the ROC metric.

nnetGrid <- expand.grid(.size = 1:10, .decay = c(0, .1, 1, 2))
maxSize <- max(nnetGrid$.size)
numWts <- 1*(maxSize * (length(colnames(stats.train.subset)) + 1) + maxSize + 1)
set.seed(450)
ctrl <- trainControl(method = "cv", number = 10, classProbs = TRUE)
# only use certain predictors described in models later `stats.train.subset[,3:8]`
nnetFit <- train(x = stats.train.subset[,3:8], y = stats.train.subset[,1],
                 method = "nnet",
                 metric = "ROC",
                 preProcess = c("center", "scale", "spatialSign"),
                 tuneGrid = nnetGrid,
                 trace = FALSE,
                 maxit = 2000,
                 MaxNWts = numWts,
                 trControl = ctrl,
                 verbose = TRUE)
# > nnetFit
# Neural Network 
# 
# 86261 samples
#     6 predictor
#     2 classes: 'No', 'Yes' 
# 
# Pre-processing: centered (4), scaled (4), spatial sign transformation (4), ignore (2) 
# Resampling: Cross-Validated (10 fold) 
# Summary of sample sizes: 77636, 77635, 77635, 77634, 77635, 77635, ... 
# Resampling results across tuning parameters:
# ...
# Accuracy was used to select the optimal model using  the largest value.
# The final values used for the model were size = 7 and decay = 0.1

Run the neural net with the tuned hyperparameters. Then check the predictions using the AUC metric.

nn <- nnet(certified ~ age + gender + nplay_video + nforum_posts + eager, 
           data = stats.train.subset, 
           size = 7, 
           decay = .4) # decay slightly changed from tuning due to improved results

nnpredictions <- predict(nn, newdata = stats.pred.subset, type = "class")
table(NNpredictions, stats.pred.subset$certified)

nnpredictions <- as.factor(nnpredictions)
levels(nnpredictions) <- c(0,1)
nnpredictions <- as.numeric(as.vector(nnpredictions))
nnroc <- roc(stats.pred.subset$certified, nnpredictions)
auc(nnroc)

train.svm <- svm(as.factor(certified) ~ age + gender + nplay_video + nforum_posts + eager, 
                 type='C-classification', 
                 kernel='radial', 
                 cachesize=2000, 
                 probability=TRUE, 
                 cost=1, 
                 data = stats.train.subset)

svmpredictions <- predict(train.svm, stats.pred.subset)
table(svmpredictions, stats.pred.subset$certified)

svmpredictions <- as.factor(svmpredictions)
levels(svmpredictions) <- c(0,1)
svmpredictions <- as.numeric(as.vector(svmpredictions))
svmroc <- roc(stats.pred.subset$certified, svmpredictions)
auc(svmroc)

train.Forest <- randomForest(formula = as.factor(certified) ~ age + gender + 
                               nplay_video + nforum_posts + eager, 
                             data = stats.train.subset, 
                             ntree = 2000, 
                             importance = TRUE, 
                             na.action=na.omit)

rfpredictions <- predict(train.Forest, stats.pred.subset)
table(rfpredictions, stats.pred.subset$certified)

rfpredictions <- as.factor(rfpredictions)
levels(rfpredictions) <- c(0,1)
rfpredictions <- as.numeric(as.vector(rfpredictions))
rfroc <- roc(stats.pred.subset$certified, rfpredictions)
auc(rfroc)

Build an ensemble of the three previous models and predict each observation using a voting tactic where if two or more models predict 0, the result is 0, and vice-versa. This did not prove as strong as the SVM on its own.

preds <- data.frame(nnpredictions, svmpredictions, rfpredictions)
preds$jointpred <- round((nnpredictions + svmpredictions + rfpredictions)/3)

jointpredictions <- as.numeric(as.vector(preds$jointpred))
jointroc <- roc(stats.pred.subset$certified, jointpredictions)
auc(jointroc)

Mike Chaykowsky

Code Appendix

You might also enjoy (View all posts)