---
title: "Physic Playground Data"
output: html_notebook
---
# Loading and Preparing the data
```{r setup}
#install.packages("psych")
#yinstall.package("googlesheets4")
#install.package("CPTtools",repos="https://ralmond.r-universe.dev")
library(CPTtools)
library(tidyverse)
library(rpart)
library(psych)
library(googlesheets4)
options(dplyr.summarise.inform = FALSE)
path <- "https://docs.google.com/spreadsheets/d/1gjtaXUxPFF_IMeT4rm1mPxlcFDNUC1u7cX-qPrUqtK0/edit#gid=1994010518"
library(plotly)
library(rpart.plot)
#install.packages("e1071")
library(caret)
library(ggplot2)
#library(GGally)
```
```{r read}
ppLevel <- read_sheet(path,na="NA")
```
```{r summary}
summary(ppLevel)
```
## Cleaning
```{r mung varialble types, warning=FALSE}
ppLevel$timestamp <- as.Date(ppLevel$timestamp)
ppLevel$source <- as.factor(ppLevel$source)
ppLevel$condition <- as.factor(ppLevel$condition)
ppLevel$full_concept <- as.factor(ppLevel$full_concept)
ppLevel$concept <- as.factor(ppLevel$concept)
ppLevel$level <- as.factor(ppLevel$level)
ppLevel$level_type <- as.factor(ppLevel$level_type)
ppLevel$level_badge[is.na(ppLevel$level_badge)] <- "none"
ppLevel$level_badge <- ordered(ppLevel$level_badge,
level=c("none","silver","gold"))
ppLevel$ls_pa_start <- as.Date(ppLevel$ls_pa_start,format="%Y-%m-%dT%H:%M:%S")
ppLevel$ls_pa_end <- as.Date(ppLevel$ls_pa_end,format="%Y-%m-%dT%H:%M:%S")
```
```{r resummerize}
summary(ppLevel)
```
Setup names for variables that have common characteristics.
```{r variable groups}
id_vars <- c("row_id","timestamp","uid","source","consent","assent")
target_var <- c("level_badge")
factor_vars <- c("condition","full_concept","concept","level_revisit_tag")
scaled_score_vars <- c("pre_near_by_concept", "pre_far_by_concept", "post_near_by_concept", "post_far_by_concept", "pre_near_by_full_concept",
"pre_far_by_full_concept", "post_near_by_full_concept",
"post_far_by_full_concept")
difficulties <- c("comp_difficulty","game_mechanic_difficulty","physics_understanding_difficulty")
durations <- c("level_duration","ls_pa_duration")
ls_vars <- c("ls_n_PA_spgbrd","ls_n_PA_lever","ls_n_PA_ramp", "ls_n_PA_pendulum","ls_n_hint","ls_n_game_tools","level_ls_total")
ob_count_vars <- c("ob_id_ramp","ob_id_lever","ob_id_spgbrd", "ob_id_pendulum", "ob_id_weight", "ob_comp_freeform", "ob_comp_pin")
```
# Exploratory Data Analysis
## Working Data
Here I'm pulling out cases with extremely large durations and ones
with missing data. I'm also scaling the difficulties by their maximum values.
```{r workingData}
## These variables need no translation
ppComplete <- ppLevel %>%
filter(level_duration < 3500 & is.finite(level_duration)) %>%
filter(!is.na(pre_near_by_concept)) %>%
filter(!is.na(pre_far_by_concept)) %>%
filter(!is.na(post_near_by_concept)) %>%
filter(!is.na(post_far_by_concept)) %>%
filter(!is.na(pre_near_by_full_concept)) %>%
filter(!is.na(pre_far_by_full_concept)) %>%
filter(!is.na(post_near_by_full_concept)) %>%
filter(!is.na(post_far_by_full_concept)) %>%
select(-row_id,-timestamp,-uid,-consent,-assent,
-level_wallet_balance,-ls_pa_start,-ls_pa_end) %>%
select(all_of(target_var), all_of(durations), -ls_pa_duration,
all_of(scaled_score_vars), all_of(difficulties),
all_of(factor_vars), all_of(ls_vars),
all_of(ob_count_vars), level)
## Var Groups: target_var, factor_vars, scaled_score_vars,
## difficulties, durations, ls_vars, ob_count_vars
## Factor variables, we need to do one-hot encoding on each of these.
# for (v in factor_vars) {
# ## as.numeric() -1 as keras is zero-base and R is one-based.
# hotCode <- to_categorical(as.numeric(pull(ppLevel,v))-1)
# colnames(hotCode) <- paste0(v,1:ncol(hotCode))
# ppWorking <- cbind(ppWorking,hotCode)
# }
##difficulties -- scale by max value.
ppComplete$comp_difficulty <- ppComplete$comp_difficulty/8
ppComplete$game_mechanic_difficulty <- ppComplete$game_mechanic_difficulty/4
ppComplete$physics_understanding_difficulty <- ppComplete$physics_understanding_difficulty/4
##ob_count_vars
## zero flags
#for (v in ob_count_vars) {
# vv <- sub("(id|comp)","flag",v)
# ppWorking[[vv]] <- pull(ppLevel,v) > 0
#}
# for (v in ob_count_vars) {
# vals <- pull(ppLevel,v)
# cutpoints <- c(-1,unique(quantile(vals,c(0,.25,.5,.75,1))))
# onehot <- to_categorical(as.numeric(cut(vals,cutpoints))-1)
# colnames(onehot) <- paste0(v,1:ncol(onehot))
# ppWorking <- cbind(ppWorking,onehot)
# }
summary(ppComplete)
```
# Exploratory Plots
## Duration
```{r first plot}
ppComplete %>%
filter(level_duration < 3500) %>%
ggplot(aes(x = (level_duration))) +
geom_histogram() +
labs(
title = "Duration spent in level (filtered).",
x = "level_duation (in seconds)"
)
ppComplete %>%
ggplot(aes(x = log(level_duration))) +
geom_histogram() +
labs(
title = "Duration spent in level (filtered).",
x = "level_duation (log; in seconds)"
)
```
```{r facet by trophy}
ppComplete %>%
ggplot(aes(x = log(level_duration))) +
geom_histogram() +
labs(
title = "Duration spent in level (filtered).",
x = "level_duation (log; in seconds)"
) + facet_grid(rows=vars(level_badge))
```
```{r violin by trophy}
ppComplete %>%
ggplot(aes(x=level_badge,y = log(level_duration))) +
geom_violin() +
labs(
title = "Duration spent in level (filtered).",
x = "level_duation (log; in seconds)"
)
```
```{r violin by level}
ppComplete %>%
ggplot(aes(x=level,y = log(level_duration))) +
geom_violin() +
labs(
title = "Duration spent in level (filtered).",
x = "level_duation (log; in seconds)"
)
```
```{r violin by all}
for (v in colnames(ppComplete)) {
if (is.numeric(pull(ppComplete,v))) {
ppComplete %>%
ggplot(aes(x=.data[[v]],y = log(level_duration))) +
geom_point() + geom_smooth() -> P
} else if (is.factor(pull(ppComplete,v))) {
ppComplete %>%
ggplot(aes(x=.data[[v]],y = log(level_duration))) +
geom_violin() -> P
}
print(P+ labs(
title = paste("Duration spent in level by ",v),
x = v,
y = "level_duation (log; in seconds)"
) )
}
```
## Object Counts
```{r histograms}
for (v in ob_count_vars) {
print(ppComplete %>%
ggplot(aes(x=.data[[v]])) +geom_histogram()
)}
for(v in ob_count_vars)
cat(v,"=",paste(round(quantile(pull(ppLevel,v),c(0,.25,.5,.75,1)),0),
collapse = " "),"\n")
```
## Pair plots
```{r pair plots}
ppComplete %>%
select(level_duration, level, comp_difficulty, physics_understanding_difficulty, game_mechanic_difficulty) %>%
pairs()
```
## Correlation Matrix
```{r corr}
ppComplete %>%
mutate(log_duration=ifelse(level_duration>0,log(level_duration),NA)) -> ppLev1
cor(pull(ppLev1,log_duration),
select(ppLev1,all_of(scaled_score_vars)),
use="pairwise.complete.obs")
cor(pull(ppLev1,log_duration),
select(ppLev1,all_of(ob_count_vars)),
use="pairwise.complete.obs")
cor(pull(ppLev1,log_duration),
select(ppLev1,all_of(difficulties)),
use="pairwise.complete.obs")
cor(pull(ppLev1,log_duration),
select(ppLev1,all_of(ls_vars)),
use="pairwise.complete.obs")
```
```{r prepostcors}
ppComplete %>% select(all_of(scaled_score_vars)) %>% cor(use="complete.obs")
```
# Build a Neural Net
```{r load libraryies}
library(tidyverse)
library(keras)
library(tfruns)
library(tfdatasets)
library(rsample)
```
```{r traintestsplit}
ppWorking <- ppComplete %>%
select(level_badge,level_duration,
all_of(difficulties),
all_of(scaled_score_vars))
testsplit <- initial_split(ppWorking, prop= 4/5)
ppTest <- testing(testsplit)
ppTrain <- training(testsplit)
valsplit <- initial_split(ppTrain, prop= 4/5)
ppVal <- testing(valsplit)
ppTrain <- training(valsplit)
nrow(ppTrain)
nrow(ppVal)
nrow(ppTest)
```
Hot encoding for categorical variables. (This is not used, but here for reference).
```{r encode categorical variables.}
ppTestBadge <- to_categorical(as.numeric(pull(ppTest,level_badge))-1)
ppTrainBadge <- to_categorical(as.numeric(pull(ppTrain,level_badge))-1)
ppValBadge <- to_categorical(as.numeric(pull(ppVal,level_badge))-1)
```
## Specify model
Convert to tensorflow data.
```{r tensorize}
df_to_dataset <- function(df, shuffle = TRUE, batch_size = 32) {
ds <- df %>% mutate(level_badge=as.integer(level_badge)) %>%
tensor_slices_dataset()
if (shuffle)
ds <- ds %>% dataset_shuffle(buffer_size = nrow(df))
ds %>% dataset_map(function(record) {
record$level_badge <- tf$one_hot(record$level_badge,3L)
record}) %>%
dataset_batch(batch_size = batch_size)
}
batch_size <- 50
train_ds <- df_to_dataset(ppTrain, batch_size = batch_size)
val_ds <- df_to_dataset(ppVal, shuffle = FALSE, batch_size = batch_size)
test_ds <- df_to_dataset(ppTest, shuffle = FALSE, batch_size = batch_size)
```
```{r check the process, eval=FALSE}
train_ds %>%
reticulate::as_iterator() %>%
reticulate::iter_next() %>%
str()
```
```{r preprocessing step}
spec <- feature_spec(train_ds, level_badge ~ .) %>%
#step_bucketized_column(level_ls_total,boundaries=c(1)) %>%
step_numeric_column(level_duration, all_of(scaled_score_vars),
normalizer_fn=scaler_standard()) %>%
step_categorical_column_with_vocabulary_list(all_nominal()) %>%
#step_indicator_column(all_nominal())
identity()
spec <- fit(spec)
```
```{r ppmodel}
ppmodel <- keras_model_sequential() %>%
layer_dense_features(dense_features(spec)) %>%
layer_dense(units = 16, activation = "relu") %>%
layer_dense(units = 3, activation = "softmax") %>%
compile(
loss = 'categorical_crossentropy',
optimizer = 'adam',
metrics = c('accuracy')
)
```
```{r train1}
ppmodel %>% fit(dataset_use_spec(train_ds,spec=spec),
ppTrainBadge,
epochs=15,
validation_data = dataset_use_spec(val_ds,spec),
verbose=2)
```
```{r evaluate1}
score1 <- ppmodel %>%
evaluate(dataset_use_spec(test_ds,spec), verbose = 0)
round(score1,3)
```
```{r predict1}
pred1 <- ppmodel %>% predict(dataset_use_spec(test_ds,spec))
head(pred1)
```
## Discrete Predictions
```{r modalPredictions}
predMode <- ordered(apply(pred1,1,which.max)-1, #Fix for renumbering levels in TF
levels=1:3,labels=c("none","silver","gold"))
summary(predMode)
```
```{r confusionMatrix}
pred1CM <- table(ppTest$level_badge,predMode)
pred1CM
cat("Cohen's kappa is ",CPTtools::fcKappa(pred1CM),"\n")
cat("Goodman & Kruskal's lambda is ",CPTtools::gkLambda(pred1CM),"\n")
```
## Probabilistic Predictions
```{r scoreFunction}
probtrue <- sapply(1:nrow(pred1),
function (i) {
pred1[i,as.numeric(ppTest$level_badge[i])]})
head(probtrue)
score <- -log(probtrue)
sum(score)
```
The _expected confusion matrix_ is like the regular confusion matrix, except that we use the prediction probabilities instead of the absolute value.
```{r ExpectedConfusionMatrix}
ppredCM1 <- matrix(0,3,3,dimnames=list(levels(ppTest$level_badge),
levels(ppTest$level_badge)))
for (i in 1:nrow(pred1)) {
w <- as.integer(ppTest$level_badge[i])
ppredCM1[w,] <- ppredCM1[w,] + pred1[i,]
}
round(ppredCM1,1)
cat("Cohen's kappa is ",CPTtools::fcKappa(ppredCM1),"\n")
cat("Goodman & Kruskal's lambda is ",CPTtools::gkLambda(ppredCM1),"\n")
```
# CART model
## Fit a model
```{r first pass}
tree1 <- ppTrain %>%
rpart(level_badge ~ ., data = ., method = "class",
model=TRUE)
tree1
```
```{r plotthethree}
prp(tree1, type = 1, extra = 1, under = TRUE, split.font = 1, varlen = -10)
```