---
title: "Physic Playground Data"
output: html_notebook
---

# Loading and Preparing the data

```{r setup}
#install.packages("psych")
#yinstall.package("googlesheets4")
#install.package("CPTtools",repos="https://ralmond.r-universe.dev")
library(CPTtools)
library(tidyverse)
library(rpart)
library(psych)
library(googlesheets4)
options(dplyr.summarise.inform = FALSE)
path <- "https://docs.google.com/spreadsheets/d/1gjtaXUxPFF_IMeT4rm1mPxlcFDNUC1u7cX-qPrUqtK0/edit#gid=1994010518"
library(plotly)
library(rpart.plot)

#install.packages("e1071")
library(caret)
library(ggplot2)
#library(GGally)
```

```{r read}
ppLevel <- read_sheet(path,na="NA")
```

```{r summary}
summary(ppLevel)
```
## Cleaning

```{r mung varialble types, warning=FALSE}
ppLevel$timestamp <- as.Date(ppLevel$timestamp)
ppLevel$source <- as.factor(ppLevel$source)
ppLevel$condition <- as.factor(ppLevel$condition)
ppLevel$full_concept <- as.factor(ppLevel$full_concept)
ppLevel$concept <- as.factor(ppLevel$concept)

ppLevel$level <- as.factor(ppLevel$level)
ppLevel$level_type <- as.factor(ppLevel$level_type)
ppLevel$level_badge[is.na(ppLevel$level_badge)] <- "none"
ppLevel$level_badge <-  ordered(ppLevel$level_badge,
                               level=c("none","silver","gold"))
ppLevel$ls_pa_start <- as.Date(ppLevel$ls_pa_start,format="%Y-%m-%dT%H:%M:%S")
ppLevel$ls_pa_end <- as.Date(ppLevel$ls_pa_end,format="%Y-%m-%dT%H:%M:%S")

```

```{r resummerize}
summary(ppLevel)
```

Setup names for variables that have common characteristics.

```{r variable groups}
id_vars <- c("row_id","timestamp","uid","source","consent","assent")
target_var <- c("level_badge")
factor_vars <- c("condition","full_concept","concept","level_revisit_tag")
scaled_score_vars <- c("pre_near_by_concept", "pre_far_by_concept",               "post_near_by_concept", "post_far_by_concept",                 "pre_near_by_full_concept",       
"pre_far_by_full_concept",         "post_near_by_full_concept",
"post_far_by_full_concept")
difficulties <- c("comp_difficulty","game_mechanic_difficulty","physics_understanding_difficulty")
durations <- c("level_duration","ls_pa_duration")
ls_vars <- c("ls_n_PA_spgbrd","ls_n_PA_lever","ls_n_PA_ramp",	"ls_n_PA_pendulum","ls_n_hint","ls_n_game_tools","level_ls_total")
ob_count_vars <- c("ob_id_ramp","ob_id_lever","ob_id_spgbrd",	"ob_id_pendulum",	"ob_id_weight", "ob_comp_freeform",	"ob_comp_pin")

```

# Exploratory Data Analysis


## Working Data

Here I'm pulling out cases with extremely large durations and ones
with missing data.  I'm also scaling the difficulties by their maximum values.


```{r workingData}
## These variables need no translation
ppComplete <- ppLevel %>%
  filter(level_duration < 3500 & is.finite(level_duration)) %>%
  filter(!is.na(pre_near_by_concept)) %>%
  filter(!is.na(pre_far_by_concept)) %>%
  filter(!is.na(post_near_by_concept)) %>%
  filter(!is.na(post_far_by_concept)) %>%
  filter(!is.na(pre_near_by_full_concept)) %>%
  filter(!is.na(pre_far_by_full_concept)) %>%
  filter(!is.na(post_near_by_full_concept)) %>%
  filter(!is.na(post_far_by_full_concept)) %>%
  select(-row_id,-timestamp,-uid,-consent,-assent,
         -level_wallet_balance,-ls_pa_start,-ls_pa_end) %>%
  select(all_of(target_var), all_of(durations), -ls_pa_duration,
         all_of(scaled_score_vars), all_of(difficulties),
         all_of(factor_vars), all_of(ls_vars), 
         all_of(ob_count_vars), level)


## Var Groups: target_var, factor_vars, scaled_score_vars,
## difficulties, durations, ls_vars, ob_count_vars

         
## Factor variables, we need to do one-hot encoding on each of these.
# for (v in factor_vars) {
#   ## as.numeric() -1 as keras is zero-base and R is one-based.
#   hotCode <- to_categorical(as.numeric(pull(ppLevel,v))-1)
#   colnames(hotCode) <- paste0(v,1:ncol(hotCode))
#   ppWorking <- cbind(ppWorking,hotCode)
# }

##difficulties -- scale by max value.
ppComplete$comp_difficulty <- ppComplete$comp_difficulty/8
ppComplete$game_mechanic_difficulty <- ppComplete$game_mechanic_difficulty/4
ppComplete$physics_understanding_difficulty <- ppComplete$physics_understanding_difficulty/4


##ob_count_vars
## zero flags
#for (v in ob_count_vars) {
#  vv <- sub("(id|comp)","flag",v)
#  ppWorking[[vv]] <- pull(ppLevel,v) > 0
#}
# for (v in ob_count_vars) {
#   vals <- pull(ppLevel,v)
#   cutpoints <- c(-1,unique(quantile(vals,c(0,.25,.5,.75,1))))
#   onehot <- to_categorical(as.numeric(cut(vals,cutpoints))-1)
#   colnames(onehot) <- paste0(v,1:ncol(onehot))
#   ppWorking <- cbind(ppWorking,onehot)
# }

summary(ppComplete)

```

# Exploratory Plots


## Duration

```{r first plot}
ppComplete %>%
    filter(level_duration < 3500) %>%
    ggplot(aes(x = (level_duration))) +
    geom_histogram() +
    labs(
        title = "Duration spent in level (filtered).",
        x = "level_duation (in seconds)"
    )
ppComplete %>%
    ggplot(aes(x = log(level_duration))) +
    geom_histogram() +
    labs(
        title = "Duration spent in level (filtered).",
        x = "level_duation (log; in seconds)"
    )
```


```{r facet by trophy}

ppComplete %>%
    ggplot(aes(x = log(level_duration))) +
    geom_histogram() +
    labs(
        title = "Duration spent in level (filtered).",
        x = "level_duation (log; in seconds)"
    ) + facet_grid(rows=vars(level_badge))
```
```{r violin by trophy}

ppComplete %>%
    ggplot(aes(x=level_badge,y = log(level_duration))) +
    geom_violin() +
    labs(
        title = "Duration spent in level (filtered).",
        x = "level_duation (log; in seconds)"
    ) 
```
```{r violin by level}

ppComplete %>%
    ggplot(aes(x=level,y = log(level_duration))) +
    geom_violin() +
    labs(
        title = "Duration spent in level (filtered).",
        x = "level_duation (log; in seconds)"
    ) 
```

```{r violin by all}

for (v in colnames(ppComplete)) {
  if (is.numeric(pull(ppComplete,v))) {
      ppComplete %>%
          ggplot(aes(x=.data[[v]],y = log(level_duration))) +
          geom_point() + geom_smooth() -> P
     
  } else if (is.factor(pull(ppComplete,v))) {
    ppComplete %>%
      ggplot(aes(x=.data[[v]],y = log(level_duration))) +
      geom_violin() -> P
  }
  print(P+ labs(
        title = paste("Duration spent in level by ",v),
        x = v,
        y = "level_duation (log; in seconds)"
      ) )
}
```

## Object Counts

```{r histograms}
for (v in ob_count_vars) {
print(ppComplete %>%
      ggplot(aes(x=.data[[v]])) +geom_histogram()
)}

for(v in ob_count_vars)
  cat(v,"=",paste(round(quantile(pull(ppLevel,v),c(0,.25,.5,.75,1)),0),
                  collapse = " "),"\n")


```

## Pair plots


```{r pair plots}
ppComplete %>%
  select(level_duration, level, comp_difficulty, physics_understanding_difficulty, game_mechanic_difficulty) %>%
  pairs()
```

## Correlation Matrix
```{r corr}
ppComplete %>% 
  mutate(log_duration=ifelse(level_duration>0,log(level_duration),NA)) -> ppLev1

cor(pull(ppLev1,log_duration),
    select(ppLev1,all_of(scaled_score_vars)),
    use="pairwise.complete.obs")
cor(pull(ppLev1,log_duration),
    select(ppLev1,all_of(ob_count_vars)),
    use="pairwise.complete.obs")
cor(pull(ppLev1,log_duration),
    select(ppLev1,all_of(difficulties)),
    use="pairwise.complete.obs")

cor(pull(ppLev1,log_duration),
    select(ppLev1,all_of(ls_vars)),
    use="pairwise.complete.obs")
```

```{r prepostcors}
ppComplete %>% select(all_of(scaled_score_vars)) %>% cor(use="complete.obs")
```


# Build a Neural Net

```{r load libraryies}
library(tidyverse)
library(keras)
library(tfruns)
library(tfdatasets)
library(rsample)
```


```{r traintestsplit}
ppWorking <- ppComplete %>%
  select(level_badge,level_duration,
         all_of(difficulties),
         all_of(scaled_score_vars))


testsplit <- initial_split(ppWorking, prop= 4/5)
ppTest <- testing(testsplit)
ppTrain <- training(testsplit)

valsplit <- initial_split(ppTrain, prop= 4/5)
ppVal <- testing(valsplit)
ppTrain <- training(valsplit)
nrow(ppTrain)
nrow(ppVal)
nrow(ppTest)

```

Hot encoding for categorical variables. (This is not used, but here for reference).

```{r encode categorical variables.}
ppTestBadge <- to_categorical(as.numeric(pull(ppTest,level_badge))-1)
ppTrainBadge <- to_categorical(as.numeric(pull(ppTrain,level_badge))-1)
ppValBadge <- to_categorical(as.numeric(pull(ppVal,level_badge))-1)
```

## Specify model

Convert to tensorflow data.

```{r tensorize}
df_to_dataset <- function(df, shuffle = TRUE, batch_size = 32) {
  ds <- df %>% mutate(level_badge=as.integer(level_badge)) %>%
    tensor_slices_dataset()
  
  if (shuffle)
    ds <- ds %>% dataset_shuffle(buffer_size = nrow(df))
  
  ds %>% dataset_map(function(record) {
      record$level_badge <- tf$one_hot(record$level_badge,3L)
      record}) %>%
    dataset_batch(batch_size = batch_size)
}
batch_size <- 50
train_ds <- df_to_dataset(ppTrain, batch_size = batch_size)
val_ds <- df_to_dataset(ppVal, shuffle = FALSE, batch_size = batch_size)
test_ds <- df_to_dataset(ppTest, shuffle = FALSE, batch_size = batch_size)


```


```{r check the process, eval=FALSE}
train_ds %>% 
  reticulate::as_iterator() %>% 
  reticulate::iter_next() %>% 
  str()
```


```{r preprocessing step}
spec <- feature_spec(train_ds, level_badge ~ .) %>%
  #step_bucketized_column(level_ls_total,boundaries=c(1)) %>%
  step_numeric_column(level_duration, all_of(scaled_score_vars),
                      normalizer_fn=scaler_standard()) %>%
  step_categorical_column_with_vocabulary_list(all_nominal()) %>%
  #step_indicator_column(all_nominal())
  identity()
spec <- fit(spec)
```


```{r ppmodel}

ppmodel <- keras_model_sequential() %>%
  layer_dense_features(dense_features(spec)) %>%
  layer_dense(units = 16, activation = "relu") %>%
  layer_dense(units = 3, activation = "softmax") %>%
  compile(
    loss = 'categorical_crossentropy',
    optimizer = 'adam',
    metrics = c('accuracy')
  )
```

```{r train1}
ppmodel %>% fit(dataset_use_spec(train_ds,spec=spec),
                ppTrainBadge,
               epochs=15,
               validation_data = dataset_use_spec(val_ds,spec),
               verbose=2)
```

```{r evaluate1}
score1 <- ppmodel %>%
    evaluate(dataset_use_spec(test_ds,spec), verbose = 0)

round(score1,3)
```
```{r predict1}
pred1 <- ppmodel %>% predict(dataset_use_spec(test_ds,spec))

head(pred1)
```


## Discrete Predictions

```{r modalPredictions}
predMode <- ordered(apply(pred1,1,which.max)-1,  #Fix for renumbering levels in TF
                    levels=1:3,labels=c("none","silver","gold"))

summary(predMode)
```
```{r confusionMatrix}
pred1CM <- table(ppTest$level_badge,predMode)
pred1CM
cat("Cohen's kappa is ",CPTtools::fcKappa(pred1CM),"\n")
cat("Goodman & Kruskal's lambda is ",CPTtools::gkLambda(pred1CM),"\n")

```


## Probabilistic Predictions

```{r scoreFunction}
probtrue <- sapply(1:nrow(pred1),
                  function (i) {
                    pred1[i,as.numeric(ppTest$level_badge[i])]})
head(probtrue)                  
score <- -log(probtrue)
sum(score)
```
The _expected confusion matrix_ is like the regular confusion matrix, except that we use the prediction probabilities instead of the absolute value.


```{r ExpectedConfusionMatrix}
ppredCM1 <- matrix(0,3,3,dimnames=list(levels(ppTest$level_badge),
                   levels(ppTest$level_badge)))
for (i in 1:nrow(pred1)) {
  w <- as.integer(ppTest$level_badge[i])
  ppredCM1[w,] <- ppredCM1[w,] + pred1[i,]
}
round(ppredCM1,1)
cat("Cohen's kappa is ",CPTtools::fcKappa(ppredCM1),"\n")
cat("Goodman & Kruskal's lambda is ",CPTtools::gkLambda(ppredCM1),"\n")

```
                  

# CART model

## Fit a model


```{r first pass}
tree1 <- ppTrain %>% 
  rpart(level_badge ~ ., data = ., method = "class",
        model=TRUE)
tree1
```

```{r plotthethree}
prp(tree1, type = 1, extra = 1, under = TRUE, split.font = 1, varlen = -10)
```