library("rstan") source("coda2.R") #converts Stan output to coda source("sortmix.R") #Has initial functions and #output sorting code source("mixfuns.R") #calculates WAIC and DIC ## Read in raw data source("simdata.R") ## Script file for running Stan model 1p (with priors) ## This selects the data file DataSet <- K3simdat DataSetName <- "K3 Simulation Stan unordered" ## This selects the essays whichLevel2 <- 1:10 ## Minimum number of entries before we will include it. This should ## be at least 1, and long enough to filter out "null" essays. minLen <- 25 ## Number of components to fit K <- 2 runName <- make.names(paste(DataSetName,K)) ## Control of plotting display plotParams <- TRUE printParams <- TRUE saveGraphs <- TRUE ## Parameters for MCMC n.chains <- 3 chain.frac <- c(1,.8,.8) # Chain starting points will # be calculated from a sample # of this fraction of each # data vector. chain.boot <- FALSE # Should this be a bootstrap # sample (drawn with # replacement)? n.retry <- 5 # Number of times to retry the # sampling before giving up. n.warmup <- 1000 ## Adaptation phase n.sample <- 5000 n.thin <- 1 secondChance <- 2 #Length multiplier in case #first run does not converge mcmc.maxruns <- 2 #maximum number of MCMC runs to try ## Convergence criteria Rhat.max <- 1.2 n_eff.min <- 100 ## Parameters to monitor. Might need a subset of these. ## per component hyperparameters level2.params <- c("alpha0","mu0","beta0","tau0","gamma0") level2.scalarparams <- c("lp__","alphaN") # These parameters don't # depend on K, so don't need sorting. ## parameter which will be used to sort mixture components ## mu0 and alpha are sensible choices. May want to set decreasing to ## true with alpha. stan.model <- "hierModel1p.stan" #Order by mu0/mu model.constrained <- FALSE #If the model contains an #ordered restriction, then we #don't need to do post MCMC #sorting. level2.idparam <- "mu0" decreasing <- FALSE ## per essay/component parameteters level1.params <- c("pi","mu","sigma") level1.idparam <- "mu" # This should correspond to # the level 2 parameter simplex.params <- c("pi","alpha0") #These params are defined on a #simplex, so requires some #care when using gelman.diag #function ## Level 2 hyperparameters mu0m <- 0 #location of mean log group location prior mu0s <- 1000 #scale of mean log group location prior lbeta0m <- 0 #location of sd log group location prior lbeta0s <- 1 #location of sd log group location prior tau0m <- 0 #location of mean log group precision prior tau0s <- 1 #scale of mean log group precision prior lgamma0m <- 0 #location of sd log group precision prior lgamma0s <- 1 #location of sd log group precision prior result1 <- list(dataSetName=DataSetName,K=K,model=stan.model, n.warmup=n.warmup,n.sample=n.sample, n.thin=n.thin,constrained=model.constrained, idparam=level2.idparam,decreasing=decreasing) ## Wrap entire script in a giant tryCatch. The finally clause should ## cause a partial result1 to get written out for debugging in the ## case of a crash somewhere along the line tryCatch({ ### Start of actual script cat("\n\n ****************\n") cat("Cleaning data for ",DataSetName,"@",K,"\n") ## Extract desired data set rawdata <- list() rawdata$Y <- DataSet$Y[whichLevel2,] rawdata$J <- DataSet$J[whichLevel2] ## clean excessively short logs short <- rawdata$J < minLen cat("Removing ", sum(short), "of", length(short),"Level 2 units for length.\n") result1$data <- list(Y=rawdata$Y[!short,], J=rawdata$J[!short],I=sum(!short), alphaNu=sum(!short),K=K, alpha0mu=rep(0.5,K), mu0m=rep(mu0m,K), mu0s=rep(mu0s,K), lbeta0m=rep(lbeta0m,K), lbeta0s=rep(lbeta0s,K), tau0m=rep(tau0m,K), tau0s=rep(tau0s,K), lgamma0m=rep(lgamma0m,K), lgamma0s=rep(lgamma0s,K) ) result1$data$Jmax <- max(result1$data$J) #Precalc for array allocation result1$data$Y <- ifelse(is.na(result1$data$Y),99999,result1$data$Y) #Stan doesn't like NAs ### Calculate Initial Values result1$inits <- vector("list",n.chains) for (d in 1:n.chains) { cat("Calculating initial values for chain ",d,";", DataSetName,"@",K,"\n") result1$inits[[d]] <- hmmInits(d,result1$data$Y,result1$data$J, n.retry=n.retry, decreasing=decreasing) } ## We could pass the hmmInits funciton instead of result1$inits, but this ## method allows us to save the inital values we used. ## This code is probably redundant, but we will use this to test ## later for the first successful run result1$hparams <- NULL result1$scalars <- NULL result1$params <- NULL result1$failedruns <- numeric() result1$converged <- logical() ## We will put this into a while loop which will allow us to ## continue to the second chance if the first one fails to start for ## some reason converged <- FALSE runno <- 0 while (!converged && runno < mcmc.maxruns) { runno <- runno+1 if (runno > 1) { ## Double run length second time through n.sample <- n.sample*secondChance n.warmup <- n.warmup*secondChance result1$n.sample <-c(result1$n.sample,n.sample) result1$n.warmup <-c(result1$n.warmup,n.warmup) } cat("\n\n\n ****************\n") cat("Running Model for ",DataSetName,"@",K, "Attempt", runno,"\n") if (runno > 1) { fit1 <- stan(fit=fit1,data=result1$data, inits=result1$inits, chains=n.chains,iter=n.sample+n.warmup, warmup=n.warmup) } else { fit1 <- stan(stan.model,data=result1$data, inits=result1$inits, chains=n.chains,iter=n.sample+n.warmup, warmup=n.warmup) } ## Extract Level 2 and deviance parameters. run1.level2 <- stan2coda2(fit1,level2.params) run1.scalars <- stan2coda2(fit1,level2.scalarparams) run1.level1 <- stan2coda2(fit1,level1.params,K) ## Stan runs occasionally crash in the middle, returning ## essentially a null fit object. In this case, the extracted ## values should all be null. Check for this case if (is.null(run1.level2[[1]])) { result1$failedruns <- c(result1$failedruns,runno) cat("Run ",runno," crashed; skipping convergence checks.\n") next } ### Identify Components -- If we did not include a constraining ### parameter, then we need to relabel the components. if (!model.constrained) { cat("Labeling components for level 2 model",DataSetName,"@",K,"\n") run1.index <- sortindex(run1.level2[[level2.idparam]],decreasing) run1.level2 <- level2sort(run1.level2,level2.params,run1.index) run1.level1 <- level1sort(run1.level1,level1.params,run1.index,K) } ## bind the sorted components onto the results if (is.null(result1$hparams)) { result1$hparams <- run1.level2 } else { for (param in level2.params) { result1$hparams[[param]] <- mcmcjoin(result1$hparams[[param]], run1.level2[[param]]) } } if (is.null(result1$scalars)) { result1$scalars <- run1.scalars } else { for (param in level2.scalarparams) { result1$scalars[[param]] <- mcmcjoin(result1$scalars[[param]], run1.scalars[[param]]) } } if (is.null(result1$params)) { result1$params <- run1.level1 } else { for (param in level1.params) { result1$params[[param]] <- mcmcjoin(result1$params[[param]], run1.level1[[param]]) } } ### Convergence diagnostics and printout cat("\n\n ****************\n") cat("Convergence diagnostics for ",DataSetName,"@",K, "Run Number", runno,"\n") ## Accumulate these as we go along, to test for convergence Rhatmax <- 0 neffMin <- Inf ## If we have null parameters, that means something went wrong with ## MCMC sampler, we will want to try again anyway. tryCatch({ if (saveGraphs) pdf(paste(runName,runno,"pdf",sep=".")) for (param in level2.scalarparams) { if (printParams) { print(try(summary(run1.scalars[[param]]))) Rhat <-try(gelman.diag(run1.scalars[[param]],autoburnin=FALSE)) print(Rhat) if (!inherits(Rhat, "try-error")) Rhatmax <- max(Rhatmax,Rhat$psrf[,1]) neff <- try(effectiveSize(run1.scalars[[param]])) print(neff) if (!inherits(Rhat, "try-error")) neffMin<- min(neffMin,neff) } } if (plotParams) { ## set up looping so gelman plots and trace plots appear on ## the same page. par(mfrow=c(length(level2.scalarparams),1)) for (param in level2.scalarparams) { try(gelman.plot(run1.scalars[[param]],auto.layout=FALSE)) } par(mfrow=c(length(level2.scalarparams),2)) for (param in level2.scalarparams) { try(plot(run1.scalars[[param]],auto.layout=FALSE)) } } for (param in level2.params) { if (printParams) { print(try(summary(result1$hparams[[param]]))) if (param %in% simplex.params) { ## Drop first column to avoid divide by zero error Rhat <-try(gelman.diag(result1$hparams[[param]][,-1],autoburnin=FALSE)) } else { Rhat <-try(gelman.diag(result1$hparams[[param]],autoburnin=FALSE)) } print(Rhat) if (!inherits(Rhat, "try-error")) Rhatmax <- max(Rhatmax,Rhat$mpsrf) neff <- try(effectiveSize(result1$hparams[[param]])) print(neff) if (!inherits(Rhat, "try-error")) neffMin<- min(neffMin,neff) } if (plotParams) { par(mfrow=c(K,1)) if (param %in% simplex.params) { ## Drop first column to avoid divide by zero error try(gelman.plot(result1$hparams[[param]][,-1],auto.layout=FALSE)) } else { try(gelman.plot(result1$hparams[[param]],auto.layout=FALSE)) } par(mfrow=c(K,1)) try(plot(result1$hparams[[param]])) } } }, finally=if (saveGraphs) dev.off()) ### Automatic convergence check converged <- TRUE if (Rhatmax > Rhat.max) { converged <- FALSE cat("Chains of length ",n.sample,"for ", DataSetName,"@",K,"did not converge in run ",runno,".\n") cat("Maximum Rhat value = ",Rhatmax,".\n") if (printParams) { ## Print some hopefully diagnostic information about what the ## chains look like for (param in level2.scalarparams) { for (chain in 1:length(run1.scalars[[param]])) { cat(param,"[[",chain,"]]\n") print(try(summary(run1.scalars[[param]][[chain]])$stat)) } } for (param in level2.params) { for (chain in 1:length(result1$hparams[[param]])) { cat(param,"[[",chain,"]]\n") print(try(summary(result1$hparams[[param]][[chain]])$stat)) } } } } if (converged && neffMin < n_eff.min) { cat("Chain length of ",n.sample,"for ", DataSetName,"@",K,"were insufficient in run",runno,".\n") cat("Minimum effective MC sample size = ",neffMin,".\n") converged <- FALSE } result1$converged <- c(result1$converged,converged) cat("\n\n\n") } ### End of Run loop. if (!converged) { cat("MCMC run did not converge, proceeding anyway.\n") } ### WAIC and DIC. cat("Calculating model fit indexes for ",DataSetName,"@",K,"\n") sample.pi <- Karray(combineChains(result1$params$pi),K) sample.mu <- Karray(combineChains(result1$params$mu),K) sample.sigma <- Karray(combineChains(result1$params$sigma),K) result1$WAIC <- mchmixWAIC(result1$data$Y,result1$data$J, sample.pi,sample.mu,sample.sigma) print(result1$WAIC) result1$DIC <- mchmixDIC(result1$data$Y,result1$data$J, sample.pi,sample.mu,sample.sigma) print(result1$DIC) cat("Analaysis complete for ",DataSetName,"@",K,"\n") }, error=function(e) print(e), finally= { ### Write out results ## We do this in the finally clause because it that way the ## partial object will get written even if there is an ## error above. assign(runName,result1) outfile <- gzfile(paste(runName,"R","gz",sep="."),open="wt") dump(runName,file=outfile) close(outfile) } )