source('mcmc.pred.R') # OR use this method to put MCMC.pred.R in a path # PC # path = 'C:/MyDrive/MyTopFolder/MySubFolder/' # Linux/MAC # path = '/home/MyName/MyTopFolder/MySubFolder/' # # source(paste0(path,'mcmc.pred.R')) ########################################################## ### ### EXAMPLES OF THE MCMC PREDICTION FUNCTIONS ### ### MCMCpack is required ### ### ### In each case, these routines produce estimates of ### ### Pr( Y | new X, old Y & X, Model & assumptions) ### ### I.e., a FULL predictive scheme. ### ### These are only a FEW of the examples of how to analyze ### predictive results. ANY question that can be asked of the ### observable Y can be answered -- with suitable code! ### ### version 0.22 ### Last touched, 2 August 2018 ### matt@wmbriggs.com ############################################ ### ### REGRESSION ### # download cgpa.csv x <- read.csv("http://wmbriggs.com/public/cgpa.csv") fit = MCMCregress(cgpa~hgpa*sat,data=x) # Replace x[6,] with any new data that has SAME variables as in model # the outcome (here cgpa) can be ANY value and is ignored, but must be there. # newdata = data.frame(hgpa = 1, sat = 600, recomm = 1, cgpa = 4) p = MCMCregress.pred(fit,x[6,]) # single newdata analysis hist(p,200) quantile(p,c(0.05,.5,.95)) mean(p) # Has to be a loop! array, marray, larray, and etc. all convert # the data.frame to a matrix or a unusable list! # R has no utility to step through a data.frame while maintaining class # # all the old x q = matrix(0,nrow(x),3) for(i in 1:nrow(x)){ q[i,]=quantile(MCMCregress.pred(fit,x[i,]),c(0.1,.5,.9)) } plot(x\$sat,q[,2],ylab='cgpa',ylim=c(min(q),max(q))) for(i in 1:nrow(x)){ lines(c(x\$sat[i],x\$sat[i]),c(q[i,1],q[i,3]),col=3) } plot(x\$hgpa,q[,2],ylab='cgpa',ylim=c(min(q),max(q))) for(i in 1:nrow(x)){ lines(c(x\$hgpa[i],x\$hgpa[i]),c(q[i,1],q[i,3]),col=3) } library(plot3D) # try leaving out CI=pi, and zlim=c(-1,5) pi = list(z = q[,c(1,3)]) scatter3D(x\$sat, x\$hgpa, q[,2], xlab='SAT',ylab='HGPA', zlim=c(-1,5),zlab='Median predicted', phi = 0, bty ="g", ticktype = "detailed", CI=pi) # all the old x q = NA g = 3 for(i in 1:nrow(x)){ p = MCMCregress.pred(fit,x[i,]) q[i]= sum(p>=g)/length(p) } plot(x\$sat,q,ylab='Pr(CGPA>g|old data,M)') plot(x\$hgpa,q,ylab='Pr(CGPA>g|old data,M)') scatter3D(x\$sat, x\$hgpa, q, xlab='SAT',ylab='HGPA', zlab=paste0('Pr(CGPA>',g,'|D,M)'), phi = 0, bty ="g", ticktype = "detailed") fit.r = MCMCregress(cgpa~hgpa*sat+recomm,data=x) q.r = NA for(i in 1:nrow(x)){ p = MCMCregress.pred(fit,x[i,]) q.r[i]= sum(p>=g)/length(p) } png('reg04.png',width=720,height=600) plot(x\$recomm, q-q.r, ylim=c(-0.05,0.05)) dev.off() ############################################ ### ### LOGISTIC REGRESSION ### data(birthwt) x=birthwt x\$race = as.factor(x\$race) fit = MCMClogit(low~age+race+smoke, data=x) # single newdata analysis, using one of the old obs as if it were new w = x[1,] w p = MCMClogit.pred(fit,w) p w\$smoke = 1 p = MCMClogit.pred(fit,w) p # all the old x p = NA for(i in 1:nrow(x)){ p[i] = MCMClogit.pred(fit,x[i,]) } plot(x\$age,p,col=x\$race,ylab='Pr(low wt|old data,M)', pch=x\$smoke+1) grid() legend('topright',c('r1','r2','r3','s0','s1'), col=c(1,2,3,1,1), pch = c(3,3,3,1,2), bty='n') ############################################ ### ### MULTINOMIAL REGRESSION ### # The form(ula) is required to be stored for multinomial # because MCMCmnl does not return or store the formula. Nor the data! data(Nethvote) x = Nethvote form = formula('vote ~relig + class + income + educ + age * urban') lv = levels(x[, as.character(form[[2]]) ]) fit = MCMCmnl(form, mcmc.method="IndMH", B0=0, mcmc=5000, thin=10, tune=0.5, baseline='D66',data=x) # or # Create CGPA in DECSION buckets. ALL analyses should be tailored # to the decisions to be made. x <- read.csv("http://wmbriggs.com/public/cgpa.csv") x\$cgpa = as.factor(roundTo(x\$cgpa,1)) levels(x\$cgpa)=c('0-1','0-1','2','3-4','3-4') form = formula('cgpa ~hgpa*sat') lv = levels(x[, as.character(form[[2]]) ]) fit = MCMCmnl(form, mcmc.method="IndMH", B0=0, mcmc=10000, thin=10, tune=0.5,data=x) # Notice that the form(ula) is now passed to prediction method. # Notice that lv(levels) is now passed to prediction method. # The Base category is decided in the MCMCmnl formula, by user or by R's # normal processing of factors. Unfortunately, MCMCmnl does not save nor # pass the base levels of the model, nor the data name, and so cannot be discovered # inside the prediction method. So we have to call it Base. # The new data does NOT have to have all factor levels, so we have to calculate # levels outside MCMCmnl.pred # single new data p = MCMCmnl.pred(fit,form,x[1,],lv) p = MCMCmnl.pred(fit,form,x[1,],lv) for(i in 1:nrow(x)){ # this preserves the proper names for p's columns if(i>1) p=rbind(p,MCMCmnl.pred(fit,form,x[i,],lv)) } p = as.data.frame(p, row.names=FALSE) par(mfrow=c(2,2)) for (i in 1:4){ plot(x\$class,p[,i],main=names(p)[i], ylab='Pr(Vote|D,M)',col=x\$relig+1) #plot(x\$sat,a[,3],main=names(p)[i], ylab='Pr(CGPA|D,M)') } ############################################ ### ### POISSON REGRESSION ### # https://stats.idre.ucla.edu/r/dae/poisson-regression/ x <- read.csv("https://stats.idre.ucla.edu/stat/data/poisson_sim.csv") x <- within(x, { prog <- factor(prog, levels=1:3, labels=c("General", "Academic", "Vocational")) id <- factor(id) }) summary(x) fit = MCMCpoisson(num_awards ~ prog*math,data=x) # single new data p = MCMCpoisson.pred(fit,x[1,]) plot(p,xlab='num_awards', ylab='Pr(num_awards|D,M)') # more complicated, because y is open ended; a guess has to be made in # number of cols of q; if subscript out of bounds, increase size q = matrix(0,nrow(x),16) for(i in 1:nrow(x)){ a = MCMCpoisson.pred(fit,x[i,]) q[i,1:length(a)]=a } par(mfrow=c(2,2)) for (i in levels(x\$prog)){ j = which(x\$prog == i) plot(0:(dim(q)[2]-1),q[1,],xlab='num_awards', ylab='Pr(num_awards|D,M)', main=i,ylim=c(min(q),max(q)), xlim=c(0,8), type='l') for(k in 2:nrow(x)){ lines(0:(dim(q)[2]-1),q[j[k],]) } } x\$math.cut = cut(x\$math, breaks = c(0,45, 52, 59, 80)) par(mfrow=c(2,2)) for (i in levels(x\$math.cut)){ j = which(x\$math.cut == i) plot(0:(dim(q)[2]-1),q[1,],xlab='num_awards', ylab='Pr(num_awards|D,M)', main=i,ylim=c(min(q),max(q)), xlim=c(0,8), type='l') for(k in 2:nrow(x)){ lines(0:(dim(q)[2]-1),q[j[k],]) } } ############################################ ### ### TOBIT REGRESSION ### x <- read.csv("http://wmbriggs.com/public/cgpa.csv") # cgpa can only exist between 0 and 4; the ordinary regression # above exhibits strong probability leakage form = formula('cgpa~hgpa*sat') fit = MCMCtobit(form, below=0,above=4,data=x, mcmc=30000) p=MCMCtobit.pred(fit,form,x[6,],0,4) hist(p,300) quantile(p,c(0.05,.5,.95)) # Has to be a loop! array, marray, larray, and etc. all convert # the data.frame to a matrix or a unusable list! # R has no utility to step through a data.frame while mainting class # all the old x q = matrix(0,nrow(x),3) for(i in 1:nrow(x)){ q[i,]=quantile(MCMCtobit.pred(fit,form,x[i,]),c(0.1,.5,.9)) } plot(x\$sat,q[,2],ylab='cgpa',ylim=c(min(q),max(q))) for(i in 1:nrow(x)){ lines(c(x\$sat[i],x\$sat[i]),c(q[i,1],q[i,3]),col=3) } plot(x\$hgpa,q[,2],ylab='cgpa',ylim=c(min(q),max(q))) for(i in 1:nrow(x)){ lines(c(x\$hgpa[i],x\$hgpa[i]),c(q[i,1],q[i,3]),col=3) } # all the old x q = NA g = 3 for(i in 1:nrow(x)){ p = MCMCtobit.pred(fit,form,x[i,]) q[i]= sum(p>=g)/length(p) } plot(x\$sat,q,ylab='Pr(CGPA>g|old data,M)') plot(x\$hgpa,q,ylab='Pr(CGPA>g|old data,M)')