############################################################################### # To load this library: use source("https://hasthika.github.io/pipack.txt") # # Need library(ggplot2), library(leaps) # ############################################################################### ################################################################################################################################## # # # pi1sim() simulates the first prediction interval with Forward selection # # pi2sim() simulates the secons prediction interval with Forward selection # # pi1withlassoridge() This function simulates the new predition interval 1 with lasso and rigde # # pi2withlassoridge() This function simulates the new predition interval 2 with lasso and rigde # # # # pi1fs() will calculate the PI1 given dataframes x, y and future x vector. Forward selection will be used to build the model # # pigraph() will create a response plot for PI1 given the dataframes x and y model will be created by forward selection # # get_pi_table() will create a table for PI given the dataframes x and y model will be created by forward selection # # # # pi2graph() will create a response plot for PI2 given the dataframes x and y model will be created by forward selection # # get_pi2_table() will create a table given the dataframes x and y model will be created by forward selection # # pi2fs() will calculate the PI2 given dataframes x, y and future x vector. Forward selection will be used to build the model # # # ################################################################################################################################## ############################### # This function Frey PI # ############################### shorth3<-function(y, alpha = 0.05){ # computes the shorth(c) interval [Ln,Un] for c = cc. #shorth lists Ln and Un using Frey's correction n <- length(y) cc <- ceiling(n * (1 - alpha + 1.12*sqrt(alpha/n))) cc <- min(n,cc) sy <- sort(y) rup <- sy[cc] rlow <- sy[1] olen <- rup - rlow if(cc < n){ for(j in (cc + 1):n){ zlen <- sy[j] - sy[j - cc + 1] if(zlen < olen) { olen <- zlen rup <- sy[j] rlow <- sy[j - cc + 1] } } } Ln <- rlow; Un <- rup list(shorth=c(Ln, Un)) } ####################################################################################### # This function simulates the First prediction interval with Forward selection # ####################################################################################### pi1sim<-function(n = 100, p = 4, k = 1, nruns = 100, eps = 0.1, shift = 9, psi = 0.0, type = 1, alpha = 0.05, nb = 100){ #Needs library(leaps). #EBIC or min Cp: Pelawa Watagoda and Rupasinghe Arachchige Don (2019) bootstrap validation residuals PI #Simulates PIs for forward selection variable selection. #Needs p > 2. # 1 <= k <= p-1, k is the number of nonnoise variables #Uses five iid error distributions: # type = 1 for N(0,1) errors, 2 for t3 errors, 3 for exp(1) - 1 errors #4 for uniform(-1,1) errors, 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) #errors. # constant = 1 so there are p = q+1 coefficients #need p > 1, beta = (1, 1, ..., 1, 0, ..., 0) with k+1 ones, p-k-1 zeroes # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. See Maronna and Zamar (2002). # cor(X_i,X_j) = [2 psi +(q-2)psi^2]/[1 + (q-1)psi^2], i not = j # when the correlation exists. set.seed(974) pilen <- 1:nruns ps <- pilen opicov <- 0 splitpilen <- pilen splitpicov <- 0 q <- p-1 vmax <- min(p,as.integer(n/5)) rho <- (2*psi + (q-2)*psi^2)/(1 + (q-1)*psi^2) A <- matrix(psi,nrow=q,ncol=q) diag(A) <- 1 b <- 0 * 1:q b[1:k] <- 1 #b[1:0] acts like b[1:1] = b[1] vars <- as.vector(1:(p-1)) indx <- 1:n nH <- ceiling(n/2) for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- x %*% A xf <- rnorm(q) %*% A if(type == 1) { y <- 1 + x %*% b + rnorm(n) yf <- 1 + xf %*% b + rnorm(1) } if(type == 2) { y <- 1 + x %*% b + rt(n, df = 3) yf <- 1 + xf %*% b + rt(1, df = 3) } if(type == 3) { y <- 1 + x %*% b + rexp(n) - 1 yf <- 1 + xf %*% b + rexp(1) - 1 } if(type == 4) { y <- 1 + x %*% b + runif(n, min = -1, max = 1) yf <- 1 + xf %*% b + runif(1, min = -1, max = 1) } if(type == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- 1 + x %*% b + err ef <- rnorm(1, sd = 1 + rbinom(1, 1, eps) * shift) yf <- 1 + xf %*% b + ef } #make an MLR data set #find sets H and V perm <- sample(indx,n) H <- perm[1:nH] xH <- x[H,] yH <- y[H] xV <- x[-H,] yV <- y[-H] #find the forward sel minimum Cp or EBIC model on the training half set H tem<-regsubsets(xH,yH,nvmax=vmax,method="forward") out<-summary(tem) if(nH < 10*p) { xx <- 1:min(length(out$bic),p-1)+1 ebic <- out$bic+2*(-lgamma(xx+1)-lgamma(p-xx+1)) minebic <- out$which[ebic==min(ebic),] #do not need the constant in vin vin <- vars[minebic[-1]] } #if nH >= 10p use min Cp model else { mincp <- out$which[out$cp==min(out$cp),] #do not need the constant in vin vin <- vars[mincp[-1]] } sub <- lsfit(xH[,vin],yH) ps[i]<-length(sub$coef) #*********************************************************************** # Getting the bootstrap samples from the validation set (-H) upper <- numeric(nb) lower <- numeric(nb) for (j in 1:nb) { index <- sample.int(n-nH, n-nH, replace = TRUE) Bx <- xV[index,] By <- yV[index] # out side the loop? yfhat <- sub$coef[1] + xf[vin] %*% sub$coef[-1] valres <- By - sub$coef[1] - as.matrix(Bx[,vin]) %*% sub$coef[-1] #need as.matrix to prevent an error if vin has 1 variable #get asymptotically optimal PI using validation residuals vpi <- shorth3(valres,alpha=alpha)$shorth upper[j] <- yfhat + vpi[2] lower[j] <- yfhat + vpi[1] } # end of the bootstrap loop up <- mean(upper) low <- mean(lower) pilen[i] <- up - low if(low < yf && up > yf) opicov <- opicov + 1 } psmn <- mean(ps)-k #0 if subset is selecting optimal subset pimnlen <- mean(pilen) opicov <- opicov/nruns list(psmn=psmn,opicov=opicov,pimenlen=pimnlen)} ####################################################################################### # This function simulates the Second prediction interval with Forward selection # ####################################################################################### pi2sim<-function(n = 100, p = 4, k = 1, nruns = 100, eps = 0.1, shift = 9, psi = 0.0, type = 1, alpha = 0.05, nb = 100){ #Needs library(leaps). #EBIC or min Cp: Pelawa Watagoda and Rupasinghe Arachchige Don (2019) bootstrap validation residuals - #split conformal PI #Simulates PIs for forward selection variable selection. #Needs p > 2. # 1 <= k <= p-1, k is the number of nonnoise variables #Uses five iid error distributions: # type = 1 for N(0,1) errors, 2 for t3 errors, 3 for exp(1) - 1 errors #4 for uniform(-1,1) errors, 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) #errors. # constant = 1 so there are p = q+1 coefficients #need p > 1, beta = (1, 1, ..., 1, 0, ..., 0) with k+1 ones, p-k-1 zeroes # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. See Maronna and Zamar (2002). # cor(X_i,X_j) = [2 psi +(q-2)psi^2]/[1 + (q-1)psi^2], i not = j # when the correlation exists. set.seed(974) pilen <- 1:nruns ps <- pilen opicov <- 0 splitpilen <- pilen splitpicov <- 0 q <- p-1 vmax <- min(p,as.integer(n/5)) rho <- (2*psi + (q-2)*psi^2)/(1 + (q-1)*psi^2) A <- matrix(psi,nrow=q,ncol=q) diag(A) <- 1 b <- 0 * 1:q b[1:k] <- 1 #b[1:0] acts like b[1:1] = b[1] vars <- as.vector(1:(p-1)) indx <- 1:n nH <- ceiling(n/2) for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- x %*% A xf <- rnorm(q) %*% A if(type == 1) { y <- 1 + x %*% b + rnorm(n) yf <- 1 + xf %*% b + rnorm(1) } if(type == 2) { y <- 1 + x %*% b + rt(n, df = 3) yf <- 1 + xf %*% b + rt(1, df = 3) } if(type == 3) { y <- 1 + x %*% b + rexp(n) - 1 yf <- 1 + xf %*% b + rexp(1) - 1 } if(type == 4) { y <- 1 + x %*% b + runif(n, min = -1, max = 1) yf <- 1 + xf %*% b + runif(1, min = -1, max = 1) } if(type == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- 1 + x %*% b + err ef <- rnorm(1, sd = 1 + rbinom(1, 1, eps) * shift) yf <- 1 + xf %*% b + ef } #make an MLR data set #find sets H and V perm <- sample(indx,n) H <- perm[1:nH] xH <- x[H,] yH <- y[H] xV <- x[-H,] yV <- y[-H] #find the forward sel minimum Cp or EBIC model on the training half set H tem<-regsubsets(xH,yH,nvmax=vmax,method="forward") out<-summary(tem) if(nH < 10*p) { xx <- 1:min(length(out$bic),p-1)+1 ebic <- out$bic+2*(-lgamma(xx+1)-lgamma(p-xx+1)) minebic <- out$which[ebic==min(ebic),] #do not need the constant in vin vin <- vars[minebic[-1]] } #if nH >= 10p use min Cp model else { mincp <- out$which[out$cp==min(out$cp),] #do not need the constant in vin vin <- vars[mincp[-1]] } sub <- lsfit(xH[,vin],yH) ps[i]<-length(sub$coef) #*********************************************************************** # Getting the bootstrap samples from the validation set (-H) upper <- numeric(nb) lower <- numeric(nb) # out side the loop? yfhat <- sub$coef[1] + xf[vin] %*% sub$coef[-1] for (j in 1:nb) { index <- sample.int(n-nH, n-nH, replace = TRUE) Bx <- xV[index,] By <- yV[index] valres <- By - sub$coef[1] - as.matrix(Bx[,vin]) %*% sub$coef[-1] #need as.matrix to prevent an error if vin has 1 variable absres <- abs(valres) aq <- quantile(absres, 1-alpha) upper[j] <- yfhat + aq lower[j] <- yfhat - aq } # end of the bootstrap loop up <- mean(upper) low <- mean(lower) splitpilen[i] <- up - low if(low < yf && up > yf) splitpicov <- splitpicov + 1 } psmn <- mean(ps)-k #0 if subset is selecting optimal subset splitpimnlen <- mean(splitpilen) splitpicov <- splitpicov/nruns list(psmn=psmn,splitpicov=splitpicov, splitpimenlen = splitpimnlen)} ############################################################################# # This function simulates the new predition interval 1 with lasso and rigde # ############################################################################# pi1withlassoridge<-function(n = 100, p = 4, k = 1, nruns = 100, eps = 0.1, shift = 9, psi = 0.0, type = 1, alpha = 0.05, nb = 100){ #Needs library(glmnet). Does lasso and RR. #Use 1 <= k <= p-1, where k is the number of nonnoise variables. #Uses the Pelawa Watagoda and Rupasinghe Arachchige Don (2019) bootstrap validation residual shorth PI # for lasso and RR with 10-fold CV. #Uses five iid error distributions: # type = 1 for N(0,1) errors, 2 for t3 errors, 3 for exp(1) - 1 errors # 4 for uniform(-1,1) errors, 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) #errors. # constant = 1 so there are p = q+1 coefficients #need p > 2, beta = (1, 1, ..., 1, 0, ..., 0) with k+1 ones, p-k-1 zeroes # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. See Maronna and Zamar (2002). # cor(X_i,X_j) = [2 psi +(q-2)psi^2]/[1 + (q-1)psi^2], i not = j # when the correlation exists. set.seed(974) ##need p>2 claspilen <- 1:nruns claspicov <- 0 vlaspilen <- 1:nruns vlaspicov <- 0 vrrpilen <- 1:nruns vrrpicov <- 0 crrpilen <- 1:nruns crrpicov <- 0 q <- p-1 rho <- (2*psi + (q-2)*psi^2)/(1 + (q-1)*psi^2) A <- matrix(psi,nrow=q,ncol=q) diag(A) <- 1 b <- 0 * 1:q b[1:k] <- 1 #b[1:0] acts like b[1:1] = b[1] vars <- as.vector(1:(p-1)) dd <- 1:nruns vdd <- dd indx <- 1:n nH <- ceiling(n/2) for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- x %*% A xf <- rnorm(q) %*% A if(type == 1) { y <- 1 + x %*% b + rnorm(n) yf <- 1 + xf %*% b + rnorm(1) } if(type == 2) { y <- 1 + x %*% b + rt(n, df = 3) yf <- 1 + xf %*% b + rt(1, df = 3) } if(type == 3) { y <- 1 + x %*% b + rexp(n) - 1 yf <- 1 + xf %*% b + rexp(1) - 1 } if(type == 4) { y <- 1 + x %*% b + runif(n, min = -1, max = 1) yf <- 1 + xf %*% b + runif(1, min = -1, max = 1) } if(type == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- 1 + x %*% b + err ef <- rnorm(1, sd = 1 + rbinom(1, 1, eps) * shift) yf <- 1 + xf %*% b + ef } #make an MLR data set # get the validation PI for lasso #find sets H and V perm <- sample(indx,n) H <- perm[1:nH] xH <- x[H,] yH <- y[H] xV <- x[-H,] yV <- y[-H] #find the 10 fold CV lasso model out<-cv.glmnet(xH,yH) lam <- out$lambda.min vp <- out$nzero[out$lambda==lam] + 1 #d for half set lasso vdd[i] <- vp #*********************************************************************** upper <- numeric(nb) lower <- numeric(nb) for (j in 1:nb) { index <- sample.int(n-nH, n-nH, replace = TRUE) Bx <- xV[index,] By <- yV[index] yfhat <- predict(out,s=lam,newx=xf) valfit <- predict(out,s=lam,newx=Bx) valres <- By - valfit #need as.matrix to prevent an error if vin has 1 variable #get asymptotically optimal PI using validation residuals vpi <- shorth3(valres,alpha=alpha)$shorth upper[j] <- yfhat + vpi[2] lower[j] <- yfhat + vpi[1] } # end of the bootstrap loop for lasso up <- mean(upper) low <- mean(lower) vlaspilen[i] <- up - low if(low < yf && up > yf) vlaspicov <- vlaspicov + 1 #*********************************************************************** #get the validation PI for ridge regression out<-cv.glmnet(xH,yH,alpha=0) lam <- out$lambda.min yfhat <- predict(out,s=lam,newx=xf) upper <- numeric(nb) lower <- numeric(nb) for (j in 1:nb) { index <- sample.int(n-nH, n-nH, replace = TRUE) Bx <- xV[index,] By <- yV[index] valfit <- predict(out,s=lam,newx=Bx) valres <- By - valfit #need as.matrix to prevent an error if vin has 1 variable #get asymptotically optimal PI using validation residuals vpi <- shorth3(valres,alpha=alpha)$shorth upper[j] <- yfhat + vpi[2] lower[j] <- yfhat + vpi[1] } # end of the bootstrap loop for ridge up <- mean(upper) low <- mean(lower) vrrpilen[i] <- up - low if(low < yf && up > yf) vrrpicov <- vrrpicov + 1 } vlaspimnlen <- mean(vlaspilen) vlaspicov <- vlaspicov/nruns vrrpimnlen <- mean(vrrpilen) vrrpicov <- vrrpicov/nruns vdd<-mean(vdd) #lasso and scaled lasso have the same d list(dvlas=vdd,vlaspicov=vlaspicov,vlaspimenlen=vlaspimnlen,vrrpicov=vrrpicov,vrrpimenlen=vrrpimnlen)} ############################################################################# # This function simulates the new predition interval 2 with lasso and rigde # ############################################################################# pi2withlassoridge<-function(n = 100, p = 4, k = 1, nruns = 100, eps = 0.1, shift = 9, psi = 0.0, type = 1, alpha = 0.05, nb = 100){ #Needs library(glmnet). Does lasso and RR. #Use 1 <= k <= p-1, where k is the number of nonnoise variables. #Uses the Pelawa Watagoda and Olive (2019) shorth PI and split conformal PI #with validation residuals for lasso and RR with 10-fold CV. #Uses five iid error distributions: # type = 1 for N(0,1) errors, 2 for t3 errors, 3 for exp(1) - 1 errors # 4 for uniform(-1,1) errors, 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) #errors. # constant = 1 so there are p = q+1 coefficients #need p > 2, beta = (1, 1, ..., 1, 0, ..., 0) with k+1 ones, p-k-1 zeroes # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. See Maronna and Zamar (2002). # cor(X_i,X_j) = [2 psi +(q-2)psi^2]/[1 + (q-1)psi^2], i not = j # when the correlation exists. set.seed(974) ##need p>2 claspilen <- 1:nruns claspicov <- 0 vlaspilen <- 1:nruns vlaspicov <- 0 vrrpilen <- 1:nruns vrrpicov <- 0 crrpilen <- 1:nruns crrpicov <- 0 q <- p-1 rho <- (2*psi + (q-2)*psi^2)/(1 + (q-1)*psi^2) A <- matrix(psi,nrow=q,ncol=q) diag(A) <- 1 b <- 0 * 1:q b[1:k] <- 1 #b[1:0] acts like b[1:1] = b[1] vars <- as.vector(1:(p-1)) dd <- 1:nruns vdd <- dd indx <- 1:n nH <- ceiling(n/2) for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- x %*% A xf <- rnorm(q) %*% A if(type == 1) { y <- 1 + x %*% b + rnorm(n) yf <- 1 + xf %*% b + rnorm(1) } if(type == 2) { y <- 1 + x %*% b + rt(n, df = 3) yf <- 1 + xf %*% b + rt(1, df = 3) } if(type == 3) { y <- 1 + x %*% b + rexp(n) - 1 yf <- 1 + xf %*% b + rexp(1) - 1 } if(type == 4) { y <- 1 + x %*% b + runif(n, min = -1, max = 1) yf <- 1 + xf %*% b + runif(1, min = -1, max = 1) } if(type == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- 1 + x %*% b + err ef <- rnorm(1, sd = 1 + rbinom(1, 1, eps) * shift) yf <- 1 + xf %*% b + ef } #make an MLR data set # get the validation PI for lasso #find sets H and V perm <- sample(indx,n) H <- perm[1:nH] xH <- x[H,] yH <- y[H] xV <- x[-H,] yV <- y[-H] #find the 10 fold CV lasso model out<-cv.glmnet(xH,yH) lam <- out$lambda.min vp <- out$nzero[out$lambda==lam] + 1 #d for half set lasso vdd[i] <- vp #*********************************************************************** upper <- numeric(nb) lower <- numeric(nb) for (j in 1:nb) { index <- sample.int(n-nH, n-nH, replace = TRUE) Bx <- xV[index,] By <- yV[index] yfhat <- predict(out,s=lam,newx=xf) valfit <- predict(out,s=lam,newx=Bx) valres <- By - valfit absres <- abs(valres) aq <- quantile(absres, 1-alpha) upper[j] <- yfhat + aq lower[j] <- yfhat - aq } # end of the bootstrap loop for lasso up <- mean(upper) low <- mean(lower) claspilen[i] <- up - low if(low < yf && up > yf) claspicov <- claspicov + 1 #*********************************************************************** #get the validation PI for ridge regression out<-cv.glmnet(xH,yH,alpha=0) lam <- out$lambda.min yfhat <- predict(out,s=lam,newx=xf) upper <- numeric(nb) lower <- numeric(nb) for (j in 1:nb) { index <- sample.int(n-nH, n-nH, replace = TRUE) Bx <- xV[index,] By <- yV[index] valfit <- predict(out,s=lam,newx=Bx) valres <- By - valfit absres <- abs(valres) aq <- quantile(absres, 1-alpha) upper[j] <- yfhat + aq lower[j] <- yfhat - aq } # end of the bootstrap loop for ridge up <- mean(upper) low <- mean(lower) crrpilen[i] <- up - low if(low < yf && up > yf) crrpicov <- crrpicov + 1 } claspimnlen <- mean(claspilen) claspicov <- claspicov/nruns crrpimnlen <- mean(crrpilen) crrpicov <- crrpicov/nruns vdd<-mean(vdd) #lasso and scaled lasso have the same d list(dvlas=vdd,claspicov=claspicov,claspimenlen=claspimnlen, crrpicov=crrpicov,crrpimenlen=crrpimnlen)} ########################################################################### # pi1fs will calculate the PI1 given dataframes x, y and future x vector # # Forward selection will be used to build the model # ########################################################################### pi1fs <- function(x, y, xf, alpha=0.05){ nb <- 100 n <- nrow(x) p <- ncol(x) indx <- 1:n vars <- as.vector(1:(p)) indx <- 1:n nH <- ceiling(n/2) vmax <- min(p,as.integer(n/5)) #find sets H and V perm <- sample(indx,n) H <- perm[1:nH] xH <- x[H,] yH <- y[H] xV <- x[-H,] yV <- y[-H] #find the forward sel minimum Cp or EBIC model on the training half set H tem<-regsubsets(xH,yH,nvmax=vmax,method="forward") out<-summary(tem) if(nH < 10*p) { xx <- 1:min(length(out$bic),p-1)+1 ebic <- out$bic+2*(-lgamma(xx+1)-lgamma(p-xx+1)) minebic <- out$which[ebic==min(ebic),] #do not need the constant in vin vin <- vars[minebic[-1]] } #if nH >= 10p use min Cp model else { mincp <- out$which[out$cp==min(out$cp),] #do not need the constant in vin vin <- vars[mincp[-1]] } sub <- lsfit(xH[,vin],yH) #ps[i]<-length(sub$coef) #*********************************************************************** # Getting the bootstrap samples from the validation set (-H) upper <- numeric(nb) lower <- numeric(nb) for (j in 1:nb) { index <- sample.int(n-nH, n-nH, replace = TRUE) Bx <- xV[index,] By <- yV[index] # out side the loop? yfhat <- sub$coef[1] + as.matrix(xf[vin]) %*% as.matrix(sub$coef[-1]) #yhat <- sub$coef[1] + as.matrix(x[,vin]) %*% as.matrix(sub$coef[-1]) #valres <- y[-H] - sub$coef[1] - as.matrix(x[-H,vin]) %*% sub$coef[-1] valres <- By - sub$coef[1] - as.matrix(Bx[,vin]) %*% sub$coef[-1] #need as.matrix to prevent an error if vin has 1 variable #get asymptotically optimal PI using validation residuals vpi <- shorth3(valres,alpha=alpha)$shorth upper[j] <- yfhat + vpi[2] lower[j] <- yfhat + vpi[1] } # end of the bootstrap loop up <- mean(upper) low <- mean(lower) FSPI <- c(low, up) FSPI } # Example: Use the Auto dataframe from ISLR library # library(ISLR) # data(Auto) # y <- Auto$mpg[-15] # Consider 15th to be the future obs # x <- Auto[-15,-c(1,9)] # Training data (exclude the 15th observation in x) # xf <- Auto[15,-c(1,9)] # Future x value # pi1fs(x, y, xf, alpha=0.05) #################################################################### # pigraph will create a response plot given the dataframes x and y # # model will be created by forward selection # #################################################################### pigraph <- function(x, y, alpha = 0.05){ n <- nrow(x) p <- ncol(x) l <- rep(NA, n) u <- rep(NA, n) yfhat <- rep(NA, n) nb <- 100 vmax <- min(p,as.integer(n/5)) indx <- 1:n vars <- as.vector(1:(p)) indx <- 1:n nH <- ceiling(n/2) #find sets H and V perm <- sample(indx,n) H <- perm[1:nH] xH <- x[H,] yH <- y[H] xV <- x[-H,] yV <- y[-H] #find the forward sel minimum Cp or EBIC model on the training half set H tem<-regsubsets(xH,yH,nvmax=vmax,method="forward") out<-summary(tem) if(nH < 10*p) { xx <- 1:min(length(out$bic),p-1)+1 ebic <- out$bic+2*(-lgamma(xx+1)-lgamma(p-xx+1)) minebic <- out$which[ebic==min(ebic),] #do not need the constant in vin vin <- vars[minebic[-1]] } #if nH >= 10p use min Cp model else { mincp <- out$which[out$cp==min(out$cp),] #do not need the constant in vin vin <- vars[mincp[-1]] } sub <- lsfit(xH[,vin],yH) #ps[i]<-length(sub$coef) #*********************************************************************** # Getting the bootstrap samples from the validation set (-H) upper <- numeric(nb) lower <- numeric(nb) for (i in 1:n) { xf <- x[i,] yfhat[i] <- sub$coef[1] + as.matrix(xf[1,vin]) %*% as.matrix(sub$coef[-1]) for (j in 1:nb) { index <- sample.int(n-nH, n-nH, replace = TRUE) Bx <- xV[index,] By <- yV[index] valres <- By - sub$coef[1] - as.matrix(Bx[,vin]) %*% sub$coef[-1] #need as.matrix to prevent an error if vin has 1 variable #get asymptotically optimal PI using validation residuals vpi <- shorth3(valres,alpha=alpha)$shorth upper[j] <- yfhat[i] + vpi[2] lower[j] <- yfhat[i] + vpi[1] } # end of the bootstrap loop up <- mean(upper) low <- mean(lower) FSPI <- c(low, up) FSPI l[i] <- low u[i] <- up } library(latex2exp) resplot <- ggplot(data.frame(l,u,yfhat,y), aes(x=yfhat, y=l)) + geom_abline(slope = 1, intercept = 0, color = "wheat") + geom_smooth(linetype = "dashed", color = "orchid4", method = "auto", size = 0.5) + geom_smooth(aes(x=yfhat, y=u), linetype = "dashed" , color = "orchid4", method = "auto", size = 0.5) + geom_point(aes(x=yfhat, y=y), color = "red", alpha = 0.5) + labs(x = "yhat", y = "y", title = "Response Plot") + theme_bw() resplot plotly::ggplotly(resplot) } # Example: Use the Auto dataframe from ISLR library # library(ISLR) # data(Auto) # y <- Auto$mpg # x <- Auto[,-c(1,9)] # pigraph(x, y, alpha = 0.01) ######################################################################### # get_pi_table will create a response plot given the dataframes x and y # # model will be created by forward selection # ######################################################################### get_pi_table <- function(x, y, alpha = 0.05){ n <- nrow(x) p <- ncol(x) l <- rep(NA, n) u <- rep(NA, n) yfhat <- rep(NA, n) nb <- 100 vmax <- min(p,as.integer(n/5)) indx <- 1:n vars <- as.vector(1:(p)) indx <- 1:n nH <- ceiling(n/2) #find sets H and V perm <- sample(indx,n) H <- perm[1:nH] xH <- x[H,] yH <- y[H] xV <- x[-H,] yV <- y[-H] #find the forward sel minimum Cp or EBIC model on the training half set H tem<-regsubsets(xH,yH,nvmax=vmax,method="forward") out<-summary(tem) if(nH < 10*p) { xx <- 1:min(length(out$bic),p-1)+1 ebic <- out$bic+2*(-lgamma(xx+1)-lgamma(p-xx+1)) minebic <- out$which[ebic==min(ebic),] #do not need the constant in vin vin <- vars[minebic[-1]] } #if nH >= 10p use min Cp model else { mincp <- out$which[out$cp==min(out$cp),] #do not need the constant in vin vin <- vars[mincp[-1]] } sub <- lsfit(xH[,vin],yH) #ps[i]<-length(sub$coef) #*********************************************************************** # Getting the bootstrap samples from the validation set (-H) upper <- numeric(nb) lower <- numeric(nb) for (i in 1:n) { xf <- x[i,] yfhat[i] <- sub$coef[1] + as.matrix(xf[1,vin]) %*% as.matrix(sub$coef[-1]) for (j in 1:nb) { index <- sample.int(n-nH, n-nH, replace = TRUE) Bx <- xV[index,] By <- yV[index] valres <- By - sub$coef[1] - as.matrix(Bx[,vin]) %*% sub$coef[-1] #need as.matrix to prevent an error if vin has 1 variable #get asymptotically optimal PI using validation residuals vpi <- shorth3(valres,alpha=alpha)$shorth upper[j] <- yfhat[i] + vpi[2] lower[j] <- yfhat[i] + vpi[1] } # end of the bootstrap loop up <- mean(upper) low <- mean(lower) FSPI <- c(low, up) FSPI l[i] <- low u[i] <- up } knitr::kable(data.frame(number = seq(1,n,1),y = y, yhat = yfhat, lower=l, upper = u)) } # Example: Use the Auto dataframe from ISLR library # library(ISLR) # data(Auto) # y <- Auto$mpg # x <- Auto[,-c(1,9)] # get_pi_table(x, y, alpha = 0.05) ############################################################################# # pi2graph will create a response plot for PI2 given the dataframes x and y # # model will be created by forward selection # ############################################################################# pi2graph <- function(x, y, alpha = 0.05){ n <- nrow(x) p <- ncol(x) l <- rep(NA, n) u <- rep(NA, n) yfhat <- rep(NA, n) nb <- 100 vmax <- min(p,as.integer(n/5)) indx <- 1:n vars <- as.vector(1:(p)) indx <- 1:n nH <- ceiling(n/2) #find sets H and V perm <- sample(indx,n) H <- perm[1:nH] xH <- x[H,] yH <- y[H] xV <- x[-H,] yV <- y[-H] #find the forward sel minimum Cp or EBIC model on the training half set H tem<-regsubsets(xH,yH,nvmax=vmax,method="forward") out<-summary(tem) if(nH < 10*p) { xx <- 1:min(length(out$bic),p-1)+1 ebic <- out$bic+2*(-lgamma(xx+1)-lgamma(p-xx+1)) minebic <- out$which[ebic==min(ebic),] #do not need the constant in vin vin <- vars[minebic[-1]] } #if nH >= 10p use min Cp model else { mincp <- out$which[out$cp==min(out$cp),] #do not need the constant in vin vin <- vars[mincp[-1]] } sub <- lsfit(xH[,vin],yH) #ps[i]<-length(sub$coef) #*********************************************************************** # Getting the bootstrap samples from the validation set (-H) upper <- numeric(nb) lower <- numeric(nb) for (i in 1:n) { xf <- x[i,] yfhat[i] <- sub$coef[1] + as.matrix(xf[1,vin]) %*% as.matrix(sub$coef[-1]) for (j in 1:nb) { index <- sample.int(n-nH, n-nH, replace = TRUE) Bx <- xV[index,] By <- yV[index] valres <- By - sub$coef[1] - as.matrix(Bx[,vin]) %*% sub$coef[-1] #need as.matrix to prevent an error if vin has 1 variable absres <- abs(valres) aq <- quantile(absres, 1-alpha) upper[j] <- yfhat[i] + aq lower[j] <- yfhat[i] - aq } # end of the bootstrap loop up <- mean(upper) low <- mean(lower) FSPI2 <- c(low, up) FSPI2 l[i] <- low u[i] <- up } library(latex2exp) resplot <- ggplot(data.frame(l,u,yfhat,y), aes(x=yfhat, y=l)) + geom_abline(slope = 1, intercept = 0, color = "wheat") + geom_smooth(linetype = "dashed", color = "blue", method = "auto", size = 0.5) + geom_smooth(aes(x=yfhat, y=u), linetype = "dashed" , color = "blue", method = "auto", size = 0.5) + geom_point(aes(x=yfhat, y=y), color = "green4", alpha = 0.5) + labs(x = "yhat", y = "y", title = "Response Plot For PI2") + theme_bw() resplot plotly::ggplotly(resplot) } # Example: Use the Auto dataframe from ISLR library # library(ISLR) # y <- Auto$mpg # x <- Auto[,-c(1,9)] # pi2graph(x, y, alpha = 0.05) ######################################################################### # get_pi2_table will create a table given the dataframes x and y # # model will be created by forward selection # ######################################################################### get_pi2_table <- function(x, y, alpha = 0.05){ n <- nrow(x) p <- ncol(x) l <- rep(NA, n) u <- rep(NA, n) yfhat <- rep(NA, n) nb <- 100 vmax <- min(p,as.integer(n/5)) indx <- 1:n vars <- as.vector(1:(p)) indx <- 1:n nH <- ceiling(n/2) #find sets H and V perm <- sample(indx,n) H <- perm[1:nH] xH <- x[H,] yH <- y[H] xV <- x[-H,] yV <- y[-H] #find the forward sel minimum Cp or EBIC model on the training half set H tem<-regsubsets(xH,yH,nvmax=vmax,method="forward") out<-summary(tem) if(nH < 10*p) { xx <- 1:min(length(out$bic),p-1)+1 ebic <- out$bic+2*(-lgamma(xx+1)-lgamma(p-xx+1)) minebic <- out$which[ebic==min(ebic),] #do not need the constant in vin vin <- vars[minebic[-1]] } #if nH >= 10p use min Cp model else { mincp <- out$which[out$cp==min(out$cp),] #do not need the constant in vin vin <- vars[mincp[-1]] } sub <- lsfit(xH[,vin],yH) #ps[i]<-length(sub$coef) #*********************************************************************** # Getting the bootstrap samples from the validation set (-H) upper <- numeric(nb) lower <- numeric(nb) for (i in 1:n) { xf <- x[i,] yfhat[i] <- sub$coef[1] + as.matrix(xf[1,vin]) %*% as.matrix(sub$coef[-1]) for (j in 1:nb) { index <- sample.int(n-nH, n-nH, replace = TRUE) Bx <- xV[index,] By <- yV[index] valres <- By - sub$coef[1] - as.matrix(Bx[,vin]) %*% sub$coef[-1] #need as.matrix to prevent an error if vin has 1 variable absres <- abs(valres) aq <- quantile(absres, 1-alpha) upper[j] <- yfhat[i] + aq lower[j] <- yfhat[i] - aq } # end of the bootstrap loop up <- mean(upper) low <- mean(lower) FSPI2 <- c(low, up) FSPI2 l[i] <- low u[i] <- up } knitr::kable(data.frame(number = seq(1,n,1),y = y, yhat = yfhat, lower=l, upper = u)) } # Example: Use the Auto dataframe from ISLR library # library(ISLR) # y <- Auto$mpg # x <- Auto[,-c(1,9)] # get_pi2_table(x, y, alpha = 0.05) ########################################################################### # pi2fs will calculate the PI2 given dataframes x, y and future x vector # # Forward selection will be used to build the model # ########################################################################### pi2fs <- function(x, y, xf, alpha=0.05){ nb <- 100 n <- nrow(x) p <- ncol(x) indx <- 1:n vars <- as.vector(1:(p)) indx <- 1:n nH <- ceiling(n/2) vmax <- min(p,as.integer(n/5)) #find sets H and V perm <- sample(indx,n) H <- perm[1:nH] xH <- x[H,] yH <- y[H] xV <- x[-H,] yV <- y[-H] #find the forward sel minimum Cp or EBIC model on the training half set H tem<-regsubsets(xH,yH,nvmax=vmax,method="forward") out<-summary(tem) if(nH < 10*p) { xx <- 1:min(length(out$bic),p-1)+1 ebic <- out$bic+2*(-lgamma(xx+1)-lgamma(p-xx+1)) minebic <- out$which[ebic==min(ebic),] #do not need the constant in vin vin <- vars[minebic[-1]] } #if nH >= 10p use min Cp model else { mincp <- out$which[out$cp==min(out$cp),] #do not need the constant in vin vin <- vars[mincp[-1]] } sub <- lsfit(xH[,vin],yH) #ps[i]<-length(sub$coef) #*********************************************************************** # Getting the bootstrap samples from the validation set (-H) upper <- numeric(nb) lower <- numeric(nb) # out side the loop? yfhat <- sub$coef[1] + as.matrix(xf[vin]) %*% as.matrix(sub$coef[-1]) for (j in 1:nb) { index <- sample.int(n-nH, n-nH, replace = TRUE) Bx <- xV[index,] By <- yV[index] #yhat <- sub$coef[1] + as.matrix(x[,vin]) %*% as.matrix(sub$coef[-1]) valres <- By - sub$coef[1] - as.matrix(Bx[,vin]) %*% sub$coef[-1] #need as.matrix to prevent an error if vin has 1 variable absres <- abs(valres) aq <- quantile(absres, 1-alpha) upper[j] <- yfhat + aq lower[j] <- yfhat - aq } # end of the bootstrap loop up <- mean(upper) low <- mean(lower) FSPI2 <- c(low, up) FSPI2 } # Example: Use the Auto dataframe from ISLR library # library(ISLR) # y <- Auto$mpg[-15] # Consider 15th to be the future obs # x <- Auto[-15,-c(1,9)] # # xf <- Auto[15,-c(1,9)] # Future x value # pi2fs(x, y, xf, alpha=0.05)