",2===_t.childNodes.length),S.parseHTML=function(e,t,n){return"string"!=typeof e?[]:("boolean"==typeof t&&(n=t,t=!1),t||(y.createHTMLDocument?((r=(t=E.implementation.createHTMLDocument("")).createElement("base")).href=E.location.href,t.head.appendChild(r)):t=E),o=!n&&[],(i=N.exec(e))?[t.createElement(i[1])]:(i=xe([e],t,o),o&&o.length&&S(o).remove(),S.merge([],i.childNodes)));var r,i,o},S.fn.load=function(e,t,n){var r,i,o,a=this,s=e.indexOf(" ");return-1").append(S.parseHTML(e)).find(r):e)}).always(n&&function(e,t){a.each(function(){n.apply(this,o||[e.responseText,t,e])})}),this},S.expr.pseudos.animated=function(t){return S.grep(S.timers,function(e){return t===e.elem}).length},S.offset={setOffset:function(e,t,n){var r,i,o,a,s,u,l=S.css(e,"position"),c=S(e),f={};"static"===l&&(e.style.position="relative"),s=c.offset(),o=S.css(e,"top"),u=S.css(e,"left"),("absolute"===l||"fixed"===l)&&-1<(o+u).indexOf("auto")?(a=(r=c.position()).top,i=r.left):(a=parseFloat(o)||0,i=parseFloat(u)||0),m(t)&&(t=t.call(e,n,S.extend({},s))),null!=t.top&&(f.top=t.top-s.top+a),null!=t.left&&(f.left=t.left-s.left+i),"using"in t?t.using.call(e,f):c.css(f)}},S.fn.extend({offset:function(t){if(arguments.length)return void 0===t?this:this.each(function(e){S.offset.setOffset(this,t,e)});var e,n,r=this[0];return r?r.getClientRects().length?(e=r.getBoundingClientRect(),n=r.ownerDocument.defaultView,{top:e.top+n.pageYOffset,left:e.left+n.pageXOffset}):{top:0,left:0}:void 0},position:function(){if(this[0]){var e,t,n,r=this[0],i={top:0,left:0};if("fixed"===S.css(r,"position"))t=r.getBoundingClientRect();else{t=this.offset(),n=r.ownerDocument,e=r.offsetParent||n.documentElement;while(e&&(e===n.body||e===n.documentElement)&&"static"===S.css(e,"position"))e=e.parentNode;e&&e!==r&&1===e.nodeType&&((i=S(e).offset()).top+=S.css(e,"borderTopWidth",!0),i.left+=S.css(e,"borderLeftWidth",!0))}return{top:t.top-i.top-S.css(r,"marginTop",!0),left:t.left-i.left-S.css(r,"marginLeft",!0)}}},offsetParent:function(){return this.map(function(){var e=this.offsetParent;while(e&&"static"===S.css(e,"position"))e=e.offsetParent;return e||re})}}),S.each({scrollLeft:"pageXOffset",scrollTop:"pageYOffset"},function(t,i){var o="pageYOffset"===i;S.fn[t]=function(e){return $(this,function(e,t,n){var r;if(x(e)?r=e:9===e.nodeType&&(r=e.defaultView),void 0===n)return r?r[i]:e[t];r?r.scrollTo(o?r.pageXOffset:n,o?n:r.pageYOffset):e[t]=n},t,e,arguments.length)}}),S.each(["top","left"],function(e,n){S.cssHooks[n]=Fe(y.pixelPosition,function(e,t){if(t)return t=We(e,n),Pe.test(t)?S(e).position()[n]+"px":t})}),S.each({Height:"height",Width:"width"},function(a,s){S.each({padding:"inner"+a,content:s,"":"outer"+a},function(r,o){S.fn[o]=function(e,t){var n=arguments.length&&(r||"boolean"!=typeof e),i=r||(!0===e||!0===t?"margin":"border");return $(this,function(e,t,n){var r;return x(e)?0===o.indexOf("outer")?e["inner"+a]:e.document.documentElement["client"+a]:9===e.nodeType?(r=e.documentElement,Math.max(e.body["scroll"+a],r["scroll"+a],e.body["offset"+a],r["offset"+a],r["client"+a])):void 0===n?S.css(e,t,i):S.style(e,t,n,i)},s,n?e:void 0,n)}})}),S.each(["ajaxStart","ajaxStop","ajaxComplete","ajaxError","ajaxSuccess","ajaxSend"],function(e,t){S.fn[t]=function(e){return this.on(t,e)}}),S.fn.extend({bind:function(e,t,n){return this.on(e,null,t,n)},unbind:function(e,t){return this.off(e,null,t)},delegate:function(e,t,n,r){return this.on(t,e,n,r)},undelegate:function(e,t,n){return 1===arguments.length?this.off(e,"**"):this.off(t,e||"**",n)},hover:function(e,t){return this.mouseenter(e).mouseleave(t||e)}}),S.each("blur focus focusin focusout resize scroll click dblclick mousedown mouseup mousemove mouseover mouseout mouseenter mouseleave change select submit keydown keypress keyup contextmenu".split(" "),function(e,n){S.fn[n]=function(e,t){return 0 '); $(tabs[0]).before(tabList); var tabContent = $('
'); $(tabs[0]).before(tabContent); // build the tabset var activeTab = 0; tabs.each(function(i) { // get the tab div var tab = $(tabs[i]); // get the id then sanitize it for use with bootstrap tabs var id = tab.attr('id'); // see if this is marked as the active tab if (tab.hasClass('active')) activeTab = i; // remove any table of contents entries associated with // this ID (since we'll be removing the heading element) $("div#" + tocID + " li a[href='#" + id + "']").parent().remove(); // sanitize the id for use with bootstrap tabs id = id.replace(/[.\/?&!#<>]/g, '').replace(/\s/g, '_'); tab.attr('id', id); // get the heading element within it, grab it's text, then remove it var heading = tab.find('h' + tabLevel + ':first'); var headingText = heading.html(); heading.remove(); // build and append the tab list item var a = $('' + headingText + ''); a.attr('href', '#' + id); a.attr('aria-controls', id); var li = $('
  • '); li.append(a); tabList.append(li); // set it's attributes tab.attr('role', 'tabpanel'); tab.addClass('tab-pane'); tab.addClass('tabbed-pane'); if (fade) tab.addClass('fade'); // move it into the tab content div tab.detach().appendTo(tabContent); }); // set active tab $(tabList.children('li')[activeTab]).addClass('active'); var active = $(tabContent.children('div.section')[activeTab]); active.addClass('active'); if (fade) active.addClass('in'); if (tabset.hasClass("tabset-sticky")) tabset.rmarkdownStickyTabs(); } // convert section divs with the .tabset class to tabsets var tabsets = $("div.section.tabset"); tabsets.each(function(i) { buildTabset($(tabsets[i])); }); };

    Cross-validation

    (a) Simulate data

    # simulate data
    set.seed(1)
    y <- rnorm(100)
    x <- rnorm(100)
    y <- x - 2 * x^2 + rnorm(100)
    
    # scatter plot
    plot(x, y)

    We simulate from the following model:

    \[ y_i =\beta_0 + \beta_1*x_i + \beta_2*x_i^2 + \epsilon_i, \quad \epsilon_i \sim N(0, 1) \\ \beta_0 = 0, \beta_1 = 1, \beta_2 = -2, \]

    (c)-(d) Run LOOCV

    Here, I have written a function that performs k-fold CV for the linear regression case. It poperates on a data frame and use the stats::lm functionality:

    # function that computes the CV-error of LS-estimator:
    # - k: (integer) number of folds
    # - df: (data.frame) training data, includes both outcome and predictors
    # - yvar: (character) name of outcome variable in df
    # - formula: (formula) formula argument to stats::lm. Defaults to "yvar ~."
    cv_using_lm <- function(k, df, yvar, formula = as.formula(paste0(yvar, "~."))) {
      
      # compute fold-size
      N <- nrow(df)
     
      # assign the observations randomly k test sets
      fold <- sample(rep(seq_len(k), length.out = N))
      #table(fold) 
      
      # init vector for storing errors
      err <- numeric(k)
      for (kk in seq_len(k)) {
        
        # indicator for the test-observations in current fold
        inFold <- fold == kk          
          
        # fit model to current train-set 
        fit    <- lm(formula, data = df[!inFold, ])  
        
        # predict current test-set outcomes
        yhat  <- predict(fit, newdata = df[inFold, ]) 
        
        # compute loss  
        y     <- df[[yvar]][inFold]                   
        err[kk] <- sum((y-yhat)**2)/length(y)         
      }
      return(err)
    }
    
    # for each model, store the outcome and predictors in a data frame
    dfs <- list(data.frame(y = y, x))
    dfs[[2]] <- cbind(dfs[[1]], x**2)
    dfs[[3]] <- cbind(dfs[[2]], x**3)
    dfs[[4]] <- cbind(dfs[[3]], x**4)
    
    # estimate prediction error with N-fold CV
    n <- length(y)
    set.seed(007)
    sapply(dfs, function(df) mean(cv_using_lm(n, df, "y")))
    
    # estimate prediction error with N-fold CV
    set.seed(007+1)
    sapply(dfs, function(df) mean(cv_using_lm(n, df, "y")))
    ## [1] 5.890979 1.086596 1.102585 1.114772
    ## [1] 5.890979 1.086596 1.102585 1.114772

    There is only one way to split the data into N folds of size 1, therefore the error estimates are exactly the same.

    (e) Which model has the lowest error?

    The model fitting a second order poly has the lowest error. Since we have simulated the data, we know that this is the true model and expect therefore this model to be a good fit to the test-set in each fold. The training error, on the other hand, will always decrease by including additional regressors.

    (f) Compare significance of coefficient estimates

    # compute p-vals for the coefficients and store them in a matrix
    p_vals <- matrix(nrow = 5, ncol = length(dfs))
    for (i in seq_along(dfs)) {
      fit <- lm(y~., data = dfs[[i]])
      coeffs <- coefficients(fit)
      sds    <- sqrt(diag(vcov(fit)))
      t_vals <- coeffs/sds
      p_vals[seq_len(i+1), i] <- 2*(1-pt(abs(t_vals), n-length(coeffs)))
    }
    
    rownames(p_vals) <- names(coeffs)
    colnames(p_vals) <- paste0("Model", seq_along(dfs))
    round(p_vals, 3)
    ##             Model1 Model2 Model3 Model4
    ## (Intercept)  0.000  0.476  0.465  0.386
    ## x            0.329  0.000  0.000  0.000
    ## `x^2`           NA  0.000  0.000  0.000
    ## `x^3`           NA     NA  0.769  0.948
    ## `x^4`           NA     NA     NA  0.637

    We see that the coefficient on the first and second order term is significant, except from the first model. The intercept is significant only in this first model. The higher order terms is not significant. From the scatterplot in (b) wee see that there is a clear non-linear relationship between y and x. No straight line gives a good fit to these data. Therefore we wrongly conclude in model 1 that the intercept is different from 0 and the slope is not.

    Ex 7.9 (Hastie 2009, p.?259)

    The exercise reads ˇ°Ex. 7.9 For the prostate data of Chapter 3, carry out a best-subset linear regression analysis, as in Table 3.3 (third column from left). Compute the AIC, BIC, five- and tenfold cross-validation, and bootstrap .632 estimates of prediction error. Discuss the results.ˇ±

    Data can be downloaded from book-website: https://hastie.su.domains/ElemStatLearn/: - column 1-8 are predictors - column 9 is the outcome (lpsa) - column 10 divides the data into a test and train set

    Import data

    # I let the -here- package help me with relative paths
    # this command defines the root-directory
    here::i_am("exercises/exercises_4_sol.Rmd") 
    
    # read data from local file
    data <- read.csv(here::here("./data/prostate.txt"), 
                     sep = "\t", 
                     header = TRUE, 
                     stringsAsFactors = TRUE)
    data[1] <- NULL  # remove first column (row-numbers)
    head(data)
    ##       lcavol  lweight age      lbph svi       lcp gleason pgg45       lpsa
    ## 1 -0.5798185 2.769459  50 -1.386294   0 -1.386294       6     0 -0.4307829
    ## 2 -0.9942523 3.319626  58 -1.386294   0 -1.386294       6     0 -0.1625189
    ## 3 -0.5108256 2.691243  74 -1.386294   0 -1.386294       7    20 -0.1625189
    ## 4 -1.2039728 3.282789  58 -1.386294   0 -1.386294       6     0 -0.1625189
    ## 5  0.7514161 3.432373  62 -1.386294   0 -1.386294       6     0  0.3715636
    ## 6 -1.0498221 3.228826  50 -1.386294   0 -1.386294       6     0  0.7654678
    ##   train
    ## 1  TRUE
    ## 2  TRUE
    ## 3  TRUE
    ## 4  TRUE
    ## 5  TRUE
    ## 6  TRUE

    As described in ch.?3.2.1, the predictors are scaled to a variance of 1 and the data is divided into test-and-train set.

    # construct data.frame including scaled predictors and the outcome 
    df <- data.frame(lapply(data[, 1:8], function(x) (x-mean(x))/sd(x)),
                     lpsa = data$lpsa[])
    sapply(df, var) # variance of each scaled variable
    
    # store test/train indicator as a variable 
    dfs <- list(train = df[data$train == TRUE, ],
                test  = df[data$train == FALSE, ])
    ##   lcavol  lweight      age     lbph      svi      lcp  gleason    pgg45 
    ## 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 
    ##     lpsa 
    ## 1.332476

    Now, we can reproduce the LS-estimates in table 3.3:

    # fit linear regression model 
    fit <- lm(lpsa ~ ., data = dfs$train)
    summary(fit)
    ## 
    ## Call:
    ## lm(formula = lpsa ~ ., data = dfs$train)
    ## 
    ## Residuals:
    ##      Min       1Q   Median       3Q      Max 
    ## -1.64870 -0.34147 -0.05424  0.44941  1.48675 
    ## 
    ## Coefficients:
    ##             Estimate Std. Error t value Pr(>|t|)    
    ## (Intercept)  2.46493    0.08931  27.598  < 2e-16 ***
    ## lcavol       0.67953    0.12663   5.366 1.47e-06 ***
    ## lweight      0.26305    0.09563   2.751  0.00792 ** 
    ## age         -0.14146    0.10134  -1.396  0.16806    
    ## lbph         0.21015    0.10222   2.056  0.04431 *  
    ## svi          0.30520    0.12360   2.469  0.01651 *  
    ## lcp         -0.28849    0.15453  -1.867  0.06697 .  
    ## gleason     -0.02131    0.14525  -0.147  0.88389    
    ## pgg45        0.26696    0.15361   1.738  0.08755 .  
    ## ---
    ## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
    ## 
    ## Residual standard error: 0.7123 on 58 degrees of freedom
    ## Multiple R-squared:  0.6944, Adjusted R-squared:  0.6522 
    ## F-statistic: 16.47 on 8 and 58 DF,  p-value: 2.042e-12

    Best subset selection

    We want to find the best subset of predictors to include in a linear regression model, using different criteria (AIC, BIC, CV, .632-bootstrap).

    We can re-use the CV-function we defined in problem 1 to apply 5- and 10-fold CV. To apply the .632 boostrap estimator, we need a similar function, for example:

    # function that computes the .632boostrap error of the LS-estimator:
    # - k: (integer) number of folds
    # - df: (data.frame) training data, includes both outcome and predictors
    # - yvar: (character) name of outcome variable in df
    # - formula: (formula) formula argument to stats::lm. Defaults to "yvar ~."
    
    bootstrap_632_err <- function(B, df, yvar, formula = as.formula(paste0(yvar, "~."))) {
      N <- nrow(df)
      
      # compute bootstrap error 
      bootstrap.err.b <- matrix(NA, nrow = B, ncol = 1) 
      for (b in 1:B) {
        set.seed(b) # set seed, useful for data replication
        index <- sample(N, replace = TRUE)
        # the selected observations form the training set
        temp.train.data <- df[index, ]
        # the rest the test set
        temp.test.data <- df[-index, ]
        # fit model on temporary train set
        fit <- lm(as.formula(paste0(yvar, "~.")), data = temp.train.data)
        
        # compute the error
        computeError <- function(model, newdata)
          mean((newdata[ , yvar] - predict(model, newdata = newdata))^2)
        
        # save the error in the matrix initialized above
        bootstrap.err.b[b] <- computeError(fit, temp.test.data)
        
        #if(b%%100 == 0) print(b)
      }
      
      # compute training error: 
      fit <- lm(as.formula(paste0(yvar, "~.")), data = df)
      err.train <- computeError(fit, df)
      
      # return 0.632 bootstrap error
      return(0.368*err.train + 0.632*mean(bootstrap.err.b))
    }
    
    # test function on full training data set
    bootstrap_632_err(1000, dfs$train, "lpsa")
    ## [1] 0.5910481

    Now, we can loop through every possible subset of the predictors and estimate the prediction error with the different methods. Note: There are probably more clever way to do this, but this what the first that come to my mind during the unexpected live-coding-session in class.

    set.seed(007)
    p <- 8
    N <- nrow(dfs$train)
    err <- list()
    
    # for each subset of size m ...
    for (m in seq_len(p)) {
      
      # list all subset of size m of the p variables
      all_comb <- combn(p, m)
      
      # initialize a matrix for storing the errors 
      tmp <- matrix(nrow = 5, ncol = ncol(all_comb))
      rownames(tmp) <- c("aic", "bic", "cv5", "cv10", "boostrap632")
      err[[m]] <- tmp 
      
      # for each size m subset...
      for (i in seq_len(ncol(all_comb))){
        
        # construct a data.frame containint the outcome and current predictor variables
        vars <- all_comb[, i]
        temp_df <- data.frame(lpsa = dfs$train$lpsa, dfs$train[,  vars])
        
        # fit the model 
        fit <- lm(lpsa~., data = temp_df)
        d <- m + 2     # number of estimated params: number of coefficients + intercept + residual variance
        
        # evaluate log-lik 
        logLik <- logLik(fit)
    
        # aic 
        r <- 1
        err[[m]][r, i] <- 2*d/n-2/N*logLik
        
        # bic 
        r <- r+1
        err[[m]][r, i] <- d*log(N) - 2*logLik
        
        # cv 5-fold
        r <- r+1
        err[[m]][r, i]  <- mean(cv_using_lm(5, temp_df, "lpsa"))
        
        # cv 10-fold
        r <- r+1
        err[[m]][r, i]  <- mean(cv_using_lm(10, temp_df, "lpsa"))
        
        # bootstrap .632
        # (note that I sat B = 100 to reduce computation time)
        r <- r+1
        err[[m]][r, i]  <- bootstrap_632_err(B = 100, df = temp_df, yvar = "lpsa") 
      }
    }

    Now, we want to find which subset is the best according to the different criteria. Here, I do so by collecting the estimates in a data frame in order to use the dplyr package to find the minimum over all subsets.

    library(dplyr)
    df <- setNames(reshape2::melt(err), c("crit", "comb", "value", "size")) %>% 
      select(crit, size, comb, value) 
    head(df)
    ##          crit size comb       value
    ## 1         aic    1    1   2.4893157
    ## 2         bic    1    1 175.3782322
    ## 3         cv5    1    1   0.7372446
    ## 4        cv10    1    1   0.7277851
    ## 5 boostrap632    1    1   0.7073414
    ## 6         aic    1    2   2.9920134

    Then, I can group the data by each criteria, and find the minimum error.

    res <- df %>% 
      group_by(crit) %>% 
      arrange(crit) %>% 
      filter(value == min(value))
    head(res)
    ## # A tibble: 5 ˇÁ 4
    ## # Groups:   crit [5]
    ##   crit         size  comb   value
    ##   <fct>       <int> <int>   <dbl>
    ## 1 aic             7     2   2.20 
    ## 2 bic             2     1 167.   
    ## 3 cv5             6    21   0.575
    ## 4 cv10            7     2   0.628
    ## 5 boostrap632     5    18   0.587

    We see that the different subsets (identified by size and comb variables) are associated with the lowest errors depending on the criteria used. Finally, we can re-fit each of the ˇ°best subsampleˇ±-sets chosen by each criteria on the full training data set.

    # fit linear regresion full model 
    fit  <- lm(lpsa ~., dfs$train)
    beta <- coefficients(fit)
    
    # init matrix for storing the coefficient estimates of each subset
    coeffs <- matrix(nrow  = length(beta), ncol = nrow(res) +1)
    coeffs[, 1] <- beta
    colnames(coeffs) <- c("LS", levels(res$crit)[res$crit])
    rownames(coeffs) <- names(beta)
    
    # fit models to each of the chosen subsets
    for (i in 1:nrow(res)) {
      m <- res$size[i]          
      comb <- res$comb[i]
      
      # find the variables included in the subset
      vars <- combn(p, m)[, comb]
      
      # construct a data frame with this subset of variables
      temp_df <- data.frame(lpsa = dfs$train$lpsa, dfs$train[,  vars])
      
      # fit the model and collect coefficient estimates
      fit <- lm(lpsa~., data = temp_df)
      coeffs[c(1, vars+1), i+1] <- coefficients(fit)
    }
    
    # 
    coeffs
    ##                      LS        aic       bic        cv5       cv10 boostrap632
    ## (Intercept)  2.46493292  2.4668675 2.4773573  2.4464512  2.4668675   2.4478730
    ## lcavol       0.67952814  0.6764486 0.7397137  0.7238923  0.6764486   0.6700038
    ## lweight      0.26305307  0.2652760 0.3163282         NA  0.2652760   0.3173287
    ## age         -0.14146483 -0.1450300        NA         NA -0.1450300          NA
    ## lbph         0.21014656  0.2095349        NA  0.3025434  0.2095349          NA
    ## svi          0.30520060  0.3070936        NA  0.3419951  0.3070936   0.2688665
    ## lcp         -0.28849277 -0.2872242        NA -0.2768828 -0.2872242  -0.2926930
    ## gleason     -0.02130504         NA        NA -0.1109199         NA          NA
    ## pgg45        0.26695576  0.2522850        NA  0.2703702  0.2522850   0.2277896

    Extra: *Compare estimates of the prediction error*

    I first misread the exercise, and ignored the ˇ°best-subsetˇ± part (sorry). Where we in the above use the prediction error estimates for model selection (to find the best predictor subset), we could also be interested in estimating the prediction error for a fixed model. In the following I apply the different procedures to estimate the prediction error in the full regression model, and compare these to the test error we obtain on the given test set.

    Test and train:

    # initialize list for storing the error estimates
    err <- list()
    
    # compute error in training set
    y     <- dfs$train$lpsa # observed training outcomes
    yhat  <- fitted(fit)    # fitted values
    err$train <- mean((y-yhat)**2)
    
    # compute error in test set
    y     <- dfs$test$lpsa 
    yhat  <- predict(fit, newdata = dfs$test)   
    err$test <- mean((y-yhat)**2)

    AIC and BIC:

    We estimate the AIC and BIC by computing the log-likelihood of the fitted model:

    N <- nrow(dfs$train) # number of training samples
    d <- 8 +2            # number of estimated params (incl. intercept and res variance)
    
    # compute log-likelihood of fitted model 
    logL <- as.numeric(logLik(fit)) 
    cat("logLik:", logL)
    
    # compare with "by-hand"-computation
    y     <- dfs$train$lpsa     # observed training outcomes
    yhat  <- fitted(fit)        # fitted values
    s2    <- mean((y-yhat)**2)  # MLE of the res. variance (biased estimator) 
    cat("by-hand:", sum(log(dnorm(y, mean = yhat, sd = sqrt(s2)))))
    
    err$aic <- 2*d/N - 2/N*logL
    err$bic <- d*log(N) - 2*logL
    ## logLik: -70.54079by-hand: -70.54079

    Cross-validation:

    Use the function from problem 1 to apply CV:

    yvar <- "lpsa"
    err$cv5  <- mean(cv_using_lm(5, dfs$train, yvar))
    err$cv10 <- mean(cv_using_lm(10, dfs$train, yvar))

    Bootstrap .632:

    The .632 bootstrap estimator estiamtes the prediction error as a weighted average of the training error and the leave-one-out bootstrap estimator. Here, I have re-used the code in r_code_lecture_6.r to compute the bootstrap error:

    B <- 1000
    bootstrap.err.b <- matrix(NA, nrow = B, ncol = 1) 
    
    for (b in 1:B) {
      set.seed(b) # set seed, useful for data replication
      index <- sample(N, replace = TRUE)
      # the selected observations form the training set
      temp.train.data <- dfs$train[index, ]
      # the rest the test set
      temp.test.data <- dfs$train[-index, ]
      
      fit <- lm(lpsa ~ ., data = temp.train.data)
      
        # compute the error
      computeError <- function(model, newdata)
        mean((newdata[ , 'lpsa'] - predict(model, newdata = newdata))^2)
      
      # save the error in the matrix initialized above
      bootstrap.err.b[b] <- computeError(fit, temp.test.data)
      
      
      if(b%%100 == 0) print(b)
    }
    
    # compute the average error 
    err$bootstrap <- mean(bootstrap.err.b)
    
    # 0.632 bootstrap
    err$bootstrap_632 <- 0.368*err$train + 0.632*err$bootstrap
    ## [1] 100
    ## [1] 200
    ## [1] 300
    ## [1] 400
    ## [1] 500
    ## [1] 600
    ## [1] 700
    ## [1] 800
    ## [1] 900
    ## [1] 1000

    Compare the error estimates:

    err
    ## $train
    ## [1] 0.4808587
    ## 
    ## $test
    ## [1] 0.5098823
    ## 
    ## $aic
    ## [1] 2.404203
    ## 
    ## $bic
    ## [1] 183.1285
    ## 
    ## $cv5
    ## [1] 0.6002474
    ## 
    ## $cv10
    ## [1] 0.6697008
    ## 
    ## $bootstrap
    ## [1] 0.6794661
    ## 
    ## $bootstrap_632
    ## [1] 0.6063786

    The different error estimates are quite different. Partly this is due to which error they do actually estimate. Normally, we are interested in obtaining the lowest test error, that is expected prediction error conditional on the particular training data we have (\(Err_\mathcal{T}\) in the lingua of Hastie et al.). Both cross-validation methods and the bootstrap estimates are typically closer to the unconditional error, the expected test error \(Err = Err_\mathcal{T}\). See ch.?7.12 in the ELS book for a discussion. The AIC estimates again estimates another error, namely the in-sample error (\(Err_{in}\)), while the BIC criteria has a Bayesian motivation: finding the most likely model given the data. In other words, direct comparison of these errors is difficult.