Bootstrap: Difference between revisions

From 太極
Jump to navigation Jump to search
Line 111: Line 111:
== Bootstrapping Extreme Value Estimators ==
== Bootstrapping Extreme Value Estimators ==
[https://www.tandfonline.com/doi/full/10.1080/01621459.2022.2120400 Bootstrapping Extreme Value Estimators]  de Haan, 2022
[https://www.tandfonline.com/doi/full/10.1080/01621459.2022.2120400 Bootstrapping Extreme Value Estimators]  de Haan, 2022
== R-squared from a regression  ==
[https://www.statmethods.net/advstats/bootstrapping.html R in Action] Nonparametric bootstrapping <syntaxhighlight lang='rsplus'>
# Compute the bootstrapped 95% confidence interval for R-squared in the linear regression
rsq <- function(data, indices, formula) {
  d <- data[indices,] # allows boot to select sample
  fit <- lm(formula, data=d)
  return(summary(fit)$r.square)
} # 'formula' is optional depends on the problem
# bootstrapping with 1000 replications
set.seed(1234)
bootobject <- boot(data=mtcars, statistic=rsq, R=1000,
                  formula=mpg~wt+disp)
plot(bootobject) # or plot(bootobject, index = 1) if we have multiple statistics
ci <- boot.ci(bootobject, conf = .95, type=c("perc", "bca") )
    # default type is "all" which contains c("norm","basic", "stud", "perc", "bca").
    # 'bca' (Bias Corrected and Accelerated) by Efron 1987 uses
    # percentiles but adjusted to account for bias and skewness.
# Level    Percentile            BCa         
# 95%  ( 0.6838,  0.8833 )  ( 0.6344,  0.8549 )
# Calculations and Intervals on Original Scale
# Some BCa intervals may be unstable
ci$bca[4:5] 
# [1] 0.6343589 0.8549305
# the mean is not the same
mean(c(0.6838,  0.8833 ))
# [1] 0.78355
mean(c(0.6344,  0.8549 ))
# [1] 0.74465
summary(lm(mpg~wt+disp, data = mtcars))$r.square
# [1] 0.7809306
</syntaxhighlight>

Revision as of 16:50, 10 July 2023

General

Nonparametric bootstrap

This is the most common bootstrap method

The upstrap Crainiceanu & Crainiceanu, Biostatistics 2018

Parametric bootstrap

Examples

Standard error

  • Standard error from a mean
    foo <- function() mean(sample(x, replace = TRUE))
    set.seed(1234)
    x <- rnorm(300)
    set.seed(1)
    sd(replicate(10000, foo()))
    # [1] 0.05717679
    sd(x)/sqrt(length(x)) # The se of mean is s/sqrt(n)
    # [1] 0.05798401
    
    set.seed(1234)
    x <- rpois(300, 2)
    set.seed(1)
    sd(replicate(10000, foo()))
    # [1] 0.08038607
    sd(x)/sqrt(length(x)) # The se of mean is s/sqrt(n)
    # [1] 0.08183151
    
  • Difference of means from two samples (cf 8.3 The two-sample problem from the book "An introduction to Bootstrap" by Efron & Tibshirani)
    # Define the two samples
    sample1 <- 1:10
    sample2 <- 11:20
    
    # Define the number of bootstrap replicates
    nboot <- 100000
    
    # Initialize a vector to store the bootstrap estimates
    boot_estimates <- numeric(nboot)
    
    # Run the bootstrap
    set.seed(123)
    for (i in seq_len(nboot)) {
      # Resample the data with replacement
      resample1 <- sample(sample1, replace = TRUE)
      resample2 <- sample(sample2, replace = TRUE)
      
      # Compute the difference of means
      boot_estimates[i] <- mean(resample1) - mean(resample2)
    }
    
    # Compute the standard error of the bootstrap estimates
    se_boot <- sd(boot_estimates)
    
    # Print the result
    cat("Bootstrap SE estimate of difference of means:", se_boot, "\n")
    # 1.283541
    
    sd1 <- sd(sample1)
    sd2 <- sd(sample2)
    
    # Calculate the sample sizes
    n1 <- length(sample1)
    n2 <- length(sample2)
    
    # Calculate the true standard error of the difference of means
    se_true <- sqrt((sd1^2/n1) + (sd2^2/n2))
    
    # Print the result
    cat("True SE of difference of means:", se_true, "\n") \
    # 1.354006

Bootstrapping Extreme Value Estimators

Bootstrapping Extreme Value Estimators de Haan, 2022

R-squared from a regression

R in Action Nonparametric bootstrapping

# Compute the bootstrapped 95% confidence interval for R-squared in the linear regression
rsq <- function(data, indices, formula) {
  d <- data[indices,] # allows boot to select sample
  fit <- lm(formula, data=d)
  return(summary(fit)$r.square)
} # 'formula' is optional depends on the problem

# bootstrapping with 1000 replications
set.seed(1234)
bootobject <- boot(data=mtcars, statistic=rsq, R=1000, 
                   formula=mpg~wt+disp)
plot(bootobject) # or plot(bootobject, index = 1) if we have multiple statistics
ci <- boot.ci(bootobject, conf = .95, type=c("perc", "bca") ) 
    # default type is "all" which contains c("norm","basic", "stud", "perc", "bca"). 
    # 'bca' (Bias Corrected and Accelerated) by Efron 1987 uses 
    # percentiles but adjusted to account for bias and skewness.
# Level     Percentile            BCa          
# 95%   ( 0.6838,  0.8833 )   ( 0.6344,  0.8549 )
# Calculations and Intervals on Original Scale
# Some BCa intervals may be unstable
ci$bca[4:5]  
# [1] 0.6343589 0.8549305
# the mean is not the same
mean(c(0.6838,  0.8833 ))
# [1] 0.78355
mean(c(0.6344,  0.8549 ))
# [1] 0.74465
summary(lm(mpg~wt+disp, data = mtcars))$r.square
# [1] 0.7809306