General

• Bootstrap from Wikipedia.
• This contains an overview of different methods for computing bootstrap confidence intervals.
• boot.ci() from the 'boot' package provides a short explanation for different methods for computing bootstrap confidence intervals.
• Typically not useful for correlated (dependent) data.
• bootstrap package. "An Introduction to the Bootstrap" by B. Efron and R. Tibshirani, 1993

Nonparametric bootstrap

This is the most common bootstrap method

The upstrap Crainiceanu & Crainiceanu, Biostatistics 2018

Parametric bootstrap

• Parametric bootstraps resample a known distribution function, whose parameters are estimated from your sample
• In small samples, a parametric bootstrap approach might be preferred according to the wikipedia
• http://www.math.ntu.edu.tw/~hchen/teaching/LargeSample/notes/notebootstrap.pdf#page=3 No package is used
• A parametric or non-parametric bootstrap?
• https://www.stat.cmu.edu/~cshalizi/402/lectures/08-bootstrap/lecture-08.pdf#page=11
• simulatorZ Bioc package
• Bootstrap R tutorial : Learn about parametric and non-parametric bootstrap through simulation
df <- scan(textConnection("3331.608 3549.913 2809.252 2671.465 3383.177 3945.721 3672.601 3922.416 4647.278 4088.246 3718.874 3724.443 3925.457 3112.920 3495.621 3651.779 3240.194 3867.347 3431.015 4163.725"), sep = " ")
sd(df)
# [1] 464.5509
mean(df)
# [1] 3617.653
set.seed(1)
theta_star <- vector()
for (i in 1:1000){ theta_star[i] <- mean(rnorm(length(df),mean = 3617.7 ,sd = 464.6)) }
theta_boot <- mean(theta_star) # bootstrap estimate of theta_hat theta_boot
theta_boot
# [1] 3615.208
boot_se <- sd(theta_star) # standard eorrs of the estimate boot_se
boot_se
# [1] 100.106

bias <- theta_boot - mean(df)
bias
# -2.444507
CI <-c(theta_boot-1.96*boot_se,theta_boot +1.96*boot_se)
CI
# 3419.000 3811.416


Using the boot package

# tell the boot() which distribution random sample will be generated
gen_function <- function(x,mle) {
# Input:
#   x: observed data
#   mle: parameter estimates
data <- rnorm(length(x),mle[1],mle[2])
return(data)
}

# function to calculate sample statistic
theta_star_function <- function(x,i) mean(x[i])

set.seed(1)
B <- boot(data = df,
sim = "parametric",
ran.gen = gen_function,
mle = c(3617.7,464.6),
statistic = theta_star_function,
R=1000)
B
# PARAMETRIC BOOTSTRAP
# Call:
# boot(data = df, statistic = theta_star_function, R = 1000, sim = "parametric",
#     ran.gen = gen_function, mle = c(3617.7, 464.6))
#
# Bootstrap Statistics :
#     original    bias    std. error
# t1* 3617.653 -2.444507     100.106
plot(B)


Examples

Standard error

• Standard error from a mean
foo <- function() mean(sample(x, replace = TRUE))
set.seed(1234)
x <- rnorm(300)
set.seed(1)
sd(replicate(10000, foo()))
# [1] 0.05717679
sd(x)/sqrt(length(x)) # The se of mean is s/sqrt(n)
# [1] 0.05798401

set.seed(1234)
x <- rpois(300, 2)
set.seed(1)
sd(replicate(10000, foo()))
# [1] 0.08038607
sd(x)/sqrt(length(x)) # The se of mean is s/sqrt(n)
# [1] 0.08183151

• Difference of means from two samples (cf 8.3 The two-sample problem from the book "An introduction to Bootstrap" by Efron & Tibshirani)
# Define the two samples
sample1 <- 1:10
sample2 <- 11:20

# Define the number of bootstrap replicates
nboot <- 100000

# Initialize a vector to store the bootstrap estimates
boot_estimates <- numeric(nboot)

# Run the bootstrap
set.seed(123)
for (i in seq_len(nboot)) {
# Resample the data with replacement
resample1 <- sample(sample1, replace = TRUE)
resample2 <- sample(sample2, replace = TRUE)

# Compute the difference of means
boot_estimates[i] <- mean(resample1) - mean(resample2)
}

# Compute the standard error of the bootstrap estimates
se_boot <- sd(boot_estimates)

# Print the result
cat("Bootstrap SE estimate of difference of means:", se_boot, "\n")
# 1.283541

sd1 <- sd(sample1)
sd2 <- sd(sample2)

# Calculate the sample sizes
n1 <- length(sample1)
n2 <- length(sample2)

# Calculate the true standard error of the difference of means
se_true <- sqrt((sd1^2/n1) + (sd2^2/n2))

# Print the result
cat("True SE of difference of means:", se_true, "\n") \
# 1.354006
• SE of the sample median
x <- c(1, 5, 8, 3, 7)
median(x) # 5
set.seed(1)
foo <- function(x) median(sample(x, replace = T))
y <- replicate(100, foo(x=x))
sd(y)
# 1.917595
c(median(x)-1.96*sd(y), median(x)+1.96*sd(y))
# [1] 1.241513 8.758487

• SE for sample relative risk

Bootstrapping Extreme Value Estimators

Bootstrapping Extreme Value Estimators de Haan, 2022

R-squared from a regression

R in Action Nonparametric bootstrapping

# Compute the bootstrapped 95% confidence interval for R-squared in the linear regression
rsq <- function(data, indices, formula) {
d <- data[indices,] # allows boot to select sample
fit <- lm(formula, data=d)
return(summary(fit)$r.square) } # 'formula' is optional depends on the problem # bootstrapping with 1000 replications set.seed(1234) bootobject <- boot(data=mtcars, statistic=rsq, R=1000, formula=mpg~wt+disp) plot(bootobject) # or plot(bootobject, index = 1) if we have multiple statistics ci <- boot.ci(bootobject, conf = .95, type=c("perc", "bca") ) # default type is "all" which contains c("norm","basic", "stud", "perc", "bca"). # 'bca' (Bias Corrected and Accelerated) by Efron 1987 uses # percentiles but adjusted to account for bias and skewness. # Level Percentile BCa # 95% ( 0.6838, 0.8833 ) ( 0.6344, 0.8549 ) # Calculations and Intervals on Original Scale # Some BCa intervals may be unstable ci$bca[4:5]
# [1] 0.6343589 0.8549305
# the mean is not the same
mean(c(0.6838,  0.8833 ))
# [1] 0.78355
mean(c(0.6344,  0.8549 ))
# [1] 0.74465
summary(lm(mpg~wt+disp, data = mtcars))\$r.square
# [1] 0.7809306