Confidence Intervals

STAT 20: Introduction to Probability and Statistics

Adapted by Gaston Sanchez

Estimating a Population Mean

Sampling Distribution of \(\bar{x}\)

Sampling Distribution and 95% C.I.

By the Central Limit Theorem:

\(\bar{x} \sim N(\mu, \sigma / \sqrt{n})\)


95% Confidence Interval

\(\bar{x} - 1.96 \times \sigma / \sqrt{n}, \ \bar{x} + 1.96 \times \sigma / \sqrt{n}\)


In practice, we usually don’t know \(\sigma\), but we can use sample SD \(s\)

\(\bar{x} - 1.96 \times s / \sqrt{n}, \ \bar{x} + 1.96 \times s / \sqrt{n}\)

Example: flights data

library(tidyverse)
library(stat20data)

flights |> 
  select(carrier, flight, dep_delay, air_time) |> 
  slice(1:5)
# A tibble: 5 × 4
  carrier flight dep_delay air_time
  <chr>    <dbl>     <dbl>    <dbl>
1 UA         521         9      175
2 F9         162       -10      125
3 UA         197        -3      285
4 AA         794        -4      296
5 AA         289       104      271

Population Distribution of air_time

Code
# population distribution
flights |>
  ggplot(aes(x = air_time)) +
  geom_histogram(bins = 30, color = "white") +
  labs(title = "Population Distribution Air Time")

Population Mean air_time

Parameter: Mean Airtime (air_time)

# population airtime mean
pop_mean = flights |>
  summarize(mu = mean(air_time))

pop_mean
# A tibble: 1 × 1
     mu
  <dbl>
1  129.

Simple Random Sample (SRS)

set.seed(246)

sampled_flights = flights |>
  slice_sample(n = 500)

Simple Random Sample (SRS)

set.seed(246)

sampled_flights = flights |>
  slice_sample(n = 500)
  
sampled_flights |> 
  select(carrier, flight, dep_delay, air_time) |> 
  slice_head(n = 5)
# A tibble: 5 × 4
  carrier flight dep_delay air_time
  <chr>    <dbl>     <dbl>    <dbl>
1 UA         503        -6       81
2 OO         858        29       65
3 WN         991        -1       70
4 OO         348         3       84
5 WN         344        -1       51

Distribution of a single sample

Code
# empirical distribution
sampled_flights |> 
  ggplot(aes(x = air_time)) +
  geom_histogram(bins = 30, color = "white") +
  labs(title = "Empirical Distribution Air Time (n = 500)")

Sample Mean air_time

Statistic: Sample Mean Airtime (air_time)

# sample airtime mean
xbar = sampled_flights |>
  summarize(xbar = mean(air_time))

xbar
# A tibble: 1 × 1
   xbar
  <dbl>
1  128.

Generating Sampling Distribution

# Repeat this process 1000 times
sample_means = replicate(
  n = 1000,
  expr = {
    flights |> 
      slice_sample(n = 500) |>
      summarize(xbar = mean(air_time)) |> 
      pull(xbar)
  }
)

Graphing Sampling Distribution

Code
# sampling distribution
data.frame(xbar = sample_means) |>
  ggplot(aes(x = xbar)) +
  geom_histogram(bins = 30, color = "white") +
  labs(title = "Sampling Distribution",
       x = "Sample Means")

Estimating a Population Mean

Let’s go back to our statistic \(\bar{x}\)

# pull() gives you output as vector
xbar = sampled_flights |>
  summarize(xbar = mean(air_time)) |> pull()

C.I. for a Population Mean

Let’s go back to our statistic \(\bar{x}\)

# pull() gives you output as vector
xbar = sampled_flights |>
  summarize(xbar = mean(air_time)) |> pull()


# compute (approximate) Standard Error (SE)
s = sampled_flights |>
  summarize(s = sd(air_time)) |> pull()
se = s / sqrt(500)

C.I. for a Population Mean

Let’s go back to our statistic \(\bar{x}\)

# pull() gives you output as vector
xbar = sampled_flights |>
  summarize(xbar = mean(air_time)) |> pull()


# compute (approximate) Standard Error (SE)
s = sampled_flights |>
  summarize(s = sd(air_time)) |> pull()
se = s / sqrt(500)

# 95% C.I.
c(xbar - 1.96*se, xbar + 1.96*se)
[1] 119.9474 135.3846

Estimating a Population Proportion

Sampling Distribution of \(\hat{p}\)

Sampling Distribution and C.I.

By the Central Limit Theorem:

\(\hat{p} \sim N(p, \sqrt{p(1-p)/n})\)


95% Confidence Interval; \(SE = (p(1-p)/n)^{1/2}\)

\(\hat{p} - 1.96 \times SE, \hat{p} + 1.96 \times SE\)


In practice, we usually don’t know \(p\), but we can use sample \(\hat{p}\)

\(SE^* = \sqrt{\hat{p}(1-\hat{p})/n}\)

\(\hat{p} - 1.96 \times SE^*, \ \hat{p} + 1.96 \times SE^*\)

Population Distribution delayed

Code
# population distribution
flights |>
  mutate(delayed = dep_delay > 0) |>
  ggplot(aes(x = delayed)) +
  geom_bar() +
  labs(title = "Population Distribution Delayed")

Population Proportion delayed

Parameter: Proportion of delayed flights

# population proportion of delayed flights
pop_prop = flights |>
  mutate(delayed = dep_delay > 0) |>
  summarize(prop = mean(delayed))

pop_prop
# A tibble: 1 × 1
   prop
  <dbl>
1 0.188

Simple Random Sample (SRS)

set.seed(246)

sampled_flights = flights |>
  mutate(delayed = dep_delay > 0) |>
  slice_sample(n = 500)

Simple Random Sample (SRS)

set.seed(246)

sampled_flights = flights |>
  mutate(delayed = dep_delay > 0) |>
  slice_sample(n = 500)
  
sampled_flights |> 
  select(carrier, flight, delayed) |> 
  slice_head(n = 5)
# A tibble: 5 × 3
  carrier flight delayed
  <chr>    <dbl> <lgl>  
1 UA         503 FALSE  
2 OO         858 TRUE   
3 WN         991 FALSE  
4 OO         348 TRUE   
5 WN         344 FALSE  

Distribution of a single sample

Code
# empirical distribution
sampled_flights |> 
  ggplot(aes(x = delayed)) +
  geom_bar() +
  labs(title = "Empirical Distribution Delayed (n = 500)")

Sample Proportion delayed

Statistic: Sample Proportion (delayed)

# sample proportion
phat = sampled_flights |>
  summarize(phat = mean(delayed))

phat
# A tibble: 1 × 1
   phat
  <dbl>
1  0.17

Generating Sampling Distribution

# Repeat this process 1000 times
sample_props = replicate(
  n = 1000,
  expr = {
    flights |> 
      slice_sample(n = 500) |>
      mutate(delayed = dep_delay > 0) |> 
      summarize(phat = mean(delayed)) |> 
      pull(phat)
  }
)

Graphing Sampling Distribution

Code
# sampling distribution
data.frame(phat = sample_props) |>
  ggplot(aes(x = phat)) +
  geom_histogram(bins = 30, color = "white") +
  labs(title = "Sampling Distribution",
       x = "Sample Proportions")

Estimating a Population Proportion

Let’s go back to our statistic \(\hat{p}\)

# pull() gives you output as vector
phat = sampled_flights |>
  summarize(phat = mean(delayed)) |> pull()

C.I. for a Population Proportion

Let’s go back to our statistic \(\hat{p}\)

# pull() gives you output as vector
phat = sampled_flights |>
  summarize(phat = mean(delayed)) |> pull()


# compute (approximate) Standard Error (SE)
se = sqrt(phat * (1 - phat) / 500)

C.I. for a Population Proportion

Let’s go back to our statistic \(\hat{p}\)

# pull() gives you output as vector
phat = sampled_flights |>
  summarize(phat = mean(delayed)) |> pull()


# compute (approximate) Standard Error (SE)
se = sqrt(phat * (1 - phat) / 500)

# 95% C.I.
c(phat - 1.96*se, phat + 1.96*se)
[1] 0.1370743 0.2029257