Data Wrangling

STAT 20: Introduction to Probability and Statistics

Adapted by Gaston Sanchez

dplyr

About dplyr

library(tidyverse)   # includes dplyr


The "dplyr" package in R is a fundamental tool for data manipulation, forming a core component of the tidyverse suite of packages.

It provides a consistent set of “verbs” or functions designed to simplify common data wrangling tasks with data frames and tibbles (an enhanced data frame structure).

dplyr verbs

  • slice()

  • select()

  • filter()

  • mutate()

  • arrange()

  • summarize()

  • group_by()

  1. All of these have a data frame as the first argument


  1. All of these produce a data frame as output


  1. Within the function, you can simply refer to columns names unquoted.

Example: Toy Data

wizards <- data.frame(
  name = c("Harry", "Bellatrix", "Hermione", "Draco"),
  house = c("Gryffindor", "Slytherin", "Gryffindor",  "Slytherin"),
  height = c(1.78, 1.57, 1.65, 1.75),
  spells = c(60, 75, 70, 55)
)

Example: Toy Data

wizards <- data.frame(
  name = c("Harry", "Bellatrix", "Hermione", "Draco"),
  house = c("Gryffindor", "Slytherin", "Gryffindor",  "Slytherin"),
  height = c(1.78, 1.57, 1.65, 1.75),
  spells = c(60, 75, 70, 55)
)

wizards
       name      house height spells
1     Harry Gryffindor   1.78     60
2 Bellatrix  Slytherin   1.57     75
3  Hermione Gryffindor   1.65     70
4     Draco  Slytherin   1.75     55

slice()

Isolates particular rows of a data frame by row number.

wizards
       name      house height spells
1     Harry Gryffindor   1.78     60
2 Bellatrix  Slytherin   1.57     75
3  Hermione Gryffindor   1.65     70
4     Draco  Slytherin   1.75     55


slice(wizards, 2)
       name     house height spells
1 Bellatrix Slytherin   1.57     75

slice()

Isolates particular rows of a data frame by row number.

wizards
       name      house height spells
1     Harry Gryffindor   1.78     60
2 Bellatrix  Slytherin   1.57     75
3  Hermione Gryffindor   1.65     70
4     Draco  Slytherin   1.75     55


slice(wizards, 2:4)
       name      house height spells
1 Bellatrix  Slytherin   1.57     75
2  Hermione Gryffindor   1.65     70
3     Draco  Slytherin   1.75     55

slice()

Isolates particular rows of a data frame by row number.

wizards
       name      house height spells
1     Harry Gryffindor   1.78     60
2 Bellatrix  Slytherin   1.57     75
3  Hermione Gryffindor   1.65     70
4     Draco  Slytherin   1.75     55


slice(wizards, -3)
       name      house height spells
1     Harry Gryffindor   1.78     60
2 Bellatrix  Slytherin   1.57     75
3     Draco  Slytherin   1.75     55

select()

Selects variables by name or number.

wizards
       name      house height spells
1     Harry Gryffindor   1.78     60
2 Bellatrix  Slytherin   1.57     75
3  Hermione Gryffindor   1.65     70
4     Draco  Slytherin   1.75     55


select(wizards, house)
       house
1 Gryffindor
2  Slytherin
3 Gryffindor
4  Slytherin

select()

Selects variables by name or number.

wizards
       name      house height spells
1     Harry Gryffindor   1.78     60
2 Bellatrix  Slytherin   1.57     75
3  Hermione Gryffindor   1.65     70
4     Draco  Slytherin   1.75     55


select(wizards, 2)
       house
1 Gryffindor
2  Slytherin
3 Gryffindor
4  Slytherin

select()

Selects variables by name or number.

wizards
       name      house height spells
1     Harry Gryffindor   1.78     60
2 Bellatrix  Slytherin   1.57     75
3  Hermione Gryffindor   1.65     70
4     Draco  Slytherin   1.75     55


select(wizards, house, name)
       house      name
1 Gryffindor     Harry
2  Slytherin Bellatrix
3 Gryffindor  Hermione
4  Slytherin     Draco

select()

Selects variables by name or number.

wizards
       name      house height spells
1     Harry Gryffindor   1.78     60
2 Bellatrix  Slytherin   1.57     75
3  Hermione Gryffindor   1.65     70
4     Draco  Slytherin   1.75     55


select(wizards, -spells)
       name      house height
1     Harry Gryffindor   1.78
2 Bellatrix  Slytherin   1.57
3  Hermione Gryffindor   1.65
4     Draco  Slytherin   1.75

select()

Selects variables by name or number.

wizards
       name      house height spells
1     Harry Gryffindor   1.78     60
2 Bellatrix  Slytherin   1.57     75
3  Hermione Gryffindor   1.65     70
4     Draco  Slytherin   1.75     55


select(wizards, name:height)
       name      house height
1     Harry Gryffindor   1.78
2 Bellatrix  Slytherin   1.57
3  Hermione Gryffindor   1.65
4     Draco  Slytherin   1.75

filter()

Returns rows that meet certain criteria.

wizards
       name      house height spells
1     Harry Gryffindor   1.78     60
2 Bellatrix  Slytherin   1.57     75
3  Hermione Gryffindor   1.65     70
4     Draco  Slytherin   1.75     55


filter(wizards, house == "Gryffindor")
      name      house height spells
1    Harry Gryffindor   1.78     60
2 Hermione Gryffindor   1.65     70

filter()

Returns rows that meet certain criteria.

wizards
       name      house height spells
1     Harry Gryffindor   1.78     60
2 Bellatrix  Slytherin   1.57     75
3  Hermione Gryffindor   1.65     70
4     Draco  Slytherin   1.75     55


filter(wizards, spells < 60)
   name     house height spells
1 Draco Slytherin   1.75     55

filter()

Returns rows that meet certain criteria.

wizards
       name      house height spells
1     Harry Gryffindor   1.78     60
2 Bellatrix  Slytherin   1.57     75
3  Hermione Gryffindor   1.65     70
4     Draco  Slytherin   1.75     55


filter(wizards, name %in% c("Harry", "Hermione"))
      name      house height spells
1    Harry Gryffindor   1.78     60
2 Hermione Gryffindor   1.65     70

filter()

Returns rows that meet certain criteria.

wizards
       name      house height spells
1     Harry Gryffindor   1.78     60
2 Bellatrix  Slytherin   1.57     75
3  Hermione Gryffindor   1.65     70
4     Draco  Slytherin   1.75     55


filter(wizards, 
       name %in% c("Harry", "Hermione"),
       spells < 70)
   name      house height spells
1 Harry Gryffindor   1.78     60

mutate()

Updates an existing variable, or adds a new variable that can be a function of previous variables.

wizards
       name      house height spells
1     Harry Gryffindor   1.78     60
2 Bellatrix  Slytherin   1.57     75
3  Hermione Gryffindor   1.65     70
4     Draco  Slytherin   1.75     55


mutate(wizards, name = toupper(name))
       name      house height spells
1     HARRY Gryffindor   1.78     60
2 BELLATRIX  Slytherin   1.57     75
3  HERMIONE Gryffindor   1.65     70
4     DRACO  Slytherin   1.75     55

mutate()

Updates an existing variable, or adds a new variable that can be a function of previous variables.

wizards
       name      house height spells
1     Harry Gryffindor   1.78     60
2 Bellatrix  Slytherin   1.57     75
3  Hermione Gryffindor   1.65     70
4     Draco  Slytherin   1.75     55


mutate(wizards, height_cm = height * 100)
       name      house height spells height_cm
1     Harry Gryffindor   1.78     60       178
2 Bellatrix  Slytherin   1.57     75       157
3  Hermione Gryffindor   1.65     70       165
4     Draco  Slytherin   1.75     55       175

arrange()

Sort the rows of a data frame by the values of variables.

wizards
       name      house height spells
1     Harry Gryffindor   1.78     60
2 Bellatrix  Slytherin   1.57     75
3  Hermione Gryffindor   1.65     70
4     Draco  Slytherin   1.75     55


arrange(wizards, height)
       name      house height spells
1 Bellatrix  Slytherin   1.57     75
2  Hermione Gryffindor   1.65     70
3     Draco  Slytherin   1.75     55
4     Harry Gryffindor   1.78     60

arrange()

Sort the rows of a data frame by the values of variables.

wizards
       name      house height spells
1     Harry Gryffindor   1.78     60
2 Bellatrix  Slytherin   1.57     75
3  Hermione Gryffindor   1.65     70
4     Draco  Slytherin   1.75     55


arrange(wizards, desc(height))
       name      house height spells
1     Harry Gryffindor   1.78     60
2     Draco  Slytherin   1.75     55
3  Hermione Gryffindor   1.65     70
4 Bellatrix  Slytherin   1.57     75

arrange()

Sort the rows of a data frame by the values of variables.

wizards
       name      house height spells
1     Harry Gryffindor   1.78     60
2 Bellatrix  Slytherin   1.57     75
3  Hermione Gryffindor   1.65     70
4     Draco  Slytherin   1.75     55


arrange(wizards, house)
       name      house height spells
1     Harry Gryffindor   1.78     60
2  Hermione Gryffindor   1.65     70
3 Bellatrix  Slytherin   1.57     75
4     Draco  Slytherin   1.75     55

arrange()

Sort the rows of a data frame by the values of variables.

wizards
       name      house height spells
1     Harry Gryffindor   1.78     60
2 Bellatrix  Slytherin   1.57     75
3  Hermione Gryffindor   1.65     70
4     Draco  Slytherin   1.75     55


arrange(wizards, house, height)
       name      house height spells
1  Hermione Gryffindor   1.65     70
2     Harry Gryffindor   1.78     60
3 Bellatrix  Slytherin   1.57     75
4     Draco  Slytherin   1.75     55

summarize()

Summarize a variable with a statistic.

wizards
       name      house height spells
1     Harry Gryffindor   1.78     60
2 Bellatrix  Slytherin   1.57     75
3  Hermione Gryffindor   1.65     70
4     Draco  Slytherin   1.75     55


summarize(wizards, 
          mean(height))
  mean(height)
1       1.6875

summarize()

Summarize a variable with a statistic.

wizards
       name      house height spells
1     Harry Gryffindor   1.78     60
2 Bellatrix  Slytherin   1.57     75
3  Hermione Gryffindor   1.65     70
4     Draco  Slytherin   1.75     55


summarize(wizards, 
          avg_height = mean(height))
  avg_height
1     1.6875

summarize()

Summarize a variable with a statistic.

wizards
       name      house height spells
1     Harry Gryffindor   1.78     60
2 Bellatrix  Slytherin   1.57     75
3  Hermione Gryffindor   1.65     70
4     Draco  Slytherin   1.75     55


summarize(wizards, 
          avg_height = mean(height),
          sd_height = sd(height))
  avg_height  sd_height
1     1.6875 0.09604686