K07_left_join.R

#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
#                   Intro to the Tidyverse by Colleen O'Briant
#                               Koan #7: left_join
#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::

# In order to progress:
# 1. Read all instructions carefully.
# 2. When you come to an exercise, fill in the blank, un-comment the line
#    (Ctrl/Cmd Shift C), and execute the code in the console (Ctrl/Cmd Return).
#    If the piece of code spans multiple lines, highlight the whole chunk or
#    simply put your cursor at the end of the last line.
# 3. Save (Ctrl/Cmd S).
# 4. Test that your answers are correct (Ctrl/Cmd Shift T).

#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::

# In this exercise, you'll learn about left_join(), which is one way to join two
# tibbles when you have variables that link them together.

# Run this code to get started. Make sure you have an internet connection, or
# reading in data from the web will not work.

library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──

## ✔ ggplot2 3.3.6     ✔ purrr   0.3.4
## ✔ tibble  3.1.7     ✔ dplyr   1.0.9
## ✔ tidyr   1.2.0     ✔ stringr 1.4.0
## ✔ readr   2.1.2     ✔ forcats 0.5.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

people <- read_csv("https://raw.githubusercontent.com/cobriant/dplyrmurdermystery/master/data/person.csv")

## Rows: 10011 Columns: 6

## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): name, address_street_name
## dbl (4): id, license_id, address_number, ssn
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

drivers_license <- read_csv("https://raw.githubusercontent.com/cobriant/dplyrmurdermystery/master/data/drivers_license.csv")

## Rows: 10007 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): eye_color, hair_color, gender, plate_number, car_make, car_model
## dbl (3): id, age, height
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

income <- read_csv("https://raw.githubusercontent.com/cobriant/dplyrmurdermystery/master/data/income.csv")

## Rows: 7514 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (2): ssn, annual_income
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::

#                               ----- people -----

# You have 3 new datasets in the global environment. The first one we'll work on
# is `people`. Don't worry: all the SSNs, along with all the other information,
# is made up.

# Un-comment the code below (Ctrl/Cmd Shift C) to run and take a look at the
# 'people' dataset:

# view(people)

# To find information about someone named Daniel Dumire, we can filter() on
# name:

people %>%
  filter(name == "Daniel Dumire")

## # A tibble: 1 × 6
##      id name          license_id address_number address_street_name       ssn
##   <dbl> <chr>              <dbl>          <dbl> <chr>                   <dbl>
## 1 27198 Daniel Dumire     953663           1355 Campground Blvd     483739224

# You can also use `str_detect()` to detect a character string for partial
# matches:

people %>%
  filter(str_detect(name, "Daniel"))

## # A tibble: 13 × 6
##       id name                license_id address_number address_street_na…    ssn
##    <dbl> <chr>                    <dbl>          <dbl> <chr>               <dbl>
##  1 13155 Danielle Stutes         782669           2648 Greatness Dr       6.86e8
##  2 15008 Daniele Mcrobbie        459397           1793 Burntlodge Rd      1.20e8
##  3 21314 Daniel Schwendinger     219130            193 Rosegold Rd        1.05e8
##  4 27198 Daniel Dumire           953663           1355 Campground Blvd    4.84e8
##  5 39873 Danielle Funderburg     695950           3340 Mc Gloshen St      4.04e8
##  6 49701 Royce Daniels           787557            375 Cornells Way       2.21e8
##  7 53486 Danielle Palys          385970            902 Belgrave Dr        2.74e8
##  8 53784 Daniella Caramelo       331205           2333 Henry Knox Circle  1.58e8
##  9 64303 Daniela Nishina         892095            526 Hohener St         5.66e8
## 10 72820 Barry Danielian         263073           3971 Prater Ave         6.26e8
## 11 74908 Karrie Daniele          709311           1773 E Kohlman St       5.87e8
## 12 94523 Daniel Ales             681781           2232 Chatter Brook Cir… 3.15e8
## 13 94679 Janine Daniell          796342           1922 Scott Foresman Bl… 2.40e8

# Read the qelp docs on str_detect:
?qelp::str_detect

# The carrot `^` means "starts with":

people %>%
  filter(str_detect(name, "^Daniel"))

## # A tibble: 9 × 6
##      id name                license_id address_number address_street_name    ssn
##   <dbl> <chr>                    <dbl>          <dbl> <chr>                <dbl>
## 1 13155 Danielle Stutes         782669           2648 Greatness Dr        6.86e8
## 2 15008 Daniele Mcrobbie        459397           1793 Burntlodge Rd       1.20e8
## 3 21314 Daniel Schwendinger     219130            193 Rosegold Rd         1.05e8
## 4 27198 Daniel Dumire           953663           1355 Campground Blvd     4.84e8
## 5 39873 Danielle Funderburg     695950           3340 Mc Gloshen St       4.04e8
## 6 53486 Danielle Palys          385970            902 Belgrave Dr         2.74e8
## 7 53784 Daniella Caramelo       331205           2333 Henry Knox Circle   1.58e8
## 8 64303 Daniela Nishina         892095            526 Hohener St          5.66e8
## 9 94523 Daniel Ales             681781           2232 Chatter Brook Circ… 3.15e8

# The dollar sign `$` means "ends with":

people %>%
  filter(str_detect(name, "nson$"))

## # A tibble: 26 × 6
##       id name                license_id address_number address_street_na…    ssn
##    <dbl> <chr>                    <dbl>          <dbl> <chr>               <dbl>
##  1 14591 Jeremiah Johanson       429738           3037 Gaines Blvd        2.76e8
##  2 15740 Kylee Bronson           499686           3737 Weiland St         1.55e8
##  3 15898 Jammie Colbenson        193571           2492 Liberty Mill St    9.48e8
##  4 21624 Otha Mallinson          999509           3150 Trovillion Rd      1.36e8
##  5 26264 Audrea Ellanson         225456           3140 W Fortlee Dr       1.75e8
##  6 27303 Joseph Hutchenson       910671           3457 E Fortlee St       1.03e8
##  7 34155 Allyson Lazenson        814209           2840 Northwestern Dr    9.91e8
##  8 37357 Gwendolyn Anson         269007           2497 Colleen Garden Way 2.99e8
##  9 40825 Leslie Etchinson        847905           3422 Lund Way           5.54e8
## 10 42169 Malisa Christianson     951727            234 Old Amboy Ave      8.40e8
## # … with 16 more rows

#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::

# 1. Query the data to find out if there's anyone with your first or last ------
# name using `|`, which is the logical operator for "or". If not, is there
# anyone with the first few letters of your first name, or the last few letters
# of your last name?

#1@

# people %>%
#   filter(str_detect(__, __) | str_detect(__, __))

#@1

# 2. Use 'select()' to find Daniel Dumire's license_id. ------------------------

#2@

# people %>%
#   filter(__) %>%
#   select(__)

#@2

#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::

#                          ----- drivers_license -----
#
# 3. Now look for Daniel Dumire in the 'drivers_license' tibble. ---------------
# The problem: there's no 'name' column in that tibble. You have to search
# instead by 'id'. That is, 'license_id' in the 'people' tibble can be matched
# with 'id' in the 'drivers_license' tibble.

#3@

# drivers_license %>%
#   filter(id == __)

#@3

#                               ----- income -----

# 4. Find a person whose name is Bennett Jimmerson and find their income. ------
#   You can follow the same 2-step process as 2 and 3. Which variables
#   can be matched between 'income' and 'people'? (use 'names(income)'
#   and 'names(people)' to quickly remind yourself of the variables in
#   each tibble)

#4@

# __

# __

#@4

#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::

#                            ----- left_join() -----
#
# A more efficient way to solve questions like 2, 3, and 4 is to join
# the tibbles by the matching variables, which are called "keys".

# We can join 'people' and 'drivers_license' by 'license_id' = 'id':

people %>%
  left_join(drivers_license, by = c("license_id" = "id"))

## # A tibble: 10,011 × 14
##       id name     license_id address_number address_street_…    ssn   age height
##    <dbl> <chr>         <dbl>          <dbl> <chr>             <dbl> <dbl>  <dbl>
##  1 10000 Christo…     993845            624 Bankhall Ave     7.48e8    46     59
##  2 10007 Kourtne…     861794           2791 Gustavus Blvd    4.78e8    54     74
##  3 10010 Muoi Ca…     385336            741 Northwestern Dr  8.29e8    24     79
##  4 10016 Era Mos…     431897           1987 Wood Glade St    6.15e8    22     75
##  5 10025 Trena H…     550890            276 Daws Hill Way    2.24e8    45     60
##  6 10027 Antione…     439509           2431 Zelham Dr        4.92e8    56     72
##  7 10034 Kyra Bu…     920494           1873 Sleigh Dr        3.32e8    56     69
##  8 10039 Frances…     278151            736 Buswell Dr       8.61e8    87     54
##  9 10095 Leslie …     729987           2772 Camellia Park C… 1.28e8    83     74
## 10 10122 Alva Co…     779002            116 Diversey Circle  1.49e8    52     64
## # … with 10,001 more rows, and 6 more variables: eye_color <chr>,
## #   hair_color <chr>, gender <chr>, plate_number <chr>, car_make <chr>,
## #   car_model <chr>

?qelp::left_join

# To get a complete set of information about Daniel Dumire,
# I join 'people' with 'drivers_license' by the keys the two tibbles
# have in common, then filter on "Daniel Dumire" and select the
# information I'm interested in:

people %>%
  left_join(drivers_license, by = c("license_id" = "id")) %>%
  filter(name == "Daniel Dumire") %>%
  select(name, age, height, eye_color, hair_color, gender, car_make)

## # A tibble: 1 × 7
##   name            age height eye_color hair_color gender car_make
##   <chr>         <dbl>  <dbl> <chr>     <chr>      <chr>  <chr>   
## 1 Daniel Dumire    27     68 amber     grey       female Buick

#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::

# 5. Your turn: use left_join() to join 'people' with 'income'. ----------------
# Since the key has the same name in both tibbles ("ssn"), 'by = "ssn"'
# will work.

#5@

# __ %>%
#  __(__, by = "ssn")

#@5

# 6. Copy-paste your code for (5) and add another step, querying the -----------
# resulting tibble to find Daniel Dumire's income.

#6@

# __

#@6

#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::

# 7. One more join problem: join 'drivers_license' with 'income'. This is ------
# tricky because they don't share any keys! You'll have to use 'left_join'
# twice and make use of the 'people' tibble.

#7@

# __

#@7

#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::

# Last notes about joins:

# We've been joining and then filtering, like this:

people %>%
  left_join(drivers_license, by = c("license_id" = "id")) %>%
  filter(name == "Daniel Dumire")

## # A tibble: 1 × 14
##      id name      license_id address_number address_street_…    ssn   age height
##   <dbl> <chr>          <dbl>          <dbl> <chr>             <dbl> <dbl>  <dbl>
## 1 27198 Daniel D…     953663           1355 Campground Blvd  4.84e8    27     68
## # … with 6 more variables: eye_color <chr>, hair_color <chr>, gender <chr>,
## #   plate_number <chr>, car_make <chr>, car_model <chr>

# But if you're only interested in Daniel Dumire, then you can also filter and
# then join, like this:

people %>%
  filter(name == "Daniel Dumire") %>%
  left_join(drivers_license, by = c("license_id" = "id"))

## # A tibble: 1 × 14
##      id name      license_id address_number address_street_…    ssn   age height
##   <dbl> <chr>          <dbl>          <dbl> <chr>             <dbl> <dbl>  <dbl>
## 1 27198 Daniel D…     953663           1355 Campground Blvd  4.84e8    27     68
## # … with 6 more variables: eye_color <chr>, hair_color <chr>, gender <chr>,
## #   plate_number <chr>, car_make <chr>, car_model <chr>

# left_join() is one of the most useful joins, and it's the only join you'll
# need to solve the murder mystery project, but there are others. 99% of
# the time, you'll only ever need left_join() and inner_join(). Check out the
# dplyr cheat sheet (page 2, right hand side) for more information:
# https://www.rstudio.com/wp-content/uploads/2015/02/data-wrangling-cheatsheet.pdf

#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::

# Great work! You're one step closer to tidyverse enlightenment. Make sure to
# return to this topic to meditate on it later.

# If you're ready, you can move on to koan 8: qplot to ggplot.

K07_left_join.R

colleenobriant

2022-08-24