#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
# Intro to the Tidyverse by Colleen O'Briant
# Koan #7: left_join
#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
# In order to progress:
# 1. Read all instructions carefully.
# 2. When you come to an exercise, fill in the blank, un-comment the line
# (Ctrl/Cmd Shift C), and execute the code in the console (Ctrl/Cmd Return).
# If the piece of code spans multiple lines, highlight the whole chunk or
# simply put your cursor at the end of the last line.
# 3. Save (Ctrl/Cmd S).
# 4. Test that your answers are correct (Ctrl/Cmd Shift T).
#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
# In this exercise, you'll learn about left_join(), which is one way to join two
# tibbles when you have variables that link them together.
# Run this code to get started. Make sure you have an internet connection, or
# reading in data from the web will not work.
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.7 ✔ dplyr 1.0.9
## ✔ tidyr 1.2.0 ✔ stringr 1.4.0
## ✔ readr 2.1.2 ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
people <- read_csv("https://raw.githubusercontent.com/cobriant/dplyrmurdermystery/master/data/person.csv")
## Rows: 10011 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): name, address_street_name
## dbl (4): id, license_id, address_number, ssn
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
drivers_license <- read_csv("https://raw.githubusercontent.com/cobriant/dplyrmurdermystery/master/data/drivers_license.csv")
## Rows: 10007 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): eye_color, hair_color, gender, plate_number, car_make, car_model
## dbl (3): id, age, height
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
income <- read_csv("https://raw.githubusercontent.com/cobriant/dplyrmurdermystery/master/data/income.csv")
## Rows: 7514 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (2): ssn, annual_income
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
# ----- people -----
# You have 3 new datasets in the global environment. The first one we'll work on
# is `people`. Don't worry: all the SSNs, along with all the other information,
# is made up.
# Un-comment the code below (Ctrl/Cmd Shift C) to run and take a look at the
# 'people' dataset:
# view(people)
# To find information about someone named Daniel Dumire, we can filter() on
# name:
people %>%
filter(name == "Daniel Dumire")
## # A tibble: 1 × 6
## id name license_id address_number address_street_name ssn
## <dbl> <chr> <dbl> <dbl> <chr> <dbl>
## 1 27198 Daniel Dumire 953663 1355 Campground Blvd 483739224
# You can also use `str_detect()` to detect a character string for partial
# matches:
people %>%
filter(str_detect(name, "Daniel"))
## # A tibble: 13 × 6
## id name license_id address_number address_street_na… ssn
## <dbl> <chr> <dbl> <dbl> <chr> <dbl>
## 1 13155 Danielle Stutes 782669 2648 Greatness Dr 6.86e8
## 2 15008 Daniele Mcrobbie 459397 1793 Burntlodge Rd 1.20e8
## 3 21314 Daniel Schwendinger 219130 193 Rosegold Rd 1.05e8
## 4 27198 Daniel Dumire 953663 1355 Campground Blvd 4.84e8
## 5 39873 Danielle Funderburg 695950 3340 Mc Gloshen St 4.04e8
## 6 49701 Royce Daniels 787557 375 Cornells Way 2.21e8
## 7 53486 Danielle Palys 385970 902 Belgrave Dr 2.74e8
## 8 53784 Daniella Caramelo 331205 2333 Henry Knox Circle 1.58e8
## 9 64303 Daniela Nishina 892095 526 Hohener St 5.66e8
## 10 72820 Barry Danielian 263073 3971 Prater Ave 6.26e8
## 11 74908 Karrie Daniele 709311 1773 E Kohlman St 5.87e8
## 12 94523 Daniel Ales 681781 2232 Chatter Brook Cir… 3.15e8
## 13 94679 Janine Daniell 796342 1922 Scott Foresman Bl… 2.40e8
# Read the qelp docs on str_detect:
?qelp::str_detect
# The carrot `^` means "starts with":
people %>%
filter(str_detect(name, "^Daniel"))
## # A tibble: 9 × 6
## id name license_id address_number address_street_name ssn
## <dbl> <chr> <dbl> <dbl> <chr> <dbl>
## 1 13155 Danielle Stutes 782669 2648 Greatness Dr 6.86e8
## 2 15008 Daniele Mcrobbie 459397 1793 Burntlodge Rd 1.20e8
## 3 21314 Daniel Schwendinger 219130 193 Rosegold Rd 1.05e8
## 4 27198 Daniel Dumire 953663 1355 Campground Blvd 4.84e8
## 5 39873 Danielle Funderburg 695950 3340 Mc Gloshen St 4.04e8
## 6 53486 Danielle Palys 385970 902 Belgrave Dr 2.74e8
## 7 53784 Daniella Caramelo 331205 2333 Henry Knox Circle 1.58e8
## 8 64303 Daniela Nishina 892095 526 Hohener St 5.66e8
## 9 94523 Daniel Ales 681781 2232 Chatter Brook Circ… 3.15e8
# The dollar sign `$` means "ends with":
people %>%
filter(str_detect(name, "nson$"))
## # A tibble: 26 × 6
## id name license_id address_number address_street_na… ssn
## <dbl> <chr> <dbl> <dbl> <chr> <dbl>
## 1 14591 Jeremiah Johanson 429738 3037 Gaines Blvd 2.76e8
## 2 15740 Kylee Bronson 499686 3737 Weiland St 1.55e8
## 3 15898 Jammie Colbenson 193571 2492 Liberty Mill St 9.48e8
## 4 21624 Otha Mallinson 999509 3150 Trovillion Rd 1.36e8
## 5 26264 Audrea Ellanson 225456 3140 W Fortlee Dr 1.75e8
## 6 27303 Joseph Hutchenson 910671 3457 E Fortlee St 1.03e8
## 7 34155 Allyson Lazenson 814209 2840 Northwestern Dr 9.91e8
## 8 37357 Gwendolyn Anson 269007 2497 Colleen Garden Way 2.99e8
## 9 40825 Leslie Etchinson 847905 3422 Lund Way 5.54e8
## 10 42169 Malisa Christianson 951727 234 Old Amboy Ave 8.40e8
## # … with 16 more rows
#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
# 1. Query the data to find out if there's anyone with your first or last ------
# name using `|`, which is the logical operator for "or". If not, is there
# anyone with the first few letters of your first name, or the last few letters
# of your last name?
#1@
# people %>%
# filter(str_detect(__, __) | str_detect(__, __))
#@1
# 2. Use 'select()' to find Daniel Dumire's license_id. ------------------------
#2@
# people %>%
# filter(__) %>%
# select(__)
#@2
#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
# ----- drivers_license -----
#
# 3. Now look for Daniel Dumire in the 'drivers_license' tibble. ---------------
# The problem: there's no 'name' column in that tibble. You have to search
# instead by 'id'. That is, 'license_id' in the 'people' tibble can be matched
# with 'id' in the 'drivers_license' tibble.
#3@
# drivers_license %>%
# filter(id == __)
#@3
# ----- income -----
# 4. Find a person whose name is Bennett Jimmerson and find their income. ------
# You can follow the same 2-step process as 2 and 3. Which variables
# can be matched between 'income' and 'people'? (use 'names(income)'
# and 'names(people)' to quickly remind yourself of the variables in
# each tibble)
#4@
# __
# __
#@4
#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
# ----- left_join() -----
#
# A more efficient way to solve questions like 2, 3, and 4 is to join
# the tibbles by the matching variables, which are called "keys".
# We can join 'people' and 'drivers_license' by 'license_id' = 'id':
people %>%
left_join(drivers_license, by = c("license_id" = "id"))
## # A tibble: 10,011 × 14
## id name license_id address_number address_street_… ssn age height
## <dbl> <chr> <dbl> <dbl> <chr> <dbl> <dbl> <dbl>
## 1 10000 Christo… 993845 624 Bankhall Ave 7.48e8 46 59
## 2 10007 Kourtne… 861794 2791 Gustavus Blvd 4.78e8 54 74
## 3 10010 Muoi Ca… 385336 741 Northwestern Dr 8.29e8 24 79
## 4 10016 Era Mos… 431897 1987 Wood Glade St 6.15e8 22 75
## 5 10025 Trena H… 550890 276 Daws Hill Way 2.24e8 45 60
## 6 10027 Antione… 439509 2431 Zelham Dr 4.92e8 56 72
## 7 10034 Kyra Bu… 920494 1873 Sleigh Dr 3.32e8 56 69
## 8 10039 Frances… 278151 736 Buswell Dr 8.61e8 87 54
## 9 10095 Leslie … 729987 2772 Camellia Park C… 1.28e8 83 74
## 10 10122 Alva Co… 779002 116 Diversey Circle 1.49e8 52 64
## # … with 10,001 more rows, and 6 more variables: eye_color <chr>,
## # hair_color <chr>, gender <chr>, plate_number <chr>, car_make <chr>,
## # car_model <chr>
?qelp::left_join
# To get a complete set of information about Daniel Dumire,
# I join 'people' with 'drivers_license' by the keys the two tibbles
# have in common, then filter on "Daniel Dumire" and select the
# information I'm interested in:
people %>%
left_join(drivers_license, by = c("license_id" = "id")) %>%
filter(name == "Daniel Dumire") %>%
select(name, age, height, eye_color, hair_color, gender, car_make)
## # A tibble: 1 × 7
## name age height eye_color hair_color gender car_make
## <chr> <dbl> <dbl> <chr> <chr> <chr> <chr>
## 1 Daniel Dumire 27 68 amber grey female Buick
#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
# 5. Your turn: use left_join() to join 'people' with 'income'. ----------------
# Since the key has the same name in both tibbles ("ssn"), 'by = "ssn"'
# will work.
#5@
# __ %>%
# __(__, by = "ssn")
#@5
# 6. Copy-paste your code for (5) and add another step, querying the -----------
# resulting tibble to find Daniel Dumire's income.
#6@
# __
#@6
#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
# 7. One more join problem: join 'drivers_license' with 'income'. This is ------
# tricky because they don't share any keys! You'll have to use 'left_join'
# twice and make use of the 'people' tibble.
#7@
# __
#@7
#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
# Last notes about joins:
# We've been joining and then filtering, like this:
people %>%
left_join(drivers_license, by = c("license_id" = "id")) %>%
filter(name == "Daniel Dumire")
## # A tibble: 1 × 14
## id name license_id address_number address_street_… ssn age height
## <dbl> <chr> <dbl> <dbl> <chr> <dbl> <dbl> <dbl>
## 1 27198 Daniel D… 953663 1355 Campground Blvd 4.84e8 27 68
## # … with 6 more variables: eye_color <chr>, hair_color <chr>, gender <chr>,
## # plate_number <chr>, car_make <chr>, car_model <chr>
# But if you're only interested in Daniel Dumire, then you can also filter and
# then join, like this:
people %>%
filter(name == "Daniel Dumire") %>%
left_join(drivers_license, by = c("license_id" = "id"))
## # A tibble: 1 × 14
## id name license_id address_number address_street_… ssn age height
## <dbl> <chr> <dbl> <dbl> <chr> <dbl> <dbl> <dbl>
## 1 27198 Daniel D… 953663 1355 Campground Blvd 4.84e8 27 68
## # … with 6 more variables: eye_color <chr>, hair_color <chr>, gender <chr>,
## # plate_number <chr>, car_make <chr>, car_model <chr>
# left_join() is one of the most useful joins, and it's the only join you'll
# need to solve the murder mystery project, but there are others. 99% of
# the time, you'll only ever need left_join() and inner_join(). Check out the
# dplyr cheat sheet (page 2, right hand side) for more information:
# https://www.rstudio.com/wp-content/uploads/2015/02/data-wrangling-cheatsheet.pdf
#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
# Great work! You're one step closer to tidyverse enlightenment. Make sure to
# return to this topic to meditate on it later.
# If you're ready, you can move on to koan 8: qplot to ggplot.