Unit 2 Practice Test

1 GridWorld

Consider this GridWorld:

┌───┬───┬───┐
│ 3 │ 0 │ 0 │
├───┼───┼───┤
│ 0 │ 2 │ 0 │
├───┼───┼───┤
│ 0 │ 0 │ 0 │
└───┴───┴───┘

For the discount rate 0.9, write down the policy function, value function, and compute Q(1, east) and Q(1, south).

2 Concepts. Answer each of these questions in 1-2 sentences.

  1. What is the key difference between the Monte Carlo “first decision only” trial and error Q-estimation and the step-by-step Q-learning update?

  2. Explain the roles of exploration versus exploitation in Q-learning.

  3. Explain how an agent’s best response may change in a certain cell based on preference shocks.

3 Q-Learning

You are given payoffs(), move(), actions, init_Q(), get_Q(), set_Q(), and animate_episode(). Complete the functions below. When correct, the provided test should pass and the animation should draw.

actions <- c("north", "south", "east", "west", "stay")

payoffs <- function(position) c(3, 0, 0, 0, 2, 0, 0, 0, 0)[position]

move <- function(cell, action) {
  if (action == "stay") {
    return(cell)
  } else if (action == "south") {
    if (cell <= 6) return(cell + 3) else return(cell)
  } else if (action == "north") {
    if (cell >= 4) return(cell - 3) else return(cell)
  } else if (action == "east") {
    if (cell %in% c(1, 2, 4, 5, 7, 8)) return(cell + 1) else return(cell)
  } else if (action == "west") {
    if (cell %in% c(2, 3, 5, 6, 8, 9)) return(cell - 1) else return(cell)
  }
}

init_Q <- function() {
  expand.grid(
    state = 1:9,
    action = actions,
    stringsAsFactors = FALSE
  ) %>%
    as_tibble() %>%
    mutate(Q = 0)
}

get_Q <- function(Qtab, s, a) {
  Qtab %>%
    filter(state == s, action == a) %>%
    pull(Q)
}

set_Q <- function(Qtab, s, a, value) {
  idx <- which(Qtab$state == s & Qtab$action == a)
  Qtab$Q[idx] <- value
  Qtab
}

greedy_action <- function(Qtab, s) {
  Qtab %>%
    filter(state == s) %>%
    slice_max(Q, with_ties = TRUE) %>%
    pull(action) %>%
    sample(size = 1)
}
epsilon_greedy_action <- function(Qtab, s, eps) {
  # With probability eps: explore by random action
  # With probability 1-eps: exploit by greedy action (ties broken randomly)
  ___
}

q_update <- function(Qtab, s, a, r, s_next, alpha = 0.2, gamma = 0.9) {
  # Compute:
  # Q_next = Q_old + alpha * (r + gamma * max_a' Q(s_next, a') - Q_old)
  # Return updated Qtab
  ___
}

run_episode_qlearn <- function(Qtab, eps) {
  state <- sample(1:9, size = 1)
  game_continues <- TRUE
  
  states <- c()
  actions_taken <- c()
  rewards <- c()
  next_states <- c()
  
  while (game_continues) {
    a <- ___
    r <- ___
    s_next <- ___
    
    Qtab <- ___
    
    states <- c(states, state)
    actions_taken <- c(actions_taken, a)
    rewards <- c(rewards, r)
    next_states <- c(next_states, s_next)
    
    game_continues <- sample(c(TRUE, FALSE), size = 1, prob = c(0.9, 0.1))
    state <- s_next
  }
  
  list(
    Qtab = Qtab,
    episode = tibble(
      t = 1:length(states),
      state = states,
      action = actions_taken,
      payoff = rewards,
      next_state = next_states
    )
  )
}

Run and animate:

test_that("epsilon-greedy and q_update run without error", {
  Qtab <- init_Q()
  a <- epsilon_greedy_action(Qtab, 5, eps = 0.3)
  expect_true(a %in% actions)
  Qtab2 <- q_update(Qtab, s = 5, a = "stay", r = 2, s_next = 5)
  expect_true(is.data.frame(Qtab2))
})

animate_episode <- function(episode, payoff_vec = c(0, 0, 0, 1, 2, 1, 0, 0, 0)) {
  
  xy3 <- function(s) tibble(
    x = ((s - 1) %% 3) + 1,
    y = 3 - ((s - 1) %/% 3)
  )
  
  grid_df <- tibble(state = 1:9, payoff = payoff_vec) %>%
    bind_cols(xy3(1:9))
  
  states_path <- c(episode$state, tail(episode$next_state, 1))
  
  traj <- tibble(
    step = seq_along(states_path),
    state = states_path
  ) %>%
    bind_cols(xy3(states_path)) %>%
    mutate(
      x = x + rnorm(n(), sd = 0.05),
      y = y + rnorm(n(), sd = 0.05)
    )
  
  # Dynamic color map for whatever values are in payoff_vec
  payoff_levels <- sort(unique(payoff_vec))
  payoff_levels_chr <- as.character(payoff_levels)
  
  # Make a palette (base R) and name it to match factor(payoff) levels
  pal <- grDevices::hcl.colors(length(payoff_levels), palette = "Zissou 1")
  color_map <- stats::setNames(pal, payoff_levels_chr)
  
  p <- ggplot(grid_df, aes(x, y)) +
    geom_tile(aes(fill = factor(payoff, levels = payoff_levels)), color = NA) +
    geom_text(aes(label = payoff), color = "white", size = 7) +
    scale_fill_manual(values = color_map, guide = "none") +
    geom_path(
      data = traj,
      aes(x = x, y = y, group = 1),
      linewidth = 1,
      color = "black"
    ) +
    coord_equal() +
    theme_void() +
    gganimate::transition_reveal(step)
  
  gganimate::animate(
    p,
    nframes = nrow(traj),
    fps = 5,
    width = 200,
    height = 200,
    renderer = gganimate::gifski_renderer(loop = TRUE)
  )
}

Qtab <- init_Q()
set.seed(1)
out <- run_episode_qlearn(Qtab, eps = 0.3)
anim <- animate_episode(out$episode)
anim

4 RL, IRL, and Binary Choice

  1. What is the reinforcement learning problem?

  2. What is the inverse reinforcement learning problem?

  3. Where might inverse reinforcement learning be especially useful?

  4. What is the difference between a logit and a probit?

5 Markov Chains

Consider a town with two restaurants: a pizza place and a Mexican restaurant.

A study found the following patterns:

  • Of those who eat at the pizza place in a given week, 75% return to the pizza place the following week, and 25% go to the Mexican restaurant the following week.
  • Of those who eat at the Mexican restaurant in a given week, 40% go to the pizza place the following week, and 60% return to the Mexican restaurant the following week.
  1. Write R code to create the transition matrix.
  2. Suppose 100 people go to the Mexican restaurant in the first week and 0 people go to the pizza place. How many people will go to the Mexican restaurant in week 2?
  3. Find the long-run equilibrium using the matrix exponent operator *^* from the package expm.