Unit 2 Practice Test

1 GridWorld

Consider this GridWorld:

┌───┬───┬───┐
│ 3 │ 0 │ 0 │
├───┼───┼───┤
│ 0 │ 2 │ 0 │
├───┼───┼───┤
│ 0 │ 0 │ 0 │
└───┴───┴───┘

For the discount rate 0.9, write down the policy function, value function, and compute Q(1, east) and Q(1, south).

2 Concepts. Answer each of these questions in 1-2 sentences.

What is the key difference between the Monte Carlo “first decision only” trial and error Q-estimation and the step-by-step Q-learning update?
Explain the roles of exploration versus exploitation in Q-learning.
Explain how an agent’s best response may change in a certain cell based on preference shocks.

3 Q-Learning

You are given payoffs(), move(), actions, init_Q(), get_Q(), set_Q(), and animate_episode(). Complete the functions below. When correct, the provided test should pass and the animation should draw.

actions <- c("north", "south", "east", "west", "stay")

payoffs <- function(position) c(3, 0, 0, 0, 2, 0, 0, 0, 0)[position]

move <- function(cell, action) {
  if (action == "stay") {
    return(cell)
  } else if (action == "south") {
    if (cell <= 6) return(cell + 3) else return(cell)
  } else if (action == "north") {
    if (cell >= 4) return(cell - 3) else return(cell)
  } else if (action == "east") {
    if (cell %in% c(1, 2, 4, 5, 7, 8)) return(cell + 1) else return(cell)
  } else if (action == "west") {
    if (cell %in% c(2, 3, 5, 6, 8, 9)) return(cell - 1) else return(cell)
  }
}

init_Q <- function() {
  expand.grid(
    state = 1:9,
    action = actions,
    stringsAsFactors = FALSE
  ) %>%
    as_tibble() %>%
    mutate(Q = 0)
}

get_Q <- function(Qtab, s, a) {
  Qtab %>%
    filter(state == s, action == a) %>%
    pull(Q)
}

set_Q <- function(Qtab, s, a, value) {
  idx <- which(Qtab$state == s & Qtab$action == a)
  Qtab$Q[idx] <- value
  Qtab
}

greedy_action <- function(Qtab, s) {
  Qtab %>%
    filter(state == s) %>%
    slice_max(Q, with_ties = TRUE) %>%
    pull(action) %>%
    sample(size = 1)
}

epsilon_greedy_action <- function(Qtab, s, eps) {
  # With probability eps: explore by random action
  # With probability 1-eps: exploit by greedy action (ties broken randomly)
  ___
}

q_update <- function(Qtab, s, a, r, s_next, alpha = 0.2, gamma = 0.9) {
  # Compute:
  # Q_next = Q_old + alpha * (r + gamma * max_a' Q(s_next, a') - Q_old)
  # Return updated Qtab
  ___
}

run_episode_qlearn <- function(Qtab, eps) {
  state <- sample(1:9, size = 1)
  game_continues <- TRUE
  
  states <- c()
  actions_taken <- c()
  rewards <- c()
  next_states <- c()
  
  while (game_continues) {
    a <- ___
    r <- ___
    s_next <- ___
    
    Qtab <- ___
    
    states <- c(states, state)
    actions_taken <- c(actions_taken, a)
    rewards <- c(rewards, r)
    next_states <- c(next_states, s_next)
    
    game_continues <- sample(c(TRUE, FALSE), size = 1, prob = c(0.9, 0.1))
    state <- s_next
  }
  
  list(
    Qtab = Qtab,
    episode = tibble(
      t = 1:length(states),
      state = states,
      action = actions_taken,
      payoff = rewards,
      next_state = next_states
    )
  )
}

Run and animate:

test_that("epsilon-greedy and q_update run without error", {
  Qtab <- init_Q()
  a <- epsilon_greedy_action(Qtab, 5, eps = 0.3)
  expect_true(a %in% actions)
  Qtab2 <- q_update(Qtab, s = 5, a = "stay", r = 2, s_next = 5)
  expect_true(is.data.frame(Qtab2))
})

animate_episode <- function(episode, payoff_vec = c(0, 0, 0, 1, 2, 1, 0, 0, 0)) {
  
  xy3 <- function(s) tibble(
    x = ((s - 1) %% 3) + 1,
    y = 3 - ((s - 1) %/% 3)
  )
  
  grid_df <- tibble(state = 1:9, payoff = payoff_vec) %>%
    bind_cols(xy3(1:9))
  
  states_path <- c(episode$state, tail(episode$next_state, 1))
  
  traj <- tibble(
    step = seq_along(states_path),
    state = states_path
  ) %>%
    bind_cols(xy3(states_path)) %>%
    mutate(
      x = x + rnorm(n(), sd = 0.05),
      y = y + rnorm(n(), sd = 0.05)
    )
  
  # Dynamic color map for whatever values are in payoff_vec
  payoff_levels <- sort(unique(payoff_vec))
  payoff_levels_chr <- as.character(payoff_levels)
  
  # Make a palette (base R) and name it to match factor(payoff) levels
  pal <- grDevices::hcl.colors(length(payoff_levels), palette = "Zissou 1")
  color_map <- stats::setNames(pal, payoff_levels_chr)
  
  p <- ggplot(grid_df, aes(x, y)) +
    geom_tile(aes(fill = factor(payoff, levels = payoff_levels)), color = NA) +
    geom_text(aes(label = payoff), color = "white", size = 7) +
    scale_fill_manual(values = color_map, guide = "none") +
    geom_path(
      data = traj,
      aes(x = x, y = y, group = 1),
      linewidth = 1,
      color = "black"
    ) +
    coord_equal() +
    theme_void() +
    gganimate::transition_reveal(step)
  
  gganimate::animate(
    p,
    nframes = nrow(traj),
    fps = 5,
    width = 200,
    height = 200,
    renderer = gganimate::gifski_renderer(loop = TRUE)
  )
}

Qtab <- init_Q()
set.seed(1)
out <- run_episode_qlearn(Qtab, eps = 0.3)
anim <- animate_episode(out$episode)
anim

4 RL, IRL, and Binary Choice

What is the reinforcement learning problem?
What is the inverse reinforcement learning problem?
Where might inverse reinforcement learning be especially useful?
What is the difference between a logit and a probit?

5 Markov Chains

Consider a town with two restaurants: a pizza place and a Mexican restaurant.

A study found the following patterns:

Of those who eat at the pizza place in a given week, 75% return to the pizza place the following week, and 25% go to the Mexican restaurant the following week.
Of those who eat at the Mexican restaurant in a given week, 40% go to the pizza place the following week, and 60% return to the Mexican restaurant the following week.

Write R code to create the transition matrix.
Suppose 100 people go to the Mexican restaurant in the first week and 0 people go to the pizza place. How many people will go to the Mexican restaurant in week 2?
Find the long-run equilibrium using the matrix exponent operator *^* from the package expm.