actions <- c("north", "south", "east", "west", "stay")
payoffs <- function(position) c(3, 0, 0, 0, 2, 0, 0, 0, 0)[position]
move <- function(cell, action) {
if (action == "stay") {
return(cell)
} else if (action == "south") {
if (cell <= 6) return(cell + 3) else return(cell)
} else if (action == "north") {
if (cell >= 4) return(cell - 3) else return(cell)
} else if (action == "east") {
if (cell %in% c(1, 2, 4, 5, 7, 8)) return(cell + 1) else return(cell)
} else if (action == "west") {
if (cell %in% c(2, 3, 5, 6, 8, 9)) return(cell - 1) else return(cell)
}
}
init_Q <- function() {
expand.grid(
state = 1:9,
action = actions,
stringsAsFactors = FALSE
) %>%
as_tibble() %>%
mutate(Q = 0)
}
get_Q <- function(Qtab, s, a) {
Qtab %>%
filter(state == s, action == a) %>%
pull(Q)
}
set_Q <- function(Qtab, s, a, value) {
idx <- which(Qtab$state == s & Qtab$action == a)
Qtab$Q[idx] <- value
Qtab
}
greedy_action <- function(Qtab, s) {
Qtab %>%
filter(state == s) %>%
slice_max(Q, with_ties = TRUE) %>%
pull(action) %>%
sample(size = 1)
}Unit 2 Practice Test
1 GridWorld
Consider this GridWorld:
┌───┬───┬───┐
│ 3 │ 0 │ 0 │
├───┼───┼───┤
│ 0 │ 2 │ 0 │
├───┼───┼───┤
│ 0 │ 0 │ 0 │
└───┴───┴───┘
For the discount rate 0.9, write down the policy function, value function, and compute Q(1, east) and Q(1, south).
2 Concepts. Answer each of these questions in 1-2 sentences.
What is the key difference between the Monte Carlo “first decision only” trial and error Q-estimation and the step-by-step Q-learning update?
Explain the roles of exploration versus exploitation in Q-learning.
Explain how an agent’s best response may change in a certain cell based on preference shocks.
3 Q-Learning
You are given payoffs(), move(), actions, init_Q(), get_Q(), set_Q(), and animate_episode(). Complete the functions below. When correct, the provided test should pass and the animation should draw.
epsilon_greedy_action <- function(Qtab, s, eps) {
# With probability eps: explore by random action
# With probability 1-eps: exploit by greedy action (ties broken randomly)
___
}
q_update <- function(Qtab, s, a, r, s_next, alpha = 0.2, gamma = 0.9) {
# Compute:
# Q_next = Q_old + alpha * (r + gamma * max_a' Q(s_next, a') - Q_old)
# Return updated Qtab
___
}
run_episode_qlearn <- function(Qtab, eps) {
state <- sample(1:9, size = 1)
game_continues <- TRUE
states <- c()
actions_taken <- c()
rewards <- c()
next_states <- c()
while (game_continues) {
a <- ___
r <- ___
s_next <- ___
Qtab <- ___
states <- c(states, state)
actions_taken <- c(actions_taken, a)
rewards <- c(rewards, r)
next_states <- c(next_states, s_next)
game_continues <- sample(c(TRUE, FALSE), size = 1, prob = c(0.9, 0.1))
state <- s_next
}
list(
Qtab = Qtab,
episode = tibble(
t = 1:length(states),
state = states,
action = actions_taken,
payoff = rewards,
next_state = next_states
)
)
}Run and animate:
test_that("epsilon-greedy and q_update run without error", {
Qtab <- init_Q()
a <- epsilon_greedy_action(Qtab, 5, eps = 0.3)
expect_true(a %in% actions)
Qtab2 <- q_update(Qtab, s = 5, a = "stay", r = 2, s_next = 5)
expect_true(is.data.frame(Qtab2))
})
animate_episode <- function(episode, payoff_vec = c(0, 0, 0, 1, 2, 1, 0, 0, 0)) {
xy3 <- function(s) tibble(
x = ((s - 1) %% 3) + 1,
y = 3 - ((s - 1) %/% 3)
)
grid_df <- tibble(state = 1:9, payoff = payoff_vec) %>%
bind_cols(xy3(1:9))
states_path <- c(episode$state, tail(episode$next_state, 1))
traj <- tibble(
step = seq_along(states_path),
state = states_path
) %>%
bind_cols(xy3(states_path)) %>%
mutate(
x = x + rnorm(n(), sd = 0.05),
y = y + rnorm(n(), sd = 0.05)
)
# Dynamic color map for whatever values are in payoff_vec
payoff_levels <- sort(unique(payoff_vec))
payoff_levels_chr <- as.character(payoff_levels)
# Make a palette (base R) and name it to match factor(payoff) levels
pal <- grDevices::hcl.colors(length(payoff_levels), palette = "Zissou 1")
color_map <- stats::setNames(pal, payoff_levels_chr)
p <- ggplot(grid_df, aes(x, y)) +
geom_tile(aes(fill = factor(payoff, levels = payoff_levels)), color = NA) +
geom_text(aes(label = payoff), color = "white", size = 7) +
scale_fill_manual(values = color_map, guide = "none") +
geom_path(
data = traj,
aes(x = x, y = y, group = 1),
linewidth = 1,
color = "black"
) +
coord_equal() +
theme_void() +
gganimate::transition_reveal(step)
gganimate::animate(
p,
nframes = nrow(traj),
fps = 5,
width = 200,
height = 200,
renderer = gganimate::gifski_renderer(loop = TRUE)
)
}
Qtab <- init_Q()
set.seed(1)
out <- run_episode_qlearn(Qtab, eps = 0.3)
anim <- animate_episode(out$episode)
anim4 RL, IRL, and Binary Choice
What is the reinforcement learning problem?
What is the inverse reinforcement learning problem?
Where might inverse reinforcement learning be especially useful?
What is the difference between a logit and a probit?
5 Markov Chains
Consider a town with two restaurants: a pizza place and a Mexican restaurant.
A study found the following patterns:
- Of those who eat at the pizza place in a given week, 75% return to the pizza place the following week, and 25% go to the Mexican restaurant the following week.
- Of those who eat at the Mexican restaurant in a given week, 40% go to the pizza place the following week, and 60% return to the Mexican restaurant the following week.
- Write R code to create the transition matrix.
- Suppose 100 people go to the Mexican restaurant in the first week and 0 people go to the pizza place. How many people will go to the Mexican restaurant in week 2?
- Find the long-run equilibrium using the matrix exponent operator
*^*from the packageexpm.