library(tidyverse)
actions <- c("north", "south", "east", "west", "stay")
move <- function(cell, action) {
if (action == "stay") {
return(cell)
} else if (action == "south") {
if (cell <= 6) return(cell + 3)
else return(cell)
} else if (action == "north") {
if (cell >= 4) return(cell - 3)
else return(cell)
} else if (action == "east") {
if (cell %in% c(1, 2, 4, 5, 7, 8))
return(cell + 1)
else return(cell)
} else if (action == "west") {
if (cell %in% c(2, 3, 5, 6, 8, 9))
return(cell - 1)
else return(cell)
}
}
payoffs <- function(position) {
c(0, 0, 0, 1, 2, 1, 0, 0, 0)[position]
}Q-Learning: Exploration vs Exploitation
In the last unit 2 classwork, you used trial-and-error learning to estimate action values using the total payoff from an episode.
Today you will learn Q-Learning, which updates \(Q(s,a)\) one step at a time, using the idea:
“If I take action \(a\) in state \(s\) and then behave optimally from the next state onward, what payoff should I expect?”
The key ideas:
- Action values \(Q(s,a)\) store how good it is to take action \(a\) in state \(s\).
- Exploration vs exploitation: sometimes try random actions (explore) so you can learn, but usually choose the best-known action (exploit).
- Temporal-difference (TD) learning: update \(Q(s,a)\) from a single transition \((s, a, r, s')\) using a bootstrapped target \(r + \gamma \max_{a'} Q(s',a')\).
GridWorld
We’ll use the same GridWorld as before, with no shocks yet:
┌───┬───┬───┐
│ 0 │ 0 │ 0 │
├───┼───┼───┤
│ 1 │ 2 │ 1 │
├───┼───┼───┤
│ 0 │ 0 │ 0 │
└───┴───┴───┘
Use the same move() function to handle transitions:
Part 1: The Q-Learning Update
Q-learning uses this update rule: New estimate = Old estimate + Learning rate x Prediction Error
\[Q(s, a) \leftarrow Q(s, a) + \alpha(r + 0.9 \max_{a'} Q(s', a') - Q(s, a))\]
- \(\alpha \in (0, 1]\) is the learning rate (we’ll use 0.2)
- \(r\) is the payoff (reward) you got this step
- \(s'\) is the next state after taking action \(a\)
Question 1: Numeric Warm-up
Suppose current \(Q(s, a) = 5\), \(\alpha = 0.2\), you take action \(a\) in state \(s\) and get a payoff of \(r = 1\), the next state is \(s'\) and \(\max_{a'} Q(s', a') = 8\). Compute the updated value of \(Q(s, a)\).
Q_old <- 5
alpha <- 0.2
r <- 1
max_next <- 8
Q_next <- ___
Q_next == 5.64Part 2: Q-table helpers
We’ll store Q-values in a tibble with one row per (state, action).
init_Q <- function() {
expand.grid(
state = 1:9,
action = actions,
stringsAsFactors = FALSE
) %>%
mutate(Q = 0)
}
init_Q() state action Q
1 1 north 0
2 2 north 0
3 3 north 0
4 4 north 0
5 5 north 0
6 6 north 0
7 7 north 0
8 8 north 0
9 9 north 0
10 1 south 0
11 2 south 0
12 3 south 0
13 4 south 0
14 5 south 0
15 6 south 0
16 7 south 0
17 8 south 0
18 9 south 0
19 1 east 0
20 2 east 0
21 3 east 0
22 4 east 0
23 5 east 0
24 6 east 0
25 7 east 0
26 8 east 0
27 9 east 0
28 1 west 0
29 2 west 0
30 3 west 0
31 4 west 0
32 5 west 0
33 6 west 0
34 7 west 0
35 8 west 0
36 9 west 0
37 1 stay 0
38 2 stay 0
39 3 stay 0
40 4 stay 0
41 5 stay 0
42 6 stay 0
43 7 stay 0
44 8 stay 0
45 9 stay 0
Helper functions: get_Q() and set_Q:
get_Q(Qtab, s, a)returns the current Q value for a state, action pairset_Q(Qtab, s, a, value)returns an updated Qtab with that value written.
Question 2: use filter and pull to complete get_Q. pull() takes a variable in a tibble returns the column vector, pulling a vector out of a tibble.
get_Q <- function(Qtab, s, a) {
Qtab %>%
___ %>%
___
}
set_Q <- function(Qtab, s, a, value) {
idx <- which(Qtab$state == s & Qtab$action == a)
Qtab$Q[idx] <- value
return(Qtab)
}
library(testthat)
test_that("get_Q and set_Q work", {
Q <- init_Q()
Q <- set_Q(Q, 3, "north", 7)
expect_equal(get_Q(Q, 3, "north"), 7)
})Part 3: Greedy vs epsilon-greedy actions
A greedy policy always chooses the action with the highest current \(Q(s,a)\) (exploitation). An epsilon-greedy policy chooses a random action with some probability like \(\epsilon = 0.3\) (exploration), and with probability \(1 - \epsilon\), chooses a greedy action.
Question 3: finish writing the functions greedy_action and epsilon_greedy_action with epsilon = 0.3.
Qtab <- init_Q()
greedy_action <- function(Qtab, s) {
# Find the action with the highest Q value.
# If there is a tie, randomly break ties.
# Return the action.
Qtab %>%
filter(___) %>%
slice_max(___) %>%
pull(___) %>%
sample(size = 1)
}
epsilon_greedy_action <- function(Qtab, s, eps) {
if (sample(1:10, size = 1) <= (eps * 10)) {
# with probability 0.3, sample randomly from actions
# (explore)
sample(actions, size = 1)
} else {
# with probability 0.7, take the greedy action
# (exploit)
greedy_action(Qtab, s)
}
}
# Verify your function works:
replicate(10, epsilon_greedy_action(Qtab, 5, eps = .3))Part 4: One episode of Q-Learning
Next we’ll generate an episode, updating Q every step.
Question 4: Write run_episode_qlearn() that:
- Has the episode begin in a random cell (1 to 9)
- After each step, the episode ends with probability 0.1
- If the episode continues, pick an action using
epsilon_greedy_action - Observe the payoff r = payoffs(state)
- Compute the next state
s_next = move(state, action) - Perform the Q-learning update
- Store the transition in a tibble
q_update <- function(Qtab, s, a, r, s_next) {
Q_old <- get_Q(Qtab, s, a)
max_next <- Qtab %>%
filter(state == s_next) %>%
pull(Q) %>%
max()
# Q learning rule:
Q_next <- ___ + 0.2 * (r + 0.9 * ___ - ___)
set_Q(Qtab, s, a, Q_next)
}
run_episode_qlearn <- function(Qtab, eps) {
state <- sample(1:9, size = 1)
game_continues <- TRUE
states <- c()
actions_taken <- c()
rewards <- c()
next_states <- c()
while (game_continues) {
a <- epsilon_greedy_action(Qtab, state, eps)
r <- payoffs(state)
s_next <- move(state, a)
Qtab <- q_update(Qtab, state, a, r, s_next)
states <- c(states, state)
actions_taken <- c(actions_taken, a)
rewards <- c(rewards, r)
next_states <- c(next_states, s_next)
game_continues <- sample(c(TRUE, FALSE), size = 1, prob = c(0.9, 0.1))
state <- s_next
}
list(
Qtab = Qtab,
episode = tibble(
t = 1:length(states),
state = states,
action = actions_taken,
payoff = rewards,
next_state = next_states
)
)
}
# example use
set.seed(123)
run_episode_qlearn(Qtab, eps = .3)Here’s a function animate_episode to visualize an episode.
library(gganimate)
animate_episode <- function(episode, payoff_vec = c(0, 0, 0, 1, 2, 1, 0, 0, 0)) {
xy3 <- function(s) tibble(
x = ((s - 1) %% 3) + 1,
y = 3 - ((s - 1) %/% 3)
)
grid_df <- tibble(state = 1:9, payoff = payoff_vec) %>%
bind_cols(xy3(1:9))
states_path <- c(episode$state, tail(episode$next_state, 1))
traj <- tibble(
step = seq_along(states_path),
state = states_path
) %>%
bind_cols(xy3(states_path)) %>%
mutate(
x = x + rnorm(n(), sd = 0.05),
y = y + rnorm(n(), sd = 0.05)
)
# Dynamic color map for whatever values are in payoff_vec
payoff_levels <- sort(unique(payoff_vec))
payoff_levels_chr <- as.character(payoff_levels)
# Make a palette (base R) and name it to match factor(payoff) levels
pal <- grDevices::hcl.colors(length(payoff_levels), palette = "Zissou 1")
color_map <- stats::setNames(pal, payoff_levels_chr)
p <- ggplot(grid_df, aes(x, y)) +
geom_tile(aes(fill = factor(payoff, levels = payoff_levels)), color = NA) +
geom_text(aes(label = payoff), color = "white", size = 7) +
scale_fill_manual(values = color_map, guide = "none") +
geom_path(
data = traj,
aes(x = x, y = y, group = 1),
linewidth = 1,
color = "black"
) +
coord_equal() +
theme_void() +
gganimate::transition_reveal(step)
gganimate::animate(
p,
nframes = nrow(traj),
fps = 5,
width = 200,
height = 200,
renderer = gganimate::gifski_renderer(loop = FALSE)
)
}Run this several times to see different episodes:
animate_episode(run_episode_qlearn(Qtab, eps = .3)$episode)Part 5: Train Q with 1000 episodes
Question 5: Run 1000 episodes, updating Qtab the whole time. Afterward, inspect Qtab for the highest Q value in each state: what stands out? Animate one more episode with eps set to 0 so that you always choose the greedy action.
Qtab <- init_Q()
for (___) {
out <- run_episode_qlearn(Qtab, eps = .3)
Qtab <- out$Qtab
}
Qtab %>%
___ %>%
___Run this several times to see different episodes:
animate_episode(run_episode_qlearn(Qtab, eps = 0)$episode)Part 6: Cliff GridWorld
Question 6: Consider a new GridWorld, which I’ll call the cliff GridWorld. Learn Q with 1000 episodes, inspect Q for the highest Q value in each state, and animate a final episode with epsilon set to 0.
┌─────┬─────┬──────┐
│ 2 │ -10 │ -10 │
├─────┼─────┼──────┤
│ 0 │ 1 │ -10 │
├─────┼─────┼──────┤
│ 0 │ 0 │ 1 │
└─────┴─────┴──────┘
cliff_payoffs <- function(position) {
c(___)[position]
}
run_episode_qlearn <- function(Qtab, eps) {
___
}
Qtab <- init_Q()
for (___) {
out <- run_episode_qlearn(Qtab, eps = .3)
Qtab <- out$Qtab
}
Qtab %>%
___ %>%
___animate_episode(
run_episode_qlearn(Qtab, eps = 0)$episode,
payoff_vec = c(2, -10, -10, 0, 1, -10, 0, 0, 1)
)Download this assignment
Here’s a link to download this assignment.