NFXP on a GridWorld

In this classwork, we will:

  1. Simulate data for an agent expertly navigating a GridWorld under extreme value type 1 preference shocks
  2. Use the NFXP algorithm to estimate the payoff function
  3. Compare the estimated payoffs to the true payoff function

The Idea

Estimating 9 parameters (a payoff for each cell in the GridWorld) will be a little too heavy for us. Instead, we’ll divide cells into features that we think share a payoff. So for this Gridworld:

┌───┬───┬───┐
│ 0 │ 0 │ 0 │ 
├───┼───┼───┤
│ 1 │ 2 │ 1 │
├───┼───┼───┤
│ 0 │ 0 │ 0 │ 
└───┴───┴───┘

The first feature will be cells 1-3 and cells 7-9, the second will be cells 4 and 6, and the third will be cell 5. So our feature vector will be c(1, 1, 1, 2, 3, 2, 1, 1, 1).

Then we’ll estimate 3 parameters \(\theta_1\), \(\theta_2\), and \(\theta_3\): the payoff for all cells in the first, second, and third feature.

features <- c(1, 1, 1, 2, 3, 2, 1, 1, 1)
theta_true <- c(0, 1, 2)
# Which makes the true payoffs in each cell:
theta_true[features]

1. Simulated Data

Refer back to where we did this in Unit 2, Dynamic Programming: Value Iteration. The small changes needed are:

  • adjust for theta and features instead of payoffs,
  • value_iteration needs to return Q as well as V. We’ll use this when we compute choice probabilities in the NFXP section.
library(tidyverse)

actions <- c("north", "south", "east", "west", "stay")

move <- function(cell, action) {
  if (action == "stay") {
    return(cell)
  } else if (action == "south") {
    if (cell <= 6) return(cell + 3) 
    else return(cell)
  } else if (action == "north") {
    if (cell >= 4) return(cell - 3) 
    else return(cell)
  } else if (action == "east") {
    if (cell %in% c(1, 2, 4, 5, 7, 8)) 
      return(cell + 1) 
    else return(cell)
  } else if (action == "west") {
    if (cell %in% c(2, 3, 5, 6, 8, 9)) 
      return(cell - 1) 
    else return(cell)
  }
}

cell_payoffs <- function(theta, features) {
  ___
}

q <- function(V, s, a, theta, features) {
  payoffs <- cell_payoffs(theta, features)
  ___ + 0.9 * ___
}

init_V <- function() ___

value_iteration <- function(theta, features, tol = 1e-6, max_iter = 2000) {
  V <- init_V()
  V_old <- ___
  i <- ___
  
  while (max(abs(V - V_old)) > tol && i < max_iter) {
    V_old <- V
    
    V <- map_dbl(
      .x = 1:9,
      .f = function(s) {
        qs <- map_dbl(actions, function(a) q(V_old, s, a, theta, features))
        ___(qs)
      }
    )
    
    i <- i + 1
  }
  
    Q <- map(
    .x = 1:9,
    .f = function(s) {
      map_dbl(actions, function(a) ___)
    }
  ) %>%
    do.call(rbind, .)
  
  colnames(Q) <- actions
  
  list(
    V = V,
    Q = Q,
    iterations = i
  )
}

Add Extreme Value Preference Shocks

Similar to what we did in Unit 2, Dynamic Programming: Value Iteration.

# Extreme value type 1 errors for logit
draw_shocks <- function() {
  set_names(-log(-log(runif(length(actions)))), actions)
}

q_shock <- function(V, s, a, shocks, theta, features) {
  ___ <- move(s, a)
  payoffs <- cell_payoffs(theta, features)
  payoffs[s] + shocks[a] + 0.9 * ___[s_prime]
}

greedy_action_given_shocks <- function(V, s, shocks, theta, features) {
  qs <- map_dbl(actions, function(a) {
    q_shock(V, s, a, shocks, theta, features)
  })
  
  sample(actions[which(qs == max(___))], size = 1)
}

Simulate one episode

simulate_episode <- function(V, theta, features) {
  state <- sample(___, size = 1)
  t <- 1
  out <- list()

  repeat {
    shocks <- draw_shocks()
    action <- greedy_action_given_shocks(V, state, shocks, theta, features)
    next_state <- move(state, action)

    out[[t]] <- tibble(
      t = t,
      state = state,
      action = action,
      next_state = next_state
    )

    if (runif(1) > 0.9) break

    state <- next_state
    t <- t + 1
  }

  bind_rows(out)
}

# Example:
simulate_episode(
  V = value_iteration(theta = c(0, 1, 2), features)$V,
  theta = c(0, 1, 2), 
  features = c(1, 1, 1, 2, 3, 2, 1, 1, 1)
  )

Simulate a full data set

simulate_dataset <- function(n, theta, features) {
  V <- value_iteration(theta, features)$V
  map(1:n, function(ep) {
    simulate_episode(
      V = V,
      theta = theta, 
      features = features
    ) %>%
    mutate(episode = ep)
  }) %>%
    bind_rows()
}

set.seed(123)
gridworld_data <- simulate_dataset(50, c(0, 1, 2), features)

gridworld_data

Visualization

visualize_data <- function(data, theta, features) {
  payoff_vec <- cell_payoffs(theta, features)
  xy3 <- function(s) tibble(
    x = ((s - 1) %% 3) + 1,
    y = 3 - ((s - 1) %/% 3)
  )
  
  grid_df <- tibble(state = 1:9, payoff = payoff_vec) %>%
    bind_cols(xy3(1:9))
  
  traj_df <- data %>%
    group_by(episode) %>%
    summarize(
      states_path = list(c(state, dplyr::last(next_state))),
      .groups = "drop"
    ) %>%
    mutate(
      traj = purrr::map2(states_path, episode, function(states_path, ep_id) {
        tibble(
          step = seq_along(states_path),
          state = states_path,
          episode = ep_id
        ) %>%
          bind_cols(xy3(states_path)) %>%
          mutate(
            x = x + rnorm(n(), sd = 0.15),
            y = y + rnorm(n(), sd = 0.15)
          )
      })
    ) %>%
    select(traj) %>%
    unnest(traj)
  
  payoff_levels <- sort(unique(payoff_vec))
  payoff_levels_chr <- as.character(payoff_levels)
  pal <- grDevices::hcl.colors(length(payoff_levels), palette = "Zissou 1")
  color_map <- stats::setNames(pal, payoff_levels_chr)
  
  ggplot(grid_df, aes(x, y)) +
    geom_tile(aes(fill = factor(payoff, levels = payoff_levels)), color = NA) +
    scale_fill_manual(values = color_map, guide = "none") +
    geom_path(
      data = traj_df,
      aes(x = x, y = y, group = episode),
      linewidth = 0.7,
      color = "black",
      alpha = 0.15
    ) +
    geom_text(aes(label = payoff), color = "white", size = 7) +
    coord_equal() +
    theme_void()
}

# Example
visualize_data(gridworld_data, theta = c(0, 1, 2), 
    features = c(1, 1, 1, 2, 3, 2, 1, 1, 1))

2. NFXP Estimation

Now we pretend we don’t know the payoffs: we only observe the gridworld data. Can we estimate \(\theta_1\), \(\theta_2\), and \(\theta_3\) (payoffs for every feature) using NFXP?

The steps are just like in the Rust (1987) replication:

  1. Guess values of \(\theta_1\), \(\theta_2\), and \(\theta_3\)
  2. Solve the Bellman equation for the value function (inner loop)
  3. Compute model-implied choice probabilities
  4. Evaluate the log likelihood of the observed actions
  5. Update parameters to maximize the log likelihood

2.1: Inner Loop: Value Iteration

In Rust, the inner loop solves the Bellman equation for each candidate parameter vector. We’ll do the same here.

We already wrote a value_iteration() function in this document, so we can just reuse that here!

Let’s see what happens when we make some guesses for theta.

value_iteration(theta = c(0, 0, 0), features)$V

value_iteration(theta = c(1, 1, 1), features)$V

value_iteration(theta = c(0, 0, 1), features)$V

2.2: From the Value Function to Choice Probabilities

In Rust, once we solve the dynamic program, we turn the Q values into logit probabilities. We’ll do the same here. For each state \(s\) and action \(a\): \[P(a | s, \theta) = \frac{\exp(Q(s, a))}{\sum_{a' \in A} \exp(Q(s, a'))}\]

logsumexp <- function(x) {
  m <- max(x)
  m + log(sum(exp(x - m)))
}

choice_prob <- function(theta, features) {
  Q <- value_iteration(theta, features)$___
  
  probs <- matrix(NA, nrow = 9, ncol = length(actions))
  colnames(probs) <- actions
  
  for (s in 1:9) {
    lse <- logsumexp(Q[s, ])
    probs[s, ] <- exp(Q[s, ] - lse)
  }
  
  as_tibble(probs) %>%
    mutate(state = 1:9) %>%
    relocate(state)
}

# Example:
choice_prob(c(0, 1, 2), features)

2.3: Build the Log Likelihood

Now we compare the model’s predicted choice probabilities to the actions in our gridworld_data.

For each observation \(i\), we observe the current state and the action.

So the log likelihood is: \[LL(\theta) = \sum_{i = 1}^N \log P(a_i | s_i, \theta)\]

We’ll also enforce theta[1] is equal to 0 because the algorithm can just as well estimate theta to be (1, 2, 3) as (0, 1, 2). So really, we’re estimating theta[2] and theta[3].

make_theta <- function(theta_free) {
  c(0, theta_free)
}

log_lik <- function(theta_free, data, features) {
  theta <- make_theta(theta_free)
  
  probs <- choice_prob(theta, features) %>%
    pivot_longer(
      cols = all_of(actions),
      names_to = "action",
      values_to = "p_action"
    )
  
  data %>%
    left_join(probs, by = c("state", "action")) %>%
    summarize(ll = sum(log(p_action))) %>%
    pull(ll)
}

# Examples
log_lik(c(0, 0), gridworld_data, features)
log_lik(c(10, 0), gridworld_data, features)
log_lik(c(1, 2), gridworld_data, features)

2.4: Outer Loop: Maximize the Log Likelihood

Just like in Rust, the outer loop searches over values of \(\theta\).

gridworld_data <- simulate_dataset(n = 50, theta = c(0, 1, 2), features)

fit <- optim(
  par = c(0, 0),
  fn = function(theta_free) -log_lik(
    theta_free, 
    gridworld_data, 
    features),
  method = "BFGS",
  control = list(maxit = 500, reltol = 1e-8)
)

fit$par
make_theta(fit$par)

Compare the estimates to the truth. How close did it get with just 50 episodes? Does it get any closer with more episodes?

Finally, change the GridWorld to this “cliff” gridworld:

┌───┬───┬───┐
│ 4 │-5 │-5 │ 
├───┼───┼───┤
│ 0 │ 0 │-5 │
├───┼───┼───┤
│ 0 │ 0 │ 0 │ 
└───┴───┴───┘

How well does NFXP do here? Make sure the first feature is the feature with payoffs 0.

Download this assignment

Here’s a link to download this assignment.