Archived
1
0

Added polecart files

This commit is contained in:
2023-02-15 19:24:03 -08:00
parent 4918e72f7e
commit 443fb9bef2
7 changed files with 1381 additions and 0 deletions

277
polecart/basic/main.py Executable file
View File

@ -0,0 +1,277 @@
import gymnasium as gym
import math
import random
import matplotlib
import matplotlib.pyplot as plt
from collections import deque
from itertools import count
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from tqdm import tqdm
import util
import optimize as optimize
# TODO: Parameter file
# TODO: What is this?
human_render = False
# TODO: What is this$
BATCH_SIZE = 128
# Learning rate of target_net.
# Controls how soft our soft update is.
#
# Should be between 0 and 1.
# Large values
# Small values do the opposite.
#
# A value of one makes target_net
# change at the same rate as policy_net.
#
# A value of zero makes target_net
# not change at all.
TAU = 0.005
# Setup game environment
if human_render:
env = gym.make("CartPole-v1", render_mode = "human")
else:
env = gym.make("CartPole-v1")
# Setup pytorch
compute_device = torch.device(
"cuda" if torch.cuda.is_available() else "cpu"
)
# Number of training episodes.
# It will take a while to process a many of these without a GPU,
# but you will not see improvement with few training episodes.
if torch.cuda.is_available():
num_episodes = 600
else:
num_episodes = 50
# Create replay memory.
#
# Transition: a container for naming data (defined in util.py)
# Memory: a deque that holds recent states as Transitions
# Has a fixed length, drops oldest
# element if maxlen is exceeded.
memory = deque([], maxlen=10000)
# Outline our network
class DQN(nn.Module):
def __init__(self, n_observations: int, n_actions: int):
super(DQN, self).__init__()
self.layer1 = nn.Linear(n_observations, 128)
self.layer2 = nn.Linear(128, 128)
self.layer3 = nn.Linear(128, n_actions)
# Can be called with one input, or with a batch.
#
# Returns tensor(
# [ Q(s, left), Q(s, right) ], ...
# )
#
# Recall that Q(s, a) is the (expected) return of taking
# action `a` at state `s`
def forward(self, x):
x = F.relu(self.layer1(x))
x = F.relu(self.layer2(x))
return self.layer3(x)
## Create networks and optimizer
# n_actions: size of action space
# - 2 for cartpole: [0, 1] as "left" and "right"
#
# n_observations: size of observation vector
# - 4 for cartpole:
# position, velocity,
# angle, angular velocity
n_actions = env.action_space.n # type: ignore
state, _ = env.reset()
n_observations = len(state)
# TODO:
# What's the difference between these two?
# What do they do?
policy_net = DQN(n_observations, n_actions).to(compute_device)
target_net = DQN(n_observations, n_actions).to(compute_device)
# Both networks start with the same weights
target_net.load_state_dict(policy_net.state_dict())
#
optimizer = optim.AdamW(
policy_net.parameters(),
lr = 1e-4, # Hyperparameter: learning rate
amsgrad = True
)
# TODO: What is this?
steps_done = 0
episode_durations = []
# TRAINING LOOP
for ep in range(num_episodes):
# Reset environment and get game state
state, _ = env.reset()
# Conversion
state = torch.tensor(
state,
dtype = torch.float32,
device = compute_device
).unsqueeze(0)
# Iterate until game is over
for t in count():
# Select next action
action = util.select_action(
state,
steps_done = steps_done,
policy_net = policy_net,
device = compute_device,
env = env
)
steps_done += 1
# Perform one step of the environment with this action.
( next_state, # new state
reward, # number: reward as a result of action
terminated, # bool: reached a terminal state (win or loss). If True, must reset.
truncated, # bool: end of time limit. If true, must reset.
_
) = env.step(action.item())
# Conversion
reward = torch.tensor([reward], device = compute_device)
if terminated:
# If the environment reached a terminal state,
# observations are meaningless. Set to None.
next_state = None
else:
# Conversion
next_state = torch.tensor(
next_state,
dtype = torch.float32,
device = compute_device
).unsqueeze(0)
# Add this state transition to memory.
memory.append(
util.Transition(
state,
action,
next_state,
reward
)
)
# Only train the network if we have enough
# transitions in memory to do so.
if len(memory) >= BATCH_SIZE:
state = next_state
# Run optimizer
optimize.optimize_model(
memory,
# Pytorch params
compute_device = compute_device,
policy_net = policy_net,
target_net = target_net,
optimizer = optimizer,
)
# Soft update target_net weights
target_net_state = target_net.state_dict()
policy_net_state = policy_net.state_dict()
for key in policy_net_state:
target_net_state[key] = (
policy_net_state[key] * TAU +
target_net_state[key] * (1-TAU)
)
target_net.load_state_dict(target_net_state)
# Move on to the next episode once we reach
# a terminal state.
if (terminated or truncated):
print(f"Episode {ep}/{num_episodes}, last duration {t+1}", end="\r" )
episode_durations.append(t + 1)
break
print("Complete.")
durations_t = torch.tensor(episode_durations, dtype=torch.float)
plt.xlabel('Episode')
plt.ylabel('Duration')
plt.plot(durations_t.numpy())
plt.show()
env.close()
en = gym.make("CartPole-v1", render_mode = "human")
while True:
state, _ = en.reset()
state = torch.tensor(
state,
dtype=torch.float32,
device=compute_device
).unsqueeze(0)
terminated = False
truncated = False
while not (terminated or truncated):
action = policy_net(state).max(1)[1].view(1, 1)
( state, # new state
reward, # reward as a result of action
terminated, # bool: reached a terminal state (win or loss). If True, must reset.
truncated, # bool: end of time limit. If true, must reset.
_
) = en.step(action.item())
state = torch.tensor(
state,
dtype=torch.float32,
device=compute_device
).unsqueeze(0)
en.render()
en.reset()

161
polecart/basic/optimize.py Executable file
View File

@ -0,0 +1,161 @@
import random
from collections import deque
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import util
def optimize_model(
memory: deque,
# Pytorch params
compute_device,
policy_net: nn.Module,
target_net: nn.Module,
optimizer,
# Parameters:
#
# BATCH_SIZE is the number of transitions sampled from the replay buffer
# GAMMA is the discount factor as mentioned in the previous section
BATCH_SIZE = 128,
GAMMA = 0.99
):
if len(memory) < BATCH_SIZE:
raise Exception(f"Not enough elements in memory for a batch of {BATCH_SIZE}")
# Get a random sample of transitions
batch = random.sample(memory, BATCH_SIZE)
# Conversion.
# Transposes batch, turning an array of Transitions
# into a Transition of arrays.
batch = util.Transition(*zip(*batch))
# Conversion.
# Combine states, actions, and rewards into their own tensors.
state_batch = torch.cat(batch.state)
action_batch = torch.cat(batch.action)
reward_batch = torch.cat(batch.reward)
# Compute a mask of non_final_states.
# Each element of this tensor corresponds to an element in the batch.
# True if this is a final state, False if it is.
#
# We use this to select non-final states later.
non_final_mask = torch.tensor(
tuple(map(
lambda s: s is not None,
batch.next_state
))
)
non_final_next_states = torch.cat(
[s for s in batch.next_state if s is not None]
)
# How .gather works:
# if out = a.gather(1, b),
# out[i, j] = a[ i ][ b[i,j] ]
#
# a is "input," b is "index"
# If this doesn't make sense, RTFD.
# Compute Q(s_t, a).
# - Use policy_net to compute Q(s_t) for each state in the batch.
# This gives a tensor of [ Q(state, left), Q(state, right) ]
#
# - Action batch is a tensor that looks like [ [0], [1], [1], ... ]
# listing the action that was taken in each transition.
# 0 => we went left, 1 => we went right.
#
# This aligns nicely with the output of policy_net. We use
# action_batch to index the output of policy_net's prediction.
#
# This gives us a tensor that contains the return we expect to get
# at that state if we follow the model's advice.
state_action_values = policy_net(state_batch).gather(1, action_batch)
# Compute V(s_t+1) for all next states.
# V(s_t+1) = max_a ( Q(s_t+1, a) )
# = the maximum reward over all possible actions at state s_t+1.
next_state_values = torch.zeros(BATCH_SIZE, device = compute_device)
# Don't compute gradient for operations in this block.
# If you don't understand what this means, RTFD.
with torch.no_grad():
# Note the use of non_final_mask here.
# States that are final do not have their reward set by the line
# below, so their reward stays at zero.
#
# States that are not final get their predicted value
# set to the best value the model predicts.
#
#
# Expected values of action are selected with the "older" target net,
# and their best reward (over possible actions) is selected with max(1)[0].
next_state_values[non_final_mask] = target_net(non_final_next_states).max(1)[0]
# TODO: What does this mean?
# "Compute expected Q values"
expected_state_action_values = reward_batch + (next_state_values * GAMMA)
# Compute Huber loss between predicted reward and expected reward.
# Pytorch is will account for this when we compute the gradient of loss.
#
# loss is a single-element tensor (i.e, a scalar).
criterion = nn.SmoothL1Loss()
loss = criterion(
state_action_values,
expected_state_action_values.unsqueeze(1)
)
# We can now run a step of backpropagation on our model.
# TODO: what does this do?
#
# Calling .backward() multiple times will accumulate parameter gradients.
# Thus, we reset the gradient before each step.
optimizer.zero_grad()
# Compute the gradient of loss wrt... something?
# TODO: what does this do, we never use loss again?!
loss.backward()
# Prevent vanishing and exploding gradients.
# Forces gradients to be in [-clip_value, +clip_value]
torch.nn.utils.clip_grad_value_( # type: ignore
policy_net.parameters(),
clip_value = 100
)
# Perform a single optimizer step.
#
# Uses the current gradient, which is stored
# in the .grad attribute of the parameter.
optimizer.step()

77
polecart/basic/util.py Executable file
View File

@ -0,0 +1,77 @@
import matplotlib
import matplotlib.pyplot as plt
import torch
import math
import random
from collections import namedtuple
Transition = namedtuple(
"Transition",
(
"state",
"action",
"next_state",
"reward"
)
)
def select_action(
state,
*,
# Number of steps that have been done
steps_done: int,
# TF parameters
policy_net, # DQN policy network
device, # Render device, "gpu" or "cpu"
env, # GYM environment instance
# Epsilon parameters
#
# Original docs:
# EPS_START is the starting value of epsilon
# EPS_END is the final value of epsilon
# EPS_DECAY controls the rate of exponential decay of epsilon, higher means a slower decay
EPS_START = 0.9,
EPS_END = 0.05,
EPS_DECAY = 1000
):
"""
Given a state, select an action using an epsilon-greedy policy.
Sometimes use our model, sometimes sample one uniformly.
P(random action) starts at EPS_START and decays to EPS_END.
Decay rate is controlled by EPS_DECAY.
"""
# Random number 0 <= x < 1
sample = random.random()
# Calculate random step threshhold
eps_threshold = (
EPS_END + (EPS_START - EPS_END) *
math.exp(
-1.0 * steps_done /
EPS_DECAY
)
)
if sample > eps_threshold:
with torch.no_grad():
# t.max(1) will return the largest column value of each row.
# second column on max result is index of where max element was
# found, so we pick action with the larger expected reward.
return policy_net(state).max(1)[1].view(1, 1)
else:
return torch.tensor(
[ [env.action_space.sample()] ],
device=device,
dtype=torch.long
)