import gymnasium as gym import math import random import matplotlib import matplotlib.pyplot as plt from collections import deque from itertools import count import torch import torch.nn as nn import torch.optim as optim import torch.nn.functional as F from tqdm import tqdm import util import optimize as optimize # TODO: Parameter file # TODO: What is this? human_render = False # TODO: What is this$ BATCH_SIZE = 128 # Learning rate of target_net. # Controls how soft our soft update is. # # Should be between 0 and 1. # Large values # Small values do the opposite. # # A value of one makes target_net # change at the same rate as policy_net. # # A value of zero makes target_net # not change at all. TAU = 0.005 # Setup game environment if human_render: env = gym.make("CartPole-v1", render_mode = "human") else: env = gym.make("CartPole-v1") # Setup pytorch compute_device = torch.device( "cuda" if torch.cuda.is_available() else "cpu" ) # Number of training episodes. # It will take a while to process a many of these without a GPU, # but you will not see improvement with few training episodes. if torch.cuda.is_available(): num_episodes = 600 else: num_episodes = 50 # Create replay memory. # # Transition: a container for naming data (defined in util.py) # Memory: a deque that holds recent states as Transitions # Has a fixed length, drops oldest # element if maxlen is exceeded. memory = deque([], maxlen=10000) # Outline our network class DQN(nn.Module): def __init__(self, n_observations: int, n_actions: int): super(DQN, self).__init__() self.layer1 = nn.Linear(n_observations, 128) self.layer2 = nn.Linear(128, 128) self.layer3 = nn.Linear(128, n_actions) # Can be called with one input, or with a batch. # # Returns tensor( # [ Q(s, left), Q(s, right) ], ... # ) # # Recall that Q(s, a) is the (expected) return of taking # action `a` at state `s` def forward(self, x): x = F.relu(self.layer1(x)) x = F.relu(self.layer2(x)) return self.layer3(x) ## Create networks and optimizer # n_actions: size of action space # - 2 for cartpole: [0, 1] as "left" and "right" # # n_observations: size of observation vector # - 4 for cartpole: # position, velocity, # angle, angular velocity n_actions = env.action_space.n # type: ignore state, _ = env.reset() n_observations = len(state) # TODO: # What's the difference between these two? # What do they do? policy_net = DQN(n_observations, n_actions).to(compute_device) target_net = DQN(n_observations, n_actions).to(compute_device) # Both networks start with the same weights target_net.load_state_dict(policy_net.state_dict()) # optimizer = optim.AdamW( policy_net.parameters(), lr = 1e-4, # Hyperparameter: learning rate amsgrad = True ) # TODO: What is this? steps_done = 0 episode_durations = [] # TRAINING LOOP for ep in range(num_episodes): # Reset environment and get game state state, _ = env.reset() # Conversion state = torch.tensor( state, dtype = torch.float32, device = compute_device ).unsqueeze(0) # Iterate until game is over for t in count(): # Select next action action = util.select_action( state, steps_done = steps_done, policy_net = policy_net, device = compute_device, env = env ) steps_done += 1 # Perform one step of the environment with this action. ( next_state, # new state reward, # number: reward as a result of action terminated, # bool: reached a terminal state (win or loss). If True, must reset. truncated, # bool: end of time limit. If true, must reset. _ ) = env.step(action.item()) # Conversion reward = torch.tensor([reward], device = compute_device) if terminated: # If the environment reached a terminal state, # observations are meaningless. Set to None. next_state = None else: # Conversion next_state = torch.tensor( next_state, dtype = torch.float32, device = compute_device ).unsqueeze(0) # Add this state transition to memory. memory.append( util.Transition( state, action, next_state, reward ) ) state = next_state # Only train the network if we have enough # transitions in memory to do so. if len(memory) >= BATCH_SIZE: # Run optimizer optimize.optimize_model( memory, # Pytorch params compute_device = compute_device, policy_net = policy_net, target_net = target_net, optimizer = optimizer, ) # Soft update target_net weights target_net_state = target_net.state_dict() policy_net_state = policy_net.state_dict() for key in policy_net_state: target_net_state[key] = ( policy_net_state[key] * TAU + target_net_state[key] * (1-TAU) ) target_net.load_state_dict(target_net_state) # Move on to the next episode once we reach # a terminal state. if (terminated or truncated): print(f"Episode {ep}/{num_episodes}, last duration {t+1}", end="\r" ) episode_durations.append(t + 1) break print("Complete.") durations_t = torch.tensor(episode_durations, dtype=torch.float) plt.xlabel('Episode') plt.ylabel('Duration') plt.plot(durations_t.numpy()) plt.show() env.close() en = gym.make("CartPole-v1", render_mode = "human") while True: state, _ = en.reset() state = torch.tensor( state, dtype=torch.float32, device=compute_device ).unsqueeze(0) terminated = False truncated = False while not (terminated or truncated): action = policy_net(state).max(1)[1].view(1, 1) ( state, # new state reward, # reward as a result of action terminated, # bool: reached a terminal state (win or loss). If True, must reset. truncated, # bool: end of time limit. If true, must reset. _ ) = en.step(action.item()) state = torch.tensor( state, dtype=torch.float32, device=compute_device ).unsqueeze(0) en.render() en.reset()