Added polecart files

2023-02-15 19:24:03 -08:00
parent 4918e72f7e
commit 443fb9bef2
7 changed files with 1381 additions and 0 deletions
--- a/polecart/basic/main.py
+++ b/polecart/basic/main.py
@@ -0,0 +1,277 @@
+import gymnasium as gym
+import math
+import random
+import matplotlib
+import matplotlib.pyplot as plt
+
+from collections import deque
+
+from itertools import count
+
+import torch
+import torch.nn as nn
+import torch.optim as optim
+import torch.nn.functional as F
+
+from tqdm import tqdm
+import util
+import optimize as optimize
+
+
+# TODO: Parameter file
+
+# TODO: What is this?
+human_render = False
+
+# TODO: What is this$
+BATCH_SIZE = 128
+
+# Learning rate of target_net.
+# Controls how soft our soft update is.
+# 
+# Should be between 0 and 1.
+# Large values 
+# Small values do the opposite.
+#
+# A value of one makes target_net
+# change at the same rate as policy_net.
+#
+# A value of zero makes target_net
+# not change at all.
+TAU = 0.005
+
+
+# Setup game environment
+if human_render:
+	env = gym.make("CartPole-v1", render_mode = "human")
+else:
+	env = gym.make("CartPole-v1")
+
+# Setup pytorch
+compute_device = torch.device(
+	"cuda" if torch.cuda.is_available() else "cpu"
+)
+
+
+# Number of training episodes.
+# It will take a while to process a many of these without a GPU,
+# but you will not see improvement with few training episodes.
+if torch.cuda.is_available():
+	num_episodes = 600
+else:
+	num_episodes = 50
+
+
+
+# Create replay memory.
+#
+# Transition: a container for naming data (defined in util.py)
+# Memory: a deque that holds recent states as Transitions
+#	Has a fixed length, drops oldest
+#	element if maxlen is exceeded.
+memory = deque([], maxlen=10000)
+
+
+
+# Outline our network
+class DQN(nn.Module):
+	def __init__(self, n_observations: int, n_actions: int):
+		super(DQN, self).__init__()
+		self.layer1 = nn.Linear(n_observations, 128)
+		self.layer2 = nn.Linear(128, 128)
+		self.layer3 = nn.Linear(128, n_actions)
+
+	# Can be called with one input, or with a batch.
+	#
+	# Returns tensor(
+	#	[ Q(s, left), Q(s, right) ], ...
+	# )
+	#
+	# Recall that Q(s, a) is the (expected) return of taking
+	# action `a` at state `s`
+	def forward(self, x):
+		x = F.relu(self.layer1(x))
+		x = F.relu(self.layer2(x))
+		return self.layer3(x)
+
+
+
+
+## Create networks and optimizer
+
+# n_actions: size of action space
+#	- 2 for cartpole: [0, 1] as "left" and "right"
+#
+# n_observations: size of observation vector
+#	- 4 for cartpole:
+#		position, velocity,
+#		angle, angular velocity
+n_actions = env.action_space.n  # type: ignore
+state, _ = env.reset()
+n_observations = len(state)
+
+# TODO:
+# What's the difference between these two?
+# What do they do?
+policy_net = DQN(n_observations, n_actions).to(compute_device)
+target_net = DQN(n_observations, n_actions).to(compute_device)
+
+# Both networks start with the same weights
+target_net.load_state_dict(policy_net.state_dict())
+
+# 
+optimizer = optim.AdamW(
+	policy_net.parameters(),
+	lr = 1e-4, # Hyperparameter: learning rate
+	amsgrad = True
+)
+
+
+
+# TODO: What is this?
+steps_done = 0
+
+
+
+episode_durations = []
+
+
+# TRAINING LOOP
+for ep in range(num_episodes):
+
+	# Reset environment and get game state
+	state, _ = env.reset()
+
+	# Conversion
+	state = torch.tensor(
+		state,
+		dtype = torch.float32,
+		device = compute_device
+	).unsqueeze(0)
+
+
+	# Iterate until game is over
+	for t in count():
+		
+		# Select next action
+		action = util.select_action(
+			state,
+			steps_done = steps_done,
+			policy_net = policy_net,
+			device = compute_device,
+			env = env
+		)
+		steps_done += 1
+
+
+		# Perform one step of the environment with this action.
+		(	next_state,		# new state
+			reward,			# number: reward as a result of action
+			terminated,		# bool:   reached a terminal state (win or loss). If True, must reset.
+			truncated,		# bool:   end of time limit. If true, must reset.
+			_
+		) = env.step(action.item())
+
+		# Conversion
+		reward = torch.tensor([reward], device = compute_device)
+
+		if terminated:
+			# If the environment reached a terminal state,
+			# observations are meaningless. Set to None.
+			next_state = None
+		else:
+			# Conversion
+			next_state = torch.tensor(
+				next_state,
+				dtype = torch.float32,
+				device = compute_device
+			).unsqueeze(0)
+
+
+		# Add this state transition to memory.
+		memory.append(
+			util.Transition(
+				state,
+				action,
+				next_state,
+				reward
+			)
+		)
+
+
+
+
+		# Only train the network if we have enough
+		# transitions in memory to do so.
+		if len(memory) >= BATCH_SIZE:
+
+			state = next_state
+
+			# Run optimizer
+			optimize.optimize_model(
+				memory,
+				# Pytorch params
+				compute_device = compute_device,
+				policy_net = policy_net,
+				target_net = target_net,
+				optimizer = optimizer,
+			)
+
+
+			# Soft update target_net weights
+			target_net_state = target_net.state_dict()
+			policy_net_state = policy_net.state_dict()
+			for key in policy_net_state:
+				target_net_state[key] = (
+					policy_net_state[key] * TAU +
+					target_net_state[key] * (1-TAU)
+				)
+			target_net.load_state_dict(target_net_state)
+
+		# Move on to the next episode once we reach
+		# a terminal state.
+		if (terminated or truncated):
+			print(f"Episode {ep}/{num_episodes}, last duration {t+1}", end="\r" )
+			episode_durations.append(t + 1)
+			break
+
+print("Complete.")
+
+durations_t = torch.tensor(episode_durations, dtype=torch.float)
+plt.xlabel('Episode')
+plt.ylabel('Duration')
+plt.plot(durations_t.numpy())
+plt.show()
+
+
+env.close()
+en = gym.make("CartPole-v1", render_mode = "human")
+
+while True:
+	state, _ = en.reset()
+	state = torch.tensor(
+		state,
+		dtype=torch.float32,
+		device=compute_device
+	).unsqueeze(0)
+
+	terminated = False
+	truncated = False
+	while not (terminated or truncated):
+		action = policy_net(state).max(1)[1].view(1, 1)
+
+		(	state,	# new state
+			reward,			# reward as a result of action
+			terminated,		# bool: reached a terminal state (win or loss). If True, must reset.
+			truncated,		# bool: end of time limit. If true, must reset.
+			_
+		) = en.step(action.item())
+
+		state = torch.tensor(
+			state,
+			dtype=torch.float32,
+			device=compute_device
+		).unsqueeze(0)
+
+		en.render()
+	en.reset()
--- a/polecart/basic/optimize.py
+++ b/polecart/basic/optimize.py
@@ -0,0 +1,161 @@
+import random
+from collections import deque
+
+import torch
+import torch.nn as nn
+import torch.optim as optim
+import torch.nn.functional as F
+
+import util
+
+def optimize_model(
+	memory: deque,
+
+	# Pytorch params
+	compute_device,
+	policy_net: nn.Module,
+	target_net: nn.Module,
+	optimizer,
+
+	# Parameters:
+	#
+	# BATCH_SIZE is the number of transitions sampled from the replay buffer
+	# GAMMA is the discount factor as mentioned in the previous section
+	BATCH_SIZE = 128,
+	GAMMA = 0.99
+):
+
+	if len(memory) < BATCH_SIZE:
+		raise Exception(f"Not enough elements in memory for a batch of {BATCH_SIZE}")
+
+
+	
+	# Get a random sample of transitions
+	batch = random.sample(memory, BATCH_SIZE)
+
+	# Conversion.
+	# Transposes batch, turning an array of Transitions
+	# into a Transition of arrays.
+	batch = util.Transition(*zip(*batch))
+
+	# Conversion.
+	# Combine states, actions, and rewards into their own tensors.
+	state_batch = torch.cat(batch.state)
+	action_batch = torch.cat(batch.action)
+	reward_batch = torch.cat(batch.reward)
+
+
+
+	# Compute a mask of non_final_states.
+	# Each element of this tensor corresponds to an element in the batch.
+	# True if this is a final state, False if it is.
+	#
+	# We use this to select non-final states later.
+	non_final_mask = torch.tensor(
+		tuple(map(
+			lambda s: s is not None,
+			batch.next_state
+		))
+	)
+
+	non_final_next_states = torch.cat(
+		[s for s in batch.next_state if s is not None]
+	)
+
+
+
+	# How .gather works:
+	# if out = a.gather(1, b),
+	# out[i, j] = a[ i ][ b[i,j] ]
+	#
+	# a is "input," b is "index"
+	# If this doesn't make sense, RTFD.
+
+	# Compute Q(s_t, a).
+	#	- Use policy_net to compute Q(s_t) for each state in the batch.
+	#		This gives a tensor of [ Q(state, left), Q(state, right) ]
+	#
+	#	- Action batch is a tensor that looks like [ [0], [1], [1], ... ]
+	#		listing the action that was taken in each transition.
+	#		0 => we went left, 1 => we went right.
+	#
+	#	This aligns nicely with the output of policy_net. We use
+	#	action_batch to index the output of policy_net's prediction.
+	#
+	#	This gives us a tensor that contains the return we expect to get
+	#	at that state if we follow the model's advice.
+
+	state_action_values = policy_net(state_batch).gather(1, action_batch)
+
+
+
+	# Compute V(s_t+1) for all next states.
+	# V(s_t+1) = max_a ( Q(s_t+1, a) )
+	# = the maximum reward over all possible actions at state s_t+1.
+	next_state_values = torch.zeros(BATCH_SIZE, device = compute_device)
+	
+	# Don't compute gradient for operations in this block.
+	# If you don't understand what this means, RTFD.
+	with torch.no_grad():
+		
+		# Note the use of non_final_mask here.
+		# States that are final do not have their reward set by the line 
+		# below, so their reward stays at zero.
+		#
+		# States that are not final get their predicted value
+		# set to the best value the model predicts.
+		#
+		#
+		# Expected values of action are selected with the "older" target net,
+		# and their best reward (over possible actions) is selected with max(1)[0].
+
+		next_state_values[non_final_mask] = target_net(non_final_next_states).max(1)[0]
+
+
+
+	# TODO: What does this mean?
+	# "Compute expected Q values"
+	expected_state_action_values = reward_batch + (next_state_values * GAMMA)
+
+
+
+	# Compute Huber loss between predicted reward and expected reward.
+	# Pytorch is will account for this when we compute the gradient of loss.
+	#
+	# loss is a single-element tensor (i.e, a scalar).
+	criterion = nn.SmoothL1Loss()
+	loss = criterion(
+		state_action_values,
+		expected_state_action_values.unsqueeze(1)
+	)
+
+	
+
+	# We can now run a step of backpropagation on our model.
+
+	# TODO: what does this do?
+	#
+	# Calling .backward() multiple times will accumulate parameter gradients.
+	# Thus, we reset the gradient before each step.
+	optimizer.zero_grad()
+
+	# Compute the gradient of loss wrt... something?
+	# TODO: what does this do, we never use loss again?!
+	loss.backward()
+
+
+	# Prevent vanishing and exploding gradients.
+	# Forces gradients to be in [-clip_value, +clip_value]
+	torch.nn.utils.clip_grad_value_(  # type: ignore
+		policy_net.parameters(),
+		clip_value = 100
+	)
+
+	# Perform a single optimizer step.
+	#
+	# Uses the current gradient, which is stored
+	# in the .grad attribute of the parameter.
+	optimizer.step()
+
+
+
--- a/polecart/basic/util.py
+++ b/polecart/basic/util.py
@@ -0,0 +1,77 @@
+import matplotlib
+import matplotlib.pyplot as plt
+
+import torch
+import math
+import random
+from collections import namedtuple
+
+
+Transition = namedtuple(
+	"Transition",
+	(
+		"state",
+		"action",
+		"next_state",
+		"reward"
+	)
+)
+
+
+def select_action(
+		state,
+		
+		*,
+
+		# Number of steps that have been done
+		steps_done: int,
+
+		# TF parameters
+		policy_net, # DQN policy network
+		device,		# Render device, "gpu" or "cpu"
+		env,		# GYM environment instance
+
+		# Epsilon parameters
+		#
+		# Original docs:
+		# EPS_START is the starting value of epsilon
+		# EPS_END is the final value of epsilon
+		# EPS_DECAY controls the rate of exponential decay of epsilon, higher means a slower decay
+		EPS_START = 0.9,
+		EPS_END = 0.05,
+		EPS_DECAY = 1000
+	):
+	"""
+	Given a state, select an action using an epsilon-greedy policy.
+
+	Sometimes use our model, sometimes sample one uniformly.
+
+	P(random action) starts at EPS_START and decays to EPS_END.
+	Decay rate is controlled by EPS_DECAY.
+	"""
+
+	# Random number 0 <= x < 1
+	sample = random.random()
+
+	# Calculate random step threshhold
+	eps_threshold = (
+		EPS_END + (EPS_START - EPS_END) *
+		math.exp(
+			-1.0 * steps_done /
+			EPS_DECAY
+		)
+	)
+
+	if sample > eps_threshold:
+		with torch.no_grad():
+			# t.max(1) will return the largest column value of each row.
+			# second column on max result is index of where max element was
+			# found, so we pick action with the larger expected reward.
+			return policy_net(state).max(1)[1].view(1, 1)
+
+	else:
+		return torch.tensor(
+			[ [env.action_space.sample()] ],
+			device=device,
+			dtype=torch.long
+		)