celeste-ai/polecart/basic/util.py

import matplotlib
import matplotlib.pyplot as plt

import torch
import math
import random
from collections import namedtuple


Transition = namedtuple(
	"Transition",
	(
		"state",
		"action",
		"next_state",
		"reward"
	)
)


def select_action(
		state,

		*,

		# Number of steps that have been done
		steps_done: int,

		# TF parameters
		policy_net, # DQN policy network
		device,		# Render device, "gpu" or "cpu"
		env,		# GYM environment instance

		# Epsilon parameters
		#
		# Original docs:
		# EPS_START is the starting value of epsilon
		# EPS_END is the final value of epsilon
		# EPS_DECAY controls the rate of exponential decay of epsilon, higher means a slower decay
		EPS_START = 0.9,
		EPS_END = 0.05,
		EPS_DECAY = 1000
	):
	"""
	Given a state, select an action using an epsilon-greedy policy.

	Sometimes use our model, sometimes sample one uniformly.

	P(random action) starts at EPS_START and decays to EPS_END.
	Decay rate is controlled by EPS_DECAY.
	"""

	# Random number 0 <= x < 1
	sample = random.random()

	# Calculate random step threshhold
	eps_threshold = (
		EPS_END + (EPS_START - EPS_END) *
		math.exp(
			-1.0 * steps_done /
			EPS_DECAY
		)
	)

	if sample > eps_threshold:
		with torch.no_grad():
			# t.max(1) will return the largest column value of each row.
			# second column on max result is index of where max element was
			# found, so we pick action with the larger expected reward.
			return policy_net(state).max(1)[1].view(1, 1)

	else:
		return torch.tensor(
			[ [env.action_space.sample()] ],
			device=device,
			dtype=torch.long
		)