Added report (filesystem cleanup)
After Width: | Height: | Size: 50 KiB |
After Width: | Height: | Size: 1.5 MiB |
After Width: | Height: | Size: 6.0 KiB |
After Width: | Height: | Size: 45 KiB |
After Width: | Height: | Size: 5.8 KiB |
After Width: | Height: | Size: 564 KiB |
After Width: | Height: | Size: 8.6 KiB |
|
@ -0,0 +1,199 @@
|
|||
\documentclass{article}
|
||||
|
||||
\usepackage{geometry}
|
||||
\geometry{
|
||||
paper = letterpaper,
|
||||
top = 25mm,
|
||||
bottom = 30mm,
|
||||
left = 30mm,
|
||||
right = 30mm,
|
||||
headheight = 75mm,
|
||||
footskip = 15mm,
|
||||
headsep = 75mm,
|
||||
}
|
||||
|
||||
\usepackage[
|
||||
left = ``,
|
||||
right = '',
|
||||
leftsub = `,
|
||||
rightsub = '
|
||||
]{dirtytalk}
|
||||
|
||||
|
||||
\usepackage{tcolorbox}
|
||||
\usepackage{fancyhdr}
|
||||
\pagestyle{fancy}
|
||||
\fancyhf{}
|
||||
\renewcommand{\headrulewidth}{0mm}
|
||||
\fancyfoot[C]{\thepage}
|
||||
|
||||
|
||||
\usepackage{adjustbox} % For title
|
||||
\usepackage{xcolor} % Colored text
|
||||
\usepackage{titlesec} % Section customization
|
||||
\usepackage{graphicx} % For images
|
||||
\usepackage{hyperref} % Clickable references and PDF metadata
|
||||
\usepackage{fontspec} % Powerful fonts, for XeTeX
|
||||
\usepackage{biblatex} % Citations
|
||||
\usepackage{enumitem} % List customization
|
||||
\usepackage{graphicx} % Images
|
||||
\usepackage{multicol}
|
||||
\addbibresource{sources.bib}
|
||||
%\usepackage{amsmath}
|
||||
%\usepackage{amssymb}
|
||||
|
||||
\graphicspath{ {./images} }
|
||||
|
||||
\hypersetup{
|
||||
colorlinks=true,
|
||||
citecolor=black,
|
||||
filecolor=black,
|
||||
linkcolor=black,
|
||||
urlcolor=blue,
|
||||
pdftitle={Celeste-AI},
|
||||
pdfauthor={Mark},
|
||||
pdfcreator={Mark with XeLaTeX}
|
||||
}
|
||||
|
||||
|
||||
%\frenchspacing
|
||||
\renewcommand*{\thefootnote}{\arabic{footnote}}
|
||||
|
||||
\setmainfont{PTAstraSerif}[
|
||||
Path = ./Astra/,
|
||||
Extension = .ttf,
|
||||
UprightFont = *-Regular,
|
||||
SmallCapsFont = *-Regular,
|
||||
BoldFont = *-Bold.ttf,
|
||||
ItalicFont = *-Italic.ttf,
|
||||
BoldItalicFont = *-BoldItalic.ttf,
|
||||
WordSpace = {1.1, 1.2, 1}
|
||||
]
|
||||
|
||||
\renewcommand{\labelitemi}{$-$}
|
||||
\renewcommand{\labelitemii}{$-$}
|
||||
\setlist{nosep}
|
||||
\setlength\parindent{0mm}
|
||||
|
||||
|
||||
% 1: command to modify
|
||||
% 2: format of label and text
|
||||
% 3: label text
|
||||
% 4: horizontal sep between label and text
|
||||
% 5: before code
|
||||
% 6: after code
|
||||
\titleformat
|
||||
{\section}
|
||||
{\centering\large\bfseries}
|
||||
{Part \thesection:}
|
||||
{1ex}
|
||||
{}
|
||||
[]
|
||||
|
||||
|
||||
\newcommand{\tag}[1]{
|
||||
\tcbox[
|
||||
nobeforeafter,
|
||||
colback=white!90!cyan,
|
||||
colframe=black!90!cyan,
|
||||
leftrule = 0.2mm,
|
||||
rightrule = 0.2mm,
|
||||
toprule = 0.2mm,
|
||||
bottomrule = 0.2mm,
|
||||
left = 0.5mm,
|
||||
right = 0.5mm,
|
||||
top = 0.5mm,
|
||||
bottom = 0.5mm
|
||||
]{#1}
|
||||
}
|
||||
|
||||
% 5 - 7 pages
|
||||
% TNR, 1 in margins
|
||||
%
|
||||
|
||||
% However, while describing methods and results, I want each individual to emphasize the methods that they learned and used in the project (this is broadly interpreted, this could be things like learning new methods, learning how to code something new, learning how to collect and polish data, skills like learning how to read papers, or visualization tools). Projects are a great way to get hands on experience and learn from your peers, so I also want to hear about what you gained from doing the project! It is perfectly reasonable for different people to have different strengths, I have no objection to this. I want to hear what were challenges that YOU faced, how you overcame them, and what you were able to take away from doing this project!
|
||||
|
||||
|
||||
|
||||
% 2. Each group should also submit a copy of their code ( a general working code is fine, you don't have to resubmit the code each time you change a line).
|
||||
|
||||
|
||||
|
||||
% Good practices for the project report:
|
||||
%
|
||||
% Use figures and tables freely
|
||||
% Make your figures nice
|
||||
% Add a short desc to figs and tables
|
||||
%
|
||||
% acknowledge anyone that has helped you, as well as cite any references that you have used. You can add an acknowledgement section after contributions statement.
|
||||
|
||||
% Lastly, it is good practice to make sure all your results are reproducible. To do this, you need to tell people exactly what parameters you used to generate each plot. If this list is small, you can include in in the figure caption, or you can include it in the text body or in the Appendix.
|
||||
|
||||
\begin{document}
|
||||
|
||||
\thispagestyle{empty}
|
||||
|
||||
|
||||
\begin{adjustbox}{minipage=0.7\textwidth, margin=0pt \smallskipamount, center}
|
||||
\begin{center}
|
||||
|
||||
\rule{\linewidth}{0.2mm}\\
|
||||
|
||||
\huge
|
||||
Celeste--AI \\
|
||||
\normalsize
|
||||
\vspace{1ex}
|
||||
Mark Ponomarenko\footnotemark[1], Timothy Chang, Ricardo Parada, Kelly Chang.
|
||||
\rule{\linewidth}{0.2mm} \\
|
||||
|
||||
\end{center}
|
||||
\end{adjustbox}
|
||||
|
||||
% Hack to get the footnote in the title at the bottom of the page.
|
||||
\phantom{\footnotemark{}}
|
||||
\footnotetext{Wrote this paper.}
|
||||
|
||||
|
||||
\section{Abstract}
|
||||
% 10ish line summary
|
||||
|
||||
From \textit{Super Mario Bros} \cite{pt-mario} and \textit{Atari} \cite{atari} to \textit{Go} \cite{alphago} and even \textit{Starcraft} \cite{sc2ai}, various forms of machine learning have been used to create game-playing algorithms. A common technique used for this task is reinforcement learning, especially deep $Q$-Learning. In this paper, we present a novel attempt to use these reinforcement-learning techniques to solve the first stage of \textit{Celeste Classic} \cite{celesteclassic}.
|
||||
|
||||
\input{parts/background}
|
||||
\input{parts/introduction}
|
||||
\input{parts/methods}
|
||||
\input{parts/results}
|
||||
\input{parts/conclusion}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
\section{Contribution Statement}
|
||||
|
||||
\subsection*{Ricardo:}
|
||||
\tag{code} \tag{hypothesis} \tag{model design} \tag{literature review} \tag{research} \tag{report}
|
||||
|
||||
\subsection*{Mark:}
|
||||
\tag{code} \tag{model design} \tag{report} \tag{literature review} \tag{plots}
|
||||
|
||||
\subsection*{Timothy:}
|
||||
\tag{code} \tag{hypothesis} \tag{model design} \tag{research} \tag{code debugging} \tag{report}
|
||||
|
||||
\subsection*{Kelly:}
|
||||
\tag{code} \tag{hypothesis} \tag{model design} \tag{organization} \tag{report} \tag{presentation}
|
||||
|
||||
|
||||
\vfill
|
||||
|
||||
|
||||
\printbibliography[keyword={site}, title={References: Sites}]
|
||||
\printbibliography[keyword={article}, title={References: Articles}]
|
||||
|
||||
\vfill
|
||||
|
||||
\section{Appendix}
|
||||
|
||||
Our code is available at \texttt{https://git.betalupi.com/Mark/celeste-ai}
|
||||
|
||||
\end{document}
|
|
@ -0,0 +1,15 @@
|
|||
\section{Background}
|
||||
% what other people did that is closely related to yours.
|
||||
|
||||
Our work is heavily based off the research done by Minh et. al in \textit{Human-Level Control through Deep Reinforcement Learning} \cite{humanlevel}. The algorithm we developed to solve \textit{Celeste Classic} uses a deep Q-learning algorithm supported by replay memory, with a modified reward system and explore-exploit probability. This is very similar to the architecture presented by Minh et al.
|
||||
|
||||
\vspace{2mm}
|
||||
|
||||
The greatest difference between our approach and the approach of \textit{Human-Level Control} is the input space and neural network type. Minh et. al use a convolutional neural network, which takes the game screen as input. This requires a significant amount of training epochs and computation time, and was thus an unreasonable approach for us. We instead used a plain linear neural network with two inputs: player x and player y.
|
||||
|
||||
\vspace{2mm}
|
||||
|
||||
Another project similar to ours is AiSpawn's \textit{AI Learns to Speedrun Celeste} \cite{aispawn}. Here, AiSpawn completes the same task we do---solving \textit{Celeste Classic}---but he uses a completely different, evolution-based approach.
|
||||
|
||||
\vfill
|
||||
\pagebreak
|
|
@ -0,0 +1,59 @@
|
|||
\section{Conclusion}
|
||||
% What is the answer to the question?
|
||||
|
||||
Using the methods described above, we were able to successfully train a Q-learning agent to play \textit{Celeste Classic}.
|
||||
|
||||
\vspace{2mm}
|
||||
|
||||
The greatest limitation of our model is its slow training speed. It took the model 4000 episodes to complete the first stage, which translates to about 8 hours of training time. A simple evolutionary algorithm, such as the one presented in \textit{AI Learns to Speedrun Celeste} \cite{aispawn} would likely have better performance than our Q-learning agent. Such an algorithm is much better for incremental tasks (such as this one) than a Q-learning algorithm.
|
||||
|
||||
\vspace{2mm}
|
||||
|
||||
We could further develop this model by making it more autonomous---specifically, by training it on raw pixel data rather than curated \texttt{(player\_x, player\_y)} tuples. This modification would \textit{significantly} slow down training, and is therefore best left out of a project with a ten-week time limit.
|
||||
|
||||
|
||||
|
||||
\vspace{5mm}
|
||||
|
||||
While developing our model, we encountered a few questions that we could not resolve. The first of these is the effect of position scaling, which is visible in the graphs below. Note that colors are inconsistent between the graphs---since we refactored our graphing tools after the right graph was generated.
|
||||
|
||||
\vspace{5mm}
|
||||
|
||||
\begin{minipage}{0.5\textwidth}
|
||||
\begin{center}
|
||||
\includegraphics[width=0.9\textwidth]{goodprediction}
|
||||
|
||||
\vspace{1mm}
|
||||
\begin{minipage}{0.9\textwidth}
|
||||
\raggedright
|
||||
\say{Best-action} plot after 500 training episodes with position rescaled to the range $[0, 1]$.
|
||||
\end{minipage}
|
||||
\end{center}
|
||||
\end{minipage}
|
||||
\hfill
|
||||
\begin{minipage}{0.5\textwidth}
|
||||
\begin{center}
|
||||
\includegraphics[width=0.9\textwidth]{badprediction}
|
||||
|
||||
\vspace{1mm}
|
||||
\begin{minipage}{0.9\textwidth}
|
||||
\raggedright
|
||||
\say{Best-action} plot after 500 training episodes with position in the original range $[0, 128]$.
|
||||
\end{minipage}
|
||||
\end{center}
|
||||
\end{minipage}
|
||||
|
||||
\vspace{5mm}
|
||||
|
||||
In these graphs, we see that, without changing the model, the scaling of input values has a \textit{significant} effect on the model's performance. Large inputs cause a \say{zoomed-out linear fanning} effect in the rightmost graph, while the left graph, with rescaled values, has a much more reasonable \say{blob} pattern.
|
||||
|
||||
\vspace{2mm}
|
||||
|
||||
In addition to this, we found that re-centering the game's coordinate system so that \texttt{(0, 0)} is in the center rather than the top-left also has a significant effect on the model's performance. Without centering, the model performs perfectly well. With centering, our loss grows uncontrollably and the model fails to converge.
|
||||
|
||||
\vspace{5mm}
|
||||
|
||||
In both of these cases, the results are surprising. In theory, re-scaled or re-centered data should not affect the performance of the model. This should be accounted for while training, with the weights of the neural network being adjusted to account for different input ranges. We do not have an explanation for this behavior, and would be glad to find one.
|
||||
|
||||
\vfill
|
||||
\pagebreak
|
|
@ -0,0 +1,76 @@
|
|||
\section{Introduction}
|
||||
% Detailed summary of the problem.
|
||||
% Discuss why addressing this problem is important.
|
||||
|
||||
|
||||
\textit{Celeste} \cite{celestegame} is a fairly successful 2018 platformer, known for high-quality level design, a vibrant speedrunning\footnotemark{} community, and brutally difficult progression. It is based on \textit{Celeste Classic}, a 4-day game jam project by the same authors. There are a few reasons we chose to create an agent for \textit{Celeste Classic}:
|
||||
|
||||
\footnotetext{\textit{speedrunning:} a competition where participants try to complete a game as quickly as possible, often abusing bugs and design mistakes.}
|
||||
|
||||
\vspace{4mm}
|
||||
|
||||
\noindent
|
||||
\begin{minipage}{0.5\textwidth}
|
||||
\noindent
|
||||
1: \textit{Celeste Classic} is designed for humans, unlike the environments from, for example, the \texttt{gymnasium} \cite{gymnasium} library.
|
||||
|
||||
\noindent
|
||||
2: It runs on the PICO-8 \cite{pico8}, which allows us to modify its code. This grants us a reliable way to interface with the game interface. This is not true of \textit{Celeste} (2018) --- writing a wrapper for \textit{Celeste} would take a significant amout of time.
|
||||
|
||||
\noindent
|
||||
3: The action space of \textit{Celeste Classic} is small, especially when ineffective actions are pruned.
|
||||
\end{minipage}\hfill
|
||||
\begin{minipage}{0.48\textwidth}
|
||||
\begin{center}
|
||||
\includegraphics[width=0.9\textwidth]{celeste}
|
||||
|
||||
\vspace{1mm}
|
||||
\begin{minipage}{0.8\textwidth}
|
||||
The first stage of \textit{Celeste} (2018), showing the player dashing to the upper-right.
|
||||
\end{minipage}
|
||||
\end{center}
|
||||
\end{minipage}
|
||||
|
||||
\vspace{5mm}
|
||||
|
||||
\noindent
|
||||
When we started this project, our goal was to develop an agent that would learn to finish the first stage of this game. It starts in the bottom-left corner of the stage, and needs to reach the top right. If the agent touches the spikes at the bottom of the stage, the game is reset and the agent must try again.
|
||||
|
||||
|
||||
To achieve this end, our agent selects one of nine actions (listed below) at every time step. It does this using a Q-learning algorithm, which is described in detail later in this paper.
|
||||
|
||||
\noindent
|
||||
\begin{minipage}{0.5\textwidth}
|
||||
Possible actions:
|
||||
\begin{itemize}
|
||||
\item \texttt{left}: move left
|
||||
\item \texttt{right}: move right
|
||||
\item \texttt{jump-l}: jump left
|
||||
\item \texttt{jump-r}: jump right
|
||||
\item \texttt{dash-l}: dash left
|
||||
\item \texttt{dash-r}: dash right
|
||||
\item \texttt{dash-u}: dash up
|
||||
\item \texttt{dash-ru}: dash right-up
|
||||
\item \texttt{dash-lu}: dash left-up
|
||||
\end{itemize}
|
||||
\end{minipage}\hfill
|
||||
\begin{minipage}{0.48\textwidth}
|
||||
\vspace{3ex}
|
||||
\begin{center}
|
||||
\includegraphics[width=0.48\textwidth]{jump}
|
||||
\includegraphics[width=0.48\textwidth]{dash}
|
||||
|
||||
\vspace{1mm}
|
||||
\begin{minipage}{0.9\textwidth}
|
||||
The first stage of \textit{Celeste Classic}. Two possible actions our agent can take are shown: \texttt{jump-r} followed by \texttt{dash-lu}.
|
||||
\end{minipage}
|
||||
\end{center}
|
||||
\end{minipage}
|
||||
|
||||
|
||||
\vfill{}
|
||||
|
||||
This task has no direct practical applications. However, by developing an agent that completes this task, we will explore possible techniques and modifications to the traditional DQN algorithm, and we will learn how a simple machine learning model can be adjusted for a rather complicated task.
|
||||
|
||||
\vfill
|
||||
\pagebreak
|
|
@ -0,0 +1,106 @@
|
|||
\section{Methods}
|
||||
% Detailed description of methods used or developed.
|
||||
|
||||
Our solution to \textit{Celeste Classic} consists of two major parts: the \textit{interface} and the \textit{agent}. The first provides a high-level interface for the game, and the second uses deep Q-learning techniques to control the player.
|
||||
|
||||
|
||||
|
||||
|
||||
\subsection{Interface}
|
||||
|
||||
The interface component does not have any machine-learning logic. Its primary job is to send input and receive game state from \textit{Celeste Classic}. We send input by emulating keypresses with the standard X11 utility \texttt{xtodool}. A minor consequence of this is the fact that our agent may only be run in a linux environment, but this can be remedied with a bit of extra code.
|
||||
|
||||
\vspace{2mm}
|
||||
|
||||
We receive game state by abusing the PICO-8's debugging features. Since PICO-8 games are plain text files, we were able to modify the code of \textit{Celeste Classic} with a few well-placed debug-print statements. The interface captures this text, parses it, and feeds it to our model.
|
||||
|
||||
\vspace{2mm}
|
||||
|
||||
The final component of the interface is timing. First, we modified \textit{Celeste Classic} to only run frames when a key is pressed. This allows the agent to run in in-game time, which wouldn't be possible otherwise: \textit{Celeste} usually runs at 30 fps, and the hardware we used to train our model cannot compute gradients that quickly.
|
||||
|
||||
Second, we implemented a \say{frame skip} mechanism to the interface, which tells the game to run a certain number of frames---many more than one---after the agent selects an action. The benefit of this is twofold: first, it prevents our model from training on redundant information. The game's state does not see significant change over consecutive frames. Second, frame skipping allows transitions to more directly reflect the consequences of an action.
|
||||
|
||||
For example, say the agent chooses to dash upwards. Due to the way \textit{Celeste} is designed, the player cannot take any other action until that dash is complete. Our frame-skip mechanism will run the game until the dash is complete, returning a new state only when a new action can be taken.
|
||||
|
||||
|
||||
|
||||
|
||||
\subsection{Agent}
|
||||
|
||||
The agent we trained to solve \textit{Celeste Classic} is a plain deep Q-learning agent. A neural network estimates the reward of taking each possible action at a given state, and the agent selects the action with the highest predicted reward. This network is a four-layer fully-connected linear net with 128 nodes in each layer and a ReLU activation function on each hidden node. It has two input nodes that track the player's X and Y-position, and nine output nodes which each correspond to an action the agent can take.
|
||||
|
||||
|
||||
|
||||
|
||||
\subsubsection{Reward}
|
||||
|
||||
\noindent
|
||||
\begin{minipage}{0.58\textwidth}
|
||||
During training, the agent receives 10 reward whenever it reaches a checkpoint (at right) or completes the stage. If the agent skips a checkpoint, it gets extra reward for each checkpoint it skipped. For example, jumping from point 1 to point 3 would give the agent 20 reward.
|
||||
|
||||
\vspace{2mm}
|
||||
|
||||
These checkpoints are distributed close enough to keep the agent progressing, but far enough away to give it a challenge. Points 4 and 5 are particularly interesting in this respect. When training an agent without point 4, it would often reach the ledge and fall off, getting no reward.
|
||||
|
||||
\vspace{2mm}
|
||||
|
||||
Despite many thousand epochs, this training process was unable to finish the stage. Though the ledge under point 4 is fairly easy to reach from either point 2 or 3, it is highly unlikely that an untrained agent would make it from point 2 to point 5 without the extra reward at point 4.
|
||||
|
||||
\end{minipage}\hfill
|
||||
\begin{minipage}{0.4\textwidth}
|
||||
\begin{center}
|
||||
\includegraphics[width=0.9\textwidth]{points}
|
||||
|
||||
\vspace{1mm}
|
||||
\begin{minipage}{0.8\textwidth}
|
||||
Locations of non-final checkpoints
|
||||
\end{minipage}
|
||||
\end{center}
|
||||
\end{minipage}
|
||||
|
||||
\vfill
|
||||
\pagebreak
|
||||
|
||||
|
||||
|
||||
|
||||
\subsubsection{Exploration Probability}
|
||||
|
||||
At every step, we use the Q network to predict the expected reward for taking each of the nine actions. Naturally, the best action to take is the one with the highest predicted reward. In order to encourage exploration, we also take a random action with a probability given by
|
||||
$$
|
||||
P(c) = \epsilon_1 + (\epsilon_0 - \epsilon_1) e^{-c / d}
|
||||
$$
|
||||
|
||||
Where $\epsilon_0$ is the initial random probability, $\epsilon_1$ is the end random probability, and $d$ is the rate at which $P(c)$ decays to $\epsilon_1$. $c$ is a rather unusual \say{time} parameter: it counts the number of times the agent has reached the next point.
|
||||
|
||||
\vspace{2mm}
|
||||
|
||||
Usually, such $\epsilon$ policies depend on the number of training steps competed. For many applications, this makes sense: if a model is trained on many iterations, it begins to perform better, and thus has less of a need to explore. In our case, that doesn't work: we need to explore until we find a way to reach a checkpoint, and rely on the model's preditions once we've found one. Therefore, instead of computing $P$ with respect to a simple iteration counter, we instead compute it with respect to $c$.
|
||||
|
||||
|
||||
|
||||
|
||||
\subsubsection{Target Network, Replay Memory}
|
||||
|
||||
To prevent an unstable training process, we use a \textit{target network} as described in \textit{Human-Level Control} \cite{humanlevel}. However, instead of periodically hard-resetting the target network to the Q network, we use a soft update defined by the following equation, where $W_Q$ and $W_T$ are weights of the Q network and target network, respectively.
|
||||
$$
|
||||
W_T = 0.05 W_Q + 0.95 W_T
|
||||
$$
|
||||
|
||||
\vspace{2mm}
|
||||
|
||||
We also use \textit{replay memory} from the same paper, with a batch size of 100 and a total size of 50,000. Our model is optimized using Adam with a learning rate of 0.001.
|
||||
|
||||
|
||||
|
||||
\subsubsection{Bellman Equation}
|
||||
|
||||
Our goal is to train our model to approximate the value function $Q(s, a)$, which tells us the value of taking action $a$ at state $s$. This approximation can then be used to choose the best action at each state. We define $Q$ using the Bellman equation:
|
||||
$$
|
||||
Q(s, a) = r(s) + \gamma Q(s_a)
|
||||
$$
|
||||
Where $r(s)$ is the reward at state $s$, $Q(s_a)$ is the value of the state we get to when we perform action $a$ at state $s$, and $\gamma$ is a discount factor that makes present reward more valuable than future reward. In our model, we set $\gamma$ to $0.9$.
|
||||
|
||||
|
||||
\vfill
|
||||
\pagebreak
|
|
@ -0,0 +1,22 @@
|
|||
\section{Results}
|
||||
% The results of applying the methods to the data set.
|
||||
% Also discuss why the results makes sense, possible implications.
|
||||
|
||||
After sufficient training, our model consistently completed the first stage of \textit{Celeste}. 4000 training episodes were required to achieve this result.
|
||||
|
||||
\vspace{2mm}
|
||||
|
||||
The figure below summarizes our model's performance during training. The color of each pixel in the plot is determined by the action with the highest predicted value, and the path the agent takes through the stage is shown in white. The agent completes the stage in the \say{4000 Episodes} plot, and fails to complete it within the allocated time limit in all the rest. Training the model on more than 4000 episodes did not have a significant effect on the agent's behavior.
|
||||
|
||||
\begin{center}
|
||||
\includegraphics[width=\textwidth]{plots}
|
||||
\end{center}
|
||||
|
||||
A few things are interesting about these results. First, we see that the best-action patterns in the above graphs to not resemble the shape of the stage. At every point that the agent doesn't visit, the predicted best action does not resemble the action an intelligent human player would take. This is because the model is not trained on these points. The predictions there are a side-effect of the training steps applied to the points in the agent's path.
|
||||
|
||||
\vspace{2mm}
|
||||
|
||||
Second, the plots above clearly depict the effect of our modified explore/exploit policy. We can see the first few segments of the agent's path are the same in each graph. In addition, the more the agent trains, the longer this repeated path is. This is a direct result of our explore/exploit policy: our agent stops exploring sections of the stage it can reliably complete, and therefore repeats paths that work.
|
||||
|
||||
\vfill
|
||||
\pagebreak
|
|
@ -0,0 +1,113 @@
|
|||
@article{humanlevel,
|
||||
author = {Mnih, Volodymyr and Kavukcuoglu, Koray and Silver, David and Rusu, Andrei A. and Veness, Joel and Bellemare, Marc G. and Graves, Alex and Riedmiller, Martin and Fidjeland, Andreas K. and Ostrovski, Georg and Petersen, Stig and Beattie, Charles and Sadik, Amir and Antonoglou, Ioannis and King, Helen and Kumaran, Dharshan and Wierstra, Daan and Legg, Shane and Hassabis, Demis},
|
||||
title = {Human-level control through deep reinforcement learning},
|
||||
|
||||
description = {Human-level control through deep reinforcement learning - nature14236.pdf},
|
||||
issn = {00280836},
|
||||
journal = {Nature},
|
||||
month = feb,
|
||||
number = 7540,
|
||||
pages = {529--533},
|
||||
publisher = {Nature Publishing Group, a division of Macmillan Publishers Limited. All Rights Reserved.},
|
||||
timestamp = {2015-08-26T14:46:40.000+0200},
|
||||
url = {http://dx.doi.org/10.1038/nature14236},
|
||||
volume = 518,
|
||||
year = 2015,
|
||||
keywords = {article}
|
||||
}
|
||||
|
||||
|
||||
@article{atari,
|
||||
author = {Mnih, Volodymyr and Kavukcuoglu, Koray and Silver, David and Graves, Alex and Antonoglou, Ioannis and Wierstra, Daan and Riedmiller, Martin},
|
||||
title = {Playing Atari with Deep Reinforcement Learning},
|
||||
url = {http://arxiv.org/abs/1312.5602},
|
||||
year = 2013,
|
||||
keywords = {article}
|
||||
}
|
||||
|
||||
@article{alphago,
|
||||
author = {Silver, David and Schrittwieser, Julian and Simonyan, Karen and Antonoglou, Ioannis and Huang, Aja and Guez, Arthur and Hubert, Thomas and Baker, Lucas and Lai, Matthew and Bolton, Adrian and Chen, Yutian and Lillicrap, Timothy and Hui, Fan and Sifre, Laurent and van den Driessche, George and Graepel, Thore and Hassabis, Demis},
|
||||
description = {Mastering the game of Go without human knowledge},
|
||||
journal = {Nature},
|
||||
pages = {354--},
|
||||
publisher = {Macmillan Publishers Limited},
|
||||
title = {Mastering the game of Go without human knowledge},
|
||||
url = {http://dx.doi.org/10.1038/nature24270},
|
||||
volume = 550,
|
||||
year = 2017,
|
||||
keywords = {article}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@online{sc2ai,
|
||||
author = {},
|
||||
title = {SC2 AI Arena},
|
||||
url = {https://sc2ai.net},
|
||||
addendum = {Accessed 2023-02-25},
|
||||
keywords = {site}
|
||||
}
|
||||
|
||||
|
||||
@online{pt-mario,
|
||||
author = {Feng, Yuansong and Subramanian, Suraj and Wang, Howard and Guo, Steven},
|
||||
title = {Train a Mario-playing RL Agent},
|
||||
url = {https://pytorch.org/tutorials/intermediate/mario_rl_tutorial.html},
|
||||
addendum = {Accessed 2023-02-25},
|
||||
keywords = {site}
|
||||
}
|
||||
|
||||
|
||||
@online{pt-cart,
|
||||
author = {Paszke, Adam and Towers, Mark},
|
||||
title = {Reinforcement Learning (DQN) Tutorial},
|
||||
url = {https://pytorch.org/tutorials/intermediate/reinforcement_q_learning.html},
|
||||
addendum = {Accessed 2023-02-25},
|
||||
keywords = {site}
|
||||
}
|
||||
|
||||
@online{celestegame,
|
||||
author = {},
|
||||
title = {Celeste},
|
||||
url = {https://www.celestegame.com},
|
||||
addendum = {Accessed 2023-02-25},
|
||||
keywords = {site},
|
||||
year = 2018
|
||||
}
|
||||
|
||||
@online{celesteclassic,
|
||||
author = {},
|
||||
title = {Celeste Classic},
|
||||
url = {https://www.lexaloffle.com/bbs/?pid=11722},
|
||||
addendum = {Accessed 2023-02-25},
|
||||
keywords = {site},
|
||||
year = 2015
|
||||
}
|
||||
|
||||
@online{pico8,
|
||||
author = {},
|
||||
title = {PICO-8},
|
||||
url = {https://www.lexaloffle.com/pico-8.php},
|
||||
addendum = {Accessed 2023-02-25},
|
||||
keywords = {site}
|
||||
}
|
||||
|
||||
@online{gymnasium,
|
||||
author = {},
|
||||
title = {Gymnasium},
|
||||
url = {https://github.com/Farama-Foundation/Gymnasium},
|
||||
addendum = {Accessed 2023-02-25},
|
||||
keywords = {site}
|
||||
}
|
||||
|
||||
@online{aispawn,
|
||||
author = {AiSpawn},
|
||||
title = {AI Learns to Speedrun Celeste},
|
||||
url = {https://www.youtube.com/watch?v=y8g1AcTYovg},
|
||||
organization = {Youtube},
|
||||
addendum = {Accessed 2023-02-22},
|
||||
keywords = {site}
|
||||
}
|
||||
|