diff --git a/celeste/celeste.py b/celeste/celeste.py index 1168ab1..851c30d 100755 --- a/celeste/celeste.py +++ b/celeste/celeste.py @@ -49,6 +49,8 @@ class Celeste: # Initialize variables self.internal_status = {} + self.before_out = None + self.last_point_frame = 0 # Score system self.frame_counter = 0 @@ -166,6 +168,9 @@ class Celeste: self.internal_status = {} self.next_point = 0 self.frame_counter = 0 + self.before_out = None + self.resetting = True + self.last_point_frame = 0 self.keypress("Escape") self.keystring("run") @@ -185,13 +190,12 @@ class Celeste: # Get state, call callback, wait for state # One line => one frame. - before_out = None - it = iter(self.process.stdout.readline, "") for line in it: l = line.decode("utf-8")[:-1].strip() + self.resetting = False # This should only occur at game start if l in ["!RESTART"]: @@ -206,7 +210,7 @@ class Celeste: key, val = entry.split(":") self.internal_status[key] = val - + # Update checkpoints @@ -221,6 +225,7 @@ class Celeste: if dist <= 4 and y == ty: print(f"Got point {self.next_point}") self.next_point += 1 + self.last_point_frame = self.frame_counter # Recalculate distance to new point tx, ty = self.target_points[self.status["stage"]][self.next_point] @@ -229,9 +234,14 @@ class Celeste: (y-ty)*(y-ty) ) + # Timeout if we spend too long between points + elif self.frame_counter - self.last_point_frame > 40: + self.internal_status["dc"] = str(int(self.internal_status["dc"]) + 1) + self.dist = dist - - # Call step callback - if before_out is not None: - after(self, before_out) - before_out = before(self) \ No newline at end of file + + # Call step callbacks + if self.before_out is not None: + after(self, self.before_out) + if not self.resetting: + self.before_out = before(self) \ No newline at end of file diff --git a/celeste/main.py b/celeste/main.py index aa972b0..9ad288d 100644 --- a/celeste/main.py +++ b/celeste/main.py @@ -42,9 +42,9 @@ EPS_DECAY = 1000 BATCH_SIZE = 128 # Learning rate of target_net. # Controls how soft our soft update is. -# +# # Should be between 0 and 1. -# Large values +# Large values # Small values do the opposite. # # A value of one makes target_net @@ -174,7 +174,7 @@ def optimize_model(): raise Exception(f"Not enough elements in memory for a batch of {BATCH_SIZE}") - + # Get a random sample of transitions batch = random.sample(memory, BATCH_SIZE) @@ -238,13 +238,13 @@ def optimize_model(): # V(s_t+1) = max_a ( Q(s_t+1, a) ) # = the maximum reward over all possible actions at state s_t+1. next_state_values = torch.zeros(BATCH_SIZE, device = compute_device) - + # Don't compute gradient for operations in this block. # If you don't understand what this means, RTFD. with torch.no_grad(): - + # Note the use of non_final_mask here. - # States that are final do not have their reward set by the line + # States that are final do not have their reward set by the line # below, so their reward stays at zero. # # States that are not final get their predicted value @@ -274,7 +274,7 @@ def optimize_model(): expected_state_action_values.unsqueeze(1) ) - + # We can now run a step of backpropagation on our model. @@ -362,10 +362,18 @@ def on_state_after(celeste, before_out): if state["next_point"] == next_state["next_point"]: reward = state["dist"] - next_state["dist"] + + if reward > 0: + reward = 1 + elif reward < 0: + reward = -1 + else: + reward = 0 else: # Score for reaching a point reward = 10 + pt_reward = torch.tensor([reward], device = compute_device)