update 1_get_expert_trajectory.py

HuangJiaLian · Aug 24, 2021 · b2f048c · b2f048c
1 parent 9414186
commit b2f048c
Showing 1 changed file with 48 additions and 29 deletions.
diff --git a/1_get_expert_trajectory.py b/1_get_expert_trajectory.py
@@ -1,20 +1,31 @@
-from tensorflow.keras import models, layers, optimizers
+'''
+@Description: 
+@Author: Jack Huang
+@Github: https://github.com/HuangJiaLian
+@Date: 2019-10-10 19:27:08
+@LastEditors: Jack Huang
+@LastEditTime: 2021-08-24 19:24:52
+'''
+
 import gym 
 import numpy as np 
 import os, shutil
 
-# A very easy policy used for finding expert trajectories
+# A an policy used for finding expert trajectories
 def get_action(state):
+    # If moving to the left, full throttle reverse
     if state[-1] < 0:
-        action = 0 # full throttle reverse
+        action = 0
+    # If moving to the right, full throttle forward
     elif state[-1] > 0:
-        action = 2 # full throttle forward
+        action = 2 
+    # Zero throttle
     else:
-        action = 1 # zero throttle
+        action = 1 
     return action
 
 
-# Function used for saveing expert trajectories
+# Saveing expert trajectories
 def open_file_and_save(path, name, data):
     try:
         with open(os.path.join(path,name), 'ab') as f_handle:
@@ -24,76 +35,84 @@ def open_file_and_save(path, name, data):
             np.savetxt(f_handle, data, fmt='%s')
 
 
-
-def main(env_name):
+def main():
     # Initialise gym environment
-    env = gym.make(env_name)
+    env = gym.make("MountainCar-v0")
     ob_space = env.observation_space
     action_space = env.action_space.n 
 
     # Number of expert trajectories
     episodes = 20
 
-    # The maximum steps in one episode
+    # The maximum step limit in one episode to make sure the mountain car 
+    # task is finite Markov decision processes (MDP).
     max_steps = 200
-    scores = []
+
+    # A list to store the
+    # return, (i.e, the sum of the reward in an episode),in each episode.
+    return_Gs = []
 
-    # Path for storing trajectories
+    # Create path for storing trajectories
     path = 'exp_traj'
     if os.path.exists(path) != True:
         os.makedirs(path)
     else:
-        # Delete previous trajectories
+        # Delete previous trajectories before creating a new one
         shutil.rmtree(path) 
         os.makedirs(path)
 
     for episode in range(episodes):
+        print('Episode:{}'.format(episode+1))
+        # Get the initial state of environment
         state = env.reset()
-        score = 0
+        return_G = 0
         done = False 
 
         # Variables to collect data would be created in this episode
         observations = []
         actions = []
-        returns = []
         for step in range(max_steps):
-
-            # Uncomment rendering if run on a server
-            # env.render()
+            # env.render() # Uncomment rendering if run on a server 
+            # Get action of expert according to the new state
             action = get_action(state=state)
 
             # Record states and actions
             observations.append(state)
             actions.append(action)
 
-            # Interface with gym environment
-            next_state, reward, done, _ = env.step(action)
-            score += reward
+            # Interact with gym environment, and 
+            # obtain a new reward and a new state
+            next_state, next_reward, done, _ = env.step(action)
+            return_G += next_reward
             state = next_state
+
+            # If the car reach the goal, end the episode.
             if done:
                 break
+
         # Record the return(the rewards sum of this episode)
-        print('Return:', score)
-        scores.append(score)    
+        print('Return:{}'.format(return_G))
+        return_Gs.append(return_G)    
 
-        # Get the next states of current states
+        # The next states, S_{t+1}
         next_observations = observations[1:]
         observations = observations[:-1]
         actions = actions[:-1]
 
+        # Save as csv files
         next_observations = np.reshape(next_observations, newshape=[-1] + list(ob_space.shape))
         observations = np.reshape(observations, newshape=[-1] + list(ob_space.shape))
         actions = np.array(actions).astype(dtype=np.int32)
-
         open_file_and_save(path = 'exp_traj', name = 'next_observations.csv', data = next_observations)
         open_file_and_save(path = 'exp_traj', name = 'observations.csv', data = observations)
         open_file_and_save(path = 'exp_traj', name = 'actions.csv', data = actions)
-
 
-    scores = np.array(scores).astype(dtype=np.int32)
-    open_file_and_save(path = 'exp_traj', name = 'returns.csv', data = scores)
+    # Save returns 
+    return_Gs = np.array(return_Gs).astype(dtype=np.int32)
+    open_file_and_save(path = 'exp_traj', name = 'returns.csv', data = return_Gs)
     env.close()
+    print('Done: {} expert trajectories obtained.'.format(episodes))
 
 
 if __name__ == "__main__":
-    main(env_name="MountainCar-v0")
+    main()