updateto support Tensorflow V2; and modifiy 2_airl.py

HuangJiaLian · Sep 21, 2021 · ec707ef · ec707ef
1 parent b2f048c
commit ec707ef
Show file tree

Hide file tree

Showing 5 changed files with 125 additions and 133 deletions.
diff --git a/2_airl.py b/2_airl.py
@@ -4,10 +4,11 @@
 @Github: https://github.com/HuangJiaLian
 @Date: 2019-10-10 19:27:08
 @LastEditors: Jack Huang
-@LastEditTime: 2019-11-18 19:24:52
+@LastEditTime: 2021-08-24 19:24:52
 '''
 
-import tensorflow as tf 
+import tensorflow.compat.v1 as tf
+tf.disable_v2_behavior() 
 import numpy as np 
 import gym, os
 import algo.generator as gen 
@@ -42,9 +43,11 @@ def drawRewards(D, episode, path):
     plt.clf()
 
 def main():
-    # Env 
+    # Mountain care env setting
     env = gym.make('MountainCar-v0') 
     ob_space = env.observation_space
+    action_space = env.action_space
+    print(ob_space, action_space)
 
     # For Reinforcement Learning
     Policy = gen.Policy_net('policy', env)
@@ -54,90 +57,89 @@ def main():
     # For Inverse Reinforcement Learning
     D = dis.Discriminator(env)
 
-    # Load Experts Demonstration
+    # Load expert trajectories
     expert_observations = np.genfromtxt('exp_traj/observations.csv')
     next_expert_observations = np.genfromtxt('exp_traj/next_observations.csv')
     expert_actions = np.genfromtxt('exp_traj/actions.csv', dtype=np.int32)
-
+    # Expert returns is just used for showing the mean scrore, not for training
     expert_returns = np.genfromtxt('exp_traj/returns.csv')
     mean_expert_return = np.mean(expert_returns)
 
-    max_episode = 24000
+    max_episode = 10000
+    # The maximum step limit in one episode to make sure the mountain car 
+    # task is finite Markov decision processes (MDP).
     max_steps = 200
     saveReturnEvery = 100
     num_expert_tra = 20 
 
-    # Logger 用来记录训练过程
+    # Just use to record the training process
     train_logger = log.logger(logger_name='AIRL_MCarV0_Training_Log', 
-        logger_path='./trainingLog/', col_names=['Episode', 'Actor(D)', 'Expert Mean(D)','Actor','Expert Mean'])
+        logger_path='./trainingLog/', col_names=['Episode', 'Actor(D)', 
+        'Expert Mean(D)','Actor','Expert Mean'])
 
-    # Saver to save all the variables
+    # Model saver
     model_save_path = './model/'
     model_name = 'airl'
     saver = tf.train.Saver(max_to_keep=int(max_episode/100))
 
     with tf.Session() as sess:
         sess.run(tf.global_variables_initializer())
-        obs = env.reset() 
-        # do NOT use original rewards to update policy
         for episode in range(max_episode):
             if episode % 100 == 0:
                 print('Episode ', episode)
-
             observations = []
             actions = []
             rewards = []
             v_preds = []
 
-            # 遍历这次游戏中的每一步
             obs = env.reset()
+            # Interact with the environment until reach
+            # the terminal state or the maximum step.
             for step in range(max_steps):
                 # if episode % 100 == 0:
                 #     env.render()
+
                 obs = np.stack([obs]).astype(dtype=np.float32)
-                act, v_pred = Policy.get_action(obs=obs, stochastic=True)
-                act = np.asscalar(act)
-                v_pred = np.asscalar(v_pred)
+                # act, v_pred = Policy.get_action(obs=obs, stochastic=True)
+                act, v_pred = Old_Policy.get_action(obs=obs, stochastic=True)
+
 
-                # 和环境交互
-                next_obs, reward, done, info = env.step(act)
+                next_obs, reward, done, _ = env.step(act)
 
                 observations.append(obs)
                 actions.append(act)
-                # 这里的reward并不是用来更新网络的,而是用来记录真实的
-                # 表现的。
+                # DO NOT use original rewards to update policy
                 rewards.append(reward)
                 v_preds.append(v_pred)
 
                 if done:
-                    v_preds_next = v_preds[1:] + [0]  # next state of terminate state has 0 state value
+                    # next state of terminate state has 0 state value
+                    v_preds_next = v_preds[1:] + [0]  
                     break
                 else:
                     obs = next_obs
 
-            # 完了就可以用数据来训练网络了
-
-            # 准备数据
-            # Expert的数据已经准备好了
-            # Generator的数据
-            # convert list to numpy array for feeding tf.placeholder
-
+            # Data preparation
+            # Data for generator: convert list to numpy array for feeding tf.placeholder
             next_observations = observations[1:]
             observations = observations[:-1]
             actions = actions[:-1]
 
             next_observations = np.reshape(next_observations, newshape=[-1] + list(ob_space.shape))
             observations = np.reshape(observations, newshape=[-1] + list(ob_space.shape))
             actions = np.array(actions).astype(dtype=np.int32)
-            # Get the G's probabilities 
-            probabilities = get_probabilities(policy=Policy, observations=observations, actions=actions)
-            # Get the experts' probabilities
-            expert_probabilities = get_probabilities(policy=Policy, observations=expert_observations, actions=expert_actions)
+
+            # G's probabilities 
+            probabilities = get_probabilities(policy=Policy, \
+                observations=observations, actions=actions)
+            # Experts' probabilities
+            expert_probabilities = get_probabilities(policy=Policy, \
+                observations=expert_observations, actions=expert_actions)
 
-            # numpy 里面log的底数是e
             log_probabilities = np.log(probabilities)
             log_expert_probabilities = np.log(expert_probabilities)
 
+            # Prepare data for disriminator
             if D.only_position:
                 observations_for_d = (observations[:,0]).reshape(-1,1)
                 next_observations_for_d = (next_observations[:,0]).reshape(-1,1)
@@ -146,103 +148,93 @@ def main():
             log_probabilities_for_d = log_probabilities.reshape(-1,1)
             log_expert_probabilities_for_d = log_expert_probabilities.reshape(-1,1)
 
-            # 数据排整齐
 
             obs, obs_next, acts, path_probs = \
-                observations_for_d, next_observations_for_d, actions, log_probabilities
+                observations_for_d, next_observations_for_d, \
+                actions.reshape(-1,1), log_probabilities.reshape(-1,1)
             expert_obs, expert_obs_next, expert_acts, expert_probs = \
-                expert_observations_for_d, next_expert_observations_for_d, expert_actions, log_expert_probabilities
+                expert_observations_for_d, next_expert_observations_for_d, \
+                expert_actions.reshape(-1,1), log_expert_probabilities.reshape(-1,1)
+
 
-            acts = acts.reshape(-1,1)
-            expert_acts = expert_acts.reshape(-1,1)
-
-            path_probs = path_probs.reshape(-1,1)
-            expert_probs = expert_probs.reshape(-1,1)
 
-            # train discriminator 得到Reward函数
-            # print('Train D')
-            # 这里两类数据量的大小不对等啊
-            # 应该可以优化的
+            # 这里两类数据量的大小不对等啊, 应该可以优化的??
+            # Train discriminator
             batch_size = 32
             for i in range(1):
-                # 抽一个G的batch
+                # Sample generator
                 nobs_batch, obs_batch, act_batch, lprobs_batch = \
                     sample_batch(obs_next, obs, acts, path_probs, batch_size=batch_size)
-
-                # 抽一个Expert的batch
+                # Sample expert
                 nexpert_obs_batch, expert_obs_batch, expert_act_batch, expert_lprobs_batch = \
-                    sample_batch(expert_obs_next, expert_obs, expert_acts, expert_probs, batch_size=batch_size)
+                    sample_batch(expert_obs_next, expert_obs, expert_acts, \
+                    expert_probs, batch_size=batch_size)
 
-                # 前半部分负样本0; 后半部分是正样本1
+                # Label generator samples as 0, indicating that discriminator 
+                # always consider generator's behavior is not good;
+                # Label expert samples as 1, indicating that discriminator 
+                # always consider expert's behavior is excellent.
                 labels = np.zeros((batch_size*2, 1))
                 labels[batch_size:] = 1.0
 
-                # 拼在一起喂到神经网络里面去训练
                 obs_batch = np.concatenate([obs_batch, expert_obs_batch], axis=0)
                 nobs_batch = np.concatenate([nobs_batch, nexpert_obs_batch], axis=0)
-                # 若是只和状态相关，下面这个这个没有用
-                act_batch = np.concatenate([act_batch, expert_act_batch], axis=0)
                 lprobs_batch = np.concatenate([lprobs_batch, expert_lprobs_batch], axis=0)
                 D.train(obs_t = obs_batch, 
                         nobs_t = nobs_batch, 
                         lprobs = lprobs_batch, 
                         labels = labels)
+
             if episode % 50 == 0:
                 drawRewards(D=D, episode=episode, path='./trainingLog/')
 
-            # output of this discriminator is reward
+            # The output of this discriminator is reward
             if D.score_discrim == False:
                 d_rewards = D.get_scores(obs_t=observations_for_d)
             else:
-                d_rewards = D.get_l_scores(obs_t=observations_for_d, nobs_t=next_observations_for_d, lprobs=log_probabilities_for_d)
+                d_rewards = D.get_l_scores(obs_t=observations_for_d, \
+                    nobs_t=next_observations_for_d, lprobs=log_probabilities_for_d)
+
             d_rewards = np.reshape(d_rewards, newshape=[-1]).astype(dtype=np.float32)
-            d_actor_return = np.sum(d_rewards)
-            # print(d_actor_return)
+            # Sum rewards to get return: Just for tracking the record of returns overtime.  
+            d_actor_return = np.sum(d_rewards) 
 
-            # d_expert_return: Just For Tracking
             if D.score_discrim == False:
                 expert_d_rewards = D.get_scores(obs_t=expert_observations_for_d)
             else:
-                expert_d_rewards = D.get_l_scores(obs_t=expert_observations_for_d, nobs_t= next_expert_observations_for_d,lprobs= log_expert_probabilities_for_d )
+                expert_d_rewards = D.get_l_scores(obs_t=expert_observations_for_d, \
+                    nobs_t= next_expert_observations_for_d,lprobs= log_expert_probabilities_for_d )
             expert_d_rewards = np.reshape(expert_d_rewards, newshape=[-1]).astype(dtype=np.float32)
             d_expert_return = np.sum(expert_d_rewards)/num_expert_tra
-            # print(d_expert_return)
-
-            ######################
-            # Start Logging      #
-            ######################
+
+            #** Start Logging **#: Just use to track information
             train_logger.add_row_data([episode, d_actor_return, d_expert_return, 
                                 sum(rewards), mean_expert_return], saveFlag=True)
             if episode % saveReturnEvery == 0:
                 train_logger.plotToFile(title='Return')
-            ###################
-            # End logging     # 
-            ###################
+            #** End logging  **# 
 
             gaes = PPO.get_gaes(rewards=d_rewards, v_preds=v_preds, v_preds_next=v_preds_next)
             gaes = np.array(gaes).astype(dtype=np.float32)
             # gaes = (gaes - gaes.mean()) / gaes.std()
             v_preds_next = np.array(v_preds_next).astype(dtype=np.float32)
-
-            # train policy 得到更好的Policy
             inp = [observations, actions, gaes, d_rewards, v_preds_next]
 
-            # if episode % 4 == 0:
-            #     PPO.assign_policy_parameters()
-
-            PPO.assign_policy_parameters()
-
+            if episode % 4 == 0:
+                PPO.assign_policy_parameters()
+            # PPO.assign_policy_parameters()
 
             for epoch in range(10):
-                sample_indices = np.random.randint(low=0, high=observations.shape[0],
-                                                   size=32)  # indices are in [low, high)
-                sampled_inp = [np.take(a=a, indices=sample_indices, axis=0) for a in inp]  # sample training data
+                sample_indices = np.random.randint(low=0, high=observations.shape[0],size=32)
+                # sample training data
+                sampled_inp = [np.take(a=a, indices=sample_indices, axis=0) for a in inp]  
                 PPO.train(obs=sampled_inp[0],
                           actions=sampled_inp[1],
                           gaes=sampled_inp[2],
                           rewards=sampled_inp[3],
                           v_preds_next=sampled_inp[4])
-            # 保存整个模型
+
+            # Save model
             if episode > 0 and episode % 100 == 0:
                 saver.save(sess, os.path.join(model_save_path, model_name), global_step=episode)
 

diff --git a/3_load_airl.py b/3_load_airl.py
@@ -4,10 +4,11 @@
 @Github: https://github.com/HuangJiaLian
 @Date: 2019-10-10 19:27:08
 @LastEditors: Jack Huang
-@LastEditTime: 2019-11-21 22:10:00
+@LastEditTime: 2021-08-24 22:10:00
 '''
 
-import tensorflow as tf 
+import tensorflow.compat.v1 as tf
+tf.disable_v2_behavior() 
 import numpy as np 
 import gym, os
 import argparse
@@ -51,7 +52,7 @@ def main():
     num_expert_tra = 20 
 
     # Saver to save all the variables
-    model_save_path = './modelGAN/'
+    model_save_path = './model/'
     model_name = 'airl'
     saver = tf.train.Saver() 
     ckpt = tf.train.get_checkpoint_state(model_save_path)
@@ -95,15 +96,15 @@ def main():
             # 遍历这次游戏中的每一步
             obs = env.reset()
             for step in range(max_steps):
-                # if episode % 100 == 0:
-                #     env.render()
+                if episode % 100 == 0:
+                    env.render()
                 obs = np.stack([obs]).astype(dtype=np.float32)
 
                 # When testing set stochastic False will get better performance
-                act, v_pred = Policy.get_action(obs=obs, stochastic=True)
-                # act, v_pred = Policy.get_action(obs=obs, stochastic=False)
-                act = np.asscalar(act)
-                v_pred = np.asscalar(v_pred)
+                # act, v_pred = Policy.get_action(obs=obs, stochastic=True)
+                act, v_pred = Policy.get_action(obs=obs, stochastic=False)
+                # act = act.item()
+                # v_pred = v_pred.item()
 
                 # 和环境交互
                 next_obs, reward, done, info = env.step(act)