citbrains · yasuohayashibara · Sep 16, 2020 · Sep 16, 2020 · Sep 16, 2020 · Sep 16, 2020
diff --git a/check_walking.py b/check_walking.py
@@ -0,0 +1,29 @@
+#!/usr/bin/env python3
+
+import gym
+import envs
+from time import sleep
+
+env = gym.make('GankenKun-v0')
+state = env.reset()
+
+step = 0
+while True:
+    if 0 <= step <= 10:
+        action = [1.0, 0.0, 0.0]
+    if 10 <= step <= 20:
+        action = [0.0, 1.0, 0.0]
+    if 20 <= step <= 30:
+        action = [-1.0, 0.0, 0.0]
+    if 30 <= step <= 40:
+        action = [0.0, -1.0, 0.0]
+    if 40 <= step <= 50:
+        action = [0.0, 0.0, 1.0]
+    if 50 <= step <= 60:
+        action = [0.0, 0.0, -1.0]
+    if 60 <= step <= 70:
+        action = [0.0, 0.0, 0.0]
+    next_state, reward, done, info = env.step(action)
+    step += 1
+    sleep(1)
+
diff --git a/dqn_train.py b/dqn_train.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python3
+
 import os
 import random
 
@@ -13,15 +15,15 @@
 
 def train():
     # setup ===========================
-    max_episode = 1000  # 学習において繰り返す最大エピソード数
+    max_episode = 10000  # 学習において繰り返す最大エピソード数
     max_step = 250  # 1エピソードの最大ステップ数
-    n_warmup_steps = 10000  # warmupを行うステップ数
+    n_warmup_steps = 250 # warmupを行うステップ数
     interval = 1  # モデルや結果を吐き出すステップ間隔
-    actions_list = [[0.1, 0.0, 0.0], [-0.1, 0.0, 0.0], [0.0, 0.06, 0.0], [0.0, -0.06, 0.0], [0.0, 0.0, 0.4], [0.0, 0.0, -0.4]]  # 行動(action)の取りうる値のリスト
-    gamma = 0.99  # 割引率
-    epsilon = 0.1  # ε-greedyのパラメータ
+    actions_list = [[0.1, 0.0, 0.0], [-0.1, 0.0, 0.0], [0.0, 0.06, 0.0], [0.0, -0.06, 0.0], [0.0, 0.1, 1.0], [0.0, -0.1, -1.0]]  # 行動(action)の取りうる値のリスト
+    gamma = 0.9  # 割引率
+    epsilon = 0.05  # ε-greedyのパラメータ
     memory_size = 10000
-    batch_size = 32
+    batch_size = 4
     result_dir = os.path.join('./result/dqn', now_str())
     x = []
     x_reward = []
@@ -71,6 +73,7 @@ def train():
             step = 0
         if total_step > n_warmup_steps:
             break
+        print(total_step)
     memory = memory[-memory_size:]
     print('warming up {:,} steps... done.'.format(n_warmup_steps))
 
@@ -100,14 +103,16 @@ def train():
 
             memory.append((state, action, c_reward, next_state, done))
             episode_reward_list.append(c_reward)
+
             exps = random.sample(memory, batch_size)
             loss, td_error = q_network.update_on_batch(exps)
             loss_list.append(loss)
             td_list.append(td_error)
 
-            q_network.sync_target_network(soft=0.01)
+            q_network.sync_target_network(soft=0.001)
             state = next_state
             memory = memory[-memory_size:]
+#            print(len(memory))
 
             # end of episode
             if step >= max_step or done:
@@ -116,7 +121,7 @@ def train():
                 reward_sum = np.sum(episode_reward_list)
                 loss_avg = np.mean(loss_list)
                 td_error_avg = np.mean(td_list)
-                print("{}episode  reward_avg:{} loss:{} td_error:{}".format(num_episode, reward_sum, loss_avg, td_error_avg))
+                print("{}episode  reward_avg:{} loss:{} td_error:{}".format(num_episode, reward_avg, loss_avg, td_error_avg))
                 if num_episode % interval == 0:
                     model_path = os.path.join(result_dir, 'episode_{}.h5'.format(num_episode))
                     q_network.main_network.save(model_path)
@@ -136,7 +141,6 @@ def train():
                     print("last_10_reward:{}".format(last_10_avg))
                     x_reward.append(num_episode)
                     y_last10_reward.append(last_10_avg)
-
 
         if num_episode >= max_episode:
             episode_loop = False

diff --git a/envs/GankenKun_pybullet/GankenKun/foot_step_planner.py b/envs/GankenKun_pybullet/GankenKun/foot_step_planner.py
@@ -19,21 +19,21 @@ def calculate(self, goal_x, goal_y, goal_th, current_x, current_y, current_th, n
     step_x  = abs(goal_x  - current_x )/self.max_stride_x
     step_y  = abs(goal_y  - current_y )/self.max_stride_y
     step_th = abs(goal_th - current_th)/self.max_stride_th
-    max_step = max(step_x, step_y, step_th)
+    max_step = max(step_x, step_y, step_th, 1)
     stride_x  = (goal_x  - current_x )/max_step
     stride_y  = (goal_y  - current_y )/max_step
     stride_th = (goal_th - current_th)/max_step
 
     # first step
     foot_step = []
     if status == 'start':
-      foot_step += [[0.0, current_x, current_y, current_th, 'both']]
+      foot_step += [[0.0, current_x, current_y, current_th, 'both', 0]]
       time += self.period * 2.0
     if next_support_leg == 'right':
-      foot_step += [[time, current_x, -self.width+current_y, current_th, next_support_leg]]
+      foot_step += [[time, current_x, -self.width+current_y, current_th, next_support_leg, -self.width]]
       next_support_leg = 'left'
     elif next_support_leg == 'left':
-      foot_step += [[time, current_x,  self.width+current_y, current_th, next_support_leg]]
+      foot_step += [[time, current_x,  self.width+current_y, current_th, next_support_leg,  self.width]]
       next_support_leg = 'right'
 
     # walking
@@ -50,10 +50,10 @@ def calculate(self, goal_x, goal_y, goal_th, current_x, current_y, current_th, n
       next_y  = current_y  + stride_y
       next_th = current_th + stride_th
       if next_support_leg == 'right':
-        foot_step += [[time, next_x, next_y-self.width, next_th, next_support_leg]]
+        foot_step += [[time, next_x, next_y-self.width, next_th, next_support_leg, -self.width]]
         next_support_leg = 'left'
       elif next_support_leg == 'left':
-        foot_step += [[time, next_x, next_y+self.width, next_th, next_support_leg]]
+        foot_step += [[time, next_x, next_y+self.width, next_th, next_support_leg,  self.width]]
         next_support_leg = 'right'
       current_x, current_y, current_th = next_x, next_y, next_th
 
@@ -62,16 +62,16 @@ def calculate(self, goal_x, goal_y, goal_th, current_x, current_y, current_th, n
     if not status == 'stop':
       time += self.period
       if next_support_leg == 'right':
-        foot_step += [[time, next_x, next_y-self.width, next_th, next_support_leg]]
+        foot_step += [[time, next_x, next_y-self.width, next_th, next_support_leg, -self.width]]
       elif next_support_leg == 'left':
-        foot_step += [[time, next_x, next_y+self.width, next_th, next_support_leg]]
+        foot_step += [[time, next_x, next_y+self.width, next_th, next_support_leg,  self.width]]
       time += self.period
       next_support_leg = 'both'
-      foot_step += [[time, next_x, next_y, next_th, next_support_leg]]
+      foot_step += [[time, next_x, next_y, next_th, next_support_leg, 0]]
       time += 2.0
-      foot_step += [[time, next_x, next_y, next_th, next_support_leg]]
+      foot_step += [[time, next_x, next_y, next_th, next_support_leg, 0]]
       time += 100.0
-      foot_step += [[time, next_x, next_y, next_th, next_support_leg]]
+      foot_step += [[time, next_x, next_y, next_th, next_support_leg, 0]]
 
     return foot_step
 

diff --git a/envs/GankenKun_pybullet/GankenKun_env.py b/envs/GankenKun_pybullet/GankenKun_env.py
@@ -15,7 +15,7 @@
 import gym
 from gym import error, spaces, utils
 from gym.utils import seeding
-
+import math
 
 class GankenKunEnv(gym.Env):
     def __init__(self):
@@ -27,17 +27,21 @@ def __init__(self):
         self.y_threshold = 3.5
         self.ball_x_threshold = 4.5
         self.ball_y_threshold = 3.0
+        self.ball_not_touch_period = 0
 
-        TIME_STEP = 0.001
- #       physicsClient = p.connect(p.GUI)
-        physicsClient = p.connect(p.DIRECT)
+        TIME_STEP = 0.01
+        physicsClient = p.connect(p.GUI)
+#        physicsClient = p.connect(p.DIRECT)
         p.setGravity(0, 0, -9.8)
         p.setTimeStep(TIME_STEP)
+        p.setPhysicsEngineParameter(numSubSteps=10)
+        p.configureDebugVisualizer(p.COV_ENABLE_GUI,0)
 
 #        planeId = p.loadURDF("envs/GankenKun_pybullet/URDF/plane.urdf", [0, 0, 0])
         self.FieldId = p.loadSDF("envs/GankenKun_pybullet/SDF/field/soccerfield.sdf")
         self.RobotId = p.loadURDF("envs/GankenKun_pybullet/URDF/gankenkun.urdf", [0, 0, 0])
         self.BallId = p.loadSDF("envs/GankenKun_pybullet/SDF/ball/ball.sdf")
+        self.ball_pos = [0.3, 0.0]
 
         self.index = {p.getBodyInfo(self.RobotId)[0].decode('UTF-8'):-1,}
         for id in range(p.getNumJoints(self.RobotId)):
@@ -65,37 +69,37 @@ def __init__(self):
         x_goal, y_goal, th_goal = 0.1, 0.0, 0.0
         self.foot_step = self.walk.setGoalPos([x_goal, y_goal, th_goal])
     #    print(p.getBasePositionAndOrientation(BallId[0]))
+        self.ball_delta_length = 0.0
 
     def step(self, action):
-        j = 0
-        #print()
         while p.isConnected():
-            j += 1
-            if j >= 10:
-                self.joint_angles,lf,rf,xp,n = self.walk.getNextPos()
-                j = 0
-                if n == 0:
-                    if (len(self.foot_step) <= 6):
-                        x_goal = self.foot_step[0][1] + action[0]
-                        y_goal = self.foot_step[0][2] + action[1]
-                        th_goal = self.foot_step[0][3] + action[2]
-
-                        self.foot_step = self.walk.setGoalPos([x_goal, y_goal, th_goal])
-                    else:
-                        self.foot_step = self.walk.setGoalPos()
-                #if you want new goal, please send position
+            self.joint_angles,lf,rf,xp,n = self.walk.getNextPos()
+            if n == 0:
+                if self.foot_step[0][4] == "left":
+                    x_goal = self.foot_step[0][1] + action[0]
+                    y_goal = self.foot_step[0][2] - self.foot_step[0][5] + action[1]
+                    th_goal = self.foot_step[0][3] + action[2]
+                    self.foot_step = self.walk.setGoalPos([x_goal, y_goal, th_goal])
                     break
+                else:
+                    self.foot_step = self.walk.setGoalPos()
             for id in range(p.getNumJoints(self.RobotId)):
                 qIndex = p.getJointInfo(self.RobotId, id)[3]
                 if qIndex > -1:
                     p.setJointMotorControl2(self.RobotId, id, p.POSITION_CONTROL, self.joint_angles[qIndex-7])
-                
+
             p.stepSimulation()
             #   sleep(TIME_STEP) # delete -> speed up
+
         x, y, _ = p.getBasePositionAndOrientation(self.RobotId)[0]
         roll, pitch, yaw = p.getEulerFromQuaternion(p.getBasePositionAndOrientation(self.RobotId)[1])
         ball_x, ball_y, _ = p.getBasePositionAndOrientation(self.BallId[0])[0]
-        self.state = [x, y, yaw, ball_x, ball_y]
+        ball_lx, ball_ly = ball_x - x, ball_y - y
+        goal_lx, goal_ly = self.goal_pos[0] - x, self.goal_pos[1] - y
+        ball_rx, ball_ry = ball_lx * math.cos(yaw) + ball_ly * math.sin(yaw), - ball_lx * math.sin(yaw) + ball_ly * math.cos(yaw)
+        goal_rx, goal_ry = goal_lx * math.cos(yaw) + goal_ly * math.sin(yaw), - goal_lx * math.sin(yaw) + goal_ly * math.cos(yaw)
+
+        self.state = [goal_rx, goal_ry, yaw, ball_rx, ball_ry]
 
         done = bool(
                 abs(x) > self.x_threshold
@@ -108,19 +112,31 @@ def step(self, action):
         if not done:
             dx, dy = ball_x - x, ball_y - y
             ball_distance = math.sqrt(dx**2 + dy**2)
-            reward += math.floor(-10.0 * ball_distance)/10
-            ball_goal_distance = math.sqrt((goal_x - ball_x)**2 + (goal_y - ball_y)**2)
-            reward += math.floor(-10.0 * ball_goal_distance)/10
+            reward += - ball_distance / 10
+            ball_goal_prev_distance = math.sqrt((goal_x + 1.0 - self.ball_pos[0])**2 + (goal_y - self.ball_pos[1])**2)
+            ball_goal_distance = math.sqrt((goal_x + 1.0 - ball_x)**2 + (goal_y - ball_y)**2)
+            if self.ball_delta_length == 0.0:
+                reward += max((- ball_goal_distance + ball_goal_prev_distance)*10, 0)
+            self.ball_delta_length = - ball_goal_distance + ball_goal_prev_distance
+            if ball_goal_distance == ball_goal_prev_distance:
+                self.ball_not_touch_period += 1
+                if self.ball_not_touch_period > 40:
+                    self.ball_not_touch_period = 0
+                    done = True
+            else:
+                self.ball_not_touch_period = 0
+            self.ball_pos[0] = ball_x
+            self.ball_pos[1] = ball_y
         elif ball_x > goal_x and self.goal_leftpole < ball_y < self.goal_rightpole:
-            reward = 500
-        else:
-            reward = -500
+            reward = 10
+            done = True
 
+#        print("action: "+str(action)+", reward: "+str(reward))
         return self.state, reward, done, {}
 
     def reset(self):
         p.resetBasePositionAndOrientation(self.RobotId, [0, 0, 0], [0, 0, 0, 1.0])
-        p.resetBasePositionAndOrientation(self.BallId[0], [0.2, 0, 0.1], [0, 0, 0, 1.0])
+        p.resetBasePositionAndOrientation(self.BallId[0], [0.3, 0, 0.1], [0, 0, 0, 1.0])
         x_goal, y_goal, th_goal = 0.1, 0.0, 0.0
         del self.walk, self.pc
         self.pc = preview_control(0.01, 1.0, 0.27)
@@ -130,6 +146,8 @@ def reset(self):
         roll, pitch, yaw = p.getEulerFromQuaternion(p.getBasePositionAndOrientation(self.RobotId)[1])
         ball_x, ball_y, _ = p.getBasePositionAndOrientation(self.BallId[0])[0]
         self.state = [x, y, yaw, ball_x, ball_y]
+        self.ball_pos[0] = 0.3
+        self.ball_pos[1] = 0.0
         return np.array(self.state)