pas terrible la

abelsalm · May 14, 2024 · 2295bb1 · 2295bb1
1 parent f7ad6ef
commit 2295bb1
Show file tree

Hide file tree

Showing 11 changed files with 233 additions and 59 deletions.
diff --git a/__pycache__/agent.cpython-312.pyc b/__pycache__/agent.cpython-312.pyc
diff --git a/__pycache__/env.cpython-312.pyc b/__pycache__/env.cpython-312.pyc
diff --git a/__pycache__/equations.cpython-312.pyc b/__pycache__/equations.cpython-312.pyc
diff --git a/__pycache__/visualize.cpython-312.pyc b/__pycache__/visualize.cpython-312.pyc
diff --git a/agent.py b/agent.py
@@ -2,7 +2,7 @@
 # first agent that works with deepqn, we will change methods later
 # import : basically PyTorch librabries and Numpy
 import torch as T
-from deepql import Actor, Critic, DeepQN
+from deepql import DeepQN
 
 import numpy as np
 
@@ -51,7 +51,7 @@ def store_transition(self, state, action, reward, state_, done):
 # here we use the epsilon gready algorithm strategy
     def choose_action(self, observation):
         if np.random.random() > self.epsilon:
-            state = T.tensor([observation]).to(self.Q_eval.device).float()
+            state = T.tensor(np.array(observation), dtype=T.float32).to(self.Q_eval.device)
             actions = self.Q_eval.forward(state)
             action = T.argmax(actions).item()
 

diff --git a/env.py b/env.py
@@ -3,6 +3,7 @@
 import equations
 import visualize as vis
 
+
 # flake8: noqa
 
 class Env():
@@ -12,13 +13,11 @@ def __init__(self):
         # 13 states: [x, y, z, vx, vy, vz, qx, qy, qz, qw, wx, wy, wz]
         self.action_space = []
         for i in range(4):
-            for j in range(2):
-                for k in range(2):
-                    a = np.zeros(4)
-                    if i != 3:
-                        a[i] = 0.01*((-1)**k)
-                    a[3] = 0.01*j
-                    self.action_space.append(a)
+            for k in range(2):
+                a = np.zeros(4)
+                if i != 3:
+                    a[i] = 0.02*((-1)**k)
+                self.action_space.append(a)
 
         self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(13,), dtype=np.float32)
         # number of timesteps in one episode
@@ -43,7 +42,8 @@ def step(self, action):
         F = np.array([Fx, Fy, Fz])
         L = np.array([0, 0, L])
         r, v = equations.CW_finite_diff(r, v, equations.n0, equations.dt, F)
-        q, w = equations.q_and_w(q, w, L, equations.dt)
+        '''q, w = equations.q_and_w(q, w, L, equations.dt)'''
+        q, w = [1, 0, 0, 0], [0, 0, 0]
         self.state = np.concatenate((r, v, q, w))
         self.episode_length -= 1
         done = self.episode_length == 0
@@ -57,26 +57,33 @@ def reward(self, state, action):
         r = 0
         # on y axis
         if self.state[1] > 25:
-            r += abs((10000 - self.state[1]))/50000  # for y, needs to come close but not to much
-        elif self.state[1] < 25 and self.state[1] > 10:
-            r += 10000
-        else:
-            r -= 10000
+            r -= abs(self.state[1])/5000  # for y, needs to come close but not to much
+        elif self.state[1] < 1000 and self.state[1] > 10:
+            r = 1
+            if self.state[1] < 500:
+                r += 2
+                if self.state[1] < 100:
+                    r += 3
 
         # stay on the orbit for x and z axis
-        if self.state[0] > 50:
-            r -= (self.state[0] - 50)/100
-        elif self.state[2] > 50:
-            r -= (self.state[2] - 50)/100
+        if self.state[0] > 200:
+            r -= (self.state[0] - 100)/10
+        elif self.state[2] > 200:
+            r -= (self.state[2] - 100)/10
+
+        if np.linalg.norm(action[3:6]) > 0.3:
+            r -= 100
+
+        reward = r
 
         # globally, distance to the satellite
-        if np.linalg.norm(self.state[:3]) > 15 and np.linalg.norm(self.state[:3]) < 50:
-            r += 1000
-        elif np.linalg.norm(self.state[:3]) < 15:
-            r -= 1000
+        '''if np.linalg.norm(self.state[:3]) > 15 and np.linalg.norm(self.state[:3]) < 50:
+            r += 1000'''
+        '''if np.linalg.norm(self.state[:3]) < 15:
+            r -= 1000'''
 
-        # then economy of propellant
-        r -= np.linalg.norm(action[:3])
+        '''# then economy of propellant
+        r -= np.linalg.norm(action[:3])'''
 
         '''# orientation constraint
         q_norm = self.state[6:10] / np.linalg.norm(self.state[6:10])
@@ -88,32 +95,18 @@ def reward(self, state, action):
         if constraint < 0:
             r -= 50'''
 
-        return r
+        return reward
 
 # test the environment and visualize the results
 env = Env()
 print(env.reset())
-indices = [[0, 10000, 0]]
-for i in range(70):
-    if i%20 >= 10:
-        a = 1
-    else:
-        a = -1
-    state, r, ok = env.step([0.1, -0.2, 0.01, 0.1])
-    indices.append([state[0], state[1], state[2]]) 
-for i in range(30):
-    if i%20 >= 10:
-        a = 1
-    else:
-        a = -1
-    state, r, ok = env.step([0, 0.1, 0., -0.1])
-    indices.append([state[0], state[1], state[2]]) 
+indices = [[0, 100, 0]]
 for i in range(100):
     if i%20 >= 10:
         a = 1
     else:
         a = -1
-    state, r, ok = env.step([-0.2, 0.4, -0.1, 0.])
+    state, r, ok = env.step([0.1, -0.2, 0.01, 0.])
     indices.append([state[0], state[1], state[2]])
 
 vis.visualize(indices)

diff --git a/equations.py b/equations.py
@@ -3,10 +3,10 @@
 # flake8: noqa
 
 # constants
-n0 = 2 * np.pi / (24 * 60 * 60)  # Earth angular's velocity (rad/s)
+n0 = 1 #2 * np.pi / (24 * 60 * 60)  # Earth angular's velocity (rad/s)
 m = 3000  # mass (kg)
-dt = 5 * 60  # dt in seconds
-total_time = 2 * 24 * 60 * 60  # total time of one week in seconds
+dt = 20 * 60  # dt in seconds
+total_time = 4 * 24 * 60 * 60  # total time of one week in seconds
 num_steps = total_time//dt  # iterations
 R = 1.5  # radius of the chaser
 H = 8  # height of the chaser

diff --git a/test_dql_on_spacecraft.py b/test_dql_on_spacecraft.py
@@ -2,36 +2,57 @@
 from agent import Agent
 import numpy as np
 import visualize as vis
+import equations as eq
+import time
 
 # flake8: noqa
 
 env = Env()
-agent = Agent(gamma=0.99, epsilon=1.0, lr=0.003, dim_input=13, batch_size=64, actions=16,
+agent = Agent(gamma=0.98, epsilon=1.0, lr=0.003, dim_input=13, batch_size=64, actions=8,
             memory_max=10000, epsilon_min=0.08, epsilon_down=2e-4)
 scores = []
 eps_history = []
 n_games = 50
 
 
-indices_dernier = []
 for i in range(n_games):
     done = False
     score = 0
     observation = env.reset()
+    indices_derniers = []
     while not done:
         action = agent.choose_action(observation)
         observation_, reward, done = env.step(env.action_space[action])
         score += reward
         agent.store_transition(observation, action, reward, observation_, done)
         agent.learn()
         observation = observation_
-        if i >= n_games-1:
-            indices_10_derniers = indices_dernier.append(env.state[0:3])
+        '''if i >= n_games -10:'''
+        indices_derniers.append(env.state[0:3])
     scores.append(score)
     eps_history.append(agent.epsilon)
 
+    vis.visualize(indices_derniers)
+
     avg_score = np.mean(scores[-10:])
     print('episode', i, 'score %.2f' % score, 'average score %.2f' % avg_score, 'epsilon %.2f' % agent.epsilon)
     print(env.state)
+    time.sleep(2)
 
-vis.visualize(indices_dernier)
+'''for i in range(10):
+    vis.visualize(indices_derniers[i*eq.num_steps:(i+1)*eq.num_steps])'''
+
+# test the rl algo and display the results of the agent 
+env = Env()
+for i in range(1, 6):
+    state = env.reset()
+    done = False
+    score = 0
+    observation = env.reset()[0:6]
+    while not done:
+        action = agent.choose_action(observation)
+        observation_, reward, done = env.step(action)
+        observation = observation_
+        score += reward
+
+    print('Episode:{} Score:{}'.format(i, score))
diff --git a/test_with_lunlan.py b/test_with_lunlan.py
@@ -7,10 +7,10 @@
 
 env = gym.make('LunarLander-v2')
 agent = Agent(gamma=0.99, epsilon=1.0, lr=0.003, dim_input=8, batch_size=64, actions=4,
-               memory_max=10000, epsilon_min=0.03, epsilon_down=2e-5)
+               memory_max=10000, epsilon_min=0.03, epsilon_down=4e-5)
 scores = []
 eps_history = []
-n_games = 50
+n_games = 500
 
 for i in range(n_games):
     done = False
@@ -42,7 +42,7 @@
     observation = env.reset()[0]
     while not done:
         action = agent.choose_action(observation)
-        observation_, reward, done, info1, info2 = env.step(action)
+        observation_, reward, done = env.step(action)
         observation = observation_
         score += reward
     if score > 200: