env tourne bien sayeeeee

abelsalm · May 12, 2024 · 533b270 · 533b270
1 parent 5f8d01b
commit 533b270
Show file tree

Hide file tree

Showing 8 changed files with 38 additions and 40 deletions.
diff --git a/__pycache__/agent.cpython-312.pyc b/__pycache__/agent.cpython-312.pyc
diff --git a/__pycache__/deepql.cpython-312.pyc b/__pycache__/deepql.cpython-312.pyc
diff --git a/__pycache__/env.cpython-312.pyc b/__pycache__/env.cpython-312.pyc
diff --git a/__pycache__/equations.cpython-312.pyc b/__pycache__/equations.cpython-312.pyc
diff --git a/agent.py b/agent.py
@@ -51,7 +51,7 @@ def store_transition(self, state, action, reward, state_, done):
 # here we use the epsilon gready algorithm strategy
     def choose_action(self, observation):
         if np.random.random() > self.epsilon:
-            state = T.tensor([observation]).to(self.Q_eval.device)
+            state = T.tensor([observation]).to(self.Q_eval.device).float()
             actions = self.Q_eval.forward(state)
             action = T.argmax(actions).item()
 
@@ -97,7 +97,9 @@ def learn(self):
             self.epsilon = self.epsilon_min
 
 
-class ChaserAgent():
+
+# actor critic strategy, later...
+'''class ChaserAgent():
     def __init__(self, gamma, epsilon, lr, dim_input, batch_size, dim_action,
                  memory_max=10000, epsilon_min=0.01, epsilon_down=5e-4):
         # all of the hyperparameters
@@ -187,4 +189,4 @@ def learn(self):
         if self.epsilon > self.epsilon_min:
             self.epsilon -= self.epsilon_down
         else:
-            self.epsilon = self.epsilon_min
+            self.epsilon = self.epsilon_min'''
diff --git a/env.py b/env.py
@@ -7,8 +7,17 @@
 class Env():
     def __init__(self):
         # 4 actions: [Fx, Fy, Fz, L]
-        self.action_space = spaces.Box(low=-0.3, high=0.3, shape=(4,), dtype=np.float32)
+        self.conbtinuous_action_space = spaces.Box(low=-0.3, high=0.3, shape=(4,), dtype=np.float32)
         # 13 states: [x, y, z, vx, vy, vz, qx, qy, qz, qw, wx, wy, wz]
+        self.action_space = []
+        for i in range(4):
+            for j in range(2):
+                a = np.zeros(4)
+                if i != 3:
+                    a[i] = 0.3  
+                a[3] = 0.3*j
+                self.action_space.append(a)
+
         self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(13,), dtype=np.float32)
         # number of timesteps in one episode
         self.episode_length = equations.num_steps
@@ -18,6 +27,11 @@ def __init__(self):
     def reset(self):
         # reset the state to the initial state
         self.state = np.zeros(13)
+        self.state[1] = 10000  # initial y position
+        self.state[6] = 1  # initial quaternion
+        self.state[10] = 10**(-5) # initial angular velocity
+        self.state[11] = 10**(-5)
+        self.state[12] = 10**(-5)
         return self.state
 
     def step(self, action):
@@ -38,11 +52,11 @@ def reward(self, state, action):
         r = 0
         # on y axis
         if self.state[1] > 25:
-            r += (10000 - self.state[1])/10000  # for y, needs to come close but not to much
+            r += (10000 - self.state[1])/1000000  # for y, needs to come close but not to much
         elif self.state[1] < 25 and self.state[1] > 10:
-            r += 50
+            r += 10
         else:
-            r -= 50
+            r -= 10
 
         # stay on the orbit for x and z axis
         if self.state[0] > 50:
@@ -52,26 +66,26 @@ def reward(self, state, action):
 
         # globally, distance to the satellite
         if np.linalg.norm(self.state[:3]) > 15 and np.linalg.norm(self.state[:3]) < 50:
-            r += 50
+            r += 10
         elif np.linalg.norm(self.state[:3]) < 15:
-            r -= 50
+            r -= 10
 
         # then economy of propellant
         r -= np.linalg.norm(action[:3])
 
-        # orientation constraint
+        '''# orientation constraint
         q_norm = self.state[6:10] / np.linalg.norm(self.state[6:10])
         d = q_norm[1:]
         dot_product = np.dot(d, self.state[:3])
         norm_r = np.linalg.norm(self.state[:3])
         constraint = -dot_product / norm_r
         constraint -= np.cos(np.deg2rad(20))
         if constraint < 0:
-            r -= 50
+            r -= 50'''
 
         return r
-
+    
 env = Env()
 print(env.reset())
-for i in range(10):
-    print(env.step([0.1, 0, 0, 0]))
+for i in range(100):
+    print(env.step([0., -0.2, 0., 0.]))
diff --git a/test_dql_on_spacecraft.py b/test_dql_on_spacecraft.py
@@ -1,51 +1,33 @@
 from env import Env
-from agent import ChaserAgent
+from agent import Agent
 import numpy as np
 
 # flake8: noqa
 
 env = Env()
-agent = ChaserAgent(gamma=0.99, epsilon=1.0, lr=0.003, dim_input=13, batch_size=64, dim_action=4,
-            memory_max=10000, epsilon_min=0.03, epsilon_down=2e-5)
+agent = Agent(gamma=0.99, epsilon=1.0, lr=0.003, dim_input=13, batch_size=64, actions=8,
+            memory_max=10000, epsilon_min=0.05, epsilon_down=2e-3)
 scores = []
 eps_history = []
-n_games = 500
+n_games = 50
 
 for i in range(n_games):
     done = False
     score = 0
     observation = env.reset()[0]
     while not done:
         action = agent.choose_action(observation)
-        observation_, reward, done = env.step(action)
+        observation_, reward, done = env.step(env.action_space[action])
         score += reward
         agent.store_transition(observation, action, reward, observation_, done)
         agent.learn()
         observation = observation_
+        agent.step += 1
     scores.append(score)
     eps_history.append(agent.epsilon)
 
     avg_score = np.mean(scores[-20:])
     print('episode', i, 'score %.2f' % score, 'average score %.2f' % avg_score, 'epsilon %.2f' % agent.epsilon)
-    if i+1 % 25 == 0:
-        print('episode', i, 'score %.2f' % score, 'average score %.2f' % avg_score, 'epsilon %.2f' % agent.epsilon)
+    print(env.state)
 
 
-# test the rl algo and display the results of the agent 
-env = Env()
-avg_score = 0
-for i in range(100):
-    state = env.reset()
-    done = False
-    score = 0
-    observation = env.reset()[0]
-    while not done:
-        action = agent.choose_action(observation)
-        observation_, reward, done = env.step(action)
-        observation = observation_
-        score += reward
-    if score > 200:
-        avg_score += 1
-    print('Episode:{} Score:{}'.format(i, score))
-print('Average score:', avg_score/100)
-env.close()
diff --git a/test_with_lunlan.py b/test_with_lunlan.py
@@ -10,7 +10,7 @@
                memory_max=10000, epsilon_min=0.03, epsilon_down=2e-5)
 scores = []
 eps_history = []
-n_games = 500
+n_games = 50
 
 for i in range(n_games):
     done = False