From 88c0c0c57cc38c6ba824a8f08cb7960b82965195 Mon Sep 17 00:00:00 2001
From: nrontsis <nrontsis@gmail.com>
Date: Sun, 11 Nov 2018 17:44:25 +0000
Subject: [PATCH] #14 Connection with MATLAB works.

---
 .../matlab-environment/pendulum_learn.m       |  6 +-
 .../matlab-environment/policy_wrapper.m       |  5 ++
 .../matlab-environment/settings_pendulum.m    | 10 ++-
 examples/pendulum-matlab/runme.py             | 70 +++++++++++++++++++
 4 files changed, 87 insertions(+), 4 deletions(-)
 create mode 100644 examples/pendulum-matlab/matlab-environment/policy_wrapper.m

diff --git a/examples/pendulum-matlab/matlab-environment/pendulum_learn.m b/examples/pendulum-matlab/matlab-environment/pendulum_learn.m
index e58107b..2613194 100644
--- a/examples/pendulum-matlab/matlab-environment/pendulum_learn.m
+++ b/examples/pendulum-matlab/matlab-environment/pendulum_learn.m
@@ -20,6 +20,7 @@
 basename = 'pendulum_';       % filename used for saving data
 
 % 2. Initial J random rollouts
+
 for jj = 1:J
   [xx, yy, realCost{jj}, latent{jj}] = ...
     rollout(gaussian(mu0, S0), struct('maxU',policy.maxU), H, plant, cost);
@@ -30,14 +31,15 @@
   end
 end
 
+
 mu0Sim(odei,:) = mu0; S0Sim(odei,odei) = S0;
 mu0Sim = mu0Sim(dyno); S0Sim = S0Sim(dyno,dyno);
 
 % 3. Controlled learning (N iterations)
-for j = 1:N
+for j = 1:1
   trainDynModel;   % train (GP) dynamics model
   learnPolicy;     % learn policy
-  applyController; % apply controller to system
+  % applyController; % apply controller to system
   disp(['controlled trial # ' num2str(j)]);
   if plotting.verbosity > 0;      % visualization of trajectory
     if ~ishandle(1); figure(1); else set(0,'CurrentFigure',1); end; clf(1);
diff --git a/examples/pendulum-matlab/matlab-environment/policy_wrapper.m b/examples/pendulum-matlab/matlab-environment/policy_wrapper.m
new file mode 100644
index 0000000..608bd30
--- /dev/null
+++ b/examples/pendulum-matlab/matlab-environment/policy_wrapper.m
@@ -0,0 +1,5 @@
+function u = policy_wrapper(mu,s)
+    global policy
+    u = policy.fcn(policy,mu,s);
+end
+
diff --git a/examples/pendulum-matlab/matlab-environment/settings_pendulum.m b/examples/pendulum-matlab/matlab-environment/settings_pendulum.m
index 4ce2cce..67510ff 100644
--- a/examples/pendulum-matlab/matlab-environment/settings_pendulum.m
+++ b/examples/pendulum-matlab/matlab-environment/settings_pendulum.m
@@ -23,7 +23,7 @@
 
 % include some paths
 try
-  rd = '../../';
+  rd = '../pilcoV0.9/';
   addpath([rd 'base'],[rd 'util'],[rd 'gp'],[rd 'control'],[rd 'loss']);
 catch
 end
@@ -84,6 +84,7 @@
 plant.prop = @propagated;        
 
 % 4. Set up the policy structure
+global policy
 policy.fcn = @(policy,m,s)conCat(@congp,@gSat,policy,m,s);% controller 
                                                           % representation
 policy.maxU = 2.5;                                        % max. amplitude of 
@@ -137,4 +138,9 @@
 % 9. Some initializations
 x = []; y = [];
 fantasy.mean = cell(1,N); fantasy.std = cell(1,N);
-realCost = cell(1,N); M = cell(N,1); Sigma = cell(N,1);
\ No newline at end of file
+realCost = cell(1,N); M = cell(N,1); Sigma = cell(N,1);
+
+% Things copied from pendulum_learn
+basename = 'pendulum_';       % filename used for saving data
+mu0Sim(odei,:) = mu0; S0Sim(odei,odei) = S0;
+mu0Sim = mu0Sim(dyno); S0Sim = S0Sim(dyno,dyno);
\ No newline at end of file
diff --git a/examples/pendulum-matlab/runme.py b/examples/pendulum-matlab/runme.py
index 1d73c00..b3447cd 100644
--- a/examples/pendulum-matlab/runme.py
+++ b/examples/pendulum-matlab/runme.py
@@ -1,7 +1,13 @@
+# Call inside python as
+# exec(open("runme.py").read())
+# to avoid MATLAB closing if the scirpt ends/crashes.
 import matlab.engine
 import os
 import urllib.request
 import zipfile
+import numpy as np
+import gym
+import time
 
 if not os.path.isdir("pilcov0.9"):
     print("Matlab implementation not found in current path.")
@@ -11,3 +17,67 @@
     zip_ref.extractall("./")
     zip_ref.close()
     print("Done!")
+
+
+def convert_to_matlab(x):
+    dtheta = x[2]
+    cos_theta = x[0]
+    sin_theta = x[1]
+
+    theta = np.arctan2(sin_theta, cos_theta)
+    return np.array([dtheta, theta, sin_theta, cos_theta])
+
+env = gym.make('Pendulum-v0')
+
+def rollout(policy, timesteps):
+    X = []; Y = []
+    env.reset()
+    x = convert_to_matlab(env.step([0])[0])
+    for timestep in range(timesteps):
+        env.render()
+        u = policy(np.array([x[0], x[2], x[3]]))
+        x_new, _, done, _ = env.step(u)
+        x_new = convert_to_matlab(x_new) # x_new -> dtheta, theta, sin(theta), cos(theta)
+        if done: break
+        X.append(np.hstack((x, u)))
+        Y.append(x_new[0:2]) # Y -> dtheta, theta
+        x = x_new
+    return np.stack(X), np.stack(Y)
+
+def random_policy(x):
+    return env.action_space.sample()
+
+eng = matlab.engine.start_matlab("-desktop")
+# dir_path = os.path.dirname(os.path.realpath(__file__)) + "/matlab-environment"
+dir_path = "matlab-environment"
+eng.cd(dir_path, nargout=0)
+
+def matlab_policy(x):
+    n = x.shape[0]
+    s = np.zeros((n,n))
+    u = eng.policy_wrapper(matlab.double(x[:, None].tolist()), matlab.double(s.tolist()), nargout=1)
+    return np.array([u])
+
+# Initial random rollouts to generate a dataset
+X,Y = rollout(policy=random_policy, timesteps=40)
+for i in range(1,3):
+    X_, Y_ = rollout(policy=random_policy, timesteps=40)
+    X = np.vstack((X, X_))
+    Y = np.vstack((Y, Y_))
+
+eng.settings_pendulum(nargout=0)
+for rollouts in range(10):
+    print("Rollout #", rollouts + 1)
+    eng.workspace['j'] = rollouts + 1
+    eng.workspace['x'] = matlab.double(X.tolist())
+    eng.workspace['y'] = matlab.double(Y.tolist())
+    eng.trainDynModel(nargout=0)
+    start = time.time()
+    eng.learnPolicy(nargout=0)
+    end = time.time()
+    print("Learning of policy done in ", end - start, " seconds.")
+    if rollouts > 8:
+        import pdb; pdb.set_trace()
+    X_new, Y_new = rollout(policy=matlab_policy, timesteps=100)
+    # Update dataset
+    X = np.vstack((X, X_new)); Y = np.vstack((Y, Y_new))
\ No newline at end of file