From 88c0c0c57cc38c6ba824a8f08cb7960b82965195 Mon Sep 17 00:00:00 2001 From: nrontsis Date: Sun, 11 Nov 2018 17:44:25 +0000 Subject: [PATCH] #14 Connection with MATLAB works. --- .../matlab-environment/pendulum_learn.m | 6 +- .../matlab-environment/policy_wrapper.m | 5 ++ .../matlab-environment/settings_pendulum.m | 10 ++- examples/pendulum-matlab/runme.py | 70 +++++++++++++++++++ 4 files changed, 87 insertions(+), 4 deletions(-) create mode 100644 examples/pendulum-matlab/matlab-environment/policy_wrapper.m diff --git a/examples/pendulum-matlab/matlab-environment/pendulum_learn.m b/examples/pendulum-matlab/matlab-environment/pendulum_learn.m index e58107b..2613194 100644 --- a/examples/pendulum-matlab/matlab-environment/pendulum_learn.m +++ b/examples/pendulum-matlab/matlab-environment/pendulum_learn.m @@ -20,6 +20,7 @@ basename = 'pendulum_'; % filename used for saving data % 2. Initial J random rollouts + for jj = 1:J [xx, yy, realCost{jj}, latent{jj}] = ... rollout(gaussian(mu0, S0), struct('maxU',policy.maxU), H, plant, cost); @@ -30,14 +31,15 @@ end end + mu0Sim(odei,:) = mu0; S0Sim(odei,odei) = S0; mu0Sim = mu0Sim(dyno); S0Sim = S0Sim(dyno,dyno); % 3. Controlled learning (N iterations) -for j = 1:N +for j = 1:1 trainDynModel; % train (GP) dynamics model learnPolicy; % learn policy - applyController; % apply controller to system + % applyController; % apply controller to system disp(['controlled trial # ' num2str(j)]); if plotting.verbosity > 0; % visualization of trajectory if ~ishandle(1); figure(1); else set(0,'CurrentFigure',1); end; clf(1); diff --git a/examples/pendulum-matlab/matlab-environment/policy_wrapper.m b/examples/pendulum-matlab/matlab-environment/policy_wrapper.m new file mode 100644 index 0000000..608bd30 --- /dev/null +++ b/examples/pendulum-matlab/matlab-environment/policy_wrapper.m @@ -0,0 +1,5 @@ +function u = policy_wrapper(mu,s) + global policy + u = policy.fcn(policy,mu,s); +end + diff --git a/examples/pendulum-matlab/matlab-environment/settings_pendulum.m b/examples/pendulum-matlab/matlab-environment/settings_pendulum.m index 4ce2cce..67510ff 100644 --- a/examples/pendulum-matlab/matlab-environment/settings_pendulum.m +++ b/examples/pendulum-matlab/matlab-environment/settings_pendulum.m @@ -23,7 +23,7 @@ % include some paths try - rd = '../../'; + rd = '../pilcoV0.9/'; addpath([rd 'base'],[rd 'util'],[rd 'gp'],[rd 'control'],[rd 'loss']); catch end @@ -84,6 +84,7 @@ plant.prop = @propagated; % 4. Set up the policy structure +global policy policy.fcn = @(policy,m,s)conCat(@congp,@gSat,policy,m,s);% controller % representation policy.maxU = 2.5; % max. amplitude of @@ -137,4 +138,9 @@ % 9. Some initializations x = []; y = []; fantasy.mean = cell(1,N); fantasy.std = cell(1,N); -realCost = cell(1,N); M = cell(N,1); Sigma = cell(N,1); \ No newline at end of file +realCost = cell(1,N); M = cell(N,1); Sigma = cell(N,1); + +% Things copied from pendulum_learn +basename = 'pendulum_'; % filename used for saving data +mu0Sim(odei,:) = mu0; S0Sim(odei,odei) = S0; +mu0Sim = mu0Sim(dyno); S0Sim = S0Sim(dyno,dyno); \ No newline at end of file diff --git a/examples/pendulum-matlab/runme.py b/examples/pendulum-matlab/runme.py index 1d73c00..b3447cd 100644 --- a/examples/pendulum-matlab/runme.py +++ b/examples/pendulum-matlab/runme.py @@ -1,7 +1,13 @@ +# Call inside python as +# exec(open("runme.py").read()) +# to avoid MATLAB closing if the scirpt ends/crashes. import matlab.engine import os import urllib.request import zipfile +import numpy as np +import gym +import time if not os.path.isdir("pilcov0.9"): print("Matlab implementation not found in current path.") @@ -11,3 +17,67 @@ zip_ref.extractall("./") zip_ref.close() print("Done!") + + +def convert_to_matlab(x): + dtheta = x[2] + cos_theta = x[0] + sin_theta = x[1] + + theta = np.arctan2(sin_theta, cos_theta) + return np.array([dtheta, theta, sin_theta, cos_theta]) + +env = gym.make('Pendulum-v0') + +def rollout(policy, timesteps): + X = []; Y = [] + env.reset() + x = convert_to_matlab(env.step([0])[0]) + for timestep in range(timesteps): + env.render() + u = policy(np.array([x[0], x[2], x[3]])) + x_new, _, done, _ = env.step(u) + x_new = convert_to_matlab(x_new) # x_new -> dtheta, theta, sin(theta), cos(theta) + if done: break + X.append(np.hstack((x, u))) + Y.append(x_new[0:2]) # Y -> dtheta, theta + x = x_new + return np.stack(X), np.stack(Y) + +def random_policy(x): + return env.action_space.sample() + +eng = matlab.engine.start_matlab("-desktop") +# dir_path = os.path.dirname(os.path.realpath(__file__)) + "/matlab-environment" +dir_path = "matlab-environment" +eng.cd(dir_path, nargout=0) + +def matlab_policy(x): + n = x.shape[0] + s = np.zeros((n,n)) + u = eng.policy_wrapper(matlab.double(x[:, None].tolist()), matlab.double(s.tolist()), nargout=1) + return np.array([u]) + +# Initial random rollouts to generate a dataset +X,Y = rollout(policy=random_policy, timesteps=40) +for i in range(1,3): + X_, Y_ = rollout(policy=random_policy, timesteps=40) + X = np.vstack((X, X_)) + Y = np.vstack((Y, Y_)) + +eng.settings_pendulum(nargout=0) +for rollouts in range(10): + print("Rollout #", rollouts + 1) + eng.workspace['j'] = rollouts + 1 + eng.workspace['x'] = matlab.double(X.tolist()) + eng.workspace['y'] = matlab.double(Y.tolist()) + eng.trainDynModel(nargout=0) + start = time.time() + eng.learnPolicy(nargout=0) + end = time.time() + print("Learning of policy done in ", end - start, " seconds.") + if rollouts > 8: + import pdb; pdb.set_trace() + X_new, Y_new = rollout(policy=matlab_policy, timesteps=100) + # Update dataset + X = np.vstack((X, X_new)); Y = np.vstack((Y, Y_new)) \ No newline at end of file