diff --git a/.gitignore b/.gitignore index 4a920f2..81ae497 100644 --- a/.gitignore +++ b/.gitignore @@ -4,4 +4,5 @@ pilco.egg-info __pycache__ *.swp octave-workspace -*.egg \ No newline at end of file +*.egg +examples/pendulum-matlab/pilcoV0.9/** \ No newline at end of file diff --git a/examples/pendulum-matlab/matlab-envinroment/draw_pendulum.m b/examples/pendulum-matlab/matlab-envinroment/draw_pendulum.m new file mode 100644 index 0000000..5b5f4cd --- /dev/null +++ b/examples/pendulum-matlab/matlab-envinroment/draw_pendulum.m @@ -0,0 +1,81 @@ +%% draw_pendulum.m +% *Summary:* Draw the pendulum system with reward, applied torque, +% and predictive uncertainty of the tips of the pendulums +% +% function draw_pendulum(theta, torque, cost, text1, text2, M, S) +% +% +% *Input arguments:* +% +% theta1 angle of inner pendulum +% theta2 angle of outer pendulum +% f1 torque applied to inner pendulum +% f2 torque applied to outer pendulum +% cost cost structure +% .fcn function handle (it is assumed to use saturating cost) +% .<> other fields that are passed to cost +% text1 (optional) text field 1 +% text2 (optional) text field 2 +% M (optional) mean of state +% S (optional) covariance of state +% +% +% Copyright (C) 2008-2013 by +% Marc Deisenroth, Andrew McHutchon, Joe Hall, and Carl Edward Rasmussen. +% +% Last modified: 2013-03-18 + + +function draw_pendulum(theta, torque, cost, text1, text2, M, S) +%% Code + +l = 0.6; +xmin = -1.2*l; +xmax = 1.2*l; +umax = 0.5; +height = 0; + +% Draw pendulum +pendulum = [0, 0; l*sin(theta), -l*cos(theta)]; +clf; hold on +plot(pendulum(:,1), pendulum(:,2),'r','linewidth',4) + +% plot ellipses around tips of pendulum (if M, S exist) +try + if max(max(S))>0 + err = linspace(-1,1,100)*sqrt(S(2,2)); + plot(l*sin(M(2)+2*err),-l*cos(M(2)+2*err),'b','linewidth',1) + plot(l*sin(M(2)+err),-l*cos(M(2)+err),'b','linewidth',2) + plot(l*sin(M(2)),-l*cos(M(2)),'b.','markersize',20) + end +catch +end + +% Draw useful information +% target location +plot(0,l,'k+','MarkerSize',20); +plot([xmin, xmax], [-height, -height],'k','linewidth',2) +% joint +plot(0,0,'k.','markersize',24) +plot(0,0,'y.','markersize',14) +% tip of pendulum +plot(l*sin(theta),-l*cos(theta),'k.','markersize',24) +plot(l*sin(theta),-l*cos(theta),'y.','markersize',14) +plot(0,-2*l,'.w','markersize',0.005) +% applied torque +plot([0 torque/umax*xmax],[-0.5, -0.5],'g','linewidth',10); +% immediate reward +reward = 1-cost.fcn(cost,[0, theta]',zeros(2)); +plot([0 reward*xmax],[-0.7, -0.7],'y', 'linewidth',10); +text(0,-0.5,'applied torque') +text(0,-0.7,'immediate reward') +if exist('text1','var') + text(0,-0.9, text1) +end +if exist('text2','var') + text(0,-1.1, text2) +end + +set(gca,'DataAspectRatio',[1 1 1],'XLim',[xmin xmax],'YLim',[-2*l 2*l]); +axis off; +drawnow; \ No newline at end of file diff --git a/examples/pendulum-matlab/matlab-envinroment/draw_rollout_pendulum.m b/examples/pendulum-matlab/matlab-envinroment/draw_rollout_pendulum.m new file mode 100644 index 0000000..4aad70f --- /dev/null +++ b/examples/pendulum-matlab/matlab-envinroment/draw_rollout_pendulum.m @@ -0,0 +1,29 @@ +%% draw_rollout_pendulum.m +% *Summary:* Script to draw a trajectory of the most recent pendulum trajectory +% +% Copyright (C) 2008-2013 by +% Marc Deisenroth, Andrew McHutchon, Joe Hall, and Carl Edward Rasmussen. +% +% Last modified: 2013-03-27 +% +%% High-Level Steps +% # For each time step, plot the observed trajectory + +%% Code + +% Loop over states in trajectory +for r = 1:size(xx,1) + cost.t = r; + if exist('j','var') && ~isempty(M{j}) + draw_pendulum(latent{j}(r,2), latent{j}(r,end), cost, ... + ['trial # ' num2str(j+J) ', T=' num2str(H*dt) ' sec'], ... + ['total experience (after this trial): ' num2str(dt*size(x,1)) ... + ' sec'], M{j}(:,r), Sigma{j}(:,:,r)); + else + draw_pendulum(latent{jj}(r,2), latent{jj}(r,end), cost, ... + ['(random) trial # ' num2str(1) ', T=' num2str(H*dt) ' sec'], ... + ['total experience (after this trial): ' num2str(dt*size(x,1)) ... + ' sec']) + end + pause(dt); +end \ No newline at end of file diff --git a/examples/pendulum-matlab/matlab-envinroment/dynamics_pendulum.m b/examples/pendulum-matlab/matlab-envinroment/dynamics_pendulum.m new file mode 100644 index 0000000..51ea331 --- /dev/null +++ b/examples/pendulum-matlab/matlab-envinroment/dynamics_pendulum.m @@ -0,0 +1,44 @@ +%% dynamics_pendulum.m +% *Summary:* Implements ths ODE for simulating the pendulum dynamics, where +% an input torque f can be applied +% +% function dz = dynamics_pendulum(t,z,u) +% +% +% *Input arguments:* +% +% t current time step (called from ODE solver) +% z state [2 x 1] +% u (optional): torque f(t) applied to pendulum +% +% *Output arguments:* +% +% dz if 3 input arguments: state derivative wrt time +% +% Note: It is assumed that the state variables are of the following order: +% dtheta: [rad/s] angular velocity of pendulum +% theta: [rad] angle of pendulum +% +% A detailed derivation of the dynamics can be found in: +% +% M.P. Deisenroth: +% Efficient Reinforcement Learning Using Gaussian Processes, Appendix C, +% KIT Scientific Publishing, 2010. +% +% +% Copyright (C) 2008-2013 by +% Marc Deisenroth, Andrew McHutchon, Joe Hall, and Carl Edward Rasmussen. +% +% Last modified: 2013-03-18 + +function dz = dynamics_pendulum(t,z,u) +%% Code + +l = 1; % [m] length of pendulum +m = 1; % [kg] mass of pendulum +g = 9.82; % [m/s^2] acceleration of gravity +b = 0.01; % [s*Nm/rad] friction coefficient + +dz = zeros(2,1); +dz(1) = ( u(t) - b*z(1) - m*g*l*sin(z(2))/2 ) / (m*l^2/3); +dz(2) = z(1); \ No newline at end of file diff --git a/examples/pendulum-matlab/matlab-envinroment/loss_pendulum.m b/examples/pendulum-matlab/matlab-envinroment/loss_pendulum.m new file mode 100644 index 0000000..6131ead --- /dev/null +++ b/examples/pendulum-matlab/matlab-envinroment/loss_pendulum.m @@ -0,0 +1,110 @@ +%% loss_pendulum.m +% *Summary:* Pendulum loss function; the loss is +% $1-\exp(-0.5*d^2*a)$, where $a>0$ and $d^2$ is the squared difference +% between the actual and desired position of the tip of the pendulum. +% The mean and the variance of the loss are computed by averaging over the +% Gaussian distribution of the state $p(x) = \mathcal N(m,s)$ with mean $m$ +% and covariance matrix $s$. +% Derivatives of these quantities are computed when desired. +% +% +% function [L, dLdm, dLds, S2] = loss_pendulum(cost, m, s) +% +% +% *Input arguments:* +% +% cost cost structure +% .p lengths of the pendulum [1 x 1 ] +% .width array of widths of the cost (summed together) +% .expl (optional) exploration parameter +% .angle (optional) array of angle indices +% .target target state [D x 1 ] +% m mean of state distribution [D x 1 ] +% s covariance matrix for the state distribution [D x D ] +% +% *Output arguments:* +% +% L expected cost [1 x 1 ] +% dLdm derivative of expected cost wrt. state mean vector [1 x D ] +% dLds derivative of expected cost wrt. state covariance matrix [1 x D^2] +% S2 variance of cost [1 x 1 ] +% +% Copyright (C) 2008-2014 by +% Marc Deisenroth, Andrew McHutchon, Joe Hall, and Carl Edward Rasmussen. +% +% Last modified: 2014-01-09 +% +%% High-Level Steps +% # Precomputations +% # Define static penalty as distance from target setpoint +% # Trigonometric augmentation +% # Calculate loss + +function [L, dLdm, dLds, S2] = loss_pendulum(cost, m, s) +%% Code +if isfield(cost,'width'); cw = cost.width; else cw = 1; end +if ~isfield(cost,'expl') || isempty(cost.expl); b = 0; else b = cost.expl; end + + +% 1. Some precomputations +D0 = size(s,2); % state dimension +D1 = D0 + 2*length(cost.angle); % state dimension (with sin/cos) + +M = zeros(D1,1); M(1:D0) = m; S = zeros(D1); S(1:D0,1:D0) = s; +Mdm = [eye(D0); zeros(D1-D0,D0)]; Sdm = zeros(D1*D1,D0); +Mds = zeros(D1,D0*D0); Sds = kron(Mdm,Mdm); + +% 2. Define static penalty as distance from target setpoint +ell = cost.p; +Q = zeros(D1); Q(D0+1:D0+2,D0+1:D0+2) = eye(2)*ell^2; + +% 3. Trigonometric augmentation +if D1-D0 > 0 + target = [cost.target(:); ... + gTrig(cost.target(:), zeros(numel(cost.target)), cost.angle)]; + + i = 1:D0; k = D0+1:D1; + [M(k), S(k,k), C, mdm, sdm, Cdm, mds, sds, Cds] = ... + gTrig(M(i),S(i,i),cost.angle); + [S, Mdm, Mds, Sdm, Sds] = ... + fillIn(S,C,mdm,sdm,Cdm,mds,sds,Cds,Mdm,Sdm,Mds,Sds,i,k,D1); +end + +% 4. Calculate loss +L = 0; dLdm = zeros(1,D0); dLds = zeros(1,D0*D0); S2 = 0; +for i = 1:length(cw) % scale mixture of immediate costs + cost.z = target; cost.W = Q/cw(i)^2; + [r, rdM, rdS, s2, s2dM, s2dS] = lossSat(cost, M, S); + + L = L + r; S2 = S2 + s2; + dLdm = dLdm + rdM(:)'*Mdm + rdS(:)'*Sdm; + dLds = dLds + rdM(:)'*Mds + rdS(:)'*Sds; + + if (b~=0 || ~isempty(b)) && abs(s2)>1e-12 + L = L + b*sqrt(s2); + dLdm = dLdm + b/sqrt(s2) * ( s2dM(:)'*Mdm + s2dS(:)'*Sdm )/2; + dLds = dLds + b/sqrt(s2) * ( s2dM(:)'*Mds + s2dS(:)'*Sds )/2; + end +end + +% normalize +n = length(cw); L = L/n; dLdm = dLdm/n; dLds = dLds/n; S2 = S2/n; + +% Fill in covariance matrix...and derivatives ---------------------------- +function [S, Mdm, Mds, Sdm, Sds] = ... + fillIn(S,C,mdm,sdm,Cdm,mds,sds,Cds,Mdm,Sdm,Mds,Sds,i,k,D) +X = reshape(1:D*D,[D D]); XT = X'; % vectorised indices +I=0*X; I(i,i)=1; ii=X(I==1)'; I=0*X; I(k,k)=1; kk=X(I==1)'; +I=0*X; I(i,k)=1; ik=X(I==1)'; ki=XT(I==1)'; + +Mdm(k,:) = mdm*Mdm(i,:) + mds*Sdm(ii,:); % chainrule +Mds(k,:) = mdm*Mds(i,:) + mds*Sds(ii,:); +Sdm(kk,:) = sdm*Mdm(i,:) + sds*Sdm(ii,:); +Sds(kk,:) = sdm*Mds(i,:) + sds*Sds(ii,:); +dCdm = Cdm*Mdm(i,:) + Cds*Sdm(ii,:); +dCds = Cdm*Mds(i,:) + Cds*Sds(ii,:); + +S(i,k) = S(i,i)*C; S(k,i) = S(i,k)'; % off-diagonal +SS = kron(eye(length(k)),S(i,i)); CC = kron(C',eye(length(i))); +Sdm(ik,:) = SS*dCdm + CC*Sdm(ii,:); Sdm(ki,:) = Sdm(ik,:); +Sds(ik,:) = SS*dCds + CC*Sds(ii,:); Sds(ki,:) = Sds(ik,:); diff --git a/examples/pendulum-matlab/matlab-envinroment/pendulum_learn.m b/examples/pendulum-matlab/matlab-envinroment/pendulum_learn.m new file mode 100644 index 0000000..e58107b --- /dev/null +++ b/examples/pendulum-matlab/matlab-envinroment/pendulum_learn.m @@ -0,0 +1,46 @@ +%% pendulum_learn.m +% *Summary:* Script to learn a controller for the pendulum swingup +% +% Copyright (C) 2008-2013 by +% Marc Deisenroth, Andrew McHutchon, Joe Hall, and Carl Edward Rasmussen. +% +% Last modified: 2013-03-27 +% +%% High-Level Steps +% # Load parameters +% # Create J initial trajectories by applying random controls +% # Controlled learning (train dynamics model, policy learning, policy +% application) + +%% Code + +% 1. Initialization +clear all; close all; +settings_pendulum; % load scenario-specific settings +basename = 'pendulum_'; % filename used for saving data + +% 2. Initial J random rollouts +for jj = 1:J + [xx, yy, realCost{jj}, latent{jj}] = ... + rollout(gaussian(mu0, S0), struct('maxU',policy.maxU), H, plant, cost); + x = [x; xx]; y = [y; yy]; % augment training sets for dynamics model + if plotting.verbosity > 0; % visualization of trajectory + if ~ishandle(1); figure(1); else set(0,'CurrentFigure',1); end; clf(1); + draw_rollout_pendulum; + end +end + +mu0Sim(odei,:) = mu0; S0Sim(odei,odei) = S0; +mu0Sim = mu0Sim(dyno); S0Sim = S0Sim(dyno,dyno); + +% 3. Controlled learning (N iterations) +for j = 1:N + trainDynModel; % train (GP) dynamics model + learnPolicy; % learn policy + applyController; % apply controller to system + disp(['controlled trial # ' num2str(j)]); + if plotting.verbosity > 0; % visualization of trajectory + if ~ishandle(1); figure(1); else set(0,'CurrentFigure',1); end; clf(1); + draw_rollout_pendulum; + end +end \ No newline at end of file diff --git a/examples/pendulum-matlab/matlab-envinroment/settings_pendulum.m b/examples/pendulum-matlab/matlab-envinroment/settings_pendulum.m new file mode 100644 index 0000000..4ce2cce --- /dev/null +++ b/examples/pendulum-matlab/matlab-envinroment/settings_pendulum.m @@ -0,0 +1,140 @@ +%% settings_pendulum.m +% *Summary:* Script set up the pendulum scenario +% +% Copyright (C) 2008-2013 by +% Marc Deisenroth, Andrew McHutchon, Joe Hall, and Carl Edward Rasmussen. +% +% Last modified: 2013-05-24 +% +%% High-Level Steps +% # Define state and important indices +% # Set up scenario +% # Set up the plant structure +% # Set up the policy structure +% # Set up the cost structure +% # Set up the GP dynamics model structure +% # Parameters for policy optimization +% # Plotting verbosity +% # Some array initializations + +%% Code + +warning('off','all'); format short; format compact; + +% include some paths +try + rd = '../../'; + addpath([rd 'base'],[rd 'util'],[rd 'gp'],[rd 'control'],[rd 'loss']); +catch +end + +rand('seed',5); randn('seed',13); + + + +% 1. Define state and important indices + +% 1a. Full state representation (including all augmentations) +% 1 dtheta angular velocity of inner pendulum +% 2 theta2 angle inner pendulum +% 3 sin(theta) complex representation ... +% 4 cos(theta) ... of angle of pendulum +% 5 u torque applied to the pendulum + +% 1b. Important indices +% odei indicies for the ode solver +% augi indicies for variables augmented to the ode variables +% dyno indicies for the output from the dynamics model and indicies to loss +% angi indicies for variables treated as angles (using sin/cos representation) +% dyni indicies for inputs to the dynamics model +% poli indicies for variables that serve as inputs to the policy +% difi indicies for training targets that are differences (rather than values) + +odei = [1 2]; % varibles for the ODE solver +augi = []; % variables to be augmented +dyno = [1 2]; % variables to be predicted (and known to loss) +angi = [2]; % angle variables +dyni = [1 3 4]; % variables that serve as inputs to the dynamics GP +poli = [1 3 4]; % variables that serve as inputs to the policy +difi = [1 2]; % variables that are learned via differences + +% 2. Set up the scenario +dt = 0.1; % [s] sampling time +T = 4; % [s] prediction time +H = ceil(T/dt); % prediction steps (optimization horizon) +mu0 = [0 0]'; % initial state mean +S0 = 0.01*eye(2); % initial state variance +N = 10; % number of policy optimizations +J = 1; % no. of inital training rollouts (of length H) +K = 1; % number of initial states for which we optimize +nc = 20; % size of controller training set + +% 3. Set up the plant structure +plant.dynamics = @dynamics_pendulum; % dynamics ODE function +plant.noise = diag([0.1^2 0.01^2]); % measurement noise +plant.dt = dt; +plant.ctrl = @zoh; % controler is zero-order-hold +plant.odei = odei; % indices of the varibles for the ODE solver +plant.augi = augi; % indices of augmented variables +plant.angi = angi; +plant.poli = poli; +plant.dyno = dyno; +plant.dyni = dyni; +plant.difi = difi; +plant.prop = @propagated; + +% 4. Set up the policy structure +policy.fcn = @(policy,m,s)conCat(@congp,@gSat,policy,m,s);% controller + % representation +policy.maxU = 2.5; % max. amplitude of + % torque +[mm ss cc] = gTrig(mu0, S0, plant.angi); % represent angles +mm = [mu0; mm]; cc = S0*cc; ss = [S0 cc; cc' ss]; % in complex plane +policy.p.inputs = gaussian(mm(poli), ss(poli,poli), nc)'; % init. location of + % basis functions +policy.p.targets = 0.1*randn(nc, length(policy.maxU)); % init. policy targets + % (close to zero) +policy.p.hyp = log([1 0.7 0.7 1 0.01]'); % initialize + % hyper-parameters + + + +% 5. Set up the cost structure +cost.fcn = @loss_pendulum; % cost function +cost.gamma = 1; % discount factor +cost.p = 1.0; % length of pendulum +cost.width = 0.5; % cost function width +cost.expl = 0; % exploration parameter +cost.angle = plant.angi; % angle variables in cost +cost.target = [0 pi]'; % target state + + +% 6. Set up the GP dynamics model structure +dynmodel.fcn = @gp0d; % function for GP predictions +dynmodel.train = @train; % function to train dynamics model +dynmodel.induce = zeros(300,0,1); % shared inducing inputs (sparse GP) +trainOpt = [300 500]; % defines the max. number of line searches + % when training the GP dynamics models + % trainOpt(1): full GP, + % trainOpt(2): sparse GP (FITC) + +% 7. Parameters for policy optimization +opt.length = 75; % max. number of line searches +opt.MFEPLS = 30; % max. number of function evaluations + % per line search +opt.verbosity = 1; % verbosity: specifies how much + % information is displayed during + % policy learning. Options: 0-3 +opt.method = 'BFGS'; % optimization algorithm. Options: + % 'BFGS' (default), 'LBFGS', 'CG' + + +% 8. Plotting verbosity +plotting.verbosity = 1; % 0: no plots + % 1: some plots + % 2: all plots + +% 9. Some initializations +x = []; y = []; +fantasy.mean = cell(1,N); fantasy.std = cell(1,N); +realCost = cell(1,N); M = cell(N,1); Sigma = cell(N,1); \ No newline at end of file diff --git a/examples/pendulum-matlab/runme.py b/examples/pendulum-matlab/runme.py new file mode 100644 index 0000000..1d73c00 --- /dev/null +++ b/examples/pendulum-matlab/runme.py @@ -0,0 +1,13 @@ +import matlab.engine +import os +import urllib.request +import zipfile + +if not os.path.isdir("pilcov0.9"): + print("Matlab implementation not found in current path.") + print("Attempting to download now") + urllib.request.urlretrieve("http://mlg.eng.cam.ac.uk/pilco/release/pilcoV0.9.zip", "pilcoV0.9.zip") + zip_ref = zipfile.ZipFile("pilcoV0.9.zip", 'r') + zip_ref.extractall("./") + zip_ref.close() + print("Done!")