Skip to content

Commit

Permalink
#13 Added restart methods for model and controller. #9 Consistent goo…
Browse files Browse the repository at this point in the history
…d performance on the pendulum-v0 environment
  • Loading branch information
kyr-pol committed Nov 7, 2018
1 parent ab4a57c commit 85ea697
Show file tree
Hide file tree
Showing 3 changed files with 81 additions and 27 deletions.
42 changes: 18 additions & 24 deletions examples/inverted_pendulum.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def rollout(env, pilco, policy, timesteps, verbose=False, random=False, SUBS=1):
x_new, _, done, _ = env.step(u)
if done: break
# env.render()
# x_new += [0.001*np.random.randn(), 0.001*np.random.randn(), 0.001*np.random.randn()]
#x_new += 0.001 * (np.random.rand()-0.5)
if verbose:
print("Action: ", u)
print("State : ", x_new)
Expand Down Expand Up @@ -61,9 +61,9 @@ def make_env(env_id, **kwargs):
maxiter = 50 # number of iterations for the controller optimisation
max_action = 1.0
target = np.array(np.zeros(state_dim)) # goal state, passed to the reward function
weights = np.diag(np.ones(state_dim)) # weights of the reward function
weights = np.eye(state_dim) # weights of the reward function
m_init = np.reshape(np.zeros(state_dim), (1,state_dim)) # initial state mean
S_init = 0.1 * np.diag(np.ones(state_dim)) # initial state variance
S_init = 0.1 * np.eye(state_dim) # initial state variance
T = 60 # horizon length in timesteps
J = 5 # number of initial rollouts with random actions
N = 10 # number of iterations
Expand All @@ -86,7 +86,7 @@ def make_env(env_id, **kwargs):
# NEEDS a different initialisation than the one in gym (change the reset() method),
# to (m_init, S_init)
SUBS=3
bf = 20
bf = 30
maxiter=50
max_action=2.0
target = np.array([1.0, 0.0, 0.0])
Expand All @@ -95,8 +95,8 @@ def make_env(env_id, **kwargs):
S_init = np.diag([0.01, 0.05, 0.01])
T = 40
J = 4
N = 15
restarts = False
N = 8
restarts = True
elif env_id == 'CartPole-v0':
env = gym.make('env_id')
# Takes discrete actions, crashes for some reason.
Expand All @@ -120,9 +120,9 @@ def make_env(env_id, **kwargs):
control_dim = 1
max_action=1.0 # actions for these environments are discrete
target = np.array([0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])
weights = np.diag(np.ones(state_dim))
m_init = np.reshape(np.array([0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]), (1,11))
S_init = 0.1 * np.diag(np.ones(state_dim))
weights = np.eye(state_dim)
m_init = np.array([0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])[None, :]
S_init = 0.1 * np.eye(state_dim)
S_init[6,6] = 2
S_init[7,7] = 2
T = 15
Expand Down Expand Up @@ -150,7 +150,8 @@ def make_env(env_id, **kwargs):
return env, parameters

def run(env_id, **kwargs ):
with tf.Session(graph=tf.Graph()):
sess = tf.Session(graph=tf.Graph())
with tf.Session() as sess:
# Make env
env, parameters = make_env(env_id, **kwargs)
SUBS, bf, maxiter, max_action, target, weights, m_init, S_init, T, J, N, restarts = \
Expand All @@ -164,14 +165,14 @@ def run(env_id, **kwargs ):
# Initial random rollouts to generate a dataset
X,Y = rollout(env, None, policy=policy, timesteps=T, random=True, SUBS=SUBS)
for i in range(1,J):
X_, Y_ = rollout(env, None, policy=policy, timesteps=T, random=True, SUBS=SUBS)
X_, Y_ = rollout(env, None, policy=policy, timesteps=T, random=True, SUBS=SUBS, verbose=True)
X = np.vstack((X, X_))
Y = np.vstack((Y, Y_))

state_dim = Y.shape[1]
control_dim = X.shape[1] - state_dim
controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=bf, max_action=max_action)
# controller = LinearController(state_dim=state_dim, control_dim=control_dim, max_action=max_action)
# controller = LinearController(state_dim=state_dim, control_dim=control_dim, max_action=max_action)

R = ExponentialReward(state_dim=state_dim, t=target, W=weights)

Expand All @@ -189,27 +190,20 @@ def run(env_id, **kwargs ):
for rollouts in range(N):
print("**** ITERATION no", rollouts, " ****")
pilco.optimize(maxiter=maxiter)
pilco.mgpr.try_restart(sess, restarts=5)
if restarts: pilco.restart_controller(sess, restarts=3)
print("No of ops:", len(tf.get_default_graph().get_operations()))

# Predict the trajectory, to check model's accuracy
for i in range(1,T):
x_pred[i,:], s_pred[i,:,:], rr[i] = pil_predict_wrapper(pilco, m_init, S_init, i)
# for i in range(1,T):
# x_pred[i,:], s_pred[i,:,:], rr[i] = pil_predict_wrapper(pilco, m_init, S_init, i)

X_new, Y_new = rollout(env, pilco, policy=policy, timesteps=T, verbose=False, SUBS=SUBS)
X_new, Y_new = rollout(env, pilco, policy=policy, timesteps=T, verbose=True, SUBS=SUBS)

# Update dataset
X = np.vstack((X, X_new)); Y = np.vstack((Y, Y_new))
pilco.mgpr.set_XY(X, Y)

# RESTARTS model and controller, to avoid local minima
if restarts and ((rollouts+1) % 4 == 0):
c2 = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=bf, max_action=max_action)
p2 = PILCO(X, Y, controller=c2, horizon=T, reward=R, m_init=m_init, S_init=S_init)
p2.optimize(maxiter=2*maxiter)
_ ,_ , rr2 = pil_predict_wrapper(p2, m_init, S_init, T)
# if the predicted reward is higher,replaces the previous model/controller
if rr2 > rr[T-1]:
pilco = p2
return X, parameters

if __name__ == '__main__':
Expand Down
32 changes: 31 additions & 1 deletion pilco/models/mgpr.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import tensorflow as tf
import gpflow
float_type = gpflow.settings.dtypes.float_type

import numpy as np
import copy

class MGPR(gpflow.Parameterized):
def __init__(self, X, Y, name=None):
Expand All @@ -18,6 +19,8 @@ def create_models(self, X, Y):
self.models = []
for i in range(self.num_outputs):
kern = gpflow.kernels.RBF(input_dim=X.shape[1], ARD=True)
# kern.lengthscales.prior = gpflow.priors.Gamma(1,10)
# kern.variance.prior = gpflow.priors.Gamma(1,10)
#TODO: Maybe fix noise for better conditioning
# kern.variance = 0.001
# kern.variance.trainable = False
Expand Down Expand Up @@ -117,6 +120,33 @@ def predict_given_factorizations(self, m, s, iK, beta):

return tf.transpose(M), S, tf.transpose(V)

def try_restart(self, session, restarts=1):
for r in range(restarts):
for i in range(len(self.models)):
model = self.models[i]
optimizer = self.optimizers[i]
store_values = model.read_values(session=session)
previous_ll = model.compute_log_likelihood()
# reinitialize all kernel parameters
model.kern.lengthscales = 1 + 0.01 * np.random.normal(size=model.kern.lengthscales.shape)
model.kern.variance = 1 + 0.01 * np.random.normal(size=model.kern.variance.shape)
model.likelihood.variance = 1 + 0.001 * np.random.normal()

optimizer._optimizer.minimize(session=optimizer._model.enquire_session(None),
feed_dict=optimizer._gen_feed_dict(optimizer._model, None),
step_callback=None)

ll = model.compute_log_likelihood()
if previous_ll > ll:
# set values back to what they were
model.assign(store_values, session=session)
print("Restoring previous values")
else:
store_values = model.read_values(session=session)
print('Successful model restart')
previous_ll = ll


def centralized_input(self, m):
return self.X - m

Expand Down
34 changes: 32 additions & 2 deletions pilco/models/pilco.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import tensorflow as tf
import gpflow
import pandas as pd
import copy

from .mgpr import MGPR
from .smgpr import SMGPR
Expand Down Expand Up @@ -41,7 +42,7 @@ def __init__(self, X, Y, num_induced_points=None, horizon=30, controller=None,
else:
self.m_init = m_init
self.S_init = S_init

self.optimizer = None

@gpflow.name_scope('likelihood')
Expand All @@ -66,7 +67,7 @@ def optimize(self, maxiter=50):
step_callback=None)
else:
self.optimizer = gpflow.train.ScipyOptimizer(method="L-BFGS-B")
self.optimizer.minimize(self, disp=True, maxiter=maxiter)
self.optimizer.minimize(self, disp=True, maxiter=maxiter, anchor=False)

end = time.time()
print("Finished with Controller's optimization in %.1f seconds" % (end - start))
Expand Down Expand Up @@ -129,3 +130,32 @@ def propagate(self, m_x, s_x):
# While-loop requires the shapes of the outputs to be fixed
M_x.set_shape([1, self.state_dim]); S_x.set_shape([self.state_dim, self.state_dim])
return M_x, S_x

def restart_controller(self, session, restarts=1):
values = self.read_values(session=session)
old_reward = copy.deepcopy(self.compute_return())
for r in range(restarts):
self.controller.models[0].X.assign(0.1 * np.random.normal(size=self.controller.models[0].X.shape))
self.controller.models[0].Y.assign(0.1 * np.random.normal(size=self.controller.models[0].Y.shape))
self.controller.models[0].kern.lengthscales.assign(0.1 * np.random.normal(size=self.controller.models[0].kern.lengthscales.shape) + 1)
# self.controller.models[0].kern.lengthscales.trainable = True
print(old_reward)
self.optimizer._optimizer.minimize(session=self.optimizer._model.enquire_session(None),
feed_dict=self.optimizer._gen_feed_dict(self.optimizer._model, None),
step_callback=None)
reward = copy.deepcopy(self.compute_return())
print(old_reward)
print(reward)
if old_reward > reward:
# set values back to what they were
print("Restoring controller values")
self.assign(values, session=session)
print(self.compute_return())
else:
print('Successful restart')
values = self.read_values(session=session)
old_reward = reward

@gpflow.autoflow()
def compute_return(self):
return self._build_likelihood()

0 comments on commit 85ea697

Please sign in to comment.