train_network.py

import numpy as np
import sys
sys.path.append("game/")

import skimage
from skimage import transform, color, exposure

import keras
from keras.models import Sequential, Model, load_model
from keras.layers import Dense, Flatten, Activation, Input
from keras.layers.convolutional import Convolution2D
from keras.optimizers import RMSprop
import keras.backend as K
from keras.callbacks import LearningRateScheduler, History
import tensorflow as tf

import pygame
import wrapped_flappy_bird as game

import scipy.misc
import scipy.stats as st

import threading
import time
import math

GAMMA = 0.99                #discount value
BETA = 0.01                 #regularisation coefficient
IMAGE_ROWS = 85
IMAGE_COLS = 84
IMAGE_CHANNELS = 4
LEARNING_RATE = 7e-4
EPISODE = 0
THREADS = 16
t_max = 5  
const = 1e-5
T = 0

episode_r = []
episode_state = np.zeros((0, IMAGE_ROWS, IMAGE_COLS, IMAGE_CHANNELS))
episode_output = []
episode_critic = []

ACTIONS = 2
a_t = np.zeros(ACTIONS)

#loss function for policy output
def logloss(y_true, y_pred):     #policy loss
	return -K.sum( K.log(y_true*y_pred + (1-y_true)*(1-y_pred) + const), axis=-1) 
	# BETA * K.sum(y_pred * K.log(y_pred + const) + (1-y_pred) * K.log(1-y_pred + const))   #regularisation term

#loss function for critic output
def sumofsquares(y_true, y_pred):        #critic loss
	return K.sum(K.square(y_pred - y_true), axis=-1)

#function buildmodel() to define the structure of the neural network in use 
def buildmodel():
	print("Model building begins")

	model = Sequential()
	keras.initializers.RandomUniform(minval=-0.1, maxval=0.1, seed=None)

	S = Input(shape = (IMAGE_ROWS, IMAGE_COLS, IMAGE_CHANNELS, ), name = 'Input')
	h0 = Convolution2D(16, kernel_size = (8,8), strides = (4,4), activation = 'relu', kernel_initializer = 'random_uniform', bias_initializer = 'random_uniform')(S)
	h1 = Convolution2D(32, kernel_size = (4,4), strides = (2,2), activation = 'relu', kernel_initializer = 'random_uniform', bias_initializer = 'random_uniform')(h0)
	h2 = Flatten()(h1)
	h3 = Dense(256, activation = 'relu', kernel_initializer = 'random_uniform', bias_initializer = 'random_uniform') (h2)
	P = Dense(1, name = 'o_P', activation = 'sigmoid', kernel_initializer = 'random_uniform', bias_initializer = 'random_uniform') (h3)
	V = Dense(1, name = 'o_V', kernel_initializer = 'random_uniform', bias_initializer = 'random_uniform') (h3)

	model = Model(inputs = S, outputs = [P,V])
	rms = RMSprop(lr = LEARNING_RATE, rho = 0.99, epsilon = 0.1)
	model.compile(loss = {'o_P': logloss, 'o_V': sumofsquares}, loss_weights = {'o_P': 1., 'o_V' : 0.5}, optimizer = rms)
	return model

#function to preprocess an image before giving as input to the neural network
def preprocess(image):
	image = skimage.color.rgb2gray(image)
	image = skimage.transform.resize(image, (IMAGE_ROWS, IMAGE_COLS), mode = 'constant')	
	image = skimage.exposure.rescale_intensity(image, out_range=(0,255))
	image = image.reshape(1, image.shape[0], image.shape[1], 1)
	return image

# initialize a new model using buildmodel() or use load_model to resume training an already trained model
model = buildmodel()
#model = load_model("saved_models/model_updates3900", custom_objects={'logloss': logloss, 'sumofsquares': sumofsquares})
model._make_predict_function()
graph = tf.get_default_graph()

intermediate_layer_model = Model(inputs=model.input, outputs=model.get_layer('o_P').output)


a_t[0] = 1 #index 0 = no flap, 1= flap
#output of network represents probability of flap

game_state = []
for i in range(0,THREADS):
	game_state.append(game.GameState(30000))


def runprocess(thread_id, s_t):
	global T
	global a_t
	global model

	t = 0
	t_start = t
	terminal = False
	r_t = 0
	r_store = []
	state_store = np.zeros((0, IMAGE_ROWS, IMAGE_COLS, IMAGE_CHANNELS))
	output_store = []
	critic_store = []
	s_t = s_t.reshape(1, s_t.shape[0], s_t.shape[1], s_t.shape[2])

	while t-t_start < t_max and terminal == False:
		t += 1
		T += 1
		intermediate_output = 0	

		with graph.as_default():
			out = model.predict(s_t)[0]			
			intermediate_output = intermediate_layer_model.predict(s_t)
		no = np.random.rand()
		a_t = [0,1] if no < out else [1,0]  #stochastic action
		#a_t = [0,1] if 0.5 <y[0] else [1,0]  #deterministic action

		x_t, r_t, terminal = game_state[thread_id].frame_step(a_t)
		x_t = preprocess(x_t)

		with graph.as_default():
			critic_reward = model.predict(s_t)[1]

		y = 0 if a_t[0] == 1 else 1

		r_store = np.append(r_store, r_t)
		state_store = np.append(state_store, s_t, axis = 0)
		output_store = np.append(output_store, y)
		critic_store = np.append(critic_store, critic_reward)
		
		s_t = np.append(x_t, s_t[:, :, :, :3], axis=3)
		print("Frame = " + str(T) + ", Updates = " + str(EPISODE) + ", Thread = " + str(thread_id) + ", Output = "+ str(intermediate_output))
	
	if terminal == False:
		r_store[len(r_store)-1] = critic_store[len(r_store)-1]
	else:
		r_store[len(r_store)-1] = -1
		s_t = np.concatenate((x_t, x_t, x_t, x_t), axis=3)
	
	for i in range(2,len(r_store)+1):
		r_store[len(r_store)-i] = r_store[len(r_store)-i] + GAMMA*r_store[len(r_store)-i + 1]

	return s_t, state_store, output_store, r_store, critic_store

#function to decrease the learning rate after every epoch. In this manner, the learning rate reaches 0, by 20,000 epochs
def step_decay(epoch):
	decay = 3.2e-8
	lrate = LEARNING_RATE - epoch*decay
	lrate = max(lrate, 0)
	return lrate

class actorthread(threading.Thread):
	def __init__(self,thread_id, s_t):
		threading.Thread.__init__(self)
		self.thread_id = thread_id
		self.next_state = s_t

	def run(self):
		global episode_output
		global episode_r
		global episode_critic
		global episode_state

		threadLock.acquire()
		self.next_state, state_store, output_store, r_store, critic_store = runprocess(self.thread_id, self.next_state)
		self.next_state = self.next_state.reshape(self.next_state.shape[1], self.next_state.shape[2], self.next_state.shape[3])

		episode_r = np.append(episode_r, r_store)
		episode_output = np.append(episode_output, output_store)
		episode_state = np.append(episode_state, state_store, axis = 0)
		episode_critic = np.append(episode_critic, critic_store)

		threadLock.release()

states = np.zeros((0, IMAGE_ROWS, IMAGE_COLS, 4))

#initializing state of each thread
for i in range(0, len(game_state)):
	image = game_state[i].getCurrentFrame()
	image = preprocess(image)
	state = np.concatenate((image, image, image, image), axis=3)
	states = np.append(states, state, axis = 0)

while True:	
	threadLock = threading.Lock()
	threads = []
	for i in range(0,THREADS):
		threads.append(actorthread(i,states[i]))

	states = np.zeros((0, IMAGE_ROWS, IMAGE_COLS, 4))

	for i in range(0,THREADS):
		threads[i].start()

	#thread.join() ensures that all threads fininsh execution before proceeding further
	for i in range(0,THREADS):
		threads[i].join()

	for i in range(0,THREADS):
		state = threads[i].next_state
		state = state.reshape(1, state.shape[0], state.shape[1], state.shape[2])
		states = np.append(states, state, axis = 0)

	e_mean = np.mean(episode_r)
	#advantage calculation for each action taken
	advantage = episode_r - episode_critic
	print("backpropagating")

	lrate = LearningRateScheduler(step_decay)
	callbacks_list = [lrate]

	weights = {'o_P':advantage, 'o_V':np.ones(len(advantage))}	
	#backpropagation
	history = model.fit(episode_state, [episode_output, episode_r], epochs = EPISODE + 1, batch_size = len(episode_output), callbacks = callbacks_list, sample_weight = weights, initial_epoch = EPISODE)

	episode_r = []
	episode_output = []
	episode_state = np.zeros((0, IMAGE_ROWS, IMAGE_COLS, IMAGE_CHANNELS))
	episode_critic = []

	f = open("rewards.txt","a")
	f.write("Update: " + str(EPISODE) + ", Reward_mean: " + str(e_mean) + ", Loss: " + str(history.history['loss']) + "\n")
	f.close()

	if EPISODE % 50 == 0: 
		model.save("saved_models/model_updates" +	str(EPISODE)) 
	EPISODE += 1