Skip to content

Commit

Permalink
ai/qlearning.py: creation
Browse files Browse the repository at this point in the history
  • Loading branch information
David Arroyo Menéndez committed Jun 28, 2019
1 parent e59ae76 commit 0322e6c
Showing 1 changed file with 263 additions and 0 deletions.
263 changes: 263 additions & 0 deletions ai/qlearning.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,263 @@
import numpy as np
import pylab as pl
import networkx as nx

# Step 2: Defining and visualising the graph
edges = [(0, 1), (1, 5), (5, 6), (5, 4), (1, 2),
(1, 3), (9, 10), (2, 4), (0, 6), (6, 7),
(8, 9), (7, 8), (1, 7), (3, 9)]

goal = 10
G = nx.Graph()
G.add_edges_from(edges)
pos = nx.spring_layout(G)
nx.draw_networkx_nodes(G, pos)
nx.draw_networkx_edges(G, pos)
nx.draw_networkx_labels(G, pos)
pl.show()

# Step 3: Defining the reward the system for the bot

MATRIX_SIZE = 11
M = np.matrix(np.ones(shape =(MATRIX_SIZE, MATRIX_SIZE)))
M *= -1

for point in edges:
print(point)
if point[1] == goal:
M[point] = 100
else:
M[point] = 0

if point[0] == goal:
M[point[::-1]] = 100
else:
M[point[::-1]]= 0
# reverse of point

M[goal, goal]= 100
print(M)
# add goal point round trip

# Step 4: Defining some utility functions to be used in the training

Q = np.matrix(np.zeros([MATRIX_SIZE, MATRIX_SIZE]))

gamma = 0.75
# learning parameter
initial_state = 1

# Determines the available actions for a given state
def available_actions(state):
current_state_row = M[state, ]
available_action = np.where(current_state_row >= 0)[1]
return available_action

available_action = available_actions(initial_state)

# Chooses one of the available actions at random
def sample_next_action(available_actions_range):
next_action = int(np.random.choice(available_action, 1))
return next_action


action = sample_next_action(available_action)

def update(current_state, action, gamma):

max_index = np.where(Q[action, ] == np.max(Q[action, ]))[1]
if max_index.shape[0] > 1:
max_index = int(np.random.choice(max_index, size = 1))
else:
max_index = int(max_index)
max_value = Q[action, max_index]
Q[current_state, action] = M[current_state, action] + gamma * max_value
if (np.max(Q) > 0):
return(np.sum(Q / np.max(Q)*100))
else:
return (0)
# Updates the Q-Matrix according to the path chosen

update(initial_state, action, gamma)

# Step 5: Training and evaluating the bot using the Q-Matrix

scores = []
for i in range(1000):
current_state = np.random.randint(0, int(Q.shape[0]))
available_action = available_actions(current_state)
action = sample_next_action(available_action)
score = update(current_state, action, gamma)
scores.append(score)

# print("Trained Q matrix:")
# print(Q / np.max(Q)*100)
# You can uncomment the above two lines to view the trained Q matrix

# Testing
current_state = 0
steps = [current_state]

while current_state != 10:

next_step_index = np.where(Q[current_state, ] == np.max(Q[current_state, ]))[1]
if next_step_index.shape[0] > 1:
next_step_index = int(np.random.choice(next_step_index, size = 1))
else:
next_step_index = int(next_step_index)
steps.append(next_step_index)
current_state = next_step_index

print("Most efficient path:")
print(steps)

pl.plot(scores)
pl.xlabel('No of iterations')
pl.ylabel('Reward gained')
pl.show()

# Step 6: Defining and visualizing the new graph with the environmental clues
# Defining the locations of the police and the drug traces
police = [2, 4, 5]
drug_traces = [3, 8, 9]

G = nx.Graph()
G.add_edges_from(edges)
mapping = {0:'0 - Detective', 1:'1', 2:'2 - Police', 3:'3 - Drug traces',
4:'4 - Police', 5:'5 - Police', 6:'6', 7:'7', 8:'Drug traces',
9:'9 - Drug traces', 10:'10 - Drug racket location'}

H = nx.relabel_nodes(G, mapping)
pos = nx.spring_layout(H)
nx.draw_networkx_nodes(H, pos, node_size =[200, 200, 200, 200, 200, 200, 200, 200])
nx.draw_networkx_edges(H, pos)
nx.draw_networkx_labels(H, pos)
pl.show()

# Step 7: Defining some utility functions for the training process

Q = np.matrix(np.zeros([MATRIX_SIZE, MATRIX_SIZE]))
env_police = np.matrix(np.zeros([MATRIX_SIZE, MATRIX_SIZE]))
env_drugs = np.matrix(np.zeros([MATRIX_SIZE, MATRIX_SIZE]))
initial_state = 1

# Same as above
def available_actions(state):
current_state_row = M[state, ]
av_action = np.where(current_state_row >= 0)[1]
return av_action

# Same as above
def sample_next_action(available_actions_range):
next_action = int(np.random.choice(available_action, 1))
return next_action

def collect_environmental_data(action):
found = []
if action in police:
found.append('p')
if action in drug_traces:
found.append('d')
return found

available_action = available_actions(initial_state)
action = sample_next_action(available_action)

def update(current_state, action, gamma):
max_index = np.where(Q[action, ] == np.max(Q[action, ]))[1]
if max_index.shape[0] > 1:
max_index = int(np.random.choice(max_index, size = 1))
else:
max_index = int(max_index)
max_value = Q[action, max_index]
Q[current_state, action] = M[current_state, action] + gamma * max_value
environment = collect_environmental_data(action)
if 'p' in environment:
env_police[current_state, action] += 1
if 'd' in environment:
env_drugs[current_state, action] += 1
if (np.max(Q) > 0):
return(np.sum(Q / np.max(Q)*100))
else:
return 0
# Same as above
update(initial_state, action, gamma)

def available_actions_with_env_help(state):
current_state_row = M[state, ]
av_action = np.where(current_state_row >= 0)[1]

# if there are multiple routes, dis-favor anything negative
env_pos_row = env_matrix_snap[state, av_action]

if (np.sum(env_pos_row < 0)):
# can we remove the negative directions from av_act?
temp_av_action = av_action[np.array(env_pos_row)[0]>= 0]
if len(temp_av_action) > 0:
av_action = temp_av_action
return av_action
# # Determines the available actions according to the environment

available_action = available_actions(initial_state)
action = sample_next_action(available_action)

def update(current_state, action, gamma):
max_index = np.where(Q[action, ] == np.max(Q[action, ]))[1]
if max_index.shape[0] > 1:
max_index = int(np.random.choice(max_index, size = 1))
else:
max_index = int(max_index)
max_value = Q[action, max_index]
Q[current_state, action] = M[current_state, action] + gamma * max_value
environment = collect_environmental_data(action)
if 'p' in environment:
env_police[current_state, action] += 1
if 'd' in environment:
env_drugs[current_state, action] += 1
if (np.max(Q) > 0):
return(np.sum(Q / np.max(Q)*100))
else:
return (0)
# Same as above
update(initial_state, action, gamma)

def available_actions_with_env_help(state):
current_state_row = M[state, ]
av_action = np.where(current_state_row >= 0)[1]

# if there are multiple routes, dis-favor anything negative
env_pos_row = env_matrix_snap[state, av_action]

if (np.sum(env_pos_row < 0)):
# can we remove the negative directions from av_act?
temp_av_action = av_action[np.array(env_pos_row)[0]>= 0]
if len(temp_av_action) > 0:
av_action = temp_av_action
return av_action

scores = []
for i in range(1000):
current_state = np.random.randint(0, int(Q.shape[0]))
available_action = available_actions(current_state)
action = sample_next_action(available_action)
score = update(current_state, action, gamma)

# print environmental matrices
print('Police Found')
print(env_police)
print('')
print('Drug traces Found')
print(env_drugs)

# scores = []
# for i in range(1000):
# current_state = np.random.randint(0, int(Q.shape[0]))
# available_action = available_actions_with_env_help(current_state)
# action = sample_next_action(available_action)
# score = update(current_state, action, gamma)
# scores.append(score)

# pl.plot(scores)
# pl.xlabel('Number of iterations')
# pl.ylabel('Reward gained')
# pl.show()

0 comments on commit 0322e6c

Please sign in to comment.