-
Notifications
You must be signed in to change notification settings - Fork 18
/
Copy pathhcope_debug.py
103 lines (68 loc) · 2.85 KB
/
hcope_debug.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import numpy as np
import random
import math
from scipy.optimize import minimize
from scipy.special import j1
from scipy.optimize import minimize_scalar
# TODO
# Add Sanity Check
def hcope_estimator(d_pre, d_post, pi_b,pi_e,delta):
"""
d_pre : float, size = (dataset_split,)
Trajectory rewards from the behavior policy
d_post : float, size = (dataset_size - dataset_split, )
Trajectory rewards from the behavior policy
delta : float, size = scalar
1-delta is the confidence of the estimator
pi_b : Probabilities for respective trajectories in behaviour policy
pi_e : Probabilities for respective trajectories in evaluation policy
RETURNS: lower bound for the mean, mu as per Theorem 1 of Thomas et al. High Confidence Off-Policy Evaluation
"""
d_pre = np.asarray(d_pre)
d_post = np.asarray(d_post)
n_post = len(d_post)
n_pre = len(d_pre)
# Estimate c which maximizes the lower bound using estimates from d_pre
c_estimate = 4.0
def f(x):
n_pre = len(d_pre)
Y = np.asarray([min((d_pre[i] * pi_e[i])/pi_b[i], x) for i in range(n_pre)], dtype=float)
# Empirical mean
EM = np.sum(Y)/n_pre
# Second term
term2 = (7.*x*np.log(2/delta)) / (3*(len(d_post)-1))
# Third term
term3 = np.sqrt( ((2*np.log(2/delta))/(n_post*n_pre*(n_pre-1)) * (n_pre*np.sum(np.square(Y)) - np.square(np.sum(Y))) ))
return (-EM+term2+term3)
c_estimate = minimize(f,np.array([c_estimate]),method='BFGS').x
# Use the estimated c for computing the maximum lower bound
c = c_estimate
if ~isinstance(c, list):
c = np.full((n_post,), c, dtype=float)
if n_post<=1:
raise(ValueError("The value of 'n' must be greater than 1"))
Y = np.asarray([min((d_post[i] * pi_e[i])/pi_b[i], c[i]) for i in range(len(d_post))], dtype=float)
# Empirical mean
EM = np.sum(Y/c)/np.sum(1/c)
# Second term
term2 = (7.*n_post*np.log(2/delta)) / (3*(n_post-1)*np.sum(1/c))
# Third term
term3 = np.sqrt( ((2*np.log(2/delta))/(n_post-1)) * (n_post*np.sum(np.square(Y/c)) - np.square(np.sum(Y/c))) ) / np.sum(1/c)
# Final estimate
return EM - term2 - term3
'''
Sample test for checking performance bounds by HCOPE -
Thomas, Philip S., Georgios Theocharous, and Mohammad Ghavamzadeh. "High-Confidence Off-Policy Evaluation." AAAI. 2015.
'''
if __name__=="__main__":
dataset_size = 100
# Rewards obtained per trajectories
dataset = random.sample(xrange(100),dataset_size)
d_pre = dataset[:int(0.05*dataset_size)]
d_post = dataset[int(0.05*dataset_size):]
# Ensure small divergence between both policy
pi_b = np.random.uniform(low=0.02, high=0.1, size=(dataset_size,))
pi_e = np.random.uniform(low=0.001, high=0.006, size=(dataset_size,))+ pi_b
delta_confidence = 0.1
print('Performance of behaviour policy: ' ,np.sum(dataset)/len(dataset))
print('Estimated lower bound performance: ',hcope_estimator(d_pre,d_post,pi_b,pi_e,delta_confidence))