COMP90051 Project 2¶
In [ ]:
# Do not edit. These are the only imports permitted.
%matplotlib inline
import numpy as np
from numpy.linalg import inv
import matplotlib.pyplot as plt
from abc import ABC, abstractmethod
1. Implementing ε-Greedy and UCB¶
In [ ]:
class MAB(ABC):
“””
Abstract class that represents a multi-armed bandit (MAB)
“””
@abstractmethod
def play(self, tround, context):
“””
Play a round
Arguments
=========
tround : int
positive integer identifying the round
context : 1D float array, shape (self.ndims * self.narms), optional
context given to the arms
Returns
=======
arm : int
the positive integer arm id for this round
“””
@abstractmethod
def update(self, arm, reward, context):
“””
Updates the internal state of the MAB after a play
Arguments
=========
arm : int
a positive integer arm id in {1, …, self.narms}
reward : float
reward received from arm
context : 1D float array, shape (self.ndims * self.narms), optional
context given to arms
“””
In [ ]:
class EpsGreedy(MAB):
“””
Epsilon-Greedy multi-armed bandit
Arguments
=========
narms : int
number of arms
epsilon : float
explore probability
Q0 : float, optional
initial value for the arms
“””
def __init__(self, narms, epsilon, Q0=np.inf):
def play(self, tround, context=None):
def update(self, arm, reward, context=None):
In [ ]:
class UCB(MAB):
“””
Upper Confidence Bound (UCB) multi-armed bandit
Arguments
=========
narms : int
number of arms
rho : float
positive real explore-exploit parameter
Q0 : float, optional
initial value for the arms
“””
def __init__(self, narms, rho, Q0=np.inf):
def play(self, tround, context=None):
def update(self, arm, reward, context=None):
2. The Basic Thompson Bandit¶
In [ ]:
class BetaThompson(MAB):
“””
Beta-Bernoulli Thompson sampling multi-armed bandit
Arguments
=========
narms : int
number of arms
alpha0 : float, optional
positive real prior hyperparameter
beta0 : float, optional
positive real prior hyperparameter
“””
def __init__(self, narms, alpha0=1.0, beta0=1.0):
def play(self, tround, context=None):
def update(self, arm, reward, context=None):
3. Off-Policy Evaluation¶
In [ ]:
def offlineEvaluate(mab, arms, rewards, contexts, nrounds=None):
“””
Offline evaluation of a multi-armed bandit
Arguments
=========
mab : instance of MAB
arms : 1D int array, shape (nevents,)
integer arm id for each event
rewards : 1D float array, shape (nevents,)
reward received for each event
contexts : 2D float array, shape (nevents, mab.narms*nfeatures)
contexts presented to the arms (stacked horizontally)
for each event.
nrounds : int, optional
number of matching events to evaluate `mab` on.
Returns
=======
out : 1D float array
rewards for the matching events
“””
In [ ]:
mab = EpsGreedy(10, 0.05)
results_EpsGreedy = offlineEvaluate(mab, arms, rewards, contexts, 800)
print(‘EpsGreedy average reward’, np.mean(results_EpsGreedy))
In [ ]:
mab = UCB(10, 1.0)
results_UCB = offlineEvaluate(mab, arms, rewards, contexts, 800)
print(‘UCB average reward’, np.mean(results_UCB))
In [ ]:
mab = BetaThompson(10, 1.0, 1.0)
results_BetaThompson = offlineEvaluate(mab, arms, rewards, contexts, 800)
print(‘BetaThompson average reward’, np.mean(results_BetaThompson))
4. Contextual Bandits – LinUCB¶
In [ ]:
class LinUCB(MAB):
“””
Contextual multi-armed bandit (LinUCB)
Arguments
=========
narms : int
number of arms
ndims : int
number of dimensions for each arm’s context
alpha : float
positive real explore-exploit parameter
“””
def __init__(self, narms, ndims, alpha):
def play(self, tround, context):
def update(self, arm, reward, context):
In [ ]:
mab = LinUCB(10, 10, 1.0)
results_LinUCB = offlineEvaluate(mab, arms, rewards, contexts, 800)
print(‘LinUCB average reward’, np.mean(results_LinUCB))
5. Contextual Bandits – LinThompson¶
In [ ]:
class LinThompson(MAB):
“””
Contextual Thompson sampled multi-armed bandit (LinThompson)
Arguments
=========
narms : int
number of arms
ndims : int
number of dimensions for each arm’s context
v : float
positive real explore-exploit parameter
“””
def __init__(self, narms, ndims, v):
def play(self, tround, context):
def update(self, arm, reward, context):
In [ ]:
mab = LinThompson(10, 10, 1.0)
results_LinThompson = offlineEvaluate(mab, arms, rewards, contexts, 800)
print(‘LinThompson average reward’, np.mean(results_LinThompson))
6. Evaluation¶
6.A.¶
In [ ]:
6.B.¶
In [ ]: