程序代写代做代考 arm COMP90051 Project 2¶

COMP90051 Project 2¶
In [ ]:
# Do not edit. These are the only imports permitted.
%matplotlib inline
import numpy as np
from numpy.linalg import inv
import matplotlib.pyplot as plt
from abc import ABC, abstractmethod

1. Implementing ε-Greedy and UCB¶
In [ ]:
class MAB(ABC):
“””
Abstract class that represents a multi-armed bandit (MAB)
“””

@abstractmethod
def play(self, tround, context):
“””
Play a round

Arguments
=========
tround : int
positive integer identifying the round

context : 1D float array, shape (self.ndims * self.narms), optional
context given to the arms

Returns
=======
arm : int
the positive integer arm id for this round
“””

@abstractmethod
def update(self, arm, reward, context):
“””
Updates the internal state of the MAB after a play

Arguments
=========
arm : int
a positive integer arm id in {1, …, self.narms}

reward : float
reward received from arm

context : 1D float array, shape (self.ndims * self.narms), optional
context given to arms
“””
In [ ]:
class EpsGreedy(MAB):
“””
Epsilon-Greedy multi-armed bandit

Arguments
=========
narms : int
number of arms

epsilon : float
explore probability

Q0 : float, optional
initial value for the arms
“””

def __init__(self, narms, epsilon, Q0=np.inf):

def play(self, tround, context=None):

def update(self, arm, reward, context=None):

In [ ]:
class UCB(MAB):
“””
Upper Confidence Bound (UCB) multi-armed bandit

Arguments
=========
narms : int
number of arms

rho : float
positive real explore-exploit parameter

Q0 : float, optional
initial value for the arms
“””

def __init__(self, narms, rho, Q0=np.inf):

def play(self, tround, context=None):

def update(self, arm, reward, context=None):

2. The Basic Thompson Bandit¶
In [ ]:
class BetaThompson(MAB):
“””
Beta-Bernoulli Thompson sampling multi-armed bandit

Arguments
=========
narms : int
number of arms

alpha0 : float, optional
positive real prior hyperparameter

beta0 : float, optional
positive real prior hyperparameter
“””

def __init__(self, narms, alpha0=1.0, beta0=1.0):

def play(self, tround, context=None):

def update(self, arm, reward, context=None):

3. Off-Policy Evaluation¶
In [ ]:
def offlineEvaluate(mab, arms, rewards, contexts, nrounds=None):
“””
Offline evaluation of a multi-armed bandit

Arguments
=========
mab : instance of MAB

arms : 1D int array, shape (nevents,)
integer arm id for each event

rewards : 1D float array, shape (nevents,)
reward received for each event

contexts : 2D float array, shape (nevents, mab.narms*nfeatures)
contexts presented to the arms (stacked horizontally)
for each event.

nrounds : int, optional
number of matching events to evaluate `mab` on.

Returns
=======
out : 1D float array
rewards for the matching events
“””

In [ ]:
mab = EpsGreedy(10, 0.05)
results_EpsGreedy = offlineEvaluate(mab, arms, rewards, contexts, 800)
print(‘EpsGreedy average reward’, np.mean(results_EpsGreedy))
In [ ]:
mab = UCB(10, 1.0)
results_UCB = offlineEvaluate(mab, arms, rewards, contexts, 800)
print(‘UCB average reward’, np.mean(results_UCB))
In [ ]:
mab = BetaThompson(10, 1.0, 1.0)
results_BetaThompson = offlineEvaluate(mab, arms, rewards, contexts, 800)
print(‘BetaThompson average reward’, np.mean(results_BetaThompson))

4. Contextual Bandits – LinUCB¶
In [ ]:
class LinUCB(MAB):
“””
Contextual multi-armed bandit (LinUCB)

Arguments
=========
narms : int
number of arms

ndims : int
number of dimensions for each arm’s context

alpha : float
positive real explore-exploit parameter
“””

def __init__(self, narms, ndims, alpha):

def play(self, tround, context):

def update(self, arm, reward, context):

In [ ]:
mab = LinUCB(10, 10, 1.0)
results_LinUCB = offlineEvaluate(mab, arms, rewards, contexts, 800)
print(‘LinUCB average reward’, np.mean(results_LinUCB))

5. Contextual Bandits – LinThompson¶
In [ ]:
class LinThompson(MAB):
“””
Contextual Thompson sampled multi-armed bandit (LinThompson)

Arguments
=========
narms : int
number of arms

ndims : int
number of dimensions for each arm’s context

v : float
positive real explore-exploit parameter
“””

def __init__(self, narms, ndims, v):

def play(self, tround, context):

def update(self, arm, reward, context):

In [ ]:
mab = LinThompson(10, 10, 1.0)
results_LinThompson = offlineEvaluate(mab, arms, rewards, contexts, 800)
print(‘LinThompson average reward’, np.mean(results_LinThompson))

6. Evaluation¶
6.A.¶
In [ ]:

6.B.¶
In [ ]: