from typing import Dict, Tuple, Optional
from MDP import Action, State, MDP, Policy
from algos import ExplicitStateValueFunction, StateValueFunction, value_iteration
class NDPolicy:
def __init__(self, copy: Optional = None):
”’
If copy is not empty, this policy is a copy of the specified policy
”’
if copy == None:
self._actions = {}
else:
self._actions = {
s:set(acts) for s,acts in copy._actions.items()
}
def add(self, s, a):
if not s in self._actions:
self._actions[s] = set()
self._actions[s].add(a)
def add_nondet_policy(self, ndpol):
for s,acts in ndpol.items():
if not s in self._actions:
self._actions[s] = set()
for a in acts:
self._actions[s].add(a)
def add_det_policy(self, mdp, pol: Policy):
for s in mdp.states():
self.add(s, pol.action(s))
def actions(self, s):
return self._actions[s]
def compute_policy_value(mdp: MDP, ndpol: NDPolicy, gamma: float, epsilon: float, max_iteration: int) -> StateValueFunction:
pass
def compute_non_augmentable_policy(mdp: MDP, gamma: float, epsilon: float, subopt_epsilon: float, max_iteration: int) -> NDPolicy:
pass
”’
TODO: Explain here why the non-deterministic policy
represented on the figure is not conservative epsilon-optimal
according to the definition of Fard and Pineau
(between 200 and 500 characters):
In order to be conservative epsilon-optimal,
the non-deterministic policy should be such that all policies
that can be derived from this policy have a value of 44.5 or more.
However, the policy that moves from 0 to 1 and from 1 to 0
has a negative value (it only includes costs).
”’
# eof