CS计算机代考程序代写 from typing import Dict, Tuple, Optional

from typing import Dict, Tuple, Optional

from MDP import Action, State, MDP, Policy
from algos import ExplicitStateValueFunction, StateValueFunction, value_iteration

class NDPolicy:
def __init__(self, copy: Optional = None):
”’
If copy is not empty, this policy is a copy of the specified policy
”’
if copy == None:
self._actions = {}
else:
self._actions = {
s:set(acts) for s,acts in copy._actions.items()
}

def add(self, s, a):
if not s in self._actions:
self._actions[s] = set()
self._actions[s].add(a)

def add_nondet_policy(self, ndpol):
for s,acts in ndpol.items():
if not s in self._actions:
self._actions[s] = set()
for a in acts:
self._actions[s].add(a)

def add_det_policy(self, mdp, pol: Policy):
for s in mdp.states():
self.add(s, pol.action(s))

def actions(self, s):
return self._actions[s]

def compute_policy_value(mdp: MDP, ndpol: NDPolicy, gamma: float, epsilon: float, max_iteration: int) -> StateValueFunction:
pass

def compute_non_augmentable_policy(mdp: MDP, gamma: float, epsilon: float, subopt_epsilon: float, max_iteration: int) -> NDPolicy:
pass

”’
TODO: Explain here why the non-deterministic policy
represented on the figure is not conservative epsilon-optimal
according to the definition of Fard and Pineau
(between 200 and 500 characters):

In order to be conservative epsilon-optimal,
the non-deterministic policy should be such that all policies
that can be derived from this policy have a value of 44.5 or more.
However, the policy that moves from 0 to 1 and from 1 to 0
has a negative value (it only includes costs).
”’

# eof

Related Posts