CS计算机代考程序代写 ”’

An example of an MDP modelled implicitly.
Compared to the state machine of example1.py,
the successor states are not computed when the MDP is constructed
but when the method next_states() is called.

from typing import List, Tuple

from MDP import Action, State, MDP

class Example2(MDP):
hunt_ = ‘hunt’
nohunt_ = ‘nohunt’

def states(self) -> List[State]:
return [0,1,2,3]

def actions(self) -> List[Action]:
return [self.hunt_, self.nohunt_]

def applicable_actions(self, s: State) -> List[Action]:
return self.actions() # all actions are always applicable

def next_states(self, s: State, a: Action) -> List[Tuple[State,float,float]]:
reward = (3-s) + (10 if s>0 else 0) # the reward is always the same in this example

if a == self.nohunt_:
next_state = max(0,s-1) # consume one if possible
proba = 1
return [ (next_state,proba,reward) ] # only one successor

# a == self.hunt_

# 2 to 3 successors
proba_successor = {} # using a map because there are several ways to reach the same state

# first state: not gaining food (and consume 1)
s1 = max(0,s-1)
proba_successor[s1] = .1

# second state: gain 2 food (and consume 1)
s2 = min(3,s+1)
proba_successor[s2] = .7

# third state: gain 1 food (and consume 0)
s3 = s
if not s in proba_successor:
proba_successor[s] = 0
proba_successor[s] = proba_successor[s] + .2

return [ (next,proba,reward) for (next,proba) in proba_successor.items() ]
Returns the outcome of performing action a in state s.
Defined as a list of elements [s’,prob,rew]
where s’ is the next state,
prob is the probability of reaching this state,
and rew is the reward.

def initial_state(self) -> State:
return 1

def example_2() -> MDP:
return Example2()

# eof