zkytony · November 4, 2021 15:03
diff --git a/cryingbaby.py b/cryingbaby.py
 """
 Example of defining a small, tabular POMDP and solving
 it using Cassandra's pomdp-solve value iteration solver.

 Refer to documentation:
 https://h2r.github.io/pomdp-py/html/examples.external_solvers.html
 """
 import pomdp_py

 def cryingbaby():
    """This is a POMDP defined in the Algorithms for Decision Making book
    by M. J. Kochenderfer et al. in section F.7"""
    S = ['hungry', 'sated']
    A = ['feed', 'sing', 'ignore']
    Z = ['crying', 'quiet']
    T = pomdp_py.TabularTransitionModel({
        # state, action, next state
        ('hungry', 'feed',   'sated'):  1.0,
        ('hungry', 'feed',   'hungry'): 0.0,

        ('hungry', 'sing',   'hungry'): 1.0,
        ('hungry', 'sing',   'sated'):  0.0,

        ('hungry', 'ignore', 'hungry'): 1.0,
        ('hungry', 'ignore', 'sated'):  0.0,

        ('sated',  'feed',   'sated'):  1.0,
        ('sated',  'feed',   'hungry'): 0.0,

        ('sated',  'sing',   'hungry'): 0.1,
        ('sated',  'sing',   'sated'):  0.9,

        ('sated',  'ignore', 'hungry'): 0.1,
        ('sated',  'ignore', 'sated'):  0.9
    })

    O = pomdp_py.TabularObservationModel({
        # state, action, observation
        ('hungry', 'feed', 'crying'):  0.8,
        ('hungry', 'feed', 'quiet'):   0.2,

        ('hungry', 'sing', 'crying'):  0.9,
        ('hungry', 'sing', 'quiet'):   0.1,

        ('hungry', 'ignore', 'crying'): 0.8,
        ('hungry', 'ignore', 'quiet'):  0.2,

        ('sated', 'feed', 'crying'):   0.1,
        ('sated', 'feed', 'quiet'):    0.9,

        ('sated', 'sing', 'crying'):   0.1,
        ('sated', 'sing', 'quiet'):    0.9,

        ('sated', 'ignore', 'crying'): 0.1,
        ('sated', 'ignore', 'quiet'):  0.9,
    })

    R = pomdp_py.TabularRewardModel({
        # state, action
        ('hungry', 'feed'): -10 - 5,
        ('hungry', 'sing'): -10 - 0.5,
        ('hungry', 'ignore'): -10,

        ('sated', 'feed'): -5,
        ('sated', 'sing'): -0.5,
        ('sated', 'ignore'): 0
    })

    gamma = 0.9
    return S, A, Z, T, O, R, gamma

 if __name__ == "__main__":
    S, A, Z, T, O, R, gamma = cryingbaby()
    pi = pomdp_py.UniformPolicyModel(A)
    b0 = pomdp_py.Histogram({"hungry": 0.22,
                             "sated": 0.78})
    agent = pomdp_py.Agent(b0, pi, T, O, R)
    horizon = 5

    filename = "cryingbaby.POMDP"
    pomdp_py.to_pomdp_file(agent, filename, discount_factor=gamma)

    # path to the pomdp-solve binary
    pomdp_solve_path = "/home/kaiyuzh/software/pomdp-solve-5.4/src/pomdp-solve"
    policy = pomdp_py.vi_pruning(agent, pomdp_solve_path,
                                 discount_factor=gamma,
                                 options=["-horizon", horizon],
                                 remove_generated_files=False,
                                 return_policy_graph=False)

    print(pomdp_py.value(agent.belief, S, A, Z, T, O, R, gamma, horizon=horizon))

    state = "hungry"  # true initial state
    for step in range(10):
        action = policy.plan(agent)
        next_state = T.sample(state, action)
        reward = R.sample(state, action, next_state)
        observation = O.sample(next_state, action)

        # print
        print(f"step = {step+1}"
              f"\t|\taction: {action}"
              f"\t|\tobservation: {observation}"
              f"\t|\tstate: {state}  "
              f"\t|\treward: {reward}"
              f"\t|\tbelief: {agent.belief}")

        # update agent belief
        next_belief = pomdp_py.belief_update(agent.belief, action, observation, T, O)
        agent.set_belief(pomdp_py.Histogram(next_belief))

        # apply state transition to the environment
        state = next_state
	"""
	Example of defining a small, tabular POMDP and solving
	it using Cassandra's pomdp-solve value iteration solver.

	Refer to documentation:
	https://h2r.github.io/pomdp-py/html/examples.external_solvers.html
	"""
	import pomdp_py

	def cryingbaby():
	"""This is a POMDP defined in the Algorithms for Decision Making book
	by M. J. Kochenderfer et al. in section F.7"""
	S = ['hungry', 'sated']
	A = ['feed', 'sing', 'ignore']
	Z = ['crying', 'quiet']
	T = pomdp_py.TabularTransitionModel({
	# state, action, next state
	('hungry', 'feed', 'sated'): 1.0,
	('hungry', 'feed', 'hungry'): 0.0,

	('hungry', 'sing', 'hungry'): 1.0,
	('hungry', 'sing', 'sated'): 0.0,

	('hungry', 'ignore', 'hungry'): 1.0,
	('hungry', 'ignore', 'sated'): 0.0,

	('sated', 'feed', 'sated'): 1.0,
	('sated', 'feed', 'hungry'): 0.0,

	('sated', 'sing', 'hungry'): 0.1,
	('sated', 'sing', 'sated'): 0.9,

	('sated', 'ignore', 'hungry'): 0.1,
	('sated', 'ignore', 'sated'): 0.9
	})

	O = pomdp_py.TabularObservationModel({
	# state, action, observation
	('hungry', 'feed', 'crying'): 0.8,
	('hungry', 'feed', 'quiet'): 0.2,

	('hungry', 'sing', 'crying'): 0.9,
	('hungry', 'sing', 'quiet'): 0.1,

	('hungry', 'ignore', 'crying'): 0.8,
	('hungry', 'ignore', 'quiet'): 0.2,

	('sated', 'feed', 'crying'): 0.1,
	('sated', 'feed', 'quiet'): 0.9,

	('sated', 'sing', 'crying'): 0.1,
	('sated', 'sing', 'quiet'): 0.9,

	('sated', 'ignore', 'crying'): 0.1,
	('sated', 'ignore', 'quiet'): 0.9,
	})

	R = pomdp_py.TabularRewardModel({
	# state, action
	('hungry', 'feed'): -10 - 5,
	('hungry', 'sing'): -10 - 0.5,
	('hungry', 'ignore'): -10,

	('sated', 'feed'): -5,
	('sated', 'sing'): -0.5,
	('sated', 'ignore'): 0
	})

	gamma = 0.9
	return S, A, Z, T, O, R, gamma

	if __name__ == "__main__":
	S, A, Z, T, O, R, gamma = cryingbaby()
	pi = pomdp_py.UniformPolicyModel(A)
	b0 = pomdp_py.Histogram({"hungry": 0.22,
	"sated": 0.78})
	agent = pomdp_py.Agent(b0, pi, T, O, R)
	horizon = 5

	filename = "cryingbaby.POMDP"
	pomdp_py.to_pomdp_file(agent, filename, discount_factor=gamma)

	# path to the pomdp-solve binary
	pomdp_solve_path = "/home/kaiyuzh/software/pomdp-solve-5.4/src/pomdp-solve"
	policy = pomdp_py.vi_pruning(agent, pomdp_solve_path,
	discount_factor=gamma,
	options=["-horizon", horizon],
	remove_generated_files=False,
	return_policy_graph=False)

	print(pomdp_py.value(agent.belief, S, A, Z, T, O, R, gamma, horizon=horizon))

	state = "hungry" # true initial state
	for step in range(10):
	action = policy.plan(agent)
	next_state = T.sample(state, action)
	reward = R.sample(state, action, next_state)
	observation = O.sample(next_state, action)

	# print
	print(f"step = {step+1}"
	f"\t\|\taction: {action}"
	f"\t\|\tobservation: {observation}"
	f"\t\|\tstate: {state} "
	f"\t\|\treward: {reward}"
	f"\t\|\tbelief: {agent.belief}")

	# update agent belief
	next_belief = pomdp_py.belief_update(agent.belief, action, observation, T, O)
	agent.set_belief(pomdp_py.Histogram(next_belief))

	# apply state transition to the environment
	state = next_state