maxpagels · November 5, 2021 13:50
diff --git a/explore-first-knapsack.py b/explore-first-knapsack.py
 import numpy as np

 # -- Globals

 N_ARMS = 4
 N_RESOURCES = 3
 N_EXPLORATION_ROUNDS = 1

 CONSUMPTION_PER_PLAY = [
    [4, 0, 0], # Example: first arm consumes 4 of resource 1 and 0 of resources 2 & 3
    [1, 5, 1],
    [6, 1, 4],
    [9, 1, 0]
 ]

 BUDGETS = [40, 71, 82]

 # -- Exploration-first

 # Assume you have explored for N rounds and recorded 
 # empirical mean reward probabilities per arm

 REWARD_PROBS = [
    0.4,  # 1 reward with probability 0.4, 0 with probability 0.6
    0.2,
    0.1,
    0.5
 ]

 # -- Exploitation phase

 # Idea: solve a MIP to maximise reward subject to budget constraints,
 # thereby getting an empirical pmf of action probabilities

 from ortools.linear_solver import pywraplp
 solver = pywraplp.Solver.CreateSolver('SCIP')

 # Variables: one integer variable per arm, denoting the number of plays
 actions = []
 for i in range(0, N_ARMS):
    actions.append(solver.IntVar(0, solver.infinity(), f"action_{i}"))

 # Constraints: the remaining budget of any resource must not be
 # exhausted
 for j, b in enumerate(BUDGETS):
    rowsums = []
    for i, a in enumerate(actions):
        rowsums.append(a * CONSUMPTION_PER_PLAY[i][j])
    solver.Add(solver.Sum([i for i in rowsums]) <= b)

 # Objective: maximise reward over actions subject to constraints
 solver.Maximize(solver.Sum([a * REWARD_PROBS[i] for i, a in enumerate(actions)]))
 status = solver.Solve()

 if status == pywraplp.Solver.OPTIMAL:
    print('MIP solution:')
    print('Simulated reward =', solver.Objective().Value())
    plays = []
    for a in actions:
        plays.append(a.solution_value())
    print(plays)
 else:
    print('The problem does not have an optimal solution.')

 # -- Print example predictions

 dist = np.array(plays)
 pmf = dist / dist.sum()

 print("Example predictions, 20 rounds:")
 for i in range(0, 20):
    print(f"round {i}, play arm {np.random.choice(np.arange(N_ARMS), p=pmf)}")
	import numpy as np

	# -- Globals

	N_ARMS = 4
	N_RESOURCES = 3
	N_EXPLORATION_ROUNDS = 1

	CONSUMPTION_PER_PLAY = [
	[4, 0, 0], # Example: first arm consumes 4 of resource 1 and 0 of resources 2 & 3
	[1, 5, 1],
	[6, 1, 4],
	[9, 1, 0]
	]

	BUDGETS = [40, 71, 82]

	# -- Exploration-first

	# Assume you have explored for N rounds and recorded
	# empirical mean reward probabilities per arm

	REWARD_PROBS = [
	0.4, # 1 reward with probability 0.4, 0 with probability 0.6
	0.2,
	0.1,
	0.5
	]

	# -- Exploitation phase

	# Idea: solve a MIP to maximise reward subject to budget constraints,
	# thereby getting an empirical pmf of action probabilities

	from ortools.linear_solver import pywraplp
	solver = pywraplp.Solver.CreateSolver('SCIP')

	# Variables: one integer variable per arm, denoting the number of plays
	actions = []
	for i in range(0, N_ARMS):
	actions.append(solver.IntVar(0, solver.infinity(), f"action_{i}"))

	# Constraints: the remaining budget of any resource must not be
	# exhausted
	for j, b in enumerate(BUDGETS):
	rowsums = []
	for i, a in enumerate(actions):
	rowsums.append(a * CONSUMPTION_PER_PLAY[i][j])
	solver.Add(solver.Sum([i for i in rowsums]) <= b)

	# Objective: maximise reward over actions subject to constraints
	solver.Maximize(solver.Sum([a * REWARD_PROBS[i] for i, a in enumerate(actions)]))
	status = solver.Solve()

	if status == pywraplp.Solver.OPTIMAL:
	print('MIP solution:')
	print('Simulated reward =', solver.Objective().Value())
	plays = []
	for a in actions:
	plays.append(a.solution_value())
	print(plays)
	else:
	print('The problem does not have an optimal solution.')

	# -- Print example predictions

	dist = np.array(plays)
	pmf = dist / dist.sum()

	print("Example predictions, 20 rounds:")
	for i in range(0, 20):
	print(f"round {i}, play arm {np.random.choice(np.arange(N_ARMS), p=pmf)}")