Peiyu Liao pyliaorachel

/[!"#$%&()*+,\-.\/:;<=>?@\[\]^_`{|}~]/

/[\u2000-\u206F\u2E00-\u2E7F\\'!"#$%&()*+,\-.\/:;<=>?@\[\]^_`{|}~]/

If you're not using any branching (i.e. you only work on master branch), you can follow the below simple and basic Git flow.

Install:

MacOS: open your terminal, $ brew install git
Windows: install GitBASH in Git for Windows, then right-click anywhere on desktop and choose Git Bash

	...
	next_state, reward, done, info = env.step(action)

	# 修改 reward，加快訓練
	x, v, theta, omega = next_state
	r1 = (env.x_threshold - abs(x)) / env.x_threshold - 0.8 # 小車離中間越近越好
	r2 = (env.theta_threshold_radians - abs(theta)) / env.theta_threshold_radians - 0.5 # 柱子越正越好
	reward = r1 + r2

	dqn.store_transition(state, action, reward, next_state)

	...
	if not done:
	reward = 1.0
	elif self.steps_beyond_done is None:
	# Pole just fell!
	self.steps_beyond_done = 0
	reward = 1.0
	else:
	self.steps_beyond_done += 1
	reward = 0.0

	env = gym.make('CartPole-v0')

	# Environment parameters
	n_actions = env.action_space.n
	n_states = env.observation_space.shape[0]

	# Hyper parameters
	n_hidden = 50
	batch_size = 32
	lr = 0.01 # learning rate

	def learn(self):
	# 隨機取樣 batch_size 個 experience
	sample_index = np.random.choice(self.memory_capacity, self.batch_size)
	b_memory = self.memory[sample_index, :]
	b_state = torch.FloatTensor(b_memory[:, :self.n_states])
	b_action = torch.LongTensor(b_memory[:, self.n_states:self.n_states+1].astype(int))
	b_reward = torch.FloatTensor(b_memory[:, self.n_states+1:self.n_states+2])
	b_next_state = torch.FloatTensor(b_memory[:, -self.n_states:])

	# 計算現有 eval net 和 target net 得出 Q value 的落差

	def store_transition(self, state, action, reward, next_state):
	# 打包 experience
	transition = np.hstack((state, [action, reward], next_state))

	# 存進 memory；舊 memory 可能會被覆蓋
	index = self.memory_counter % self.memory_capacity
	self.memory[index, :] = transition
	self.memory_counter += 1

	def choose_action(self, state):
	x = torch.unsqueeze(torch.FloatTensor(state), 0)

	# epsilon-greedy
	if np.random.uniform() < self.epsilon: # 隨機
	action = np.random.randint(0, self.n_actions)
	else: # 根據現有 policy 做最好的選擇
	actions_value = self.eval_net(x) # 以現有 eval net 得出各個 action 的分數
	action = torch.max(actions_value, 1)[1].data.numpy()[0] # 挑選最高分的 action

	class DQN(object):
	def __init__(self, n_states, n_actions, n_hidden, batch_size, lr, epsilon, gamma, target_replace_iter, memory_capacity):
	self.eval_net, self.target_net = Net(n_states, n_actions, n_hidden), Net(n_states, n_actions, n_hidden)

	self.memory = np.zeros((memory_capacity, n_states * 2 + 2)) # 每個 memory 中的 experience 大小為 (state + next state + reward + action)
	self.optimizer = torch.optim.Adam(self.eval_net.parameters(), lr=lr)
	self.loss_func = nn.MSELoss()
	self.memory_counter = 0
	self.learn_step_counter = 0 # 讓 target network 知道什麼時候要更新

	class Net(nn.Module):
	def __init__(self, n_states, n_actions, n_hidden):
	super(Net, self).__init__()

	# 輸入層 (state) 到隱藏層，隱藏層到輸出層 (action)
	self.fc1 = nn.Linear(n_states, n_hidden)
	self.out = nn.Linear(n_hidden, n_actions)

	def forward(self, x):
	x = self.fc1(x)