/[!"#$%&()*+,\-.\/:;<=>?@\[\]^_`{|}~]/
Punctuations, include Unicode ones (\u2000-\u206F: general punctuations, \u2E00-\u2E7F: supplemental punctuations)
/[\u2000-\u206F\u2E00-\u2E7F\\'!"#$%&()*+,\-.\/:;<=>?@\[\]^_`{|}~]/
If you're not using any branching (i.e. you only work on master branch), you can follow the below simple and basic Git flow.
Install:
$ brew install git
... | |
next_state, reward, done, info = env.step(action) | |
# 修改 reward,加快訓練 | |
x, v, theta, omega = next_state | |
r1 = (env.x_threshold - abs(x)) / env.x_threshold - 0.8 # 小車離中間越近越好 | |
r2 = (env.theta_threshold_radians - abs(theta)) / env.theta_threshold_radians - 0.5 # 柱子越正越好 | |
reward = r1 + r2 | |
dqn.store_transition(state, action, reward, next_state) |
... | |
if not done: | |
reward = 1.0 | |
elif self.steps_beyond_done is None: | |
# Pole just fell! | |
self.steps_beyond_done = 0 | |
reward = 1.0 | |
else: | |
self.steps_beyond_done += 1 | |
reward = 0.0 |
env = gym.make('CartPole-v0') | |
# Environment parameters | |
n_actions = env.action_space.n | |
n_states = env.observation_space.shape[0] | |
# Hyper parameters | |
n_hidden = 50 | |
batch_size = 32 | |
lr = 0.01 # learning rate |
def learn(self): | |
# 隨機取樣 batch_size 個 experience | |
sample_index = np.random.choice(self.memory_capacity, self.batch_size) | |
b_memory = self.memory[sample_index, :] | |
b_state = torch.FloatTensor(b_memory[:, :self.n_states]) | |
b_action = torch.LongTensor(b_memory[:, self.n_states:self.n_states+1].astype(int)) | |
b_reward = torch.FloatTensor(b_memory[:, self.n_states+1:self.n_states+2]) | |
b_next_state = torch.FloatTensor(b_memory[:, -self.n_states:]) | |
# 計算現有 eval net 和 target net 得出 Q value 的落差 |
def store_transition(self, state, action, reward, next_state): | |
# 打包 experience | |
transition = np.hstack((state, [action, reward], next_state)) | |
# 存進 memory;舊 memory 可能會被覆蓋 | |
index = self.memory_counter % self.memory_capacity | |
self.memory[index, :] = transition | |
self.memory_counter += 1 |
def choose_action(self, state): | |
x = torch.unsqueeze(torch.FloatTensor(state), 0) | |
# epsilon-greedy | |
if np.random.uniform() < self.epsilon: # 隨機 | |
action = np.random.randint(0, self.n_actions) | |
else: # 根據現有 policy 做最好的選擇 | |
actions_value = self.eval_net(x) # 以現有 eval net 得出各個 action 的分數 | |
action = torch.max(actions_value, 1)[1].data.numpy()[0] # 挑選最高分的 action |
class DQN(object): | |
def __init__(self, n_states, n_actions, n_hidden, batch_size, lr, epsilon, gamma, target_replace_iter, memory_capacity): | |
self.eval_net, self.target_net = Net(n_states, n_actions, n_hidden), Net(n_states, n_actions, n_hidden) | |
self.memory = np.zeros((memory_capacity, n_states * 2 + 2)) # 每個 memory 中的 experience 大小為 (state + next state + reward + action) | |
self.optimizer = torch.optim.Adam(self.eval_net.parameters(), lr=lr) | |
self.loss_func = nn.MSELoss() | |
self.memory_counter = 0 | |
self.learn_step_counter = 0 # 讓 target network 知道什麼時候要更新 |
class Net(nn.Module): | |
def __init__(self, n_states, n_actions, n_hidden): | |
super(Net, self).__init__() | |
# 輸入層 (state) 到隱藏層,隱藏層到輸出層 (action) | |
self.fc1 = nn.Linear(n_states, n_hidden) | |
self.out = nn.Linear(n_hidden, n_actions) | |
def forward(self, x): | |
x = self.fc1(x) |