Skip to content

Instantly share code, notes, and snippets.

@SnowMasaya
Last active October 24, 2018 10:38
Show Gist options
  • Save SnowMasaya/39b3b10709fd02b93884012fa15536b0 to your computer and use it in GitHub Desktop.
Save SnowMasaya/39b3b10709fd02b93884012fa15536b0 to your computer and use it in GitHub Desktop.
PyTorchで始める物体検出:Yolo 9000 Better, Faster, Stronger ref: https://qiita.com/GushiSnow/items/470512e5c04fcdfe7c59
class Conv2d_BatchNorm(nn.Module):
def __init__(self, in_channels, out_channels, kernel_size, stride=1, relu=True, same_padding=False):
super(Conv2d_BatchNorm, self).__init__()
padding = int((kernel_size - 1) / 2) if same_padding else 0
self.conv = nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding=padding, bias=False)
self.bn = nn.BatchNorm2d(out_channels, momentum=0.01)
self.relu = nn.LeakyReLU(0.1, inplace=True) if relu else None
def forward(self, x):
x = self.conv(x)
x = self.bn(x)
if self.relu is not None:
x = self.relu(x)
return x
inp_size = np.array([416, 416], dtype=np.int) # w, h
out_w, out_h, out_c = int(w / stride), int(h / stride), c * (stride * stride)
net.load_from_npz(cfg.pretrained_model, num_conv=18)
out_channels = cfg.num_anchors * (cfg.num_classes + 5)
self.conv5 = net_utils.Conv2d(c4, out_channels, 1, 1, relu=False)
Pr(Norfolk terrier) = Pr(Norfolk terrier\ |\ terrier) ∗Pr(terrier\ |\ hunting dog) ∗ . . .∗
Pr(mammal\ |\ animal) ∗Pr(animal\ |\ physical object)
softmax_word_tree = []
for wordnet_index in output_index:
softmax_word_tree.append(F.softmax(output[5:wordnet_index]))
output_tensor = torch.cat(tuple(softmax_word_tree), 0)
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
self.conv2_drop = nn.Dropout2d()
self.fc1 = nn.Linear(320, 50)
self.fc2 = nn.Linear(50, 10)
self.conv1_2 = nn.Conv2d(10, 20, kernel_size=5)
self.conv2_2 = nn.Conv2d(20, 40, kernel_size=5)
self.fc1_2 = nn.Linear(360, 50)
self.over_size = 28
def forward(self, x):
_, _, h, w = x.size()
if h > self.over_size:
x = F.relu(F.max_pool2d(self.conv1(x), 2))
x = F.relu(F.max_pool2d(self.conv1_2(x), 2))
x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2_2(x)), 2))
x = x.view(-1, 360)
x = F.relu(self.fc1_2(x))
else:
x = F.relu(F.max_pool2d(self.conv1(x), 2))
x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
x = x.view(-1, 320)
x = F.relu(self.fc1(x))
x = F.dropout(x, training=self.training)
x = self.fc2(x)
return F.log_softmax(x)
imdb = VOCDataset(cfg.imdb_train, cfg.DATA_DIR, cfg.train_batch_size,
yolo_utils.preprocess_train, processes=2, shuffle=True, dst_size=cfg.inp_size)
def next_batch(self):
batch = {'images': [], 'gt_boxes': [], 'gt_classes': [], 'dontcare': [], 'origin_im': []}
i = 0
while i < self.batch_size:
try:
images, gt_boxes, classes, dontcare, origin_im = self.gen.next()
batch['images'].append(images)
batch['gt_boxes'].append(gt_boxes)
batch['gt_classes'].append(classes)
batch['dontcare'].append(dontcare)
batch['origin_im'].append(origin_im)
i += 1
except (StopIteration, AttributeError):
indexes = np.arange(len(self.image_names), dtype=np.int)
if self._shuffle:
np.random.shuffle(indexes)
self.gen = self.pool.imap(self._im_processor,
([self.image_names[i], self.get_annotation(i), self.dst_size] for i in indexes),
chunksize=self.batch_size)
self._epoch += 1
print('epoch {} start...'.format(self._epoch))
batch['images'] = np.asarray(batch['images'])
return batch
net = Darknet19()
def _make_layers(in_channels, net_cfg):
layers = []
if len(net_cfg) > 0 and isinstance(net_cfg[0], list):
for sub_cfg in net_cfg:
layer, in_channels = _make_layers(in_channels, sub_cfg)
layers.append(layer)
else:
for item in net_cfg:
if item == 'M':
layers.append(nn.MaxPool2d(kernel_size=2, stride=2))
else:
out_channels, ksize = item
layers.append(net_utils.Conv2d_BatchNorm(in_channels, out_channels, ksize, same_padding=True))
# layers.append(net_utils.Conv2d(in_channels, out_channels, ksize, same_padding=True))
in_channels = out_channels
return nn.Sequential(*layers), in_channels
anchors = np.asarray([(1.08, 1.19), (3.42, 4.41), (6.63, 11.38), (9.42, 5.11), (16.62, 10.52)], dtype=np.float)
num_anchors = len(anchors)
net.load_from_npz(cfg.pretrained_model, num_conv=18)
def load_from_npz(self, fname, num_conv=None):
# 重みを指定
dest_src = {'conv.weight': 'kernel', 'conv.bias': 'biases',
'bn.weight': 'gamma', 'bn.bias': 'biases',
'bn.running_mean': 'moving_mean', 'bn.running_var': 'moving_variance'}
# 学習済みモデルの読み込み
params = np.load(fname)
# モジュールのすべての状態をdictionary形式で返却
# http://pytorch.org/docs/master/nn.html?highlight=state_dict#torch.nn.Module.state_dict
own_dict = self.state_dict()
# モデルのキー(conv_weightなど)を取得
keys = list(own_dict.keys())
# 畳み込み層を効率よくアクセスするため5つ刻み
for i, start in enumerate(range(0, len(keys), 5)):
if num_conv is not None and i >= num_conv:
break
end = min(start+5, len(keys))
for key in keys[start:end]:
list_key = key.split('.')
ptype = dest_src['{}.{}'.format(list_key[-2], list_key[-1])]
src_key = '{}-convolutional/{}:0'.format(i, ptype)
print((src_key, own_dict[key].size(), params[src_key].shape))
param = torch.from_numpy(params[src_key])
          # kernelのみ重みの配列の順序を変更
if ptype == 'kernel':
param = param.permute(3, 2, 0, 1)
own_dict[key].copy_(param)
('0-convolutional/kernel:0', torch.Size([32, 3, 3, 3]), (3, 3, 3, 32))
('0-convolutional/gamma:0', torch.Size([32]), (32,))
('0-convolutional/biases:0', torch.Size([32]), (32,))
('0-convolutional/moving_mean:0', torch.Size([32]), (32,))
('0-convolutional/moving_variance:0', torch.Size([32]), (32,))
('1-convolutional/kernel:0', torch.Size([64, 32, 3, 3]), (3, 3, 32, 64))
('1-convolutional/gamma:0', torch.Size([64]), (64,))
('1-convolutional/biases:0', torch.Size([64]), (64,))
('1-convolutional/moving_mean:0', torch.Size([64]), (64,))
('1-convolutional/moving_variance:0', torch.Size([64]), (64,))
:
net(im_data, gt_boxes, gt_classes, dontcare)
anchors = np.ascontiguousarray(cfg.anchors, dtype=np.float)
bbox_pred_np = np.expand_dims(bbox_pred_np, 0)
bbox_np = yolo_to_bbox(
np.ascontiguousarray(bbox_pred_np, dtype=np.float),
anchors,
H, W)
bbox_np = bbox_np[0] # bbox_np = (hw, num_anchors, (x1, y1, x2, y2)) range: 0 ~ 1
bbox_np[:, :, 0::2] *= float(inp_size[0]) # rescale x
bbox_np[:, :, 1::2] *= float(inp_size[1]) # rescale y
for b in range(bsize):
for row in range(H):
for col in range(W):
ind = row * W + col
for a in range(num_anchors):
# 出力画像のサイズと合わせるため中央位置を計算
cx = (bbox_pred[b, ind, a, 0] + col) / W
cy = (bbox_pred[b, ind, a, 1] + row) / H
# 出力画像のサイズと合わせるため幅、高さを計算。0.5倍は中央位置からの幅、高さのため
bw = bbox_pred[b, ind, a, 2] * anchors[a][0] / W * 0.5
bh = bbox_pred[b, ind, a, 3] * anchors[a][1] / H * 0.5
# オフセットを計算。(x_min, y_min, x_max, y_max)
bbox_out[b, ind, a, 0] = cx - bw
bbox_out[b, ind, a, 1] = cy - bh
bbox_out[b, ind, a, 2] = cx + bw
bbox_out[b, ind, a, 3] = cy + bh
bbox_np_b = np.reshape(bbox_np, [-1, 4])
ious = bbox_ious(
np.ascontiguousarray(bbox_np_b, dtype=np.float),
np.ascontiguousarray(gt_boxes_b, dtype=np.float)
)
best_ious = np.max(ious, axis=1).reshape(_iou_mask.shape)
iou_penalty = 0 - iou_pred_np[best_ious < cfg.iou_thresh]
_iou_mask[best_ious <= cfg.iou_thresh] = cfg.noobject_scale * iou_penalty
for k in range(K):
# query_boxesは真のボックスのエリア。下記は面積を計算している
qbox_area = (
(query_boxes[k, 2] - query_boxes[k, 0] + 1) *
(query_boxes[k, 3] - query_boxes[k, 1] + 1)
)
for n in range(N):
# 下記は真のボックスと予測したボックスの幅の一致している部分を導出している
iw = (
min(boxes[n, 2], query_boxes[k, 2]) -
max(boxes[n, 0], query_boxes[k, 0]) + 1
)
if iw > 0:
# 下記は真のボックスと予測したボックスの高さの一致している部分を導出している
ih = (
min(boxes[n, 3], query_boxes[k, 3]) -
max(boxes[n, 1], query_boxes[k, 1]) + 1
)
if ih > 0:
# 一致している部分が幅も高さも1以上の場合は予測したボックスの面積を計算
box_area = (
(boxes[n, 2] - boxes[n, 0] + 1) *
(boxes[n, 3] - boxes[n, 1] + 1)
)
# 一致部分の面積を計算
inter_area = iw * ih
# 下記で一致率を計算
intersec[n, k] = inter_area / (qbox_area + box_area - inter_area)
cell_w = float(inp_size[0]) / W
cell_h = float(inp_size[1]) / H
cx = (gt_boxes_b[:, 0] + gt_boxes_b[:, 2]) * 0.5 / cell_w
cy = (gt_boxes_b[:, 1] + gt_boxes_b[:, 3]) * 0.5 / cell_h
cell_inds = np.floor(cy) * W + np.floor(cx)
cell_inds = cell_inds.astype(np.int)
target_boxes = np.empty(gt_boxes_b.shape, dtype=np.float)
target_boxes[:, 0] = cx - np.floor(cx) # cx
target_boxes[:, 1] = cy - np.floor(cy) # cy
target_boxes[:, 2] = (gt_boxes_b[:, 2] - gt_boxes_b[:, 0]) / inp_size[0] * out_size[0] # tw
target_boxes[:, 3] = (gt_boxes_b[:, 3] - gt_boxes_b[:, 1]) / inp_size[1] * out_size[1] # th
gt_boxes_resize = np.copy(gt_boxes_b)
# 真のボックスを出力のサイズに変換
gt_boxes_resize[:, 0::2] *= (out_size[0] / float(inp_size[0]))
gt_boxes_resize[:, 1::2] *= (out_size[1] / float(inp_size[1]))
# 各アンカーボックスにおけるIoUを計算
anchor_ious = anchor_intersections(
anchors,
np.ascontiguousarray(gt_boxes_resize, dtype=np.float)
)
# IoUが最も高いアンカーボックスを選択
anchor_inds = np.argmax(anchor_ious, axis=0)
hw = 4
_boxes = np.zeros([hw, num_anchors, 4], dtype=np.float)
for n in range(N):
anchor_area = anchors[n, 0] * anchors[n, 1]
for k in range(K):
# 真のボックスの幅を計算
boxw = (query_boxes[k, 2] - query_boxes[k, 0] + 1)
# 真のボックスの高さを計算
boxh = (query_boxes[k, 3] - query_boxes[k, 1] + 1)
# 一致している幅を計算
iw = min(anchors[n, 0], boxw)
# 一致している高さを計算
ih = min(anchors[n, 1], boxh)
# 一致している面積を計算
inter_area = iw * ih
# 一致率を計算
intersec[n, k] = inter_area / (anchor_area + boxw * boxh - inter_area)
for i, cell_ind in enumerate(cell_inds):
# ネットワークの予測値を超える場合は下記の処理をしない
if cell_ind >= hw or cell_ind < 0:
print cell_ind
continue
# 最も良いアンカーボックスを選択
a = anchor_inds[i]
iou_pred_cell_anchor = iou_pred_np[cell_ind, a, :] # 0 ~ 1, should be close to 1
# マスク処理によってスケーリングを行い、ロス計算時の重要度を設定。一致率が高いほどロスを小さくしたいので下記のようになる
_iou_mask[cell_ind, a, :] = cfg.object_scale * (1 - iou_pred_cell_anchor)
# IoUが最も高いアンカーボックスを選択
_ious[cell_ind, a, :] = ious_reshaped[cell_ind, a, i]
# マスク処理によってスケーリングを行い、ロス計算時の重要度を設定
_box_mask[cell_ind, a, :] = cfg.coord_scale
# 真のボックスをアンカーサイズに変換
target_boxes[i, 2:4] /= anchors[a]
# IoUが最も高いアンカーボックスを選択
_boxes[cell_ind, a, :] = target_boxes[i]
# マスク処理によってスケーリングを行い、ロス計算時の重要度を設定
_class_mask[cell_ind, a, :] = cfg.class_scale
# IoUが最も高いアンカーボックスを選択
_classes[cell_ind, a, gt_classes[i]] = 1.
self.bbox_loss = nn.MSELoss(size_average=False)(bbox_pred * box_mask, _boxes * box_mask) / num_boxes
self.iou_loss = nn.MSELoss(size_average=False)(iou_pred * iou_mask, _ious * iou_mask) / num_boxes
class_mask = class_mask.expand_as(prob_pred)
self.cls_loss = nn.MSELoss(size_average=False)(prob_pred * class_mask, _classes * class_mask) / num_boxes
loss = net.loss
loss.backward()
lr = cfg.init_learning_rate
optimizer = torch.optim.SGD(net.parameters(), lr=lr, momentum=cfg.momentum, weight_decay=cfg.weight_decay)
optimizer.zero_grad()
optimizer.step()
if use_tensorboard and step % cfg.log_interval == 0:
exp.add_scalar_value('loss_train', train_loss, step=step)
exp.add_scalar_value('loss_bbox', bbox_loss, step=step)
exp.add_scalar_value('loss_iou', iou_loss, step=step)
exp.add_scalar_value('loss_cls', cls_loss, step=step)
exp.add_scalar_value('learning_rate', lr, step=step)
if imdb.epoch in cfg.lr_decay_epochs:
lr *= cfg.lr_decay
optimizer = torch.optim.SGD(net.parameters(), lr=lr, momentum=cfg.momentum, weight_decay=cfg.weight_decay)
imdb = VOCDataset(cfg.imdb_train, cfg.DATA_DIR, cfg.train_batch_size,
yolo_utils.preprocess_train, processes=2, shuffle=True, dst_size=cfg.inp_size)
im, trans_param = imcv2_affine_trans(im)
array([
# この部分は各画像の座標、縦軸がアンカーボックスの数、横軸がオフセットの座標
[[ 0., 0., 0., 0.],
[ 0., 0., 0., 0.],
[ 0., 0., 0., 0.],
[ 0., 0., 0., 0.],
[ 0., 0., 0., 0.]],
[[ 0., 0., 0., 0.],
[ 0., 0., 0., 0.],
[ 0., 0., 0., 0.],
[ 0., 0., 0., 0.],
[ 0., 0., 0., 0.]],
[[ 0., 0., 0., 0.],
[ 0., 0., 0., 0.],
[ 0., 0., 0., 0.],
[ 0., 0., 0., 0.],
[ 0., 0., 0., 0.]],
[[ 0., 0., 0., 0.],
[ 0., 0., 0., 0.],
[ 0., 0., 0., 0.],
[ 0., 0., 0., 0.],
[ 0., 0., 0., 0.]]])
h, w, c = im.shape
scale = np.random.uniform() / 10. + 1.
max_offx = (scale - 1.) * w
max_offy = (scale - 1.) * h
offx = int(np.random.uniform() * max_offx)
offy = int(np.random.uniform() * max_offy)
self.gen = self.pool.imap(self._im_processor,
([self.image_names[i], self.get_annotation(i), self.dst_size] for i in indexes),
chunksize=self.batch_size)
p_w, p_hはアンカーボックスのサイズ \\
b_x = \sigma(t_x) + c_x \\
b_y = \sigma(t_y) + c_y \\
b_w = p_we^{(t_w)} \\
b_h = p_he^{(t_h)} \\
Pr(object) ∗ IOU(b, object) = \sigma(t_o)
xy_pred = F.sigmoid(conv5_reshaped[:, :, :, 0:2])
wh_pred = torch.exp(conv5_reshaped[:, :, :, 2:4])
net_cfgs = [
# conv1s
[(32, 3)],
['M', (64, 3)],
['M', (128, 3), (64, 1), (128, 3)],
['M', (256, 3), (128, 1), (256, 3)],
['M', (512, 3), (256, 1), (512, 3), (256, 1), (512, 3)],
# conv2
['M', (1024, 3), (512, 1), (1024, 3), (512, 1), (1024, 3)],
# ------------
# conv3
[(1024, 3), (1024, 3)],
# conv4
[(1024, 3)]
]
self.conv4, c4 = _make_layers((c1*(stride*stride) + c3), net_cfgs[7])
conv1s_reorg = self.reorg(conv1s)
cat_1_3 = torch.cat([conv1s_reorg, conv3], 1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment