Created
January 19, 2022 21:33
-
-
Save trzy/8eb1452665248b91064d0efbfc35dde9 to your computer and use it in GitHub Desktop.
PyTorch Memory Leak and Redundant .detach()
This file has been truncated, but you can view the full file.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from dataclasses import dataclass | |
| import numpy as np | |
| import random | |
| import torch as t | |
| from torch import nn | |
| from torchvision.ops import nms | |
| from torch.nn import functional as F | |
| from torchvision.ops import RoIPool | |
| from torchvision.models import vgg16 | |
| def create_optimizer(model): | |
| params = [] | |
| for key, value in dict(model.named_parameters()).items(): | |
| if not value.requires_grad: | |
| continue | |
| if "weight" in key: | |
| params += [{ "params": [value], "weight_decay": 0 }] | |
| return t.optim.SGD(params, lr = 1e-3, momentum = 0.9) | |
| def run(): | |
| model = FasterRCNNModel( | |
| num_classes = 21, | |
| allow_edge_proposals = True, | |
| dropout_probability = 0 | |
| ).cuda() | |
| optimizer = create_optimizer(model) | |
| # Train forever | |
| while True: | |
| loss = model.train_step( | |
| optimizer = optimizer, | |
| image_data = t.from_numpy(__image_data).unsqueeze(dim = 0).float().cuda(), | |
| anchor_map = __anchor_map.astype(float), | |
| anchor_valid_map = __anchor_valid_map.astype(float), | |
| gt_rpn_map = t.from_numpy(__gt_rpn_map).unsqueeze(dim = 0).float().cuda(), | |
| gt_rpn_object_indices = [ __gt_rpn_object_indices ], | |
| gt_rpn_background_indices = [ __gt_rpn_background_indices ], | |
| gt_boxes = [ __gt_boxes ] | |
| ) | |
| class FasterRCNNModel(nn.Module): | |
| @dataclass | |
| class Loss: | |
| rpn_class: float | |
| rpn_regression: float | |
| detector_class: float | |
| detector_regression: float | |
| total: float | |
| def __init__(self, num_classes, rpn_minibatch_size = 256, proposal_batch_size = 128, allow_edge_proposals = True, dropout_probability = 0): | |
| super().__init__() | |
| # Constants | |
| self._num_classes = num_classes | |
| self._rpn_minibatch_size = rpn_minibatch_size | |
| self._proposal_batch_size = proposal_batch_size | |
| self._detector_box_delta_means = [ 0, 0, 0, 0 ] | |
| self._detector_box_delta_stds = [ 0.1, 0.1, 0.2, 0.2 ] | |
| # Network stages | |
| self._stage1_feature_extractor = FeatureExtractor() | |
| self._stage2_region_proposal_network = RegionProposalNetwork(allow_edge_proposals = allow_edge_proposals) | |
| self._stage3_detector_network = DetectorNetwork(num_classes = num_classes, dropout_probability = dropout_probability) | |
| def train_step(self, optimizer, image_data, anchor_map, anchor_valid_map, gt_rpn_map, gt_rpn_object_indices, gt_rpn_background_indices, gt_boxes): | |
| """ | |
| Performs one training step on a sample of data. | |
| Parameters | |
| ---------- | |
| optimizer : torch.optim.Optimizer | |
| Optimizer. | |
| image_data : torch.Tensor | |
| A tensor of shape (batch_size, channels, height, width) representing | |
| images normalized using the VGG-16 convention (BGR, ImageNet channel-wise | |
| mean-centered). | |
| anchor_map : torch.Tensor | |
| Map of anchors, shaped (height, width, num_anchors * 4). The last | |
| dimension contains the anchor boxes specified as a 4-tuple of | |
| (center_y, center_x, height, width), repeated for all anchors at that | |
| coordinate of the feature map. If this or anchor_valid_map is not | |
| provided, both will be computed here. | |
| anchor_valid_map : torch.Tensor | |
| Map indicating which anchors are valid (do not intersect image bounds), | |
| shaped (height, width). If this or anchor_map is not provided, both will | |
| be computed here. | |
| gt_rpn_map : torch.Tensor | |
| Ground truth RPN map of shape | |
| (batch_size, height, width, num_anchors, 6), where height and width are | |
| the feature map dimensions, not the input image dimensions. The final | |
| dimension contains: | |
| - 0: Trainable anchor (1) or not (0). Only valid and non-neutral (that | |
| is, definitely positive or negative) anchors are trainable. This is | |
| the same as anchor_valid_map with additional invalid anchors caused | |
| by neutral samples | |
| - 1: For trainable anchors, whether the anchor is an object anchor (1) | |
| or background anchor (0). For non-trainable anchors, will be 0. | |
| - 2: Regression target for box center, ty. | |
| - 3: Regression target for box center, tx. | |
| - 4: Regression target for box size, th. | |
| - 5: Regression target for box size, tw. | |
| gt_rpn_object_indices : List[np.ndarray] | |
| For each image in the batch, a map of shape (N, 3) of indices (y, x, k) | |
| of all N object anchors in the RPN ground truth map. | |
| gt_rpn_background_indices : List[np.ndarray] | |
| For each image in the batch, a map of shape (M, 3) of indices of all M | |
| background anchors in the RPN ground truth map. | |
| gt_boxes : List[List[datasets.training_sample.Box]] | |
| For each image in the batch, a list of ground truth object boxes. | |
| Returns | |
| ------- | |
| Loss | |
| Loss (a dataclass with class and regression losses for both the RPN and | |
| detector states). | |
| """ | |
| self.train() | |
| # Clear accumulated gradient | |
| optimizer.zero_grad() | |
| # For now, we only support a batch size of 1 | |
| assert image_data.shape[0] == 1, "Batch size must be 1" | |
| assert len(gt_rpn_map.shape) == 5 and gt_rpn_map.shape[0] == 1, "Batch size must be 1" | |
| assert len(gt_rpn_object_indices) == 1, "Batch size must be 1" | |
| assert len(gt_rpn_background_indices) == 1, "Batch size must be 1" | |
| assert len(gt_boxes) == 1, "Batch size must be 1" | |
| image_shape = image_data.shape[1:] | |
| # Stage 1: Extract features | |
| feature_map = self._stage1_feature_extractor(image_data = image_data) | |
| # Stage 2: Generate object proposals using RPN | |
| rpn_score_map, rpn_box_deltas_map, proposals = self._stage2_region_proposal_network( | |
| feature_map = feature_map, | |
| image_shape = image_shape, # each image in batch has identical shape: (num_channels, height, width) | |
| anchor_map = anchor_map, | |
| anchor_valid_map = anchor_valid_map, | |
| max_proposals_pre_nms = 12000, | |
| max_proposals_post_nms = 2000 | |
| ) | |
| # Sample random mini-batch of anchors (for RPN training) | |
| gt_rpn_minibatch_map = self._sample_rpn_minibatch( | |
| rpn_map = gt_rpn_map, | |
| object_indices = gt_rpn_object_indices, | |
| background_indices = gt_rpn_background_indices | |
| ) | |
| # Assign labels to proposals and take random sample (for detector training) | |
| proposals, gt_classes, gt_box_deltas = self._label_proposals( | |
| proposals = proposals, | |
| gt_boxes = gt_boxes[0], # for now, batch size of 1 | |
| min_background_iou_threshold = 0.0, | |
| min_object_iou_threshold = 0.5 | |
| ) | |
| proposals, gt_classes, gt_box_deltas = self._sample_proposals( | |
| proposals = proposals, | |
| gt_classes = gt_classes, | |
| gt_box_deltas = gt_box_deltas, | |
| max_proposals = self._proposal_batch_size, | |
| positive_fraction = 0.25 | |
| ) | |
| # Make sure RoI proposals and ground truths are detached from computational | |
| # graph so that gradients are not propagated through them. They are treated | |
| # as constant inputs into the detector stage. | |
| proposals = proposals.detach() | |
| gt_classes = gt_classes.detach() | |
| gt_box_deltas = gt_box_deltas.detach() | |
| # Stage 3: Detector | |
| detector_classes, detector_box_deltas = self._stage3_detector_network( | |
| feature_map = feature_map, | |
| proposals = proposals | |
| ) | |
| # Compute losses | |
| rpn_class_loss = _rpn_class_loss(predicted_scores = rpn_score_map, y_true = gt_rpn_minibatch_map) | |
| rpn_regression_loss = _rpn_regression_loss(predicted_box_deltas = rpn_box_deltas_map, y_true = gt_rpn_minibatch_map) | |
| detector_class_loss = _detector_class_loss(predicted_classes = detector_classes, y_true = gt_classes) | |
| detector_regression_loss = _detector_regression_loss(predicted_box_deltas = detector_box_deltas, y_true = gt_box_deltas) | |
| total_loss = rpn_class_loss + rpn_regression_loss + detector_class_loss + detector_regression_loss | |
| loss = FasterRCNNModel.Loss( | |
| rpn_class = rpn_class_loss.detach().cpu().item(), | |
| rpn_regression = rpn_regression_loss.detach().cpu().item(), | |
| detector_class = detector_class_loss.detach().cpu().item(), | |
| detector_regression = detector_regression_loss.detach().cpu().item(), | |
| total = total_loss.detach().cpu().item() | |
| ) | |
| # Backprop | |
| total_loss.backward() | |
| # Optimizer step | |
| optimizer.step() | |
| # Return losses and data useful for computing statistics | |
| return loss | |
| def _sample_rpn_minibatch(self, rpn_map, object_indices, background_indices): | |
| """ | |
| Selects anchors for training and produces a copy of the RPN ground truth | |
| map with only those anchors marked as trainable. | |
| Parameters | |
| ---------- | |
| rpn_map : np.ndarray | |
| RPN ground truth map of shape | |
| (batch_size, height, width, num_anchors, 6). | |
| object_indices : List[np.ndarray] | |
| For each image in the batch, a map of shape (N, 3) of indices (y, x, k) | |
| of all N object anchors in the RPN ground truth map. | |
| background_indices : List[np.ndarray] | |
| For each image in the batch, a map of shape (M, 3) of indices of all M | |
| background anchors in the RPN ground truth map. | |
| Returns | |
| ------- | |
| np.ndarray | |
| A copy of the RPN ground truth map with index 0 of the last dimension | |
| recomputed to include only anchors in the minibatch. | |
| """ | |
| assert rpn_map.shape[0] == 1, "Batch size must be 1" | |
| assert len(object_indices) == 1, "Batch size must be 1" | |
| assert len(background_indices) == 1, "Batch size must be 1" | |
| positive_anchors = object_indices[0] | |
| negative_anchors = background_indices[0] | |
| assert len(positive_anchors) + len(negative_anchors) >= self._rpn_minibatch_size, "Image has insufficient anchors for RPN minibatch size of %d" % self._rpn_minibatch_size | |
| assert len(positive_anchors) > 0, "Image does not have any positive anchors" | |
| assert self._rpn_minibatch_size % 2 == 0, "RPN minibatch size must be evenly divisible" | |
| # Sample, producing indices into the index maps | |
| num_positive_anchors = len(positive_anchors) | |
| num_negative_anchors = len(negative_anchors) | |
| num_positive_samples = min(self._rpn_minibatch_size // 2, num_positive_anchors) # up to half the samples should be positive, if possible | |
| num_negative_samples = self._rpn_minibatch_size - num_positive_samples # the rest should be negative | |
| positive_anchor_idxs = random.sample(range(num_positive_anchors), num_positive_samples) | |
| negative_anchor_idxs = random.sample(range(num_negative_anchors), num_negative_samples) | |
| # Construct index expressions into RPN map | |
| positive_anchors = positive_anchors[positive_anchor_idxs] | |
| negative_anchors = negative_anchors[negative_anchor_idxs] | |
| trainable_anchors = np.concatenate([ positive_anchors, negative_anchors ]) | |
| batch_idxs = np.zeros(len(trainable_anchors)) | |
| trainable_idxs = (batch_idxs, trainable_anchors[:,0], trainable_anchors[:,1], trainable_anchors[:,2], 0) | |
| # Create a copy of the RPN map with samples set as trainable | |
| rpn_minibatch_map = rpn_map.clone() | |
| rpn_minibatch_map[:,:,:,:,0] = 0 | |
| rpn_minibatch_map[trainable_idxs] = 1 | |
| return rpn_minibatch_map | |
| def _label_proposals(self, proposals, gt_boxes, min_background_iou_threshold, min_object_iou_threshold): | |
| """ | |
| Determines which proposals generated by the RPN stage overlap with ground | |
| truth boxes and creates ground truth labels for the subsequent detector | |
| stage. | |
| Parameters | |
| ---------- | |
| proposals : torch.Tensor | |
| Proposal corners, shaped (N, 4). | |
| gt_boxes : List[datasets.training_sample.Box] | |
| Ground truth object boxes. | |
| min_background_iou_threshold : float | |
| Minimum IoU threshold with ground truth boxes below which proposals are | |
| ignored entirely. Proposals with an IoU threshold in the range | |
| [min_background_iou_threshold, min_object_iou_threshold) are labeled as | |
| background. This value can be greater than 0, which has the effect of | |
| selecting more difficult background examples that have some degree of | |
| overlap with ground truth boxes. | |
| min_object_iou_threshold : float | |
| Minimum IoU threshold for a proposal to be labeled as an object. | |
| Returns | |
| ------- | |
| torch.Tensor, torch.Tensor, torch.Tensor | |
| Proposals, (N, 4), labeled as either objects or background (depending on | |
| IoU thresholds, some proposals can end up as neither and are excluded | |
| here); one-hot encoded class labels, (N, num_classes), for each proposal; | |
| and box delta regression targets, (N, 2, (num_classes - 1) * 4), for each | |
| proposal. Box delta target values are present at locations [:,1,:] and | |
| consist of (ty, tx, th, tw) for the class that the box corresponds to. | |
| The entries for all other classes and the background classes should be | |
| ignored. A mask is written to locations [:,0,:]. For each proposal | |
| assigned a non-background class, there will be 4 consecutive elements | |
| marked with 1 indicating the corresponding box delta target values are to | |
| be used. There are no box delta regression targets for background | |
| proposals and the mask is entirely 0 for those proposals. | |
| """ | |
| assert min_background_iou_threshold < min_object_iou_threshold, "Object threshold must be greater than background threshold" | |
| # Convert ground truth box corners to (M,4) tensor and class indices to (M,) | |
| gt_box_corners = np.array([ box["corners"] for box in gt_boxes ], dtype = np.float32) | |
| gt_box_corners = t.from_numpy(gt_box_corners).cuda() | |
| gt_box_class_idxs = t.tensor([ box["class_index"] for box in gt_boxes ], dtype = t.long, device = "cuda") | |
| # Let's be crafty and create some fake proposals that match the ground | |
| # truth boxes exactly. This isn't strictly necessary and the model should | |
| # work without it but it will help training and will ensure that there are | |
| # always some positive examples to train on. | |
| proposals = t.vstack([ proposals, gt_box_corners ]) | |
| # Compute IoU between each proposal (N,4) and each ground truth box (M,4) | |
| # -> (N, M) | |
| ious = t_intersection_over_union(boxes1 = proposals, boxes2 = gt_box_corners) | |
| # Find the best IoU for each proposal, the class of the ground truth box | |
| # associated with it, and the box corners | |
| best_ious = t.max(ious, dim = 1).values # (N,) of maximum IoUs for each of the N proposals | |
| box_idxs = t.argmax(ious, dim = 1) # (N,) of ground truth box index for each proposal | |
| gt_box_class_idxs = gt_box_class_idxs[box_idxs] # (N,) of class indices of highest-IoU box for each proposal | |
| gt_box_corners = gt_box_corners[box_idxs] # (N,4) of box corners of highest-IoU box for each proposal | |
| # Remove all proposals whose best IoU is less than the minimum threshold | |
| # for a negative (background) sample. We also check for IoUs > 0 because | |
| # due to earlier clipping, we may get invalid 0-area proposals. | |
| idxs = t.where((best_ious >= min_background_iou_threshold))[0] # keep proposals w/ sufficiently high IoU | |
| proposals = proposals[idxs] | |
| best_ious = best_ious[idxs] | |
| gt_box_class_idxs = gt_box_class_idxs[idxs] | |
| gt_box_corners = gt_box_corners[idxs] | |
| # IoUs less than min_object_iou_threshold will be labeled as background | |
| gt_box_class_idxs[best_ious < min_object_iou_threshold] = 0 | |
| # One-hot encode class labels | |
| num_proposals = proposals.shape[0] | |
| gt_classes = t.zeros((num_proposals, self._num_classes), dtype = t.float32, device = "cuda") # (N,num_classes) | |
| gt_classes[ t.arange(num_proposals), gt_box_class_idxs ] = 1.0 | |
| # Convert proposals and ground truth boxes into "anchor" format (center | |
| # points and side lengths). For the detector stage, the proposals serve as | |
| # the anchors relative to which the final box predictions will be | |
| # regressed. | |
| proposal_centers = 0.5 * (proposals[:,0:2] + proposals[:,2:4]) # center_y, center_x | |
| proposal_sides = proposals[:,2:4] - proposals[:,0:2] # height, width | |
| gt_box_centers = 0.5 * (gt_box_corners[:,0:2] + gt_box_corners[:,2:4]) # center_y, center_x | |
| gt_box_sides = gt_box_corners[:,2:4] - gt_box_corners[:,0:2] # height, width | |
| # Compute box delta regression targets (ty, tx, th, tw) for each proposal | |
| # based on the best box selected | |
| box_delta_targets = t.empty((num_proposals, 4), dtype = t.float32, device = "cuda") # (N,4) | |
| box_delta_targets[:,0:2] = (gt_box_centers - proposal_centers) / proposal_sides # ty = (gt_center_y - proposal_center_y) / proposal_height, tx = (gt_center_x - proposal_center_x) / proposal_width | |
| box_delta_targets[:,2:4] = t.log(gt_box_sides / proposal_sides) # th = log(gt_height / proposal_height), tw = (gt_width / proposal_width) | |
| box_delta_means = t.tensor(self._detector_box_delta_means, dtype = t.float32, device = "cuda") | |
| box_delta_stds = t.tensor(self._detector_box_delta_stds, dtype = t.float32, device = "cuda") | |
| box_delta_targets[:,:] -= box_delta_means # mean adjustment | |
| box_delta_targets[:,:] /= box_delta_stds # standard deviation scaling | |
| # Convert regression targets into a map of shape (N,2,4*(C-1)) where C is | |
| # the number of classes and [:,0,:] specifies a mask for the corresponding | |
| # target components at [:,1,:]. Targets are ordered (ty, tx, th, tw). | |
| # Background class 0 is not present at all. | |
| gt_box_deltas = t.zeros((num_proposals, 2, 4 * (self._num_classes - 1)), dtype = t.float32, device = "cuda") | |
| gt_box_deltas[:,0,:] = t.repeat_interleave(gt_classes, repeats = 4, dim = 1)[:,4:] # create masks using interleaved repetition, remembering to ignore class 0 | |
| gt_box_deltas[:,1,:] = t.tile(box_delta_targets, dims = (1, self._num_classes - 1)) # populate regression targets with straightforward repetition (only those columns corresponding to class are masked on) | |
| return proposals, gt_classes, gt_box_deltas | |
| def _sample_proposals(self, proposals, gt_classes, gt_box_deltas, max_proposals, positive_fraction): | |
| if max_proposals <= 0: | |
| return proposals, gt_classes, gt_box_deltas | |
| # Get positive and negative (background) proposals | |
| class_indices = t.argmax(gt_classes, axis = 1) # (N,num_classes) -> (N,), where each element is the class index (highest score from its row) | |
| positive_indices = t.where(class_indices > 0)[0] | |
| negative_indices = t.where(class_indices <= 0)[0] | |
| num_positive_proposals = len(positive_indices) | |
| num_negative_proposals = len(negative_indices) | |
| # Select positive and negative samples, if there are enough. Note that the | |
| # number of positive samples can be either the positive fraction of the | |
| # *actual* number of proposals *or* the *desired* number (max_proposals). | |
| # In practice, these yield virtually identical results but the latter | |
| # method will yield slightly more positive samples in the rare cases when | |
| # the number of proposals is below the desired number. Here, we use the | |
| # former method but others, such as Yun Chen, use the latter. To implement | |
| # it, replace num_samples with max_proposals in the line that computes | |
| # num_positive_samples. I am not sure what the original Faster R-CNN | |
| # implementation does. | |
| num_samples = min(max_proposals, len(class_indices)) | |
| num_positive_samples = min(round(num_samples * positive_fraction), num_positive_proposals) | |
| num_negative_samples = min(num_samples - num_positive_samples, num_negative_proposals) | |
| # Do we have enough? | |
| if num_positive_samples <= 0 or num_negative_samples <= 0: | |
| return proposals[[]], gt_classes[[]], gt_box_deltas[[]] # return 0-length tensors | |
| # Sample randomly | |
| positive_sample_indices = positive_indices[ t.randperm(len(positive_indices))[0:num_positive_samples] ] | |
| negative_sample_indices = negative_indices[ t.randperm(len(negative_indices))[0:num_negative_samples] ] | |
| indices = t.cat([ positive_sample_indices, negative_sample_indices ]) | |
| # Return | |
| return proposals[indices], gt_classes[indices], gt_box_deltas[indices] | |
| class RegionProposalNetwork(nn.Module): | |
| def __init__(self, allow_edge_proposals = False): | |
| super().__init__() | |
| # Constants | |
| self._allow_edge_proposals = allow_edge_proposals | |
| # Layers | |
| num_anchors = 9 | |
| self._rpn_conv1 = nn.Conv2d(in_channels = 512, out_channels = 512, kernel_size = (3, 3), stride = 1, padding = "same") | |
| self._rpn_class = nn.Conv2d(in_channels = 512, out_channels = num_anchors, kernel_size = (1, 1), stride = 1, padding = "same") | |
| self._rpn_boxes = nn.Conv2d(in_channels = 512, out_channels = num_anchors * 4, kernel_size = (1, 1), stride = 1, padding = "same") | |
| # Initialize weights | |
| self._rpn_conv1.weight.data.normal_(mean = 0.0, std = 0.01) | |
| self._rpn_conv1.bias.data.zero_() | |
| self._rpn_class.weight.data.normal_(mean = 0.0, std = 0.01) | |
| self._rpn_class.bias.data.zero_() | |
| self._rpn_boxes.weight.data.normal_(mean = 0.0, std = 0.01) | |
| self._rpn_boxes.bias.data.zero_() | |
| def forward(self, feature_map, image_shape, anchor_map, anchor_valid_map, max_proposals_pre_nms, max_proposals_post_nms): | |
| """ | |
| Predict objectness scores and regress region-of-interest box proposals on | |
| an input feature map. | |
| Parameters | |
| ---------- | |
| feature_map : torch.Tensor | |
| Feature map of shape (batch_size, 512, height, width). | |
| image_shape : Tuple[int, int, int] | |
| Shapes of each image in pixels: (num_channels, height, width). | |
| anchor_map : np.ndarray | |
| Map of anchors, shaped (height, width, num_anchors * 4). The last | |
| dimension contains the anchor boxes specified as a 4-tuple of | |
| (center_y, center_x, height, width), repeated for all anchors at that | |
| coordinate of the feature map. | |
| anchor_valid_map : np.ndarray | |
| Map indicating which anchors are valid (do not intersect image bounds), | |
| shaped (height, width, num_anchors). | |
| max_proposals_pre_nms : int | |
| How many of the best proposals (sorted by objectness score) to extract | |
| before applying non-maximum suppression. | |
| max_proposals_post_nms : int | |
| How many of the best proposals (sorted by objectness score) to keep after | |
| non-maximum suppression. | |
| Returns | |
| ------- | |
| torch.Tensor, torch.Tensor, torch.Tensor | |
| - Objectness scores (batch_size, height, width, num_anchors) | |
| - Box regressions (batch_size, height, width, num_anchors * 4), as box | |
| deltas (that is, (ty, tx, th, tw) for each anchor) | |
| - Proposals (N, 4) -- all corresponding proposal box corners stored as | |
| (y1, x1, y2, x2). | |
| """ | |
| # Pass through the network | |
| y = F.relu(self._rpn_conv1(feature_map)) | |
| objectness_score_map = t.sigmoid(self._rpn_class(y)) | |
| box_deltas_map = self._rpn_boxes(y) | |
| # Transpose shapes to be more convenient: | |
| # objectness_score_map -> (batch_size, height, width, num_anchors) | |
| # box_deltas_map -> (batch_size, height, width, num_anchors * 4) | |
| objectness_score_map = objectness_score_map.permute(0, 2, 3, 1).contiguous() | |
| box_deltas_map = box_deltas_map.permute(0, 2, 3, 1).contiguous() | |
| # Returning to CPU land by extracting proposals as lists (NumPy arrays) | |
| anchors, objectness_scores, box_deltas = self._extract_valid( | |
| anchor_map = anchor_map, | |
| anchor_valid_map = anchor_valid_map, | |
| objectness_score_map = objectness_score_map, | |
| box_deltas_map = box_deltas_map | |
| ) | |
| # **** UNCOMMENT THE LINE BELOW TO "FIX" MEMORY LEAK **** | |
| # Detach from graph to avoid backprop. According to my understanding, this | |
| # should be redundant here because we later take care to detach the | |
| # proposals (in FasterRCNNModel). However, there is a memory leak involving | |
| # t_convert_deltas_to_boxes() if this is not done here. Ultimately, the | |
| # numerical results are not affected. Proposals returned from this function | |
| # are supposed to be constant and are fed into the detector stage. See any | |
| # commit prior to 209141c for an earlier version of the code here that | |
| # performed all operations on CPU using NumPy, which was slightly slower | |
| # but equivalent. | |
| #box_deltas = box_deltas.detach() | |
| # Convert regressions to box corners | |
| proposals = t_convert_deltas_to_boxes( | |
| box_deltas = box_deltas, | |
| anchors = t.from_numpy(anchors).cuda(), | |
| box_delta_means = t.tensor([0, 0, 0, 0], dtype = t.float32, device = "cuda"), | |
| box_delta_stds = t.tensor([1, 1, 1, 1], dtype = t.float32, device = "cuda") | |
| ) | |
| # Keep only the top-N scores. Note that we do not care whether the | |
| # proposals were labeled as objects (score > 0.5) and peform a simple | |
| # ranking among all of them. Restricting them has a strong adverse impact | |
| # on training performance. | |
| sorted_indices = t.argsort(objectness_scores) # sort in ascending order of objectness score | |
| sorted_indices = sorted_indices.flip(dims = (0,)) # descending order of score | |
| proposals = proposals[sorted_indices][0:max_proposals_pre_nms] # grab the top-N best proposals | |
| objectness_scores = objectness_scores[sorted_indices][0:max_proposals_pre_nms] # corresponding scores | |
| # Clip to image boundaries | |
| proposals[:,0:2] = t.clamp(proposals[:,0:2], min = 0) | |
| proposals[:,2] = t.clamp(proposals[:,2], max = image_shape[1]) | |
| proposals[:,3] = t.clamp(proposals[:,3], max = image_shape[2]) | |
| # Remove anything less than 16 pixels on a side | |
| height = proposals[:,2] - proposals[:,0] | |
| width = proposals[:,3] - proposals[:,1] | |
| idxs = t.where((height >= 16) & (width >= 16))[0] | |
| proposals = proposals[idxs] | |
| objectness_scores = objectness_scores[idxs] | |
| # Perform NMS | |
| idxs = nms( | |
| boxes = proposals, | |
| scores = objectness_scores, | |
| iou_threshold = 0.7 | |
| ) | |
| idxs = idxs[0:max_proposals_post_nms] | |
| proposals = proposals[idxs] | |
| # Return network outputs as PyTorch tensors and extracted object proposals | |
| # as NumPy arrays | |
| return objectness_score_map, box_deltas_map, proposals | |
| def _extract_valid(self, anchor_map, anchor_valid_map, objectness_score_map, box_deltas_map): | |
| assert objectness_score_map.shape[0] == 1 # only batch size of 1 supported for now | |
| height, width, num_anchors = anchor_valid_map.shape | |
| anchors = anchor_map.reshape((height * width * num_anchors, 4)) # [N,4] all anchors | |
| anchors_valid = anchor_valid_map.reshape((height * width * num_anchors)) # [N,] whether anchors are valid (i.e., do not cross image boundaries) | |
| scores = objectness_score_map.reshape((height * width * num_anchors)) # [N,] prediced objectness scores | |
| box_deltas = box_deltas_map.reshape((height * width * num_anchors, 4)) # [N,4] predicted box delta regression targets | |
| if self._allow_edge_proposals: | |
| # Use all proposals | |
| return anchors, scores, box_deltas | |
| else: | |
| # Filter out those proposals generated at invalid anchors | |
| idxs = anchors_valid > 0 | |
| return anchors[idxs], scores[idxs], box_deltas[idxs] | |
| def _rpn_class_loss(predicted_scores, y_true): | |
| """ | |
| Computes RPN class loss. | |
| Parameters | |
| ---------- | |
| predicted_scores : torch.Tensor | |
| A tensor of shape (batch_size, height, width, num_anchors) containing | |
| objectness scores (0 = background, 1 = object). | |
| y_true : torch.Tensor | |
| Ground truth tensor of shape (batch_size, height, width, num_anchors, 6). | |
| Returns | |
| ------- | |
| torch.Tensor | |
| Scalar loss. | |
| """ | |
| epsilon = 1e-7 | |
| # y_true_class: (batch_size, height, width, num_anchors), same as predicted_scores | |
| y_true_class = y_true[:,:,:,:,1].reshape(predicted_scores.shape) | |
| y_predicted_class = predicted_scores | |
| # y_mask: y_true[:,:,:,0] is 1.0 for anchors included in the mini-batch | |
| y_mask = y_true[:,:,:,:,0].reshape(predicted_scores.shape) | |
| # Compute how many anchors are actually used in the mini-batch (e.g., | |
| # typically 256) | |
| N_cls = t.count_nonzero(y_mask) + epsilon | |
| # Compute element-wise loss for all anchors | |
| loss_all_anchors = F.binary_cross_entropy(input = y_predicted_class, target = y_true_class, reduction = "none") | |
| # Zero out the ones which should not have been included | |
| relevant_loss_terms = y_mask * loss_all_anchors | |
| # Sum the total loss and normalize by the number of anchors used | |
| return t.sum(relevant_loss_terms) / N_cls | |
| def _rpn_regression_loss(predicted_box_deltas, y_true): | |
| """ | |
| Computes RPN box delta regression loss. | |
| Parameters | |
| ---------- | |
| predicted_box_deltas : torch.Tensor | |
| A tensor of shape (batch_size, height, width, num_anchors * 4) containing | |
| RoI box delta regressions for each anchor, stored as: ty, tx, th, tw. | |
| y_true : torch.Tensor | |
| Ground truth tensor of shape (batch_size, height, width, num_anchors, 6). | |
| Returns | |
| ------- | |
| torch.Tensor | |
| Scalar loss. | |
| """ | |
| epsilon = 1e-7 | |
| scale_factor = 1.0 # hyper-parameter that controls magnitude of regression loss and is chosen to make regression term comparable to class term | |
| sigma = 3.0 # see: https://github.com/rbgirshick/py-faster-rcnn/issues/89 | |
| sigma_squared = sigma * sigma | |
| y_predicted_regression = predicted_box_deltas | |
| y_true_regression = y_true[:,:,:,:,2:6].reshape(y_predicted_regression.shape) | |
| # Include only anchors that are used in the mini-batch and which correspond | |
| # to objects (positive samples) | |
| y_included = y_true[:,:,:,:,0].reshape(y_true.shape[0:4]) # trainable anchors map: (batch_size, height, width, num_anchors) | |
| y_positive = y_true[:,:,:,:,1].reshape(y_true.shape[0:4]) # positive anchors | |
| y_mask = y_included * y_positive | |
| # y_mask is of the wrong shape. We have one value per (y,x,k) position but in | |
| # fact need to have 4 values (one for each of the regression variables). For | |
| # example, y_predicted might be (1,37,50,36) and y_mask will be (1,37,50,9). | |
| # We need to repeat the last dimension 4 times. | |
| y_mask = y_mask.repeat_interleave(repeats = 4, dim = 3) | |
| # The paper normalizes by dividing by a quantity called N_reg, which is equal | |
| # to the total number of anchors (~2400) and then multiplying by lambda=10. | |
| # This does not make sense to me because we are summing over a mini-batch at | |
| # most, so we use N_cls here. I might be misunderstanding what is going on | |
| # but 10/2400 = 1/240 which is pretty close to 1/256 and the paper mentions | |
| # that training is relatively insensitve to choice of normalization. | |
| N_cls = t.count_nonzero(y_included) + epsilon | |
| # Compute element-wise loss using robust L1 function for all 4 regression | |
| # components | |
| x = y_true_regression - y_predicted_regression | |
| x_abs = t.abs(x) | |
| is_negative_branch = (x_abs < (1.0 / sigma_squared)).float() | |
| R_negative_branch = 0.5 * x * x * sigma_squared | |
| R_positive_branch = x_abs - 0.5 / sigma_squared | |
| loss_all_anchors = is_negative_branch * R_negative_branch + (1.0 - is_negative_branch) * R_positive_branch | |
| # Zero out the ones which should not have been included | |
| relevant_loss_terms = y_mask * loss_all_anchors | |
| return scale_factor * t.sum(relevant_loss_terms) / N_cls | |
| class DetectorNetwork(nn.Module): | |
| def __init__(self, num_classes, dropout_probability): | |
| super().__init__() | |
| # Define network | |
| self._roi_pool = RoIPool(output_size = (7, 7), spatial_scale = 1.0 / 16.0) | |
| self._fc1 = nn.Linear(in_features = 512*7*7, out_features = 4096) | |
| self._fc2 = nn.Linear(in_features = 4096, out_features = 4096) | |
| self._classifier = nn.Linear(in_features = 4096, out_features = num_classes) | |
| self._regressor = nn.Linear(in_features = 4096, out_features = (num_classes - 1) * 4) | |
| # Dropout layers | |
| self._dropout1 = nn.Dropout(p = dropout_probability) | |
| self._dropout2 = nn.Dropout(p = dropout_probability) | |
| # Initialize weights | |
| self._classifier.weight.data.normal_(mean = 0.0, std = 0.01) | |
| self._classifier.bias.data.zero_() | |
| self._regressor.weight.data.normal_(mean = 0.0, std = 0.001) | |
| self._regressor.bias.data.zero_() | |
| def forward(self, feature_map, proposals): | |
| """ | |
| Predict final class and box delta regressions for region-of-interest | |
| proposals. The proposals serve as "anchors" for the box deltas, which | |
| refine the proposals into final boxes. | |
| Parameters | |
| ---------- | |
| feature_map : torch.Tensor | |
| Feature map of shape (batch_size, 512, height, width). | |
| proposals : torch.Tensor | |
| Region-of-interest box proposals that are likely to contain objects. | |
| Has shape (N, 4), where N is the number of proposals, with each box given | |
| as (y1, x1, y2, x2) in pixel coordinates. | |
| Returns | |
| ------- | |
| torch.Tensor, torch.Tensor | |
| Predicted classes, (N, num_classes), encoded as a one-hot vector, and | |
| predicted box delta regressions, (N, 4*(num_classes-1)), where the deltas | |
| are expressed as (ty, tx, th, tw) and are relative to each corresponding | |
| proposal box. Because there is no box for the background class 0, it is | |
| excluded entirely and only (num_classes-1) sets of box delta targets are | |
| computed. | |
| """ | |
| # Batch size of one for now, so no need to associate proposals with batches | |
| assert feature_map.shape[0] == 1, "Batch size must be 1" | |
| batch_idxs = t.zeros((proposals.shape[0], 1)).cuda() | |
| # (N, 5) tensor of (batch_idx, x1, y1, x2, y2) | |
| indexed_proposals = t.cat([ batch_idxs, proposals ], dim = 1) | |
| indexed_proposals = indexed_proposals[:, [ 0, 2, 1, 4, 3 ]] # each row, (batch_idx, y1, x1, y2, x2) -> (batch_idx, x1, y1, x2, y2) | |
| # RoI pooling: (N, 512, 7, 7) | |
| rois = self._roi_pool(feature_map, indexed_proposals) | |
| rois = rois.reshape((rois.shape[0], 512*7*7)) # flatten each RoI: (N, 512*7*7) | |
| # Forward propagate | |
| y1o = F.relu(self._fc1(rois)) | |
| y1 = self._dropout1(y1o) | |
| y2o = F.relu(self._fc2(y1)) | |
| y2 = self._dropout2(y2o) | |
| classes_raw = self._classifier(y2) | |
| classes = F.softmax(classes_raw, dim = 1) | |
| box_deltas = self._regressor(y2) | |
| return classes, box_deltas | |
| def _detector_class_loss(predicted_classes, y_true): | |
| """ | |
| Computes detector class loss. | |
| Parameters | |
| ---------- | |
| predicted_classes : torch.Tensor | |
| RoI predicted classes as categorical vectors, (N, num_classes). | |
| y_true : torch.Tensor | |
| RoI class labels as categorical vectors, (N, num_classes). | |
| Returns | |
| ------- | |
| torch.Tensor | |
| Scalar loss. | |
| """ | |
| epsilon = 1e-7 | |
| scale_factor = 1.0 | |
| cross_entropy_per_row = -(y_true * t.log(predicted_classes + epsilon)).sum(dim = 1) | |
| N = cross_entropy_per_row.shape[0] + epsilon | |
| cross_entropy = t.sum(cross_entropy_per_row) / N | |
| return scale_factor * cross_entropy | |
| def _detector_regression_loss(predicted_box_deltas, y_true): | |
| """ | |
| Computes detector regression loss. | |
| Parameters | |
| ---------- | |
| predicted_box_deltas : torch.Tensor | |
| RoI predicted box delta regressions, (N, 4*(num_classes-1)). The background | |
| class is excluded and only the non-background classes are included. Each | |
| set of box deltas is stored in parameterized form as (ty, tx, th, tw). | |
| y_true : torch.Tensor | |
| RoI box delta regression ground truth labels, (N, 2, 4*(num_classes-1)). | |
| These are stored as mask values (1 or 0) in (:,0,:) and regression | |
| parameters in (:,1,:). Note that it is important to mask off the predicted | |
| and ground truth values because they may be set to invalid values. | |
| Returns | |
| ------- | |
| torch.Tensor | |
| Scalar loss. | |
| """ | |
| epsilon = 1e-7 | |
| scale_factor = 1.0 | |
| sigma = 1.0 | |
| sigma_squared = sigma * sigma | |
| # We want to unpack the regression targets and the mask of valid targets into | |
| # tensors each of the same shape as the predicted: | |
| # (num_proposals, 4*(num_classes-1)) | |
| # y_true has shape: | |
| # (num_proposals, 2, 4*(num_classes-1)) | |
| y_mask = y_true[:,0,:] | |
| y_true_targets = y_true[:,1,:] | |
| # Compute element-wise loss using robust L1 function for all 4 regression | |
| # targets | |
| x = y_true_targets - predicted_box_deltas | |
| x_abs = t.abs(x) | |
| is_negative_branch = (x < (1.0 / sigma_squared)).float() | |
| R_negative_branch = 0.5 * x * x * sigma_squared | |
| R_positive_branch = x_abs - 0.5 / sigma_squared | |
| losses = is_negative_branch * R_negative_branch + (1.0 - is_negative_branch) * R_positive_branch | |
| # Normalize to number of proposals (e.g., 128). Although this may not be | |
| # what the paper does, it seems to work. Other implemetnations do this. | |
| # Using e.g., the number of positive proposals will cause the loss to | |
| # behave erratically because sometimes N will become very small. | |
| N = y_true.shape[0] + epsilon | |
| relevant_loss_terms = y_mask * losses | |
| return scale_factor * t.sum(relevant_loss_terms) / N | |
| class FeatureExtractor(nn.Module): | |
| def __init__(self): | |
| super().__init__() | |
| self._block1_conv1 = nn.Conv2d(in_channels = 3, out_channels = 64, kernel_size = (3, 3), stride = 1, padding = "same") | |
| self._block1_conv2 = nn.Conv2d(in_channels = 64, out_channels = 64, kernel_size = (3, 3), stride = 1, padding = "same") | |
| self._block1_pool = nn.MaxPool2d(kernel_size = (2, 2), stride = 2) | |
| self._block2_conv1 = nn.Conv2d(in_channels = 64, out_channels = 128, kernel_size = (3, 3), stride = 1, padding = "same") | |
| self._block2_conv2 = nn.Conv2d(in_channels = 128, out_channels = 128, kernel_size = (3, 3), stride = 1, padding = "same") | |
| self._block2_pool = nn.MaxPool2d(kernel_size = (2, 2), stride = 2) | |
| self._block3_conv1 = nn.Conv2d(in_channels = 128, out_channels = 256, kernel_size = (3, 3), stride = 1, padding = "same") | |
| self._block3_conv2 = nn.Conv2d(in_channels = 256, out_channels = 256, kernel_size = (3, 3), stride = 1, padding = "same") | |
| self._block3_conv3 = nn.Conv2d(in_channels = 256, out_channels = 256, kernel_size = (3, 3), stride = 1, padding = "same") | |
| self._block3_pool = nn.MaxPool2d(kernel_size = (2, 2), stride = 2) | |
| self._block4_conv1 = nn.Conv2d(in_channels = 256, out_channels = 512, kernel_size = (3, 3), stride = 1, padding = "same") | |
| self._block4_conv2 = nn.Conv2d(in_channels = 512, out_channels = 512, kernel_size = (3, 3), stride = 1, padding = "same") | |
| self._block4_conv3 = nn.Conv2d(in_channels = 512, out_channels = 512, kernel_size = (3, 3), stride = 1, padding = "same") | |
| self._block4_pool = nn.MaxPool2d(kernel_size = (2, 2), stride = 2) | |
| self._block5_conv1 = nn.Conv2d(in_channels = 512, out_channels = 512, kernel_size = (3, 3), stride = 1, padding = "same") | |
| self._block5_conv2 = nn.Conv2d(in_channels = 512, out_channels = 512, kernel_size = (3, 3), stride = 1, padding = "same") | |
| self._block5_conv3 = nn.Conv2d(in_channels = 512, out_channels = 512, kernel_size = (3, 3), stride = 1, padding = "same") | |
| # Freeze first two convolutional blocks | |
| self._block1_conv1.weight.requires_grad = False | |
| self._block1_conv1.bias.requires_grad = False | |
| self._block1_conv2.weight.requires_grad = False | |
| self._block1_conv2.bias.requires_grad = False | |
| self._block2_conv1.weight.requires_grad = False | |
| self._block2_conv1.bias.requires_grad = False | |
| self._block2_conv2.weight.requires_grad = False | |
| self._block2_conv2.bias.requires_grad = False | |
| def forward(self, image_data): | |
| """ | |
| Converts input images into feature maps using VGG-16 convolutional layers. | |
| Parameters | |
| ---------- | |
| image_data : torch.Tensor | |
| A tensor of shape (batch_size, channels, height, width) representing | |
| images normalized using the VGG-16 convention (BGR, ImageNet channel-wise | |
| mean-centered). | |
| Returns | |
| ------- | |
| torch.Tensor | |
| Feature map of shape (batch_size, 512, height // 16, width // 16). | |
| """ | |
| y = F.relu(self._block1_conv1(image_data)) | |
| y = F.relu(self._block1_conv2(y)) | |
| y = self._block1_pool(y) | |
| y = F.relu(self._block2_conv1(y)) | |
| y = F.relu(self._block2_conv2(y)) | |
| y = self._block2_pool(y) | |
| y = F.relu(self._block3_conv1(y)) | |
| y = F.relu(self._block3_conv2(y)) | |
| y = F.relu(self._block3_conv3(y)) | |
| y = self._block3_pool(y) | |
| y = F.relu(self._block4_conv1(y)) | |
| y = F.relu(self._block4_conv2(y)) | |
| y = F.relu(self._block4_conv3(y)) | |
| y = self._block4_pool(y) | |
| y = F.relu(self._block5_conv1(y)) | |
| y = F.relu(self._block5_conv2(y)) | |
| y = F.relu(self._block5_conv3(y)) | |
| return y | |
| def t_intersection_over_union(boxes1, boxes2): | |
| """ | |
| Equivalent to intersection_over_union(), operating on PyTorch tensors. | |
| Parameters | |
| ---------- | |
| boxes1 : torch.Tensor | |
| Box corners, shaped (N, 4), with each box as (y1, x1, y2, x2). | |
| boxes2 : torch.Tensor | |
| Box corners, shaped (M, 4). | |
| Returns | |
| ------- | |
| torch.Tensor | |
| IoUs for each pair of boxes in boxes1 and boxes2, shaped (N, M). | |
| """ | |
| top_left_point = t.maximum(boxes1[:,None,0:2], boxes2[:,0:2]) # (N,1,2) and (M,2) -> (N,M,2) indicating top-left corners of box pairs | |
| bottom_right_point = t.minimum(boxes1[:,None,2:4], boxes2[:,2:4]) # "" bottom-right corners "" | |
| well_ordered_mask = t.all(top_left_point < bottom_right_point, axis = 2) # (N,M) indicating whether top_left_x < bottom_right_x and top_left_y < bottom_right_y (meaning boxes may intersect) | |
| intersection_areas = well_ordered_mask * t.prod(bottom_right_point - top_left_point, dim = 2) # (N,M) indicating intersection area (bottom_right_x - top_left_x) * (bottom_right_y - top_left_y) | |
| areas1 = t.prod(boxes1[:,2:4] - boxes1[:,0:2], dim = 1) # (N,) indicating areas of boxes1 | |
| areas2 = t.prod(boxes2[:,2:4] - boxes2[:,0:2], dim = 1) # (M,) indicating areas of boxes2 | |
| union_areas = areas1[:,None] + areas2 - intersection_areas # (N,1) + (M,) - (N,M) = (N,M), union areas of both boxes | |
| epsilon = 1e-7 | |
| return intersection_areas / (union_areas + epsilon) | |
| def t_convert_deltas_to_boxes(box_deltas, anchors, box_delta_means, box_delta_stds): | |
| """ | |
| Equivalent to convert_deltas_to_boxes(), operating on PyTorch tensors. | |
| Parameters | |
| ---------- | |
| box_deltas : torch.Tensor | |
| Box deltas with shape (N, 4). Each row is (ty, tx, th, tw). | |
| anchors : torch.Tensor | |
| Corresponding anchors that the box deltas are based upon, shaped (N, 4) | |
| with each row being (center_y, center_x, height, width). | |
| box_delta_means : torch.Tensor | |
| Mean ajustment to box deltas, (4,), to be added after standard deviation | |
| scaling and before conversion to actual box coordinates. | |
| box_delta_stds : torch.Tensor | |
| Standard deviation adjustment to box deltas, (4,). Box deltas are first | |
| multiplied by these values. | |
| Returns | |
| ------- | |
| torch.Tensor | |
| Box coordinates, (N, 4), with each row being (y1, x1, y2, x2). | |
| """ | |
| box_deltas = box_deltas * box_delta_stds + box_delta_means | |
| center = anchors[:,2:4] * box_deltas[:,0:2] + anchors[:,0:2] # center_x = anchor_width * tx + anchor_center_x, center_y = anchor_height * ty + anchor_center_y | |
| size = anchors[:,2:4] * t.exp(box_deltas[:,2:4]) # width = anchor_width * exp(tw), height = anchor_height * exp(th) | |
| boxes = t.empty(box_deltas.shape, dtype = t.float32, device = "cuda") | |
| boxes[:,0:2] = center - 0.5 * size # y1, x1 | |
| boxes[:,2:4] = center + 0.5 * size # y2, x2 | |
| return boxes | |
Author
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Unfortunately github truncated my gist. The full file is here: https://www.dropbox.com/s/ldjnyloxq2vpi2s/pytorch_memory_leak.py
I know it's large but it spins up and executes quickly :)