# the foreground queries must satisfy two requirements: # 1. the quereis located in bounding boxes # 2. the distance from queries to the box center match the feature map stride min_border_distances = torch.min(border_distances, dim=-1)[0] # [h*w, m] max_border_distances = torch.max(border_distances, dim=-1)[0] mask_in_gt_boxes = min_border_distances > 0 min_limit, max_limit = self.limit_range[level_idx] mask_in_level = (max_border_distances > min_limit) & (max_border_distances <= max_limit) mask_pos = mask_in_gt_boxes & mask_in_level
# from high level to low level batch_size = feat_flatten.shape[0] selected_score = [] selected_inds = [] salience_score = [] for level_idx inrange(spatial_shapes.shape[0] - 1, -1, -1): start_index = level_start_index[level_idx] end_index = level_start_index[level_idx + 1] if level_idx < spatial_shapes.shape[0] - 1elseNone level_memory = backbone_output_memory[:, start_index:end_index, :] mask = mask_flatten[:, start_index:end_index] # update the memory using the higher-level score_prediction if level_idx != spatial_shapes.shape[0] - 1: upsample_score = torch.nn.functional.interpolate( score, size=spatial_shapes[level_idx].unbind(), mode="bilinear", align_corners=True, ) upsample_score = upsample_score.view(batch_size, -1, spatial_shapes[level_idx].prod()) upsample_score = upsample_score.transpose(1, 2) level_memory = level_memory + level_memory * upsample_score * self.alpha[level_idx] # predict the foreground score of the current layer score = self.enc_mask_predictor(level_memory) valid_score = score.squeeze(-1).masked_fill(mask, score.min()) score = score.transpose(1, 2).view(batch_size, -1, *spatial_shapes[level_idx])
# get the topk salience index of the current feature map level level_score, level_inds = valid_score.topk(level_token_nums[level_idx], dim=1) level_inds = level_inds + level_start_index[level_idx] salience_score.append(score) selected_inds.append(level_inds) selected_score.append(level_score)