本blog为github上版源码解析系列代码笔记
---------------个人学习笔记---------------
----------------本文作者吴疆--------------
------------
1.当不通过RPN获得roi需要做如下修改:
demo.py中调用demo()时需传入boxes参数
调用im_detect()时需传入boxes参数,否则boxes默认为None
cfg.TEST.HAS_RPN改为False
_get_blobs(im, boxes)中_get_rois_blob(rois,cfg.TEST.SCALES_BASE)的cfg.TEST.SCALES_BASE改为im_scale_factors
修改VGGnet_test.py网络文件:self.im_info改为self.rois,其占位符shape改为[None, 5],将self.layers字典中‘im_info’均改为‘rois’,删除网络与RPN相关的层。
import tensorflow as tffrom .network import Networkfrom ..fast_rcnn.config import cfgclass VGGnet_test(Network): def __init__(self, trainable=True): self.inputs = [] self.data = tf.placeholder(tf.float32, shape=[None, None, None, 3]) self.rois = tf.placeholder(tf.float32, shape=[None, 5]) self.keep_prob = tf.placeholder(tf.float32) self.layers = dict({ 'data': self.data, 'rois': self.rois}) self.trainable = trainable self.setup() def setup(self): # n_classes = 21 n_classes = cfg.NCLASSES # anchor_scales = [8, 16, 32] anchor_scales = cfg.ANCHOR_SCALES _feat_stride = [16, ] (self.feed('data') .conv(3, 3, 64, 1, 1, name='conv1_1', trainable=False) .conv(3, 3, 64, 1, 1, name='conv1_2', trainable=False) .max_pool(2, 2, 2, 2, padding='VALID', name='pool1') .conv(3, 3, 128, 1, 1, name='conv2_1', trainable=False) .conv(3, 3, 128, 1, 1, name='conv2_2', trainable=False) .max_pool(2, 2, 2, 2, padding='VALID', name='pool2') .conv(3, 3, 256, 1, 1, name='conv3_1') .conv(3, 3, 256, 1, 1, name='conv3_2') .conv(3, 3, 256, 1, 1, name='conv3_3') .max_pool(2, 2, 2, 2, padding='VALID', name='pool3') .conv(3, 3, 512, 1, 1, name='conv4_1') .conv(3, 3, 512, 1, 1, name='conv4_2') .conv(3, 3, 512, 1, 1, name='conv4_3') .max_pool(2, 2, 2, 2, padding='VALID', name='pool4') .conv(3, 3, 512, 1, 1, name='conv5_1') .conv(3, 3, 512, 1, 1, name='conv5_2') .conv(3, 3, 512, 1, 1, name='conv5_3')) # (self.feed('conv5_3') # .conv(3, 3, 512, 1, 1, name='rpn_conv/3x3') # .conv(1, 1, len(anchor_scales) * 3 * 2, 1, 1, padding='VALID', relu=False, name='rpn_cls_score')) # # (self.feed('rpn_conv/3x3') # .conv(1, 1, len(anchor_scales) * 3 * 4, 1, 1, padding='VALID', relu=False, name='rpn_bbox_pred')) # # # shape is (1, H, W, Ax2) -> (1, H, WxA, 2) # (self.feed('rpn_cls_score') # .spatial_reshape_layer(2, name='rpn_cls_score_reshape') # .spatial_softmax(name='rpn_cls_prob')) # # # shape is (1, H, WxA, 2) -> (1, H, W, Ax2) # (self.feed('rpn_cls_prob') # .spatial_reshape_layer(len(anchor_scales) * 3 * 2, name='rpn_cls_prob_reshape')) # # (self.feed('rpn_cls_prob_reshape', 'rpn_bbox_pred', 'im_info') # .proposal_layer(_feat_stride, anchor_scales, 'TEST', name='rois')) (self.feed('conv5_3', 'rois') .roi_pool(7, 7, 1.0 / 16, name='pool_5') .fc(4096, name='fc6') .fc(4096, name='fc7') .fc(n_classes, relu=False, name='cls_score') .softmax(name='cls_prob')) (self.feed('fc7') .fc(n_classes * 4, relu=False, name='bbox_pred'))
2.test.py中im_detect(sess, net, im, boxes=None)调用函数执行顺序(其中,形参im应为BGR顺序)
def im_detect(sess, net, im, boxes=None): """Detect object classes in an image given object proposals. Arguments: net (caffe.Net): Fast R-CNN network to use im (ndarray): color image to test (in BGR order) boxes (ndarray): R x 4 array of object proposals Returns: scores (ndarray): R x K array of object class scores (K includes background as object category 0) boxes (ndarray): R x (4*K) array of predicted bounding boxes """
_get_blobs(im, boxes)获取blobs数据字典(含data、rois字段,data与rois均经过缩放)与图像缩放因子数组im_scales(默认不使用图像金字塔,则数组仅含一个元素),其中,还对所有的blobs['rois']进行去重(判断映射到主干网络产生的feature map相同位置的),更新blobs['rois'](按缩放因子缩放了的,R*5含第1维全0 level)和boxes(原始传入的boxes,未被缩放,R*4)
--->forward pass前向传播,构造feed_dict字典(含net.data即blobs['data']、net.rois即blobs['rois']、net.keep_prob为1.0,与VGGnet_test.py的输入对应)并通过运行sess传入
--->通过调用(network.py中)net.get_output()得到cls_score(cfg.TEST.SVM为True时的得分,即不经过softmax)、cls_prob(默认cfg.TEST.SVM为False,经过softmax处理的最终得分)、bbox_pred坐标补偿量、rois(维度分别为R2*n_classes、R2*n_classes、R2*(n_classes*4)、R2*5(R2 <= R),对应于VGGnet_test.py相应层输出)
--->默认cfg.TEST.BBOX_REG为True调用bbox_transform_inv(boxes,box_deltas)(bbox_transform.py中)回归boxes得到pred_boxes
--->调用_clip_boxes(pred_boxes,im.shape)将越界的(回归后的boxes)pred_boxes改为图像边界得到更新的pred_boxes
--->更新scores和pred_boxes并返回,此时score维度由R2*n_classes变回R*n_classes,pred_boxes维度也由R2*(n_classes*4)变为R*(n_classes*4)(R为最初的boxes数量)即说明对于映射到feature map相同位置的原始boxes,其最终回归为拥有相同score的同一个box。说明返回的scores和pred_boxes中有重复的元素(且仍然为最初boxes的排序,具体可参考np.unique()函数)(未明白为何这样设计,只统计unique的感觉更好)
# When mapping from image ROIs to feature map ROIs, there's some aliasing # (some distinct image ROIs get mapped to the same feature ROI). # Here, we identify duplicate feature ROIs, so we only compute features # on the unique subset. if cfg.DEDUP_BOXES > 0 and not cfg.TEST.HAS_RPN: #1/16 v = np.array([1, 1e3, 1e6, 1e9, 1e12]) hashes = np.round(blobs['rois'] * cfg.DEDUP_BOXES).dot(v) _, index, inv_index = np.unique(hashes, return_index=True, return_inverse=True) blobs['rois'] = blobs['rois'][index, :] boxes = boxes[index, :]
cls_score, cls_prob, bbox_pred, rois = \ sess.run([net.get_output('cls_score'), net.get_output('cls_prob'), net.get_output('bbox_pred'),net.get_output('rois')],\ feed_dict=feed_dict)
if cfg.DEDUP_BOXES > 0 and not cfg.TEST.HAS_RPN: # Map scores and predictions back to the original set of boxes scores = scores[inv_index, :] pred_boxes = pred_boxes[inv_index, :]
# -*- coding:utf-8 -*-# Author: WUJiang# 测试功能import numpy as nphashes = np.array([ [4], [1], [6], [7], [1] ])_, index, inv_index = np.unique(hashes, return_index=True, return_inverse=True)print(inv_index)# _ [1 4 6 7]# index [1 0 2 3] # inv_index [1 0 2 3 0]
_为去重并排序后的数组、index为_中元素在原数组中第一次出现时的索引构成的数组、inv_index为原数组元素在_中的对应索引构成的数组。
3._get_blobs(im, rois)函数的执行逻辑(rois即为传入的boxes)
调用_get_image_blob(im)获取blobs['data']([None, None, None,3],与VGGnet_test.py中data占位符对应)和im_scale_factors缩放因子数组(即im_scales),调用_get_rois_blob(rois,im_scale_factors)得到blobs['rois'](R*5)返回blobs和im_scale_factors
def _get_image_blob(im): """Converts an image into a network input. Arguments: im (ndarray): a color image in BGR order Returns: blob (ndarray): a data blob holding an image pyramid im_scale_factors (list): list of image scales (relative to im) used in the image pyramid # 此处注释有误,应为ndarray而非list """
im--->减去像素均值操作--->判断以长边(1000)/短边(600)缩放--->双线性差值对图像进行缩放im = cv2.resize(im_orig, None, None, fx=im_scale, fy=im_scale,interpolation=cv2.INTER_LINEAR)--->
blob=im_list_to_blob(processed_ims)(blob.py中)将(图像金字塔处理后得到的)图像列表转化为blob数据(即blobs['data'])返回blob和np.array(im_scale_factors)缩放因子数组(不使用图像金字塔时processed_ims和im_scale_factors均只含一个元素)
def _get_rois_blob(im_rois, im_scale_factors): """Converts RoIs into network inputs. Arguments: im_rois (ndarray): R x 4 matrix of RoIs in original image coordinates im_scale_factors (list): scale factors as returned by _get_image_blob # 此处注释有误,应为ndarray而非list Returns: blob (ndarray): R x 5 matrix of RoIs in the image pyramid """
调用_project_im_rois(im_rois, im_scale_factors)得到rois和levels,拼接rois和levels构成rois_blob(即data['rois'],shape为R*5,5=1level+4坐标,默认不使用图像金字塔level均为0)
def _project_im_rois(im_rois, scales): """Project image RoIs into the image pyramid built by _get_image_blob. Arguments: im_rois (ndarray): R x 4 matrix of RoIs in original image coordinates scales (list): scale factors as returned by _get_image_blob # 此处注释有误,应为ndarray而非list Returns: rois (ndarray): R x 4 matrix of projected RoI coordinates levels (list): image pyramid levels used by each projected RoI # 此处注释有误,应为ndarray而非list """
rois = im_rois * scales[levels] rois按缩放因子进行缩放返回rois和levels
# -*- coding:utf-8 -*-# Author: WUJiang# 测试功能import numpy as npim_rois = np.array([ [1, 2, 4, 5], [2, 3, 7, 8]])scale = np.array([0.6])levels = np.array([ [0], [0]])"""[[0.6] [0.6]]"""print(scale[levels])rois = im_rois * scale[levels]"""[[0.6 1.2 2.4 3. ] [1.2 1.8 4.2 4.8]]"""print(rois)
4.test_net(sess, net, imdb, weight_filename, max_per_image = 300即top-300个proposal, thresh = 0.05即得分阈值, vis = False)的执行逻辑(被test_net.py调用)
def test_net(sess, net, imdb, weights_filename , max_per_image=300, thresh=0.05, vis=False): #被test_net.py调用 #weights_filename不包含后缀的模型文件名,类似于vggnet...70000 """Test a Fast R-CNN network on an image database."""
构造all_boxes列表,其中all_boxes[cls_ind类别索引][im_id图像索引]为该类的R*5的检测结果det,即R*(x1,y1,x2,y2,score) ( 其中图像共num_images张,num_images=len(imdb.image_index) )
--->调用get_output_dir(imdb,weights_filename)(config.py中)得到输出对应数据集测试结果pkl的部分绝对路径(需+detection.pkl)
--->构造计时器字典_t(含im_detect和misc字段,分别创建了一个Timer对象)
--->(不使用RPN时)roidb=imdb.roidb(维度和内容不明,imdb类)--->得到测试结果pkl文件全路径det_file(如output/faster_rcnn_voc_vgg/voc_2007_test/VGGnet_fast_rcnn_iter_6020/detections.pkl)
--->i循环遍历每一张图像 bbox_proposal = roidb[i]['boxes'][roidb[i]['gt_classes'] == 0](维度和内容不明,使得含义不明)剔除为gt的rois,只评估非roi的proposal的效果(因为roi有可能来源于training or val split)
--->读取图像im--->调用im_detect(sess, net, im, box_proposal)返回scores和boxes,前后时间戳相减得到耗时detect_time
--->对于每张图像,从1~imdb.num_classes j循环遍历各个类别(0为背景),取出该类score得分超过thresh阈值的boxes拼接成cls_det(None,5),类内执行NMS(nms是C编译的.so),更新cls_det,将cls_det填入 all_boxes对应位置all_boxes[j][i]
--->每张图像最多保留包含所有类别的max_per_image(默认为300,即top-300)个得分最高的boxes,更新all_boxes[j][i],im_detect()后的处理均纳入NMS耗时nms_time--->打印相关测试信息
--->两层循环结束后,利用cPickle向det_file中写入all_boxes检测结果--->调用imdb.evaluate_detection(all_boxes, output_dir)(lib.datasets.imdb.pascal_voc.py中)评估算法精度(如计算AP值)、按类存储相关检测结果
# -*- coding:utf-8 -*-# Author: WUJiang# 测试功能# all_boxes[cls][image] = N x 5 array of detections in (x1, y1, x2, y2, score)all_boxes = [[[] for _ in range(5)] # num_images for _ in range(4)] # num_classes"""[ [[], [], [], [], []], [[], [], [], [], []], [[], [], [], [], []], [[], [], [], [], []]]"""print(all_boxes)
5.其他函数
_clip_boxes(boxes, im_shape)
def _clip_boxes(boxes, im_shape): """Clip boxes to image boundaries.""" # x1 >= 0 boxes[:, 0::4] = np.maximum(boxes[:, 0::4], 0) # y1 >= 0 boxes[:, 1::4] = np.maximum(boxes[:, 1::4], 0) # x2 < im_shape[1] boxes[:, 2::4] = np.minimum(boxes[:, 2::4], im_shape[1] - 1) # y2 < im_shape[0] boxes[:, 3::4] = np.minimum(boxes[:, 3::4], im_shape[0] - 1) return boxes
# -*- coding:utf-8 -*-# Author: WUJiang# 测试功能import numpy as npboxes = np.array([ [1, 2, 5, 7], [2, 4, 8, 9] ])"""[[1] [2]]"""print(boxes[:, 0::4])# [1 2]print(boxes[:, 0])
# -*- coding:utf-8 -*-# Author: WUJiang# 测试功能import numpy as npa = np.array([ [0, 1, 2, 3, 4, 5, 6, 7], [1, 2, 3, 4, 5, 6, 7, 8], [2, 3, 4, 5, 6, 7, 8, 9]])"""[[0 4] [1 5] [2 6]]"""b = a[:, 0::4] # 第二维以4为步长print(b)
_rescale_boxes(boxes, inds, scales) 未见调用
def _rescale_boxes(boxes, inds, scales): """Rescale boxes according to image rescaling.""" for i in xrange(boxes.shape[0]): # 并不是生成序列,而是作为一个生成器,生成一个取出一个 boxes[i,:] = boxes[i,:] / scales[int(inds[i])] return boxes
vis_detection(im, class_name, dets, thresh=0.8) 可视化检测结果,绘制boxes矩形框 (demo.py中将其重构,不太理解的是这里为何要将10与dets.shape[0]进行比较)
被test_net(sess, net, imdb, weight_filename, max_per_image = 300, thresh = 0.05, vis = False)调用
def vis_detections(im, class_name, dets, thresh=0.8): """Visual debugging of detections.""" import matplotlib.pyplot as plt #im = im[:, :, (2, 1, 0)] for i in xrange(np.minimum(10, dets.shape[0])): #dets由bbox坐标和score拼成 bbox = dets[i, :4] score = dets[i, -1] if score > thresh: #plt.cla() #plt.imshow(im) plt.gca().add_patch( plt.Rectangle((bbox[0], bbox[1]), bbox[2] - bbox[0], bbox[3] - bbox[1], fill=False, edgecolor='g', linewidth=3) ) plt.gca().text(bbox[0], bbox[1] - 2, '{:s} {:.3f}'.format(class_name, score), bbox=dict(facecolor='blue', alpha=0.5), fontsize=14, color='white') plt.title('{} {:.3f}'.format(class_name, score)) #plt.show()
apply_nms(all_boxes, thresh) 未见调用
def apply_nms(all_boxes, thresh): """Apply non-maximum suppression to all predicted boxes output by the test_net method. """ num_classes = len(all_boxes) num_images = len(all_boxes[0]) nms_boxes = [[[] for _ in xrange(num_images)] for _ in xrange(num_classes)] for cls_ind in xrange(num_classes): for im_ind in xrange(num_images): dets = all_boxes[cls_ind][im_ind] if dets == []: continue x1 = dets[:, 0] y1 = dets[:, 1] x2 = dets[:, 2] y2 = dets[:, 3] scores = dets[:, 4] inds = np.where((x2 > x1) & (y2 > y1) & (scores > cfg.TEST.DET_THRESHOLD))[0] dets = dets[inds,:] if dets == []: continue keep = nms(dets, thresh) # 核心在这里 if len(keep) == 0: continue nms_boxes[cls_ind][im_ind] = dets[keep, :].copy() return nms_boxes
传入参数all_boxes可见test_net(sess, net, imdb, weights_filename, max_per_image. thresh, vis)函数