解读SPP / SPPF / SimSPPF / ASPP / RFB / SPPCSPC

本文主要是介绍解读SPP / SPPF / SimSPPF / ASPP / RFB / SPPCSPC，希望对大家解决编程问题提供一定的参考价值，需要的开发者们随着小编来一起学习吧！

SPP与SPPF
一、SPP的应用的背景
在卷积神经网络中我们经常看到固定输入的设计，但是如果我们输入的不能是固定尺寸的该怎么办呢？

通常来说，我们有以下几种方法：

（1）对输入进行resize操作，让他们统统变成你设计的层的输入规格那样。但是这样过于暴力直接，可能会丢失很多信息或者多出很多不该有的信息（图片变形等），影响最终的结果。

（2）替换网络中的全连接层，对最后的卷积层使用global average pooling，全局平均池化只和通道数有关，而与特征图大小没有关系

（3）最后一个当然是我们要讲的SPP结构

Note：
但是在yolov5中SPP/SPPF作用是：实现局部特征和全局特征的featherMap级别的融合。

二、SPP结构分析
SPP结构又被称为空间金字塔池化，能将任意大小的特征图转换成固定大小的特征向量。

接下来我们来详述一下SPP是怎么处理滴~

输入层：首先我们现在有一张任意大小的图片，其大小为w * h。

输出层：21个神经元 – 即我们待会希望提取到21个特征。

分析如下图所示：分别对1 * 1分块，2 * 2分块和4 * 4子图里分别取每一个框内的max值（即取蓝框框内的最大值），这一步就是作最大池化，这样最后提取出来的特征值（即取出来的最大值）一共有1 * 1 + 2 * 2 + 4 * 4 = 21个。得出的特征再concat在一起。

在这里插入图片描述
而在YOLOv5中SPP的结构图如下图所示：

其中，前后各多加一个CBL，中间的kernel size分别为1 * 1，5 * 5，9 * 9和13 * 13。

三、SPPF结构分析
CBL(conv+BN+Leaky relu)改成CBS(conv+BN+SiLU)哈，之前没注意它的名称变化。
在这里插入图片描述
四、YOLOv5中SPP/SPPF结构源码解析（内含注释分析）

代码注释与上图的SPP结构相对应。

class SPP(nn.Module):def __init__(self, c1, c2, k=(5, 9, 13)):#这里5，9，13，就是初始化的kernel sizesuper().__init__()c_ = c1 // 2  # hidden channelsself.cv1 = Conv(c1, c_, 1, 1)#这里对应第一个CBLself.cv2 = Conv(c_ * (len(k) + 1), c2, 1, 1)#这里对应SPP操作里的最后一个CBLself.m = nn.ModuleList([nn.MaxPool2d(kernel_size=x, stride=1, padding=x // 2) for x in k])#这里对应SPP核心操作，对5 * 5分块，9 * 9分块和13 * 13子图分别取最大池化def forward(self, x):x = self.cv1(x)with warnings.catch_warnings():warnings.simplefilter('ignore')  # suppress torch 1.9.0 max_pool2d() warning忽略警告return self.cv2(torch.cat([x] + [m(x) for m in self.m], 1))#torch.cat对应concat
# SPPF结构
class SPPF(nn.Module):# Spatial Pyramid Pooling - Fast (SPPF) layer for YOLOv5 by Glenn Jocherdef __init__(self, c1, c2, k=5):  # equivalent to SPP(k=(5, 9, 13))super().__init__()c_ = c1 // 2  # hidden channelsself.cv1 = Conv(c1, c_, 1, 1)self.cv2 = Conv(c_ * 4, c2, 1, 1)self.m = nn.MaxPool2d(kernel_size=k, stride=1, padding=k // 2)def forward(self, x):x = self.cv1(x)#先通过CBL进行通道数的减半with warnings.catch_warnings():warnings.simplefilter('ignore')  # suppress torch 1.9.0 max_pool2d() warningy1 = self.m(x)y2 = self.m(y1)#上述两次最大池化return self.cv2(torch.cat([x, y1, y2, self.m(y2)], 1))#将原来的x,一次池化后的y1,两次池化后的y2,3次池化的self.m(y2)先进行拼接，然后再CBL

实验对比
下面做个简单的小实验，对比下SPP和SPPF的计算结果以及速度，代码如下（注意这里将SPPF中最开始和结尾处的1x1卷积层给去掉了，只对比含有MaxPool的部分）：

import time
import torch
import torch.nn as nnclass SPP(nn.Module):def __init__(self):super().__init__()self.maxpool1 = nn.MaxPool2d(5, 1, padding=2)self.maxpool2 = nn.MaxPool2d(9, 1, padding=4)self.maxpool3 = nn.MaxPool2d(13, 1, padding=6)def forward(self, x):o1 = self.maxpool1(x)o2 = self.maxpool2(x)o3 = self.maxpool3(x)return torch.cat([x, o1, o2, o3], dim=1)class SPPF(nn.Module):def __init__(self):super().__init__()self.maxpool = nn.MaxPool2d(5, 1, padding=2)def forward(self, x):o1 = self.maxpool(x)o2 = self.maxpool(o1)o3 = self.maxpool(o2)return torch.cat([x, o1, o2, o3], dim=1)def main():input_tensor = torch.rand(8, 32, 16, 16)spp = SPP()sppf = SPPF()output1 = spp(input_tensor)output2 = sppf(input_tensor)print(torch.equal(output1, output2))t_start = time.time()for _ in range(100):spp(input_tensor)print(f"spp time: {time.time() - t_start}")t_start = time.time()for _ in range(100):sppf(input_tensor)print(f"sppf time: {time.time() - t_start}")if __name__ == '__main__':main()"""运行结果"""
True
spp time: 0.5373051166534424
sppf time: 0.20780706405639648

更多类型的SPP
1.1 SPP（Spatial Pyramid Pooling）
SPP模块是何凯明大神在2015年的论文《Spatial Pyramid Pooling in Deep Convolutional Networks for Visual Recognition》中被提出。

SPP全程为空间金字塔池化结构，主要是为了解决两个问题：

有效避免了对图像区域裁剪、缩放操作导致的图像失真等问题；
解决了卷积神经网络对图相关重复特征提取的问题，大大提高了产生候选框的速度，且节省了计算成本。

在这里插入图片描述

class SPP(nn.Module):# Spatial Pyramid Pooling (SPP) layer https://arxiv.org/abs/1406.4729def __init__(self, c1, c2, k=(5, 9, 13)):super().__init__()c_ = c1 // 2  # hidden channelsself.cv1 = Conv(c1, c_, 1, 1)self.cv2 = Conv(c_ * (len(k) + 1), c2, 1, 1)self.m = nn.ModuleList([nn.MaxPool2d(kernel_size=x, stride=1, padding=x // 2) for x in k])def forward(self, x):x = self.cv1(x)with warnings.catch_warnings():warnings.simplefilter('ignore')  # suppress torch 1.9.0 max_pool2d() warningreturn self.cv2(torch.cat([x] + [m(x) for m in self.m], 1))

1.2 SPPF（Spatial Pyramid Pooling - Fast）
这个是YOLOv5作者Glenn Jocher基于SPP提出的，速度较SPP快很多，所以叫SPP-Fast

在这里插入图片描述

class SPPF(nn.Module):# Spatial Pyramid Pooling - Fast (SPPF) layer for YOLOv5 by Glenn Jocherdef __init__(self, c1, c2, k=5):  # equivalent to SPP(k=(5, 9, 13))super().__init__()c_ = c1 // 2  # hidden channelsself.cv1 = Conv(c1, c_, 1, 1)self.cv2 = Conv(c_ * 4, c2, 1, 1)self.m = nn.MaxPool2d(kernel_size=k, stride=1, padding=k // 2)def forward(self, x):x = self.cv1(x)with warnings.catch_warnings():warnings.simplefilter('ignore')  # suppress torch 1.9.0 max_pool2d() warningy1 = self.m(x)y2 = self.m(y1)return self.cv2(torch.cat((x, y1, y2, self.m(y2)), 1))

1.3 SimSPPF（Simplified SPPF）
美团YOLOv6提出的模块，感觉和SPPF只差了一个激活函数，简单测试了一下，单个ConvBNReLU速度要比ConvBNSiLU快18%

在这里插入图片描述

class SimConv(nn.Module):'''Normal Conv with ReLU activation'''def __init__(self, in_channels, out_channels, kernel_size, stride, groups=1, bias=False):super().__init__()padding = kernel_size // 2self.conv = nn.Conv2d(in_channels,out_channels,kernel_size=kernel_size,stride=stride,padding=padding,groups=groups,bias=bias,)self.bn = nn.BatchNorm2d(out_channels)self.act = nn.ReLU()def forward(self, x):return self.act(self.bn(self.conv(x)))def forward_fuse(self, x):return self.act(self.conv(x))class SimSPPF(nn.Module):'''Simplified SPPF with ReLU activation'''def __init__(self, in_channels, out_channels, kernel_size=5):super().__init__()c_ = in_channels // 2  # hidden channelsself.cv1 = SimConv(in_channels, c_, 1, 1)self.cv2 = SimConv(c_ * 4, out_channels, 1, 1)self.m = nn.MaxPool2d(kernel_size=kernel_size, stride=1, padding=kernel_size // 2)def forward(self, x):x = self.cv1(x)with warnings.catch_warnings():warnings.simplefilter('ignore')y1 = self.m(x)y2 = self.m(y1)return self.cv2(torch.cat([x, y1, y2, self.m(y2)], 1))

1.4 ASPP（Atrous Spatial Pyramid Pooling）
受到SPP的启发，语义分割模型DeepLabv2中提出了ASPP模块(空洞空间卷积池化金字塔)，该模块使用具有不同采样率的多个并行空洞卷积层。为每个采样率提取的特征在单独的分支中进一步处理，并融合以生成最终结果。该模块通过不同的空洞率构建不同感受野的卷积核，用来获取多尺度物体信息，具体结构比较简单如下图所示：
在这里插入图片描述
ASPP是在DeepLab中提出来的，在后续的DeepLab版本中对其做了改进，如加入BN层、加入深度可分离卷积等，但基本的思路还是没变。

# without BN version
class ASPP(nn.Module):def __init__(self, in_channel=512, out_channel=256):super(ASPP, self).__init__()self.mean = nn.AdaptiveAvgPool2d((1, 1))  # (1,1)means ouput_dimself.conv = nn.Conv2d(in_channel,out_channel, 1, 1)self.atrous_block1 = nn.Conv2d(in_channel, out_channel, 1, 1)self.atrous_block6 = nn.Conv2d(in_channel, out_channel, 3, 1, padding=6, dilation=6)self.atrous_block12 = nn.Conv2d(in_channel, out_channel, 3, 1, padding=12, dilation=12)self.atrous_block18 = nn.Conv2d(in_channel, out_channel, 3, 1, padding=18, dilation=18)self.conv_1x1_output = nn.Conv2d(out_channel * 5, out_channel, 1, 1)def forward(self, x):size = x.shape[2:]image_features = self.mean(x)image_features = self.conv(image_features)image_features = F.upsample(image_features, size=size, mode='bilinear')atrous_block1 = self.atrous_block1(x)atrous_block6 = self.atrous_block6(x)atrous_block12 = self.atrous_block12(x)atrous_block18 = self.atrous_block18(x)net = self.conv_1x1_output(torch.cat([image_features, atrous_block1, atrous_block6,atrous_block12, atrous_block18], dim=1))return net

1.5 RFB（Receptive Field Block）
RFB模块是在《ECCV2018:Receptive Field Block Net for Accurate and Fast Object Detection》一文中提出的，该文的出发点是模拟人类视觉的感受野从而加强网络的特征提取能力，在结构上RFB借鉴了Inception的思想，主要是在Inception的基础上加入了空洞卷积，从而有效增大了感受野

在这里插入图片描述

RFB和RFB-s的架构。RFB-s用于在浅层人类视网膜主题图中模拟较小的pRF，使用具有较小内核的更多分支。

class BasicConv(nn.Module):def __init__(self, in_planes, out_planes, kernel_size, stride=1, padding=0, dilation=1, groups=1, relu=True, bn=True):super(BasicConv, self).__init__()self.out_channels = out_planesif bn:self.conv = nn.Conv2d(in_planes, out_planes, kernel_size=kernel_size, stride=stride, padding=padding, dilation=dilation, groups=groups, bias=False)self.bn = nn.BatchNorm2d(out_planes, eps=1e-5, momentum=0.01, affine=True)self.relu = nn.ReLU(inplace=True) if relu else Noneelse:self.conv = nn.Conv2d(in_planes, out_planes, kernel_size=kernel_size, stride=stride, padding=padding, dilation=dilation, groups=groups, bias=True)self.bn = Noneself.relu = nn.ReLU(inplace=True) if relu else Nonedef forward(self, x):x = self.conv(x)if self.bn is not None:x = self.bn(x)if self.relu is not None:x = self.relu(x)return xclass BasicRFB(nn.Module):def __init__(self, in_planes, out_planes, stride=1, scale=0.1, map_reduce=8, vision=1, groups=1):super(BasicRFB, self).__init__()self.scale = scaleself.out_channels = out_planesinter_planes = in_planes // map_reduceself.branch0 = nn.Sequential(BasicConv(in_planes, inter_planes, kernel_size=1, stride=1, groups=groups, relu=False),BasicConv(inter_planes, 2 * inter_planes, kernel_size=(3, 3), stride=stride, padding=(1, 1), groups=groups),BasicConv(2 * inter_planes, 2 * inter_planes, kernel_size=3, stride=1, padding=vision, dilation=vision, relu=False, groups=groups))self.branch1 = nn.Sequential(BasicConv(in_planes, inter_planes, kernel_size=1, stride=1, groups=groups, relu=False),BasicConv(inter_planes, 2 * inter_planes, kernel_size=(3, 3), stride=stride, padding=(1, 1), groups=groups),BasicConv(2 * inter_planes, 2 * inter_planes, kernel_size=3, stride=1, padding=vision + 2, dilation=vision + 2, relu=False, groups=groups))self.branch2 = nn.Sequential(BasicConv(in_planes, inter_planes, kernel_size=1, stride=1, groups=groups, relu=False),BasicConv(inter_planes, (inter_planes // 2) * 3, kernel_size=3, stride=1, padding=1, groups=groups),BasicConv((inter_planes // 2) * 3, 2 * inter_planes, kernel_size=3, stride=stride, padding=1, groups=groups),BasicConv(2 * inter_planes, 2 * inter_planes, kernel_size=3, stride=1, padding=vision + 4, dilation=vision + 4, relu=False, groups=groups))self.ConvLinear = BasicConv(6 * inter_planes, out_planes, kernel_size=1, stride=1, relu=False)self.shortcut = BasicConv(in_planes, out_planes, kernel_size=1, stride=stride, relu=False)self.relu = nn.ReLU(inplace=False)def forward(self, x):x0 = self.branch0(x)x1 = self.branch1(x)x2 = self.branch2(x)out = torch.cat((x0, x1, x2), 1)out = self.ConvLinear(out)short = self.shortcut(x)out = out * self.scale + shortout = self.relu(out)return out

1.6 SPPCSPC
该模块是YOLOv7中使用的SPP结构，表现优于SPPF，但参数量和计算量提升了很多
在这里插入图片描述

class SPPCSPC(nn.Module):# CSP https://github.com/WongKinYiu/CrossStagePartialNetworksdef __init__(self, c1, c2, n=1, shortcut=False, g=1, e=0.5, k=(5, 9, 13)):super(SPPCSPC, self).__init__()c_ = int(2 * c2 * e)  # hidden channelsself.cv1 = Conv(c1, c_, 1, 1)self.cv2 = Conv(c1, c_, 1, 1)self.cv3 = Conv(c_, c_, 3, 1)self.cv4 = Conv(c_, c_, 1, 1)self.m = nn.ModuleList([nn.MaxPool2d(kernel_size=x, stride=1, padding=x // 2) for x in k])self.cv5 = Conv(4 * c_, c_, 1, 1)self.cv6 = Conv(c_, c_, 3, 1)self.cv7 = Conv(2 * c_, c2, 1, 1)def forward(self, x):x1 = self.cv4(self.cv3(self.cv1(x)))y1 = self.cv6(self.cv5(torch.cat([x1] + [m(x1) for m in self.m], 1)))y2 = self.cv2(x)return self.cv7(torch.cat((y1, y2), dim=1))
#分组SPPCSPC 分组后参数量和计算量与原本差距不大，不知道效果怎么样
class SPPCSPC_group(nn.Module):def __init__(self, c1, c2, n=1, shortcut=False, g=1, e=0.5, k=(5, 9, 13)):super(SPPCSPC_group, self).__init__()c_ = int(2 * c2 * e)  # hidden channelsself.cv1 = Conv(c1, c_, 1, 1, g=4)self.cv2 = Conv(c1, c_, 1, 1, g=4)self.cv3 = Conv(c_, c_, 3, 1, g=4)self.cv4 = Conv(c_, c_, 1, 1, g=4)self.m = nn.ModuleList([nn.MaxPool2d(kernel_size=x, stride=1, padding=x // 2) for x in k])self.cv5 = Conv(4 * c_, c_, 1, 1, g=4)self.cv6 = Conv(c_, c_, 3, 1, g=4)self.cv7 = Conv(2 * c_, c2, 1, 1, g=4)def forward(self, x):x1 = self.cv4(self.cv3(self.cv1(x)))y1 = self.cv6(self.cv5(torch.cat([x1] + [m(x1) for m in self.m], 1)))y2 = self.cv2(x)return self.cv7(torch.cat((y1, y2), dim=1))

1.7 SPPFCSPC+
我借鉴了SPPF的思想将SPPCSPC优化了一下，得到了SPPFCSPC，在保持感受野不变的情况下获得速度提升；我把这个模块给v7作者看了，并没有得到否定，详细回答可以看4 Issue

目前这个结构被YOLOv6 3.0版本使用了，效果很不错，大家可以看一下YOLOv6 3.0的论文，里面有详细的实验结果。
在这里插入图片描述

class SPPFCSPC(nn.Module):def __init__(self, c1, c2, n=1, shortcut=False, g=1, e=0.5, k=5):super(SPPFCSPC, self).__init__()c_ = int(2 * c2 * e)  # hidden channelsself.cv1 = Conv(c1, c_, 1, 1)self.cv2 = Conv(c1, c_, 1, 1)self.cv3 = Conv(c_, c_, 3, 1)self.cv4 = Conv(c_, c_, 1, 1)self.m = nn.MaxPool2d(kernel_size=k, stride=1, padding=k // 2)self.cv5 = Conv(4 * c_, c_, 1, 1)self.cv6 = Conv(c_, c_, 3, 1)self.cv7 = Conv(2 * c_, c2, 1, 1)def forward(self, x):x1 = self.cv4(self.cv3(self.cv1(x)))x2 = self.m(x1)x3 = self.m(x2)y1 = self.cv6(self.cv5(torch.cat((x1,x2,x3, self.m(x3)),1)))y2 = self.cv2(x)return self.cv7(torch.cat((y1, y2), dim=1))