Unet

以resnet50作为主干特征提取网络。
resnet50的基本块:Bottleneck

  1. 基本结构是逐点卷积(11卷积核)进行通道数上升,33卷积层进行特征提取,最后逐点卷积进行通道数下降,获得最终的通道数。
  2. 残差结构,如果输入输出特征图通道数,尺寸均未发生改变,则直接进行残差链接,否则对输入特征使用一个卷积层处理为与输出相同的形状后进行残差链接
class Bottleneck(nn.Module):
    expansion = 4
    def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
                 base_width=64, dilation=1, norm_layer=None):
        super(Bottleneck, self).__init__()
        if norm_layer is None:
            norm_layer = nn.BatchNorm2d
        width = int(planes * (base_width / 64.)) * groups
        # 利用1x1卷积下降通道数
        self.conv1 = conv1x1(inplanes, width)
        self.bn1 = norm_layer(width)
        # 利用3x3卷积进行特征提取
        self.conv2 = conv3x3(width, width, stride, groups, dilation)
        self.bn2 = norm_layer(width)
        # 利用1x1卷积上升通道数
        self.conv3 = conv1x1(width, planes * self.expansion)
        self.bn3 = norm_layer(planes * self.expansion)

        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        identity = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)

        out = self.conv3(out)
        out = self.bn3(out)

        if self.downsample is not None:
            identity = self.downsample(x)

        out += identity
        out = self.relu(out)

        return out
  1. resnet结构每个layer都包含若干个上述残差块,但是每个layer中只有第一个残差块负责对特征图进行下采样及通道数变更(即如果通过这个layer后特征图被下采样两倍,那这个下采样一定发生在此layer的第一个残差块的3*3卷积层这里)
  2. 以3 512 512的rgb图像为例,resnet50在第234个layer会发生下采样
    x = self.conv1(x) # k=7 s=2 p=3下采样两倍
    x = self.bn1(x)
    feat1 = self.relu(x)
    feat1(64,256,256)
    x = self.maxpool(feat1)
    feat2 = self.layer1(x) # layer2不进行下采样,下采样发生在上面的最大值池化
    feat2(256,128,128)
    feat3 = self.layer2(feat2)
    feat3(512,64,64)
    feat4 = self.layer3(feat3)
    feat4(1024,32,32)
    feat5 = self.layer4(feat4)
    feat5(2048,16,16)
class ResNet(nn.Module):
    def __init__(self, block, layers, num_classes=1000):
        #-----------------------------------------------------------#
        #   假设输入图像为600,600,3
        #   当我们使用resnet50的时候
        #-----------------------------------------------------------#
        self.inplanes = 64
        super(ResNet, self).__init__()
        # 600,600,3 -> 300,300,64
        self.conv1  = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
        self.bn1    = nn.BatchNorm2d(64)
        self.relu   = nn.ReLU(inplace=True)
        # 300,300,64 -> 150,150,64
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=0, ceil_mode=True) # change
        # 150,150,64 -> 150,150,256
        self.layer1 = self._make_layer(block, 64, layers[0])
        # 150,150,256 -> 75,75,512
        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
        # 75,75,512 -> 38,38,1024
        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
        # 38,38,1024 -> 19,19,2048
        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
        
        self.avgpool = nn.AvgPool2d(7)
        self.fc = nn.Linear(512 * block.expansion, num_classes)

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()

    def _make_layer(self, block, planes, blocks, stride=1):
        downsample = None
        if stride != 1 or self.inplanes != planes * block.expansion:
            downsample = nn.Sequential(
                nn.Conv2d(self.inplanes, planes * block.expansion,
                    kernel_size=1, stride=stride, bias=False),
            nn.BatchNorm2d(planes * block.expansion),
        )

        layers = []
        layers.append(block(self.inplanes, planes, stride, downsample))
        self.inplanes = planes * block.expansion
        for i in range(1, blocks):
            layers.append(block(self.inplanes, planes))

        return nn.Sequential(*layers)

    def forward(self, x):
        # x = self.conv1(x)
        # x = self.bn1(x)
        # x = self.relu(x)
        # x = self.maxpool(x)

        # x = self.layer1(x)
        # x = self.layer2(x)
        # x = self.layer3(x)
        # x = self.layer4(x)

        # x = self.avgpool(x)
        # x = x.view(x.size(0), -1)
        # x = self.fc(x)

        x       = self.conv1(x)
        x       = self.bn1(x)
        feat1   = self.relu(x)

        x       = self.maxpool(feat1)
        feat2   = self.layer1(x)

        feat3   = self.layer2(feat2)
        feat4   = self.layer3(feat3)
        feat5   = self.layer4(feat4)
        return [feat1, feat2, feat3, feat4, feat5]
  1. 之后对这五个特征图进行特征融合
    以输入为3,600,600为例
    特征将融合4次,融合后的特征会经过unetUp层进行处理,此层先对inputs2进行下采样与input1的尺寸保持一致,之后两个特征层在通道维进行堆叠(这里是堆叠,resnet中残差结构是输入输出特征图相加),堆叠后经过两个不改变尺寸的3*3卷积进行通道数调整即可。
class unetUp(nn.Module):  # 对inputs2进行上采样后于inputs1在通道数上cat,卷积层一改变通道数,卷积层二不改变,整个过程不改变特征图尺寸
    def __init__(self, in_size, out_size):
        super(unetUp, self).__init__()
        self.conv1  = nn.Conv2d(in_size, out_size, kernel_size = 3, padding = 1)
        self.conv2  = nn.Conv2d(out_size, out_size, kernel_size = 3, padding = 1)
        self.up     = nn.UpsamplingBilinear2d(scale_factor = 2)
        self.relu   = nn.ReLU(inplace = True)

    def forward(self, inputs1, inputs2):
        outputs = torch.cat([inputs1, self.up(inputs2)], 1)
        outputs = self.conv1(outputs)
        outputs = self.relu(outputs)
        outputs = self.conv2(outputs)
        outputs = self.relu(outputs)
        return outputs
  1. 定义一些超参数,定义了4个unetUp层的输入输出通道数
    in_filters = [192, 512, 1024, 3072]
    out_filters = [64, 128, 256, 512]
    up4 = self.up_concat4(feat4, feat5)
    特征图五上采样两倍,与特征图4混合特征形状为(3072,32,32)
    所以concat4的input_size为3072,输出形状为(512, 32, 32)
    up3 = self.up_concat3(feat3, up4)
    up4上采样两倍,与特征图3混合特征形状为(1024,64,64)
    所以concat3的input_size为1024,输出形状为(256, 64, 64)
    up2 = self.up_concat2(feat2, up3)
    up3上采样两倍,与特征图3混合特征形状为(512,128,128)
    所以concat2的input_size为512,输出形状为(128, 128, 128)
    up1 = self.up_concat1(feat1, up2)
    up2上采样两倍,与特征图3混合特征形状为(192,256,256)
    所以concat1的input_size为192,输出形状为(64, 256,256)
    if self.up_conv != None:
    up1 = self.up_conv(up1)
    最后再进行一次二倍上采样还原回输入图片的尺寸(64,512,512)
    至此特征图up1尺寸与输入图片的尺寸完全相同,64相当于对原图的每个像素点获得了64位的特征向量

此时理解为每个像素点都是64维特征向量,对其进行分类 ,用一个逐点卷积对通道数进行调整:
self.final = nn.Conv2d(out_filters[0], num_classes, 1),实现最终的像素级分类

class Unet(nn.Module):
    def __init__(self, num_classes = 21, pretrained = False, backbone = 'vgg'):
        super(Unet, self).__init__()
        if backbone == 'vgg':
            self.vgg    = VGG16(pretrained = pretrained)
            in_filters  = [192, 384, 768, 1024]
        elif backbone == "resnet50":
            self.resnet = resnet50(pretrained = pretrained)
            in_filters  = [192, 512, 1024, 3072]
        else:
            raise ValueError('Unsupported backbone - `{}`, Use vgg, resnet50.'.format(backbone))
        out_filters = [64, 128, 256, 512]

        # upsampling
        # 64,64,512
        self.up_concat4 = unetUp(in_filters[3], out_filters[3])
        # 128,128,256
        self.up_concat3 = unetUp(in_filters[2], out_filters[2])
        # 256,256,128
        self.up_concat2 = unetUp(in_filters[1], out_filters[1])
        # 512,512,64
        self.up_concat1 = unetUp(in_filters[0], out_filters[0])

        if backbone == 'resnet50':
            self.up_conv = nn.Sequential(
                nn.UpsamplingBilinear2d(scale_factor = 2), 
                nn.Conv2d(out_filters[0], out_filters[0], kernel_size = 3, padding = 1),
                nn.ReLU(),
                nn.Conv2d(out_filters[0], out_filters[0], kernel_size = 3, padding = 1),
                nn.ReLU(),
            )
        else:
            self.up_conv = None

        self.final = nn.Conv2d(out_filters[0], num_classes, 1)

        self.backbone = backbone

    def forward(self, inputs):
        if self.backbone == "vgg":
            [feat1, feat2, feat3, feat4, feat5] = self.vgg.forward(inputs)
        elif self.backbone == "resnet50":
            [feat1, feat2, feat3, feat4, feat5] = self.resnet.forward(inputs)
        # 以输入为3,512,512为例
        # 经过64 64得到第一个特征图64,512,512
        # 经过M 128 128得到第二个特征图128, 256, 256
        # 经过M 256 256 256得到第三个特征图256, 128, 128
        # 经过M 512 512 512得到第四个特征图512, 64, 64
        # 经过M 512 512 512得到第五个特征图512, 32, 32
        up4 = self.up_concat4(feat4, feat5)
        # 特征图五上采样两倍,与特征图4混合特征形状为(1024,64,64)
        # 所以concat4的input_size为1024,输出形状为(512, 64,64)
        up3 = self.up_concat3(feat3, up4)
        # up4上采样两倍,与特征图3混合特征形状为(768,128,128)
        # 所以concat3的input_size为768,输出形状为(256, 128, 128)
        up2 = self.up_concat2(feat2, up3)
        # up3上采样两倍,与特征图2混合特征形状为(384,256,256)
        # 所以concat2的input_size为384,输出形状为(128, 256, 256)
        up1 = self.up_concat1(feat1, up2)
        # up2上采样两倍,与特征图1混合特征形状为(192,512,512)
        # 所以concat1的input_size为192,输出形状为(64, 512,512)


        if self.up_conv != None:
            up1 = self.up_conv(up1)

        # 至此特征图up1尺寸与输入图片的尺寸完全相同,64相当于对原图的每个像素点获得了64位的特征向量
        # 之后将使用1*1卷积层将这64位特征映射为长度num_class的向量,相当于对该图进行分类

        final = self.final(up1)
        
        return final

deeplabv3+

  1. 此网络是以mobilenetv2为基础搭建的语义分隔框架,其基本结构是InvertedResidual,与残差块相似,也是逐点卷积加3*3卷积加逐点卷积,如果进行下采样,这个下采样过程也发生在3*3卷积层,区别在于,3*3卷积层采用分层卷积,成倍缩减了参数量,且只有在输入输出特征图通道数与尺寸完全一样时才采用残差链接
class InvertedResidual(nn.Module):
    def __init__(self, inp, oup, stride, expand_ratio):
        # inp即输入通道数
        # expand_ratio即通道数扩增时扩增为inp的几倍hidden_dim = round(inp * expand_ratio)
        # 
        super(InvertedResidual, self).__init__()
        self.stride = stride
        assert stride in [1, 2] 

        hidden_dim = round(inp * expand_ratio)
        # 当inp = oup时意味着输入输出通道数未发生改变, stride = 1意味着输入输出的尺寸数未发生改变,那么就进行残差链接,将输入和输出加起来作为最终的输出
        self.use_res_connect = self.stride == 1 and inp == oup  

        if expand_ratio == 1:  # 不仅进行通道数的扩增,此时就是一个逐层卷积加一个逐点卷积
            self.conv = nn.Sequential(
                #--------------------------------------------#
                #   进行3x3的逐层卷积,进行跨特征点的特征提取
                #--------------------------------------------#
                nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False),
                BatchNorm2d(hidden_dim),
                nn.ReLU6(inplace=True),
                #-----------------------------------#
                #   利用1x1卷积进行通道数的调整
                #-----------------------------------#
                nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
                BatchNorm2d(oup),
            )
        else:  # 进行通道数的扩增,此时就进行一个逐点卷积做通道数上升,逐层卷积特征提取,逐点卷积做通道数下降
            self.conv = nn.Sequential(
                #-----------------------------------#
                #   利用1x1卷积进行通道数的上升
                #-----------------------------------#
                nn.Conv2d(inp, hidden_dim, 1, 1, 0, bias=False),
                BatchNorm2d(hidden_dim),
                nn.ReLU6(inplace=True),
                #--------------------------------------------#
                #   进行3x3的逐层卷积,进行跨特征点的特征提取
                #--------------------------------------------#
                nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False),
                BatchNorm2d(hidden_dim),
                nn.ReLU6(inplace=True),
                #-----------------------------------#
                #   利用1x1卷积进行通道数的下降
                #-----------------------------------#
                nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
                BatchNorm2d(oup),
            )
        # 关于逐层卷积,是一种节省卷积层参数开销的策略,在保证输出的特征图的尺寸及通道数一样的情况下,分层卷积比普通卷积的参数量缩小n倍,n为分组的组数
        # 假设输入特征图(64, 512, 512),期望使用3*3卷积层,步长1,padding1,使用128个卷积核获得(128, 512, 512)的特征图,此时使用的是普通卷积,参数量为128 * 3 * 3 * 64
        # 如果使用分组卷积,加入特征图分为4组,则一组为(16, 512 ,512),此时使用3*3卷积层,步长1,padding1,32个卷积核获得(32, 512, 512)的特征图,参数量为32 * 3 * 3 * 16
        # 因为分了四组,所以还需要3组32个卷积核,共需要4 * 32 * 3 * 3 * 16的参数量
        # 综上所述,因为输出特征图层数不变,所以普通卷积和分组卷积所需要的卷积核数量是不变的,而且卷积核的尺寸也相同,因此参数量的改善源自于分组卷积的卷积核的通道数变为原来的1/n

    def forward(self, x):
        if self.use_res_connect:
            return x + self.conv(x)
        else:
            return self.conv(x)
  1. 原始的mobilenetv2
    与resnet相似,以(512 512 3)输入图像为例
    512, 512, 3 -> 256, 256, 32走来先获取初始特征图
    self.features = [conv_bn(3, input_channel, 2)]
    后续有七个layer
    n代表这组有几个InvertedResidual
    s表示在组的第一个InvertedResidual是否对特征图尺寸进行2倍缩放(一组中只有第一个InvertedResidual会缩放特征图,其他的都会保证输入输出特征图尺寸不发生改变)
    c表示的是这组输出特征层的通道数发生改变(一组中只有第一个InvertedResidual会改变,其他的都会保证输入输出通道数不发生改变)
    (因此一组中只有第一个InvertedResidual没有残差链接,其他的都有残差链接)
    t表示的是这组中的所有InvertedResidual是否有通道数上升的过程,即expand_ratio参数
    t, c, n, s
    [1, 16, 1, 1], # 256, 256, 32 -> 256, 256, 16
    [6, 24, 2, 2], # 256, 256, 16 -> 128, 128, 24 2
    [6, 32, 3, 2], # 128, 128, 24 -> 64, 64, 32 4
    [6, 64, 4, 2], # 64, 64, 32 -> 32, 32, 64 7
    [6, 96, 3, 1], # 32, 32, 64 -> 32, 32, 96
    [6, 160, 3, 2], # 32, 32, 96 -> 16, 16, 160 14
    [6, 320, 1, 1], # 16, 16, 160 -> 16, 16, 320
    2 4 7 14指的是在第2,4,7,14个InvertedResidual块时stride=2,特征图被下采样
class MobileNetV2(nn.Module):
    def __init__(self, n_class=1000, input_size=224, width_mult=1.):
        super(MobileNetV2, self).__init__()
        block = InvertedResidual
        input_channel = 32
        last_channel = 1280
        interverted_residual_setting = [
            # t, c, n, s  
            # n代表这组有几个InvertedResidual
            # s表示在组的第一个InvertedResidual是否对特征图尺寸进行2倍缩放(一组中只有第一个InvertedResidual会缩放特征图,其他的都会保证输入输出特征图尺寸不发生改变)
            # c表示的是这组输出特征层的通道数发生改变(一组中只有第一个InvertedResidual会改变,其他的都会保证输入输出通道数不发生改变)
            # (因此一组中只有第一个InvertedResidual没有残差链接,其他的都有残差链接)
            # t表示的是这组中的所有InvertedResidual是否有通道数上升的过程,即expand_ratio参数
            [1, 16, 1, 1], # 256, 256, 32 -> 256, 256, 16
            [6, 24, 2, 2], # 256, 256, 16 -> 128, 128, 24   2 # 2 4 7 14指的是在第2,4,7,14个InvertedResidual块时stride=2,特征图被下采样
            [6, 32, 3, 2], # 128, 128, 24 -> 64, 64, 32     4
            [6, 64, 4, 2], # 64, 64, 32 -> 32, 32, 64       7
            [6, 96, 3, 1], # 32, 32, 64 -> 32, 32, 96
            [6, 160, 3, 2], # 32, 32, 96 -> 16, 16, 160     14
            [6, 320, 1, 1], # 16, 16, 160 -> 16, 16, 320
        ]

        assert input_size % 32 == 0
        input_channel = int(input_channel * width_mult)
        self.last_channel = int(last_channel * width_mult) if width_mult > 1.0 else last_channel
        # 512, 512, 3 -> 256, 256, 32走来先获取初始特征图
        self.features = [conv_bn(3, input_channel, 2)]

        for t, c, n, s in interverted_residual_setting:
            output_channel = int(c * width_mult)
            for i in range(n):
                if i == 0:
                    self.features.append(block(input_channel, output_channel, s, expand_ratio=t))
                else:
                    self.features.append(block(input_channel, output_channel, 1, expand_ratio=t))
                input_channel = output_channel

        self.features.append(conv_1x1_bn(input_channel, self.last_channel))
        self.features = nn.Sequential(*self.features)

        self.classifier = nn.Sequential(
            nn.Dropout(0.2),
            nn.Linear(self.last_channel, n_class),
        )

        self._initialize_weights()

    def forward(self, x):
        x = self.features(x)
        x = x.mean(3).mean(2)
        x = self.classifier(x)
        return x

    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
                if m.bias is not None:
                    m.bias.data.zero_()
            elif isinstance(m, BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()
            elif isinstance(m, nn.Linear):
                n = m.weight.size(1)
                m.weight.data.normal_(0, 0.01)
                m.bias.data.zero_()
  1. 在实际用于deeplabv3+时,会根据实际情况对mobilnetv2进行调整
    原本mobilnet会在开始时进行一次下采样,后续第2,4,7,14个InvertedResidual块会进行下采样,如果下采样8倍,则只需三次,那么会从第7个块开始对后续的所有块进行调整,让其不再进行下采样。
    具体是使用_nostride_dilate对块进行调整,调整后
    前7个块的第二个卷积层都是普通分层卷积,其中第2第4块执行下采样s=2
    第8到第14个块的第二个卷积层是dilation=2的分层空洞卷积
    第15到最后的第二个卷积层是dilation=4的分层空洞卷积
    取出第2个InvertedResidual的输出作为浅层特征low_level_features,这个浅层特征相对于输入图片被下采样4次,最后一个InvertedResidual输出的特征为深层特征记为x
class MobileNetV2(nn.Module):
    def __init__(self, downsample_factor=8, pretrained=True):
        super(MobileNetV2, self).__init__()
        from functools import partial
        
        model           = mobilenetv2(pretrained)
        self.features   = model.features[:-1]  # 最后一个conv_1x1_bn不要了,只要开头的卷积和后续的InvertedResidual

        self.total_idx  = len(self.features)
        self.down_idx   = [2, 4, 7, 14]

        if downsample_factor == 8:  
        # 如果要下采样8倍,那就需要两个strip=2的InvertedResidual块(因为开头的卷积自带一次2倍下采样)
        # 从第7到第13个InvertedResidual块,第7个块的第一个卷积仍是逐点卷积通道上升,第二个卷积变成步长为1的普通分层卷积,第三个卷积仍是逐点卷积通道下降
        # 实际上_nostride_dilate只会对stride=2的卷积层,或3*3的卷积层出手,也就是只会对InvertedResidual块的第二个卷积层出手
        # 此时第7个块的第二个卷积层是普通卷积(s=1),第8到第14个块的第二个卷积层是dilation=2的空洞卷积,第15到最后的块的第二个卷积层是dilation=4的空洞卷积
        # 前7个块的第二个卷积层都是普通分层卷积,其中第2第4块执行下采样s=2
        # 第8到第14个块的第二个卷积层是dilation=2的分层空洞卷积
        # 第15到最后的第二个卷积层是dilation=4的分层空洞卷积
            for i in range(self.down_idx[-2], self.down_idx[-1]):
                self.features[i].apply(
                    partial(self._nostride_dilate, dilate=2)
                )
            for i in range(self.down_idx[-1], self.total_idx):
                self.features[i].apply(
                    partial(self._nostride_dilate, dilate=4)
                )
        elif downsample_factor == 16:
        # 此时要进行4次下采样,所以从第14层的第二个卷积层是普通卷积(s=1),后面的所有块是dilation=2的空洞卷积
        # 前14个块的第二个卷积层都是普通分层卷积,其中第2第4第7块执行下采样s=2
        # 第15到最后的第二个卷积层是dilation=2的分层空洞卷积
            for i in range(self.down_idx[-1], self.total_idx):
                self.features[i].apply(
                    partial(self._nostride_dilate, dilate=2)
                )
        
    def _nostride_dilate(self, m, dilate):
        classname = m.__class__.__name__
        if classname.find('Conv') != -1:
            if m.stride == (2, 2):
                m.stride = (1, 1)
                if m.kernel_size == (3, 3):
                    m.dilation = (dilate//2, dilate//2)
                    m.padding = (dilate//2, dilate//2)
            else:
                if m.kernel_size == (3, 3):
                    # 空洞卷积后,卷积核大小为3 + 2 * (dilate - 1) = 1 + 2*dilate
                    # 计算卷积后特征图尺寸[(o + 2 * dilate) - (1 + 2*dilate)] + 1 = o 即特征图尺寸不变
                    m.dilation = (dilate, dilate)
                    m.padding = (dilate, dilate)

    def forward(self, x):
        # 假设512, 512, 3
        low_level_features = self.features[:4](x)
        # 在第2个块执行过一次下采样,所有共下采样了4倍
        # low_level_features(128, 128, 24)
        x = self.features[4:](low_level_features)
        # 通道数为320,尺寸与下采样次数相关
        return low_level_features, x
  1. 对于深层特征边,会使用不同空洞率的空洞卷积及池化操作并行得到五个特征层,这五个特征层将有不同的感受野,将其cat后用逐点卷积进行特征融合
class ASPP(nn.Module):
	def __init__(self, dim_in, dim_out, rate=1, bn_mom=0.1):
		super(ASPP, self).__init__()
        # branch1为1*1卷积
        # branch2-4为不同空洞率的空洞卷积
        # 
		self.branch1 = nn.Sequential(
				nn.Conv2d(dim_in, dim_out, 1, 1, padding=0, dilation=rate,bias=True),
				nn.BatchNorm2d(dim_out, momentum=bn_mom),
				nn.ReLU(inplace=True),
		)
		self.branch2 = nn.Sequential(
				nn.Conv2d(dim_in, dim_out, 3, 1, padding=6*rate, dilation=6*rate, bias=True),
				nn.BatchNorm2d(dim_out, momentum=bn_mom),
				nn.ReLU(inplace=True),	
		)
		self.branch3 = nn.Sequential(
				nn.Conv2d(dim_in, dim_out, 3, 1, padding=12*rate, dilation=12*rate, bias=True),
				nn.BatchNorm2d(dim_out, momentum=bn_mom),
				nn.ReLU(inplace=True),	
		)
		self.branch4 = nn.Sequential(
				nn.Conv2d(dim_in, dim_out, 3, 1, padding=18*rate, dilation=18*rate, bias=True),
				nn.BatchNorm2d(dim_out, momentum=bn_mom),
				nn.ReLU(inplace=True),	
		)
		self.branch5_conv = nn.Conv2d(dim_in, dim_out, 1, 1, 0,bias=True)
		self.branch5_bn = nn.BatchNorm2d(dim_out, momentum=bn_mom)
		self.branch5_relu = nn.ReLU(inplace=True)

		self.conv_cat = nn.Sequential(
				nn.Conv2d(dim_out*5, dim_out, 1, 1, padding=0,bias=True),
				nn.BatchNorm2d(dim_out, momentum=bn_mom),
				nn.ReLU(inplace=True),		
		)

	def forward(self, x):
		[b, c, row, col] = x.size()
        #-----------------------------------------#
        #   一共五个分支
        #-----------------------------------------#
		conv1x1 = self.branch1(x)
		conv3x3_1 = self.branch2(x)
		conv3x3_2 = self.branch3(x)
		conv3x3_3 = self.branch4(x)
        #-----------------------------------------#
        #   第五个分支,全局平均池化+卷积
        #-----------------------------------------#
		global_feature = torch.mean(x,2,True)
		global_feature = torch.mean(global_feature,3,True)
		global_feature = self.branch5_conv(global_feature)
		global_feature = self.branch5_bn(global_feature)
		global_feature = self.branch5_relu(global_feature)
		global_feature = F.interpolate(global_feature, (row, col), None, 'bilinear', True)
		
        #-----------------------------------------#
        #   将五个分支的内容堆叠起来
        #   然后1x1卷积整合特征。
        #-----------------------------------------#
		feature_cat = torch.cat([conv1x1, conv3x3_1, conv3x3_2, conv3x3_3, global_feature], dim=1)
		result = self.conv_cat(feature_cat)
		return result
  1. 深层卷积经过ASPP进行空洞卷积特征提取后与浅层特征再进行特征融合,之后经过分类头,再上采样到输入图像的尺寸即完成了对每个像素点的分类。
class DeepLab(nn.Module):
    def __init__(self, num_classes, backbone="mobilenet", pretrained=True, downsample_factor=16):
        super(DeepLab, self).__init__()
        if backbone=="xception":
            #----------------------------------#
            #   获得两个特征层
            #   浅层特征    [128,128,256]
            #   主干部分    [30,30,2048]
            #----------------------------------#
            self.backbone = xception(downsample_factor=downsample_factor, pretrained=pretrained)
            in_channels = 2048
            low_level_channels = 256
        elif backbone=="mobilenet":
            #----------------------------------#
            #   获得两个特征层
            #   浅层特征    [128,128,24]
            #   主干部分    [30,30,320]
            #----------------------------------#
            self.backbone = MobileNetV2(downsample_factor=downsample_factor, pretrained=pretrained)
            in_channels = 320
            low_level_channels = 24
        else:
            raise ValueError('Unsupported backbone - `{}`, Use mobilenet, xception.'.format(backbone))

        #-----------------------------------------#
        #   ASPP特征提取模块
        #   利用不同膨胀率的膨胀卷积进行特征提取
        #-----------------------------------------#
        self.aspp = ASPP(dim_in=in_channels, dim_out=256, rate=16//downsample_factor)
        
        #----------------------------------#
        #   浅层特征边
        #----------------------------------#
        self.shortcut_conv = nn.Sequential(
            nn.Conv2d(low_level_channels, 48, 1),
            nn.BatchNorm2d(48),
            nn.ReLU(inplace=True)
        )		

        self.cat_conv = nn.Sequential(
            nn.Conv2d(48+256, 256, 3, stride=1, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True),
            nn.Dropout(0.5),

            nn.Conv2d(256, 256, 3, stride=1, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True),

            nn.Dropout(0.1),
        )
        self.cls_conv = nn.Conv2d(256, num_classes, 1, stride=1)

    def forward(self, x):
        H, W = x.size(2), x.size(3)
        #-----------------------------------------#
        #   获得两个特征层
        #   low_level_features: 浅层特征-进行卷积处理
        #   x : 主干部分-利用ASPP结构进行加强特征提取
        #-----------------------------------------#
        low_level_features, x = self.backbone(x)
        x = self.aspp(x)
        low_level_features = self.shortcut_conv(low_level_features)
        
        #-----------------------------------------#
        #   将加强特征边上采样
        #   与浅层特征堆叠后利用卷积进行特征提取
        #-----------------------------------------#
        x = F.interpolate(x, size=(low_level_features.size(2), low_level_features.size(3)), mode='bilinear', align_corners=True)
        x = self.cat_conv(torch.cat((x, low_level_features), dim=1))
        x = self.cls_conv(x)
        x = F.interpolate(x, size=(H, W), mode='bilinear', align_corners=True)
        return x

感谢b导的代码和博客
源码地址:https://github.com/bubbliiiing/deeplabv3-plus-pytorch
最后:tte的PhD读不了一点。