✨写在前面：强烈推荐给大家一个优秀的人工智能学习网站，内容包括人工智能基础、机器学习、深度学习神经网络等，详细介绍各部分概念及实战教程，通俗易懂，非常适合人工智能领域初学者及研究者学习。➡️点击跳转到网站。

学习资料总结：

深度学习网络模型设计总结 - 知乎

CNN网络结构的发展（最全整理） - 专知

常见的网络结构 - 侯凯 - 博客园

卷积神经网络超详细介绍_呆呆的猫的博客

一、各类网络模型论文：

二、经典网络模型介绍：学习参考：

1.LeNet-5

代码实战 import time import torch from torch import nn, optim import matplotlib.pyplot as plt import sys sys.path.append("..") import d2lzh_pytorch as d2l device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') """经典神经网络LeNet模型"""

class LeNet(nn.Module): def 「init」(self): super(LeNet, self).「init」() self.conv = nn.Sequential( nn.Conv2d(1, 6, 5), # in_channels, out_channels,kernel_size nn.Sigmoid(), nn.MaxPool2d(2, 2), # kernel_size, stride nn.Conv2d(6, 16, 5),#卷积层 nn.Sigmoid(), nn.MaxPool2d(2, 2) ) self.fc = nn.Sequential( nn.Linear(1644, 120), nn.Sigmoid(), nn.Linear(120, 84), nn.Sigmoid(), nn.Linear(84, 10) ) def forward(self, img): feature = self.conv(img) output = self.fc(feature.view(img.shape[0], -1)) return output #查看每个层的形状 net=LeNet() print(net)

"""获取数据和训练模型""" #使⽤Fashion-MNIST作为训练数据集。 batch_size = 256 train_iter, test_iter =d2l.load_data_fashion_mnist(batch_size=batch_size)

def evaluate_accuracy(data_iter, net,device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')): acc_sum, n = 0.0, 0 with torch.no_grad(): for X, y in data_iter: if isinstance(net, torch.nn.Module): net.eval() # 评估模式, 这会关闭dropout acc_sum += (net(X.to(device)).argmax(dim=1) == y.to(device)).float().sum().cpu().item() net.train() #改回训练模型 else: # ⾃定义的模型, 3.13节之后不会⽤到, 不考虑GPU if ('is_training' in net.「code」.co_varnames): # 如果有is_training这个参数 # 将is_training设置成False acc_sum += (net(X, is_training=False).argmax(dim=1) == y).float().sum().item() else: acc_sum += (net(X).argmax(dim=1) ==y).float().sum().item() n += y.shape[0] return acc_sum / n

def train_ch5(net, train_iter, test_iter, batch_size, optimizer,device, num_epochs): net = net.to(device) print("training on ", device) loss = torch.nn.CrossEntropyLoss() batch_count = 0 for epoch in range(num_epochs): train_l_sum, train_acc_sum, n, start = 0.0, 0.0, 0,time.time() for X, y in train_iter: X = X.to(device) y = y.to(device) y_hat = net(X) l = loss(y_hat, y) optimizer.zero_grad() l.backward() optimizer.step() train_l_sum += l.cpu().item() train_acc_sum += (y_hat.argmax(dim=1) == y).sum().cpu().item() n += y.shape[0] batch_count += 1 test_acc = evaluate_accuracy(test_iter, net) print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f,time % .1fsec'% (epoch + 1, train_l_sum / batch_count,train_acc_sum / n, test_acc, time.time() - start)) #学习率采⽤0.001，训练算法使⽤Adam算法，损失函数使⽤交叉熵损失函数。 lr, num_epochs = 0.001, 5 optimizer = torch.optim.Adam(net.parameters(), lr=lr)

train_ch5(net, train_iter, test_iter, batch_size, optimizer, device,num_epochs) plt.show() 代码实战： 2.AlexNet

Alexnet代码实现：

import time import matplotlib.pyplot as plt import torch from torch import nn, optim import torchvision import sys sys.path.append("..") import d2lzh_pytorch as d2l device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

"""实现稍微简化过的AlexNet """ class AlexNet(nn.Module): def 「init」(self): super(AlexNet, self).「init」() self.conv = nn.Sequential( nn.Conv2d(1, 96, 11, 4), # in_channels, out_channels,kernel_size, stride, padding nn.ReLU(), nn.MaxPool2d(3, 2), # kernel_size, stride

减⼩卷积窗⼝，使⽤填充为2来使得输⼊与输出的⾼和宽⼀致，且增⼤输出通道数

             nn.Conv2d(96, 256, 5, 1, 2),
             nn.ReLU(),
             nn.MaxPool2d(3, 2),
           # 连续3个卷积层，且使⽤更⼩的卷积窗⼝。除了最后的卷积层外，进⼀步增⼤了输出通道数。
       # 前两个卷积层后不使⽤池化层来减⼩输⼊的⾼和宽
             nn.Conv2d(256, 384, 3, 1, 1),
             nn.ReLU(),
             nn.Conv2d(384, 384, 3, 1, 1),
             nn.ReLU(),
             nn.Conv2d(384, 256, 3, 1, 1),
             nn.ReLU(),
             nn.MaxPool2d(3, 2)
       )# 这⾥全连接层的输出个数⽐LeNet中的⼤数倍。使⽤丢弃层来缓解过拟合
       self.fc = nn.Sequential(
           nn.Linear(256 * 5 * 5, 4096),
           nn.ReLU(),
           nn.Dropout(0.5),
           nn.Linear(4096, 4096),
           nn.ReLU(),
           nn.Dropout(0.5),
           # 输出层。由于这⾥使⽤Fashion-MNIST，所以⽤类别数为10，⽽⾮论⽂中的1000
           nn.Linear(4096, 10),
       )


  def forward(self, img):
      feature = self.conv(img)

      output = self.fc(feature.view(img.shape[0], -1))
      return output

#打印看网络结构 net = AlexNet() print(net)

"""读取数据""" def load_data_fashion_mnist(batch_size, resize=None,root='~/Datasets/FashionMNIST'):

 trans = []
 if resize:
      trans.append(torchvision.transforms.Resize(size=resize))
 trans.append(torchvision.transforms.ToTensor())

 transform = torchvision.transforms.Compose(trans)
 mnist_train = torchvision.datasets.FashionMNIST(root=root,
                                             train=True, download=True, transform=transform)
 mnist_test = torchvision.datasets.FashionMNIST(root=root,
                                            train=False, download=True, transform=transform)
 train_iter = torch.utils.data.DataLoader(mnist_train,
                                      batch_size=batch_size, shuffle=True, num_workers=4)
 test_iter = torch.utils.data.DataLoader(mnist_test,
                                     batch_size=batch_size, shuffle=False, num_workers=4)

 return train_iter, test_iter

batch_size = 128

如出现“out of memory”的报错信息，可减⼩batch_size或resize

train_iter, test_iter = load_data_fashion_mnist(batch_size,resize=224)

"""训练""" lr, num_epochs = 0.001, 5 optimizer = torch.optim.Adam(net.parameters(), lr=lr) d2l.train_ch5(net, train_iter, test_iter, batch_size, optimizer,device, num_epochs) plt.show() 3.Inception网络

GoogleNet模型详细解读：

去除全连接层，使得模型训练更快并且减轻了过拟合。

Inception的核心思想是通过增加网络深度和宽度的同时减少参数的方法来解决问题。Inception v1由22层，比AlexNet的8层或VGGNet的19层更深。但其计算量只有15亿次浮点运算，同时只有500万的参数量，仅为AlexNet的1/12，却有着更高的准确率。

Inception的前身为MLP卷积层。

卷积层要提升表达能力，主要依靠增加输出通道数，每个输出通道对应一个滤波器，同一滤波器共享参数只能提取一类特征，因此一个输出通道只能做一种特征处理。所以在传统的CNN中会使用尽量多的滤波器，把原样本中尽可能多的潜在的特征提取出来，然后再通过池化和大量的线性变化在其中筛选出需要的特征。这样的代价就是参数太多，运算太满，而且很容易引起过拟合。

MLP卷积层的思想是将CNN高维度特征转成低维度特征，将神经网络的思想融合在具体的卷积操作当中。直白的理解就是在网络中再放一个网络，即使每个卷积的通道中包含一个微型的多层网络，用一个网络来代替原来具体的卷积运算过程（卷积核的每个值与样本对应的像素点相乘，再将相乘后的所有结果加在一起生成新的像素点的过程）

全局均值池化是在平均池化层中使用同等大小的过滤器将其特征保存下来。这种结构用来代替深层网络结构最后的全连接输出层。具体用法是在卷积处理之后，对每个特征图的整张图片进行全局均值池化，生成一个值，即每张特征图相当于一个输出特征，这个特征就表示了我们输出类的特征。如图输出1000个特征图

Inception的原始模型是相当于MLP卷积层更为稀疏，它采用了MLP卷积层的思想，将中间的全连接层换成了多通道卷积层。Inception与MLP卷积在网络中的作用一样，把封装好的Inception作为一个卷积单元，堆积起来形成了原始的GoogleNet网络。

其结构是将1x1、3x3、5x5的卷积核对应的卷积操作和3x3的滤波器对应的池化操作堆叠在一起，一方面增加了网络的宽度，另一方面增加了网络对尺度的适应性。增加了网络对不同尺度的适应性。

Inception v1模型在原有的Inception模型基础上做了一些改进，原因是由于Inception的原始模型是将所有的卷积核都在上一层的所有输出上来做，那么5x5的卷积核所需的计算量就比较大，造成了特征图厚度很大。为了避免这一现象，在3x3前、5x5前、最大池化层后分别加上了1x1的卷积核，起到了降低特征图厚度的作用（其中1x1卷积主要用来降维）

Inception v2模型在v1模型基础上应用当时的主流技术，在卷积后加入BN层，使每一层的输出都归一化处理，减少了内变协变量的移动问题；同时还使用了梯度截断技术，增加了训练的稳定性。另外，Inception学习了VGG，用2个3x3的conv替代5x5，这既降低了参数数量，也提升了计算速度。

Inception v3没有再加入其他的技术，只是将原来的结构进行了调整，其最重要的一个改进是分解。

Inception v4结合残差连接技术的特点进行结构的优化调整。

4.残差网络（ResNet） (1)Resnet概念

残差网络详细解读：

该框架能够大大简化模型网络的训练时间，使得再可接受时间内，模型能更深。所谓的残差连接就是在标准的前馈卷积网络上加一个跳跃，从而绕过一些层的连接方式。解决梯度消失的问题

在ResNet中，输入层与Addition之间存在着两个连接，左侧的连接是输入层通过若干神经层之后连接到Addition，右侧的连接是输入层直接传到Addition，在反向传播的过程中误差传到Input时会得到两个误差的相加和，一个是左侧一堆网络的误差，一个右侧直接的原始误差。左侧的误差会随着层数变深而梯度越来越小，右侧则是由Addition直接连接到Input，所以还会保留着Addition的梯度。这样Input得到的相加和后的梯度就没有那么小了，可以保证接着将误差往下传。

下图Resnet网络，输入大小[256,224,244]，经多层卷积块最终得到输出也是[256,224,244]，但参数量变得很小了。

在论文中的Resnet文中，提到了一个名词叫“Shortcut Connection”，实际上它指的就是identity mapping，这里先解释一下，免的大家后面会confuse。针对不同深度的ResNet，其中残差一般包括2-3层，分别是以下两种Residual Block：

对上图做如下说明：

左图为基本的residual block，residual mapping为两个64通道的3x3卷积，输入输出均为64通道，可直接相加。该block主要使用在相对浅层网络，比如ResNet-34；
右图为针对深层网络提出的block，称为“bottleneck” block，主要目的就是为了降维。首先通过一个1x1卷积将256维通道（channel）降到64通道，最后通过一个256通道的1x1卷积恢复。

通过上面的介绍我们知道，residual mapping和identity mapping是沿通道维度相加的，那么如果通道维度不相同怎么办？

作者提出在identity mapping部分使用1x1卷积进行处理，表示如下：

其中，指的是1x1卷积操作。

下图为VGG-19，Plain-34(没有使用residual结构)和ResNet-34网络结构对比：

对上图进行如下说明：

相比于VGG-19，ResNet没有使用全连接层，而使用了全局平均池化层，可以减少大量参数。VGG-19大量参数集中在全连接层；
ResNet-34中跳跃连接“实线”为identity mapping和residual mapping通道数相同，“虚线”部分指的是两者通道数不同，需要使用1x1卷积调整通道维度，使其可以相加。

论文一共提出5种ResNet网络，网络参数统计表如下：

代码实现

import torch from torch import nn from torch.nn import functional as F from torch.utils.data import DataLoader from torchvision import datasets from torchvision import transforms from torch import nn, optim

from torchvision.models import resnet18

class ResBlk(nn.Module): """ resnet block """

def __init__(self, ch_in, ch_out):
    """
    :param ch_in:
    :param ch_out:
    """
    super(ResBlk, self).__init__()

    self.conv1 = nn.Conv2d(ch_in, ch_out, kernel_size=3, stride=1, padding=1)
    self.bn1 = nn.BatchNorm2d(ch_out)
    self.conv2 = nn.Conv2d(ch_out, ch_out, kernel_size=3, stride=1, padding=1)
    self.bn2 = nn.BatchNorm2d(ch_out)

    self.extra = nn.Sequential()
    if ch_out != ch_in:
        # [b, ch_in, h, w] => [b, ch_out, h, w]
        self.extra = nn.Sequential(
            nn.Conv2d(ch_in, ch_out, kernel_size=1, stride=1),
            nn.BatchNorm2d(ch_out)
        )


def forward(self, x):
    """
    :param x: [b, ch, h, w]
    :return:
    """
    out = F.relu(self.bn1(self.conv1(x)))
    out = self.bn2(self.conv2(out))
    # short cut.
    # extra module: [b, ch_in, h, w] => [b, ch_out, h, w]
    # element-wise add:
    out = self.extra(x) + out

    return out

class ResNet18(nn.Module):

def __init__(self):
    super(ResNet18, self).__init__()

    self.conv1 = nn.Sequential(
        nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1),
        nn.BatchNorm2d(16)
    )
    # followed 4 blocks
    # [b, 64, h, w] => [b, 128, h ,w]
    self.blk1 = ResBlk(16, 16)
    # [b, 128, h, w] => [b, 256, h, w]
    self.blk2 = ResBlk(16, 32)
    # # [b, 256, h, w] => [b, 512, h, w]
    # self.blk3 = ResBlk(128, 256)
    # # [b, 512, h, w] => [b, 1024, h, w]
    # self.blk4 = ResBlk(256, 512)

    self.outlayer = nn.Linear(32*32*32, 10)

def forward(self, x):
    """
    :param x:
    :return:
    """
    x = F.relu(self.conv1(x))

    # [b, 64, h, w] => [b, 1024, h, w]
    x = self.blk1(x)
    x = self.blk2(x)
    # x = self.blk3(x)
    # x = self.blk4(x)

    # print(x.shape)
    x = x.view(x.size(0), -1)
    x = self.outlayer(x)

    return x

def main(): batchsz = 32

cifar_train = datasets.CIFAR10('cifar', True, transform=transforms.Compose([
    transforms.Resize((32, 32)),
    transforms.ToTensor()
]), download=True)
cifar_train = DataLoader(cifar_train, batch_size=batchsz, shuffle=True)

cifar_test = datasets.CIFAR10('cifar', False, transform=transforms.Compose([
    transforms.Resize((32, 32)),
    transforms.ToTensor()
]), download=True)
cifar_test = DataLoader(cifar_test, batch_size=batchsz, shuffle=True)


x, label = iter(cifar_train).next()
print('x:', x.shape, 'label:', label.shape)

device = torch.device('cuda')
# model = Lenet5().to(device)
model = ResNet18().to(device)

criteon = nn.CrossEntropyLoss().to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-3)
print(model)

for epoch in range(1000):

    model.train()
    for batchidx, (x, label) in enumerate(cifar_train):
        # [b, 3, 32, 32]
        # [b]
        x, label = x.to(device), label.to(device)

        logits = model(x)
        # logits: [b, 10]
        # label:  [b]
        # loss: tensor scalar
        loss = criteon(logits, label)

        # backprop
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()


    #
    print(epoch, 'loss:', loss.item())


    model.eval()
    with torch.no_grad():
        # test
        total_correct = 0
        total_num = 0
        for x, label in cifar_test:
            # [b, 3, 32, 32]
            # [b]
            x, label = x.to(device), label.to(device)

            # [b, 10]
            logits = model(x)
            # [b]
            pred = logits.argmax(dim=1)
            # [b] vs [b] => scalar tensor
            correct = torch.eq(pred, label).float().sum().item()
            total_correct += correct
            total_num += x.size(0)
            # print(correct)

        acc = total_correct / total_num
        print(epoch, 'acc:', acc)

if 「name」 == '「main」': main() 使用keras实现ResNet-18 from keras.layers import Input from keras.layers import Conv2D, MaxPool2D, Dense, BatchNormalization, Activation, add, GlobalAvgPool2D from keras.models import Model from keras import regularizers from keras.utils import plot_model from keras import backend as K

def conv2d_bn(x, nb_filter, kernel_size, strides=(1, 1), padding='same'): """ conv2d -> batch normalization -> relu activation """ x = Conv2D(nb_filter, kernel_size=kernel_size, strides=strides, padding=padding, kernel_regularizer=regularizers.l2(0.0001))(x) x = BatchNormalization()(x) x = Activation('relu')(x) return x

def shortcut(input, residual): """ shortcut连接，也就是identity mapping部分。 """

input_shape = K.int_shape(input)
residual_shape = K.int_shape(residual)
stride_height = int(round(input_shape[1] / residual_shape[1]))
stride_width = int(round(input_shape[2] / residual_shape[2]))
equal_channels = input_shape[3] == residual_shape[3]

identity = input
# 如果维度不同，则使用1x1卷积进行调整
if stride_width > 1 or stride_height > 1 or not equal_channels:
    identity = Conv2D(filters=residual_shape[3],
                       kernel_size=(1, 1),
                       strides=(stride_width, stride_height),
                       padding="valid",
                       kernel_regularizer=regularizers.l2(0.0001))(input)

return add([identity, residual])

def basic_block(nb_filter, strides=(1, 1)): """ 基本的ResNet building block，适用于ResNet-18和ResNet-34. """ def f(input):

    conv1 = conv2d_bn(input, nb_filter, kernel_size=(3, 3), strides=strides)
    residual = conv2d_bn(conv1, nb_filter, kernel_size=(3, 3))

    return shortcut(input, residual)

return f

def residual_block(nb_filter, repetitions, is_first_layer=False): """ 构建每层的residual模块，对应论文参数统计表中的conv2_x -> conv5_x """ def f(input): for i in range(repetitions): strides = (1, 1) if i == 0 and not is_first_layer: strides = (2, 2) input = basic_block(nb_filter, strides)(input) return input

return f

def resnet_18(input_shape=(224,224,3), nclass=1000): """ build resnet-18 model using keras with TensorFlow backend. :param input_shape: input shape of network, default as (224,224,3) :param nclass: numbers of class(output shape of network), default as 1000 :return: resnet-18 model """ input_ = Input(shape=input_shape)

conv1 = conv2d_bn(input_, 64, kernel_size=(7, 7), strides=(2, 2))
pool1 = MaxPool2D(pool_size=(3, 3), strides=(2, 2), padding='same')(conv1)

conv2 = residual_block(64, 2, is_first_layer=True)(pool1)
conv3 = residual_block(128, 2, is_first_layer=True)(conv2)
conv4 = residual_block(256, 2, is_first_layer=True)(conv3)
conv5 = residual_block(512, 2, is_first_layer=True)(conv4)

pool2 = GlobalAvgPool2D()(conv5)
output_ = Dense(nclass, activation='softmax')(pool2)

model = Model(inputs=input_, outputs=output_)
model.summary()

return model

if 「name」 == '「main」': model = resnet_18() plot_model(model, 'ResNet-18.png') # 保存模型图四、其他领域Paper 1.物体检测领域的相关模型： RCNN（regions with CNN）模型增加特征的穷举范围，然后在其中发现有价值的特征。大概步骤如下：

（1）对于一副输入的图片，通过选择性搜索，找出2000个候选窗口。

（2）利用CNN对它们提取特征向量，即将这2000个子图统一缩放到227x227，然后进行卷积操作。

（3）利用SVM算法对特征向量进行分类识别。

RCNN中对每一类都进行SVM训练，根据输出的特征类为每一个区打分，最终决定保留或拒绝该区域特征。

SPP-Net：基于空间金字塔池化的优化RCNN方法。空间金字塔池化（Spatial Pyramid Pooling, SPP）最大的特点是，不再关心输入图片的尺寸，而是根据最后的输出类别个数，通过算法来生成多个不同范围的池化层，由它们对输入进行并行池化处理，使最终的输出特征个数与生成类别个数相等，接着再进行类别的比较和判定。

Fast-R-CNN在SPP-Net基础上进行了改进，并将它嫁接到VGG16上所形成的网络，实现了整个网络端到端的训练。

YOLO：能够一次性预测多个位置和类别的模型。先将图片分为SxS个网络，每个网络相当于一个任务，负责检测内部是否有物体的中心点落入该区域，一旦有的话，则启动该任务来检测n个bounding boxes对象。

SSD：比YOLO更快更准的模型，融合了RPN的思想

相关论文：

2.OCR

3.Mobile

4.Image Segmentation

5.GAN

6.文本分类(TC, text-classification)