导盲赛事第二弹: 数据集与训练策略_AI热点日报

本文围绕智能导盲机器狗比赛的数据集与训练策略展开。数据集方面，介绍了train与val数据融合的方法及代码，还提及伪标签训练（半监督学习）的入门、进阶和创新版方式。训练策略上，针对

本文围绕智能导盲机器狗比赛的数据集与训练策略展开。数据集方面，介绍了train与val数据融合的方法及代码，还提及伪标签训练（半监督学习）的入门、进阶和创新版方式。训练策略上，针对PaddleDetection，给出了显卡数量、batchsize、优化器选择等方面的建议，并提供了含droppath的ConvNeXt代码实例。

导盲赛事第二弹: 数据集与训练策略 - 游乐网

0、简介

一只导盲犬能够给盲人带来许多生活上的便利，但是导盲犬的培训周期长，费用高昂，因此，不是所有盲人能够拥有导盲犬，如果有机器狗代替导盲犬，将极大的造福盲人，此项比赛为智能导盲机器狗比赛，通过比赛来考评智能导盲机器狗的智能感知能力及综合运动性能，要求智能四足仿生机器人沿布置好的城市人行道场景走完全程并完成指定任务。

其实说起来高大上，当我们一句看到具体的任务的时候就会发现，其实就是一个非常简单的目标检测任务在出塞中，赛事组提供五种不同的目标让你去进行识别，但是为了服务于现实场景，其在模型大小以及检测速度等方面均提出了要求模型大小限制在200兆以内检测速度要求不低于20FPS。

目前赛事已经快接近尾声了，相信大家应该已经确定自己的路线了，而且网络的改进应该也差不多了，接下来要选择的应该就是在数据集与训练策越方面开始各显神通了，那么现在就让我们一起在数据集与训练策略方面下下功夫吧

上集指路：导盲赛道思路分享

1、数据集

1.1 train val 数据融合

有些比赛是直接把一堆数据给你然后让你自己划分train与val，去进行训练，那么通常情况下选手会先划分train与val然后去进行训练然后一点点改进网络，当val达到最高值的时候，也就是确定好了网络架构，那么一般选手就会将val与train再融合到一起再去跑一遍最后最好的网络然后去提交，而本赛事中赛事组已经帮你划好了train与val然后用coco数据集给你确定下来了，这样你就免去了第一步，并且保证了train与val的分布均匀，使用coco格式对于你先期确定网络架构有很大帮助，但是如果你对数据处理比较薄弱，那么在将train与val合并的时候就会出问题，但是别担心我这里已经帮你做好了这个工作。

你只需要运行下面的代码就可以得到一个包含train与val所有数据的文件夹了

In [ ]

## 解压文件夹!tar -zxvf data/data137625/WisdomGuide.tar.gz## 安装所需第三方库!pip install lxml!pip install pycocotools#coco2vocfrom pycocotools.coco import COCO import  os, cv2, shutilfrom lxml import etree, objectifyfrom tqdm import tqdmfrom PIL import Imageimport numpy as npimport timeimport jsondef cover_copy(src,dst):    '''    src和dst都必须是文件，该函数是执行覆盖操作    '''    if os.path.exists(dst):        os.remove(dst)        shutil.copy(src,dst)    else:        shutil.copy(src,dst)def coco2voc(basedir='VOCdevkit/COCO_VOC',sourcedir='WisdomGuide'):    """    basedir:用来存放转换后数据和标注文件    sourcedir:用来指定原始COCO数据集的存放位置    """    img_savepath= os.path.join(basedir,'JPEGImages')    ann_savepath=os.path.join(basedir,'Annotations')    main_path = os.path.join(basedir,"annotations")    for p in [basedir,img_savepath,ann_savepath,main_path]:        if os.path.exists(p):            shutil.rmtree(p)            os.makedirs(p)        else:            os.makedirs(p)        datasets = ['train','val']    # datasets = ['val2017']    for dataset in datasets:        start = time.time()        print(f"start {dataset}")        no_ann=[] #用来存放没有标注数据的图片的id,并将这些图片复制到results文件夹中        not_rgb=[] #是灰度图，同样将其保存        annfile = 'instance_{}.json'.format(dataset)        annpath=os.path.join(sourcedir,'annotations',annfile)                print('loading annotations into memory...')        tic = time.time()        with open(annpath, 'r') as f:            dataset_ann = json.load(f)        assert type(            dataset_ann        ) == dict, 'annotation file format {} not supported'.format(            type(dataset))        print('Done (t={:0.2f}s)'.format(time.time() - tic))                coco = COCO(annpath)        classes = dict()        for cat in coco.dataset['categories']:            classes[cat['id']] = cat['name']        imgIds = coco.getImgIds()        # imgIds=imgIds[0:1000]#测试用，抽取10张图片，看下存储效果        for imgId in tqdm(imgIds):            img = coco.loadImgs(imgId)[0]               filename = img['file_name']            filepath=os.path.join(sourcedir,dataset,filename)            annIds = coco.getAnnIds(imgIds=img['id'],  iscrowd=None)            anns = coco.loadAnns(annIds)                        if not len(anns):                # print(f"{dataset}:{imgId}该文件没有标注信息，将其复制到{dataset}_noann_result中，以使查看")                no_ann.append(imgId)                result_path = os.path.join(sourcedir,dataset+"_noann_result")                dest_path = os.path.join(result_path,filename)                if not os.path.exists(result_path):                    os.makedirs(result_path)                cover_copy(filepath,dest_path)                continue #如果没有标注信息，则把没有标注信息的图片移动到相关结果文件 noann_result中,来进行查看 ，然后返回做下一张图            #有标注信息，接着往下走，获取标注信息            objs = []            for ann in anns:                name = classes[ann['category_id']]                if 'bbox' in ann:                    # print('bbox in ann',imgId)                    bbox = ann['bbox']                    xmin = (int)(bbox[0])                    ymin = (int)(bbox[1])                    xmax = (int)(bbox[2] + bbox[0])                    ymax = (int)(bbox[3] + bbox[1])                    obj = [name, 1.0, xmin, ymin, xmax, ymax]                    #标错框在这里                    if not(xmin-xmax==0 or ymin-ymax==0):                        objs.append(obj)                 else:                    print(f"{dataset}:{imgId}bbox在标注文件中不存在")# 单张图有多个标注框，某个类别没有框                               annopath = os.path.join(ann_savepath,filename[:-3] + "xml") #生成的xml文件保存路径            dst_path = os.path.join(img_savepath,filename)                       im = Image.open(filepath)            image = np.array(im).astype(np.uint8)            if im.mode != "RGB":             # if img.shape[-1] != 3:                                                # print(f"{dataset}:{imgId}该文件非rgb图，其复制到{dataset}_notrgb_result中，以使查看")                # print(f"img.shape{image.shape} and img.mode{im.mode}")                not_rgb.append(imgId)                result_path = os.path.join(sourcedir,dataset+"_notrgb_result")                dest_path = os.path.join(result_path,filename)                if not os.path.exists(result_path):                    os.makedirs(result_path)                cover_copy(filepath,dest_path) #复制到notrgb_result来方便查看                                im=im.convert('RGB')                image = np.array(im).astype(np.uint8)                im.save(dst_path,quality=95)#图片经过转换后，放到我们需要的位置片                im.close()            else:                                cover_copy(filepath, dst_path)#把原始图像复制到目标文件夹            E = objectify.ElementMaker(annotate=False)            anno_tree = E.annotation(                E.folder('VOC'),                E.filename(filename),                E.source(                    E.database('COCO'),                    E.annotation('VOC'),                    E.image('COCO')                ),                E.size(                    E.width(image.shape[1]),                    E.height(image.shape[0]),                    E.depth(image.shape[2])                ),                E.segmented(0)            )            for obj in objs:                E2 = objectify.ElementMaker(annotate=False)                anno_tree2 = E2.object(                    E.name(obj[0]),                    E.pose(),                    E.truncated("0"),                    E.difficult(0),                    E.bndbox(                        E.xmin(obj[2]),                        E.ymin(obj[3]),                        E.xmax(obj[4]),                        E.ymax(obj[5])                    )                )                anno_tree.append(anno_tree2)            etree.ElementTree(anno_tree).write(annopath, pretty_print=True)        print(f"{dataset}该数据集有{len(no_ann)}/{len(imgIds)}张图片没有instance标注信息，已经这些图片复制到{dataset}_noann_result中以使进行查看")        print(f"{dataset}该数据集有{len(not_rgb)}/{len(imgIds)}张图片是非RGB图像，已经这些图片复制到{dataset}_notrgb_result中以使进行查看")        duriation = time.time()-start        print(f"数据集{dataset}处理完成用时{round(duriation/60,2)}分")#run coco2voccoco2voc()#voc2coco!cd voc2coco && python voc2coco.py ../VOCdevkit/COCO_VOC/Annotations ../VOCdevkit/COCO_VOC/COCO.json!mkdir COCO_all!mv VOCdevkit/COCO_VOC/JPEGImages COCO_all/!mv VOCdevkit/COCO_VOC/COCO.json COCO_all/

登录后复制

1.2 伪标签训练

一些比赛不会像导盲赛事一样不向你提供test数据集让你把网络提交上去，而是会选择把test数据集的照片给你，让你预测完了把预测结果交上去，那么这个时候伪标签就有用了，简单来说就是将预测的结果给当成标注信息再和赛事提供的原数据一起训练一般来说会上升百分之0.几个点，但是有的赛事不允许伪标签。我看了一眼咱们这个比赛规则并没有限制这个。但是很遗憾啦，这个赛事并没有给你提供test照片呀

但是我相信有些财大气粗，底蕴雄厚的学校肯定积累过这些数据，但是其中可能很多都没有标注，一个个标注也不太现实，那么你就可以尝试使用伪标签训练的形式进行训练，而这种训练方式也有一个比较高大上的名字半监督学习

这里引用知乎的文章简单介绍一下半监督学习的方式

来源伪标签（Pseudo-Labelling）——锋利的匕首

入门版1. 使用标记数据训练有监督模型M2. 使用有监督模型M对无标签数据进行预测，得出预测概率P3. 通过预测概率P筛选高置信度样本4. 使用有标记数据以及伪标签数据训练新模型M’

登录后复制

导盲赛事第二弹: 数据集与训练策略 - 游乐网

进阶版1. 使用标记数据训练有监督模型M2. 使用有监督模型M对无标签数据进行预测，得出预测概率P3. 通过预测概率P筛选高置信度样本4. 使用有标记数据以及伪标签数据训练新模型M’5. 将M替换为M’，重复以上步骤直至模型效果不出现提升

登录后复制

导盲赛事第二弹: 数据集与训练策略 - 游乐网

创新版1. 使用标记数据训练有监督模型M2. 使用有监督模型M对无标签数据进行预测，得出预测概率P3. 将模型损失函数改为Loss = loss(labeled_data) + alpha*loss(unlabeled_data)4. 使用有标记数据以及伪标签数据训练新模型M’

登录后复制

导盲赛事第二弹: 数据集与训练策略 - 游乐网

2、训练策略

相信大部分人使用的都是PaddleDetection，那么在训练策略方面有些事情你可能需要注意一下

看看你有几个卡 PaddleDetection的配置文件是以八卡为基准进行配置的，因此如果你只有一个显卡的话最好将学习率也变为原来的1/8，并且如果你的batsize又缩小了的话那么建议你也把学习率同步缩小卡不是越多越好可以看到如果是训练COCO数据集的话一般是使用8卡v100起步，但是那是因为coco数据集太大了！，你像是咱们这个数据集我两张1080ti 每张卡batchsize设置为8，每个epoch也才333个iter！目前我的实验下来如果是使用4卡v100的效果是不如两张1080ti的。batchsize不是越大越好目前ai studio已经上线了a100，然后我就震惊了因为我的网络只有100m大小，然后我的分辨率是416x416，我在a100上最后直接把batchsize改到了48，太离谱了！但是当我花了7个小时去训练后，我平静了，a100确实可以大幅缩短训练时间，但是最终精度反而相比较两张1080ti低了0.5个点。因此我建议你最高也就把batchsize放到32吧。别一味地放大batchsize了。优化器的选择 PaddleDetection中目前主流的两个优化器分别为 Momentum以及AdamW，其中Momentum适合于普通的卷积网络，而AdamW适合一些比较难以训练的网络，或者说如果你相信奇迹并且你是炼丹大师而且相信能抖出一个奇迹那么Momentum就比较适合你，但是如果你是一个比较求稳想找一个比较合适的点就行那么AdamW就比较适合你。惩戒方法选择为了防止网络过拟合一般会在训练策略的时候加一些措施比如droppath等，但是droppath需要在网络中加相关代码，一会我再给大家放一个实例，这里聊得是PaddleDetection在配置文件中一般的做法，一般如果你用Momentum那么会使用L2范式进行规范，如果你用的AdamW则会使用clip_grad_by_norm进行规范。

下面我就把我的一个训练策略开放给大家做一个参考

epoch: 400LearningRate:  base_lr: 0.00025  schedulers:    - !CosineDecay      max_epochs: 500    - !LinearWarmup      start_factor: 0.      epochs: 5OptimizerBuilder:  clip_grad_by_norm: 0.1  regularizer: false  optimizer:    type: AdamW    weight_decay: 0.0001

登录后复制

2.1 droppath实例使用ConvNeXt进行droppath操作

In [ ]

# copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve.## Licensed under the Apache License, Version 2.0 (the "License");# you may not use this file except in compliance with the License.# You may obtain a copy of the License at##    http://www.apache.org/licenses/LICENSE-2.0## Unless required by applicable law or agreed to in writing, software# distributed under the License is distributed on an "AS IS" BASIS,# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.# See the License for the specific language governing permissions and# limitations under the License.# Code was based on https://github.com/facebookresearch/ConvNeXt# import math# from numbers import Integral# import paddle# import paddle.nn as nn# import paddle.nn.functional as F# from ppdet.core.workspace import register, serializable# from paddle.regularizer import L2Decay# from paddle.nn.initializer import Uniform# from paddle import ParamAttr# from paddle.nn.initializer import Constant# from paddle.vision.ops import DeformConv2D# from .name_adapter import NameAdapter# from ppdet.modeling.shape_spec import ShapeSpecimport paddleimport paddle.nn as nnimport paddle.nn.functional as F# __all__ = ['ConvNeXt']trunc_normal_ = nn.initializer.TruncatedNormal(std=0.02)zeros_ = nn.initializer.Constant(value=0.0)ones_ = nn.initializer.Constant(value=1.0)class Identity(nn.Layer):    def __init__(self):        super().__init__()    def forward(self, x):        return xdef drop_path(x, drop_prob=0.0, training=False):    if drop_prob == 0.0 or not training:        return x    keep_prob = paddle.to_tensor(1 - drop_prob)    shape = (paddle.shape(x)[0], ) + (1, ) * (x.ndim - 1)    random_tensor = keep_prob + paddle.rand(shape, dtype=x.dtype)    random_tensor = paddle.floor(random_tensor)  # binarize    output = x.divide(keep_prob) * random_tensor    return outputclass DropPath(nn.Layer):    def __init__(self, drop_prob=None):        super(DropPath, self).__init__()        self.drop_prob = drop_prob    def forward(self, x):        return drop_path(x, self.drop_prob, self.training)class Block(nn.Layer):    """ ConvNeXt Block. There are two equivalent implementations:    (1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W)    (2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back    Args:        dim (int): Number of input channels.        drop_path (float): Stochastic depth rate. Default: 0.0        layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.    """    def __init__(self, dim, drop_path=0., layer_scale_init_value=1e-6):        super().__init__()        self.dwconv = nn.Conv2D(dim, dim, kernel_size=7, padding=3,                                groups=dim)  # depthwise conv        self.norm = LayerNorm(dim, epsilon=1e-6)        self.pwconv1 = nn.Linear(            dim, 4 * dim)  # pointwise/1x1 convs, implemented with linear layers        self.act = nn.GELU()        self.pwconv2 = nn.Linear(4 * dim, dim)        self.gamma = paddle.create_parameter(            shape=[dim],            dtype='float32',            default_initializer=nn.initializer.Constant(                value=layer_scale_init_value)        ) if layer_scale_init_value > 0 else None        self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()    def forward(self, x):        input = x        x = self.dwconv(x)        x = x.transpose([0, 2, 3, 1])  # (N, C, H, W) -> (N, H, W, C)        x = self.norm(x)        x = self.pwconv1(x)        x = self.act(x)        x = self.pwconv2(x)        if self.gamma is not None:            x = self.gamma * x        x = x.transpose([0, 3, 1, 2])  # (N, H, W, C) -> (N, C, H, W)        x = input + self.drop_path(x)        return xclass LayerNorm(nn.Layer):    """ LayerNorm that supports two data formats: channels_last (default) or channels_first.    The ordering of the dimensions in the inputs. channels_last corresponds to inputs with    shape (batch_size, height, width, channels) while channels_first corresponds to inputs    with shape (batch_size, channels, height, width).    """    def __init__(self,                 normalized_shape,                 epsilon=1e-6,                 data_format="channels_last"):        super().__init__()        self.weight = paddle.create_parameter(shape=[normalized_shape],                                              dtype='float32',                                              default_initializer=ones_)        self.bias = paddle.create_parameter(shape=[normalized_shape],                                            dtype='float32',                                            default_initializer=zeros_)        self.epsilon = epsilon        self.data_format = data_format        if self.data_format not in ["channels_last", "channels_first"]:            raise NotImplementedError        self.normalized_shape = (normalized_shape, )    def forward(self, x):        if self.data_format == "channels_last":            return F.layer_norm(x, self.normalized_shape, self.weight,                                self.bias, self.epsilon)        elif self.data_format == "channels_first":            u = x.mean(1, keepdim=True)            s = (x - u).pow(2).mean(1, keepdim=True)            x = (x - u) / paddle.sqrt(s + self.epsilon)            x = self.weight[:, None, None] * x + self.bias[:, None, None]            return x# @register# @serializableclass ConvNeXt(nn.Layer):    """ ConvNeXt        A Paddle impl of : `A ConvNet for the 2020s`  -          https://arxiv.org/pdf/2201.03545.pdf    Args:        in_chans (int): Number of input image channels. Default: 3        depths (tuple(int)): Number of blocks at each stage. Default: [3, 3, 9, 3]        dims (int): Feature dimension at each stage. Default: [96, 192, 384, 768]        drop_path_rate (float): Stochastic depth rate. Default: 0.        layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.        head_init_scale (float): Init scaling value for classifier weights and biases. Default: 1.    """    def __init__(        self,        in_chans=3,        out_channals=[1, 2, 3],        depths=[3, 3, 9, 3],        dims=[96, 192, 384, 768],        drop_path_rate=0.,        layer_scale_init_value=1e-6,    ):        super().__init__()        self._out_strides = [4, 8, 16, 32]        self.dims = dims        self.out_channals = out_channals        self.downsample_layers = nn.LayerList(        )  # stem and 3 intermediate downsampling conv layers        stem = nn.Sequential(            nn.Conv2D(in_chans, dims[0], kernel_size=4, stride=4),            LayerNorm(dims[0], epsilon=1e-6, data_format="channels_first"))        self.downsample_layers.append(stem)        for i in range(3):            downsample_layer = nn.Sequential(                LayerNorm(dims[i], epsilon=1e-6, data_format="channels_first"),                nn.Conv2D(dims[i], dims[i + 1], kernel_size=2, stride=2),            )            self.downsample_layers.append(downsample_layer)        self.stages = nn.LayerList(        )  # 4 feature resolution stages, each consisting of multiple residual blocks        dp_rates = [            x.item() for x in paddle.linspace(0, drop_path_rate, sum(depths))        ]        cur = 0        for i in range(4):            stage = nn.Sequential(*[                Block(dim=dims[i],                      drop_path=dp_rates[cur + j],                      layer_scale_init_value=layer_scale_init_value)                for j in range(depths[i])            ])            self.stages.append(stage)            cur += depths[i]        self.norm = nn.LayerNorm(dims[-1], epsilon=1e-6)  # final norm layer        self.apply(self._init_weights)    def _init_weights(self, m):        if isinstance(m, (nn.Conv2D, nn.Linear)):            trunc_normal_(m.weight)            zeros_(m.bias)    # @property    def out_shape(self):        return [            ShapeSpec(                channels=self.dims[i], stride=self._out_strides[i])            for i in self.out_channals        ]    def forward(self, x):        x = x        outs = []        for i in range(4):            x = self.downsample_layers[i](x)            x = self.stages[i](x)            if i in self.out_channals:                outs.append(x)        return outsif __name__ == "__main__":    model = ConvNeXt()    paddle.summary(model, (1, 3, 640, 640))

登录后复制