深度学习从数据集收集到测试1

原创

位沁CSDN 2022-08-18 17:39:31 博主文章分类：深度学习 ©著作权

©著作权归作者所有：来自51CTO博客作者位沁CSDN的原创作品，请联系作者获取转载授权，否则将追究法律责任

1、数据集获取、
郑重声明：虽然爬虫爬取数据不好，但是本人是正规使用，一切为了学习

import requests
import os

def getManyPages(keyword,pages):
    params=[]
    for i in range(30,30*pages+30,30):
        params.append({
                      'tn': 'resultjson_com',
                      'ipn': 'rj',
                      'ct': 201326592,
                      'is': '',
                      'fp': 'result',
                      'queryWord': keyword,
                      'cl': 2,
                      'lm': -1,
                      'ie': 'utf-8',
                      'oe': 'utf-8',
                      'adpicid': '',
                      'st': -1,
                      'z': '',
                      'ic': 0,
                      'word': keyword,
                      's': '',
                      'se': '',
                      'tab': '',
                      'width': '',
                      'height': '',
                      'face': 0,
                      'istype': 2,
                      'qc': '',
                      'nc': 1,
                      'fr': '',
                      'pn': i,
                      'rn': 30,
                      'gsm': '1e',
                      '1488942260214': ''
                  })
    #url = 'https: // image.so.com '
    url = 'https://image.baidu.com/search/acjson'
    urls = []
    for i in params:
        urls.append(requests.get(url,params=i).json().get('data'))

    return urls


def getImg(dataList, localPath):

    if not os.path.exists(localPath):  # 新建文件夹
        os.mkdir(localPath)

    x = 0
    for list in dataList:
        for i in list:
            if i.get('thumbURL') != None:
                print('正在下载：%s' % i.get('thumbURL'))
                ir = requests.get(i.get('thumbURL'))
                open(localPath + '%d.jpg' % x, 'wb').write(ir.content)
                x += 1
            else:
                print('图片链接不存在')

if __name__ == '__main__':
    dataList = getManyPages('郑爽',14)  # 参数1:关键字，参数2:要下载的页数
    #getImg(dataList,'e:/pythonSpiderFile/img17/') # 参数2:指定保存的路径
    getImg ( dataList, 'D:/opencv_image/zhongyao/Bai_fu/' )

以上是获取图片数据的方法，可以利用爬虫爬取自己想要的数据
弊端：有时网页爬取的图片不能使用，因为王爷的图像不一定是图片格式，在使用时如果批量使用有可能存在问题、因此要自己手动处理一边便，（说的不是resize而是图片格式问题，要自己处理一遍，尤其是当提醒你输入有问题的时候）
深度学习中图片数据集命名避免空格和括号
2、数据预处理

#代码功能：将原始图片转换成训练需要的大小shape，并将其保存

"""首先，我们需要准备训练的原始数据，本次训练为图像分类识别，
从网上随机的下载了Dog的四种类别：
在训练之前，需要做的就是进行图像的预处理，即将这些大小不一的原始图片转换成我们训练需要的shape。
下载的原始图片分别放到同一文件的不同文件夹下 
"""

import os
import tensorflow as tf
from PIL import Image

# 该部分包括：制作TFRecords, 读取tfrecored数据获得image和label，打印验证并保存生成的图片
# 原始图片保存位置
origin_images = 'D:/opencv_image/zhongyao/'
# 生成的图片保存位置
gen_images = 'D:/opencv_image/2/'
# 需要的识别类型
classes = {'Bai_fu', 'Bai_xian', 'Bai_zhi', 'Dan_pi'} # 这些识别的类型必须与该类的文件夹名称一致


# 样本总数
num_samples = 400


#=====================================================================================================
    
# 本质上是将每一张图片转化成tfrecord数据

# 制作TFRecords数据
def creat_record():
    writer = tf.python_io.TFRecordWriter("Our_data_set.tfrecords")   # 定义一个写入器
    for index, name in enumerate(classes):    #  enumerate:枚举，返回索引值和对应的数据对象。这个index指的是就是类别，name实际上是已经分好类的文件夹名称
        class_path = origin_images + "/" + name + "/"    # path相加能直接得到新的路径
        for img_name in os.listdir(class_path):     #  os.listdir(path) 返回指定路径下文件和文件夹列表
            img_path = class_path + img_name        #  string数据直接相加，是两个string联合在一起
            img = Image.open(img_path)
            img = img.resize((64,64),)   ## 设定转换后图片的大小
            img_raw = img.tobytes()      # 图片转换为原生bytes
#            print(index, img_raw)
            
            example = tf.train.Example(
                    features=tf.train.Features(feature={
                            "label":tf.train.Feature(int64_list = tf.train.Int64List(value = [index])),
                            'img_raw':tf.train.Feature(bytes_list = tf.train.BytesList(value = [img_raw]))
                    }))
            writer.write(example.SerializeToString())  # 将序列转化为字符串之后写入到tfrecord文件中。  也就是说，tfrecord保存的是string数据
            
    writer.close()
    
#=================================================================================================


# 文件的读取
def read_and_decord(filename):
    # 创建文件队列，不限读取的数量
    filename_queue = tf.train.string_input_producer([filename])      # 批量读取文件，读取路径下的文件。 返回值是字符串的队列 
    
    # creat a reader from file queue
    reader = tf.TFRecordReader()       # 创建一个 阅读器  
    # reader从文件队列中读入一个序列化的样本
    _,serialized_examples = reader.read(filename_queue)   # 将tfrecords文件返回成， 返回的是文件名和文件，这里我们只需要文件即可。
    
    # get feature from serialized example
    # 解析符号化的样本
    features = tf.parse_single_example(
            serialized=serialized_examples,
            features={
                    'label':tf.FixedLenFeature([], tf.int64),
                    'img_raw':tf.FixedLenFeature([], tf.string)
                    })
    label = features['label']
    img = features['img_raw']
    img = tf.decode_raw(img, tf.uint8)
    img = tf.reshape(img, [64,64,3])
    
    label = tf.cast(label, tf.int32)
    
    return img, label
    
    
    
#========================================================================
    
if __name__=='__main__':
    creat_record()
    batch = read_and_decord('Our_data_set.tfrecords')
    init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())
    
    with tf.Session() as sess:
        sess.run(init_op)
        
        coord = tf.train.Coordinator()           # 协调器
        threads = tf.train.start_queue_runners(coord=coord)   # 开始队列 
        
        for i in range(num_samples):
            example, lab = sess.run(batch)    #在回话中取出image和label
            img = Image.fromarray(example, 'RGB')   # arry转化成image。
            img.save(gen_images  + '/' + str(i) + 'samples' + str(lab) + '.jpg')
            print(example, lab)
            
        coord.request_stop()
        coord.join(threads)
        sess.close()

数据预处理目的是将输入数据进行格式统一，将其大小进行统一，方便下一步操作。
（1）首先找到原始图片的位置，origin_images = 'D:/opencv_image/zhongyao/'，这个位置 应该是各个分类的图片的文件夹 。
（2）定义resize后图片的 位置gen_images = 'D:/opencv_image/2/'
（3）定义需要识别的数据集分类：
classes = {'Bai_fu', 'Bai_xian', 'Bai_zhi', 'Dan_pi'}      # 这些识别的类型必须与该类的文件夹名称一致
（4）定义样本总数：num_samples = 400，样本总数是rezize后所有图片的总数即各个分类图片数的总和。
（5）制作TFRecords数据，
   注意：在定义写入器时

writer = tf.python_io.TFRecordWriter("Our_data_set.tfrecords") # 定义一个写入器 写入器名字任意但必须是.tfrecords后缀
（6）enumerate函数

seasons = ['Spring', 'Summer', 'Fall', 'Winter']
 list(enumerate(seasons))
[(0, 'Spring'), (1, 'Summer'), (2, 'Fall'), (3, 'Winter')]
 list(enumerate(seasons, start=1))       # 小标从 1 开始
[(1, 'Spring'), (2, 'Summer'), (3, 'Fall'), (4, 'Winter')]

# 制作TFRecords数据

def creat_record():
    writer = tf.python_io.TFRecordWriter("Our_data_set.tfrecords")   # 定义一个写入器
    for index, name in enumerate(classes):    #  enumerate:枚举，返回索引值和对应的数据对象。这个index指的是就是类别，name实际上是已经分好类的文件夹名称
        class_path = origin_images + "/" + name + "/"    # path相加能直接得到新的路径
        for img_name in os.listdir(class_path):     #  os.listdir(path) 返回指定路径下文件和文件夹列表
            img_path = class_path + img_name        #  string数据直接相加，是两个string联合在一起
            img = Image.open(img_path)
            img = img.resize((64,64),)   ## 设定转换后图片的大小
            img_raw = img.tobytes()      # 图片转换为原生bytes
#           print(index, img_raw)
            
            example = tf.train.Example(
                    features=tf.train.Features(feature={
                            "label":tf.train.Feature(int64_list = tf.train.Int64List(value = [index])),
                            'img_raw':tf.train.Feature(bytes_list = tf.train.BytesList(value = [img_raw]))
                    }))
            writer.write(example.SerializeToString())  # 将序列转化为字符串之后写入到tfrecord文件中。  也就是说，tfrecord保存的是string数据
            
    writer.close()

（7）定位到每一张图之后，打开Image.open(name_path)，重新标定大小Image.resize((size,size))，转化成raw类型Image.robytes(img)
（8）按照example协议转化：example = tf.train.Example():
（9）写入tfrecords文件：writer.write(example.SerializeToString())
（10）关闭writer.close()
函数具体为：Tfrecord文件中，在写入的时候格式byte格式。上图中，

example = tf.train.Example(
        features=tf.train.Features(feature={
             "label":tf.train.Feature(int64_list = tf.train.Int64List(value = [index])),
             'img_raw':tf.train.Feature(bytes_list = tf.train.BytesList(value = [img_raw]))
                    }))

tf.train.Example 协议内存块包含了Features字段，通过feature将图片的二进制数据和label进行统一封装，然后将example协议内存块转化为字符串，

3、读取和decode数据集

)函数中创建文件队列：file_queue = tf.train.string_input_pruductor(filename)
2)创建阅读器： reader = tf.TFRecordReader()
3)阅读器从队列总一个一个的读出
_, example：example = reader.read(file_queue)。
4)文件解析器将example解析。都是二进制文件。需要解码decode。
5)将image从二进制中解码：img = tf.decode_raw(img, ‘RGB’)，还原：img = tf.reshape(img,[64,64,3])
    
   # 文件的读取
def read_and_decord(filename):
    # 创建文件队列，不限读取的数量
    filename_queue = tf.train.string_input_producer([filename])      # 批量读取文件，读取路径下的文件。 返回值是字符串的队列
    # creat a reader from file queue
    reader = tf.TFRecordReader()       # 创建一个 阅读器  
    # reader从文件队列中读入一个序列化的样本
    _,serialized_examples = reader.read(filename_queue)   # 将tfrecords文件返回成， 返回的是文件名和文件，这里我们只需要文件即可。
    # get feature from serialized example
    # 解析符号化的样本
    features = tf.parse_single_example(
            serialized=serialized_examples,
            features={
                    'label':tf.FixedLenFeature([], tf.int64),
                    'img_raw':tf.FixedLenFeature([], tf.string)
                    })
    label = features['label']
    img = features['img_raw']
    img = tf.decode_raw(img, tf.uint8)
    img = tf.reshape(img, [64,64,3])
    
    label = tf.cast(label, tf.int32)
    
    return

然后队列取值
1)Tfrecords在sess中运行时，需要按队列来取。因此，需要进行队列取值
2)队列取值一般有协调器和多线程：coord = tf.train.Coordinator()
Threads = tf.train.start_queue_runners(coord = coord)
在sess中写入这两句话之后，可以一次读取上述2函数中的数据了。但是这个返回值img是array类型，需要转换
3)Arra转换成image： img = Image.fromarray(img,’RGB’)
4)然后就可以保存、显示等操作了
5)数据读取完之后，需要对队列和线程进行操作：
coord.request_stop() 关闭协调器
coord.jion(threads) 线程加入协调器
sess.close() 关闭sess。这个必须要，否则报错。
将现有的图片集合构建成自己的数据集，步骤：
1、生成tfrecord文件
2、定义record reader解析tfrecord文件
3、构造一个批生成器（batcher）
4、构建其他的操作
5、初始化所有的操作
6、启动QueueRunner

制作自己的数据集，用tensorflow自带的TFRecords格式。它能自动的为你打上标签。

if __name__=='__main__':
creat_record()
batch = read_and_decord('Our_data_set.tfrecords')
init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())

with tf.Session() as sess:
    sess.run(init_op)
    
    coord = tf.train.Coordinator()           # 协调器
    threads = tf.train.start_queue_runners(coord=coord)   # 开始队列 
    
    for i in range(num_samples):
        example, lab = sess.run(batch)    #在回话中取出image和label
        img = Image.fromarray(example, 'RGB')   # arry转化成image。
        img.save(gen_images  + '/' + str(i) + 'samples' + str(lab) + '.jpg')
        print(example, lab)
        
    coord.request_stop()
    coord.join(threads)
    sess.close()