传说使用以下方式可以使用多GPU共同训练?那是不可能滴……
# 在代码中
import os
os.environ['CUDA_VISIBLE_DEVICES']='0,1,2,3' # 使第0,1,2,3块GPU一起训练?NO!
os.environ['CUDA_VISIBLE_DEVICES']='-1' # 只使用CPU
# 在控制台
CUDA_VISIBLE_DEVICES=0,1,2,3 python train.py # NO!
正确的打开方式: 参考:tensorflow-models/tutorials/image/cifar10/cifar10_multi_gpu_train.py
create_tfrecord.py
import tensorflow as tf
import numpy as np
from tensorflow.examples.tutorials.mnist import input_data
def int64_feature(value):
return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
def bytes_feature(value):
return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
mnist = input_data.read_data_sets('./MNIST_data', dtype=tf.uint8, one_hot=True)
images = mnist.train.images
labels = mnist.train.labels
size = images.shape[1]
num_examples = mnist.train.num_examples
# 输出TFRecord文件的地址
filename = './output.tfrecord'
# 创建writer来写tfrecords文件
writer = tf.python_io.TFRecordWriter(filename)
for i in range(num_examples):
# 将图像矩阵转换为一个字符串
image_raw = images[i].tostring()
#将一个样例转换为Example Protocol Buffer,并将所有的信息写入这个数据结构
example = tf.train.Example(features = tf.train.Features(feature={
'size': int64_feature(size),
'label': int64_feature(np.argmax(labels[i])),
'image_raw': bytes_feature(image_raw)
}))
#将一个Example写入TFRecord文件
writer.write(example.SerializeToString())
writer.close()
multi_gpu_train.py
import tensorflow as tf
import time
from datetime import datetime
import matplotlib.pyplot as plt
INPUT_NODE = 28*28
NODE1 = 500
OUTPUT_NODE = 10
BASE_LEARNING_RATE = 0.001
DECAY_RATET = 0.99
BATCH_SIZE = 32
MOVING_AVG_DECAY = 0.99
TRAIN_STEPS = 30000
REGULAR_RATIO = 0.0001
N_GPU = 1
def get_weight_variables(shape, regularizer):
w = tf.get_variable('w',shape,dtype=tf.float32,initializer=tf.truncated_normal_initializer(stddev=0.1))
tf.add_to_collection(tf.GraphKeys.LOSSES, regularizer(w))
return w
def inference(input_tensor,regularizer):
with tf.variable_scope('layer1'):
weight = get_weight_variables([INPUT_NODE, NODE1], regularizer)
bias = tf.get_variable('bias', [NODE1], dtype=tf.float32)
layer1 = tf.nn.relu(tf.matmul(input_tensor, weight) + bias)
with tf.variable_scope('layer2'):
weight = get_weight_variables([NODE1, OUTPUT_NODE], regularizer)
bias = tf.get_variable('bias', [OUTPUT_NODE], dtype=tf.float32)
layer2 = tf.matmul(layer1, weight) + bias
return layer2
def _parse_image_function(example_proto):
# Create a dictionary describing the features.
image_feature_description = {
'size': tf.FixedLenFeature([], tf.int64),
'label': tf.FixedLenFeature([], tf.int64),
'image_raw': tf.FixedLenFeature([], tf.string),
}
# Parse the input tf.Example proto using the dictionary above.
feat_dict = tf.parse_single_example(example_proto, image_feature_description) # 返回值是<class 'dict'>类型
decoded_img = tf.decode_raw(feat_dict['image_raw'], tf.uint8)
reshaped_img = tf.cast(tf.reshape(decoded_img, [784]), tf.float32)
label = tf.cast(feat_dict['label'],tf.int32)
return reshaped_img, label
def get_input():
# 读取 TFRecord 文件
input_files = ['output.tfrecord'] # 可以有多个文件
dataset = tf.data.TFRecordDataset(input_files).map(_parse_image_function)
dataset = dataset.shuffle(buffer_size=10000).repeat(100).batch(BATCH_SIZE)
iterator = dataset.make_one_shot_iterator()
img, label = iterator.get_next()
return img, label
# 定义损失函数,对于给定的训练数据、正则化损失和命名空间,计算在这个命名空间下的总损失,
# 之所以要给定命名空间是因为不同的GPU上计算的到的正则化损失都会加入名为loss的集合,
# 如果不通过命名空间就会将其它GPU上的正则化损失加到当前GPU上(对于其它GPU也是,相当于每个GPU上的正则化损失都是所有GPU上正则化损失之和)
def get_loss(x, y_gt,regularizer,scope, reuse_variable=None):
with tf.variable_scope(tf.get_variable_scope(), reuse=reuse_variable):
y_pred = inference(x,regularizer)
# 计算交叉熵损失
cross_entropy = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y_gt, logits=y_pred))
# 计算当前GPU上计算得到的正则化损失
regularization_loss = tf.add_n(tf.get_collection(tf.GraphKeys.LOSSES, scope)) # scope重要!scope重要!scope重要!
# 计算总损失
loss = cross_entropy + regularization_loss
return loss
# 计算每一个变量梯度的平均值,用于更新变量
# tower_gradients里面保存的形式是(第一个GPU上的梯度,第二个GPU上的梯度,...第N-1个GPU上的梯度)
def average_gradients(tower_gradients):
average_grads = []
# 枚举所有的变量和变量在不同GPU上计算得出的梯度
for grad_and_vars in zip(*tower_gradients): # grad_and_vars是同一变量在不同GPU上的梯度
# 计算所有GPU上梯度平均值
grads = []
for g,_ in grad_and_vars:
expanded_g = tf.expand_dims(g,0)
grads.append(expanded_g)
grad = tf.concat(grads, 0)
grad = tf.reduce_mean(grad,0)
v = grad_and_vars[0][1]
grad_and_var = (grad, v)
# 将变量和它的平均梯度对应起来
average_grads.append(grad_and_var)
# 返回所有变量的平均梯度,用于更新变量
return average_grads
def train():
# 将简单的运算放在CPU上,只有神经网络的训练过程放到GPU上
with tf.Graph().as_default(), tf.device('/cpu:0'):
x, y_gt = get_input()
regularizer = tf.contrib.layers.l2_regularizer(REGULAR_RATIO)
global_step = tf.Variable(0,trainable=False)
learning_rate = tf.train.exponential_decay(BASE_LEARNING_RATE,global_step, 55000/BATCH_SIZE,DECAY_RATET)
opt = tf.train.GradientDescentOptimizer(learning_rate)
tower_grads = []
reuse_variables = False
# 将神经网络的优化过程跑在不同的GPU上
for i in range(N_GPU):
# 将优化过程指定在一个GPU上
with tf.device('/gpu:%d'%i):
with tf.name_scope('GPU_%d'%i) as scope:
cur_loss = get_loss(x,y_gt,regularizer,scope, reuse_variables)
# 在第一次声明变量之后,将控制变量复用的参数设置为True,从而使得y_pred = inference(x)使用的是同一组参数
reuse_variables = True
grads = opt.compute_gradients(cur_loss)
print(type(grads))
tower_grads.append(grads)
# 计算变量的平均梯度
grads = average_gradients(tower_grads)
for grad, var in grads:
if grad is not None:
tf.summary.histogram('gradients_on_average%s'%var.op.name,grad)
# 使用平均梯度更新参数
apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)
for var in tf.trainable_variables():
tf.summary.histogram(var.op.name, var)
ema = tf.train.ExponentialMovingAverage(MOVING_AVG_DECAY, global_step)
variables_to_average = (tf.trainable_variables() + tf.moving_average_variables())
avg_op = ema.apply(variables_to_average)
with tf.control_dependencies([apply_gradient_op, avg_op]):
train_op = tf.no_op('train')
saver = tf.train.Saver()
summary_op = tf.summary.merge_all()
writer = tf.summary.FileWriter('./log',graph=tf.get_default_graph())
with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=True)) as sess:
sess.run(tf.global_variables_initializer())
for i in range(TRAIN_STEPS):
# 执行神经网络的训练,并记录训练时间
start_time = time.time()
step, loss_, _ = sess.run([global_step, cur_loss, train_op])
duration = time.time()-start_time
# 每隔一段时间输出当前训练进度,并统计训练速度
if step!=0 and step%1000==0:
# 计算使用过的训练数据个数,因为在每次运行训练操作时每个GPU都会使用一个batch的训练数据,
# 因此用到的训练数据个数为batch*N_GPU
num_examples_per_step = BATCH_SIZE * N_GPU
# num_examples_per_step是本次迭代使用的训练数据个数,duration为运行当前训练操作使用的时间,
# 于是每秒可以处理的训练数据个数为:
examples_per_sec = num_examples_per_step/duration
# duration为运行当前训练操作的时间,因为在每一个训练过程中每个GPU都会使用一个batch的训练数据,
# 所以在单个batch上的训练所需时间为:
sec_per_batch = duration/N_GPU
# 输出训练信息
format_str = ('%s: step%d, loss=%.2f(%.1f examples/sec; %.3f sec/batch)')
print(format_str %(datetime.now(), step, loss_, examples_per_sec, sec_per_batch))
# 可视化训练过程
summary = sess.run(summary_op)
writer.add_summary(summary, step)
if i % 1000 == 0 or (step+1) == TRAIN_STEPS:
print('step: %d, loss: %f' % (step, loss_))
saver.save(sess,'./model/model.ckpt')
writer.close()
def main():
train()
if __name__ == '__main__' :
main()