基于前面的基础知识积累,废话不多说,直接实战~
1 项目实践
参考github项目 前文使用Docker准备了5个相关开发环境,刚好来试试。
1.1 代码准备
项目源代码如下:worker and ps.ipynb
import os
import json
import tensorflow as tf
tf.get_logger().setLevel('ERROR') # 记录日志
# Cluster setup
tf_config = {
'cluster': {
'worker': ['192.168.1.1:12345', '192.168.1.2:12345'],
'ps': ['192.168.1.3:12345', '192.168.1.4:12345'],
'chief': ['192.168.1.5:12345']
},
'task': {'type': 'worker', 'index': 0},
# 'task': {'type': 'ps', 'index': 0}
}
os.environ.pop('TF_CONFIG', None)
os.environ['TF_CONFIG'] = json.dumps(tf_config)
cluster_resolver = tf.distribute.cluster_resolver.TFConfigClusterResolver() # 集群解析器
if cluster_resolver.task_type == 'ps':
os.environ['CUDA_VISIBLE_DEVICES'] = '-1' # 若解析为ps类型,则仅使用最后一块GPU
print('Parameter server detected')
elif cluster_resolver.task_type == 'worker':
gpu_devices = tf.config.list_physical_devices('GPU')
if len(gpu_devices) == 0: raise SystemError('GPU device not found')
for gpu in gpu_devices:
tf.config.experimental.set_memory_growth(gpu, True) # 若解析为worker则设置所有GPU为自增长
print('Worker detected with GPU(s):', gpu_devices)
else: raise SystemError('Machine in wrong role')
# Allow reporting worker and ps failure to the coordinator
# 设置环境变量使得允许工作器和参数服务器报告错误给协调者
# 这个设置在将来可能不需要
os.environ['GRPC_FAIL_FAST'] = 'use_caller'
# Start a TensorFlow server and wait.
# 启动一个server并等待
server = tf.distribute.Server(
cluster_resolver.cluster_spec(),
job_name = cluster_resolver.task_type,
task_index = cluster_resolver.task_id,
protocol = cluster_resolver.rpc_layer or 'grpc', # 协议相关
start = True
)
server.join()
coordinator.ipynb
import os
import json
import h5py
import tensorflow as tf
tf.get_logger().setLevel('ERROR')
%load_ext tensorboard
# Cluster setup
tf_config = {
'cluster': {
'worker': ['192.168.1.1:12345', '192.168.1.2:12345'],
'ps': ['192.168.1.3:12345', '192.168.1.4:12345'],
'chief': ['192.168.1.5:12345']
},
'task': {'type': 'chief', 'index': 0}
}
os.environ.pop('TF_CONFIG', None)
os.environ['TF_CONFIG'] = json.dumps(tf_config)
# Allow reporting worker and ps failure to the coordinator
os.environ['GRPC_FAIL_FAST'] = 'use_caller'
# Instantiate a ParameterServerStrategy
# 指定对变量进行分片
variable_partitioner = (
tf.distribute.experimental.partitioners.MinSizePartitioner(
min_shard_bytes = (256 << 10), # 它为每个分片分配至少 256K
max_shards = len(tf_config['cluster']['ps']) # 每个 ps 最多得到一个分片
)
)
# 初始化ps策略实例
strategy = tf.distribute.experimental.ParameterServerStrategy(
tf.distribute.cluster_resolver.TFConfigClusterResolver(), # 建立解析器
variable_partitioner = variable_partitioner # 建立分片器
)
strategy
# Path setup
TRAIN_PATH = 'Dataset/Train'
VALIDATE_PATH = 'Dataset/Validate'
TEST_PATH = 'Dataset/Test'
MODEL_PATH = 'Model'
MODEL_CKPT = os.path.join(MODEL_PATH, 'ckpt-{epoch}')
MODEL_TRAINED = os.path.join(MODEL_PATH, 'model.hdf5')
MODEL_BACKUP = os.path.join(MODEL_PATH, 'backup')
# Preparing data
CLASSES = 30
IMAGE_SIZE = (224, 224)
PER_WORKER_BATCH_SIZE = 32
NUM_WORKERS = len(tf_config['cluster']['worker'])
GLOBAL_BATCH_SIZE = PER_WORKER_BATCH_SIZE * NUM_WORKERS
EPOCHS = 3
from tensorflow.keras.preprocessing.image import ImageDataGenerator # ImageDataGenerator数据增强,扩充数据集大小,提高模型泛化能力
train_generator = ImageDataGenerator(
rescale = 1./255, # 所有数据集将乘以该数值,将像素值限制在0,1之间
rotation_range = 40, # 随机旋转角度数范围
width_shift_range = 0.2, # 随机宽度偏移量
height_shift_range = 0.2, # 随机高度偏移量
shear_range = 0.2, # 让所有的点的x坐标或y坐标其中一个按比例平移,另外一个不动
zoom_range = 0.2, # 随机缩放的范围[1-n,1+n]
horizontal_flip = True # 是否随机水平翻转
)
# Input data
def train_dataset_fn(input_context):
batch_size = input_context.get_per_replica_batch_size(GLOBAL_BATCH_SIZE)
train_dataset = tf.data.Dataset.from_generator(
lambda: train_generator.flow_from_directory(
TRAIN_PATH,
target_size = IMAGE_SIZE,
batch_size = batch_size
),
output_types = (tf.float32, tf.float32),
output_shapes = ([batch_size, *IMAGE_SIZE, 3], [batch_size, CLASSES])
).shard(
input_context.num_input_pipelines,
input_context.input_pipeline_id
).cache()
return train_dataset.prefetch(buffer_size=tf.data.AUTOTUNE)
from tensorflow.keras.utils.experimental import DatasetCreator
train_dataset = DatasetCreator(train_dataset_fn)
num_train = !find {TRAIN_PATH} -type f | wc -l
num_train = int(num_train[0])
print(f'Found {num_train} files')
# Model implement
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.applications.mobilenet_v2 import preprocess_input
from tensorflow.keras.layers import GlobalAveragePooling2D, Dense, Dropout
from tensorflow.keras.models import Model
# Define the model
def build_and_compile_model():
base_model = MobileNetV2(
input_shape = IMAGE_SIZE + (3,),
include_top = False,
weights = None
)
x = preprocess_input(base_model.output)
x = GlobalAveragePooling2D()(x)
x = Dense(512, activation='relu')(x)
x = Dropout(0.2)(x)
outputs = Dense(CLASSES, activation='softmax')(x)
model = Model(inputs=base_model.input, outputs=outputs)
model.compile(
optimizer = 'adam',
loss = 'categorical_crossentropy',
metrics = ['accuracy']
)
return model
# Callbacks
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint
from tensorflow.keras.callbacks import Callback, LearningRateScheduler
from tensorflow.keras.callbacks.experimental import BackupAndRestore
def decay(epoch):
if epoch < 3: return 1e-3
elif epoch >= 3 and epoch < 7: return 1e-4
return 1e-5
# Define a callback for printing the learning rate at the end of each epoch.
class PrintLR(Callback):
def on_epoch_end(self, epoch, logs=None):
print(f'\nLearning rate for epoch {epoch + 1} is {model.optimizer.lr.numpy()}')
callbacks = [
TensorBoard(log_dir='./logs'),
BackupAndRestore(backup_dir=MODEL_BACKUP),
ModelCheckpoint(filepath=MODEL_CKPT, save_weights_only=True, verbose=1),
LearningRateScheduler(decay),
PrintLR()
]
!rm -rf logs
# Training
with strategy.scope():
model = build_and_compile_model()
history = model.fit(
train_dataset,
epochs = EPOCHS,
steps_per_epoch = num_train // (GLOBAL_BATCH_SIZE * NUM_WORKERS),
# steps_per_epoch = num_train // GLOBAL_BATCH_SIZE,
# callbacks = callbacks,
# verbose = 1, # not allowed with ParameterServerStrategy
)
model.save(MODEL_TRAINED)
model.save(MODEL_TRAINED)
%tensorboard --logdir=logs
evaluator.ipynb
import os
import json
import tensorflow as tf
tf.get_logger().setLevel('ERROR')
%load_ext tensorboard
# Cluster setup
tf_config = {
'cluster': {'evaluator': ['192.168.1.6:12345']},
'task': {'type': 'evaluator', 'index': 0}
}
os.environ.pop('TF_CONFIG', None)
os.environ['TF_CONFIG'] = json.dumps(tf_config)
tf.distribute.cluster_resolver.TFConfigClusterResolver()
# Path setup
TRAIN_PATH = 'Dataset/Train'
VALIDATE_PATH = 'Dataset/Validate'
TEST_PATH = 'Dataset/Test'
MODEL_PATH = 'Model'
MODEL_CKPT = os.path.join(MODEL_PATH, 'ckpt-{epoch}')
# Preparing data
CLASSES = 30
IMAGE_SIZE = (224, 224)
GLOBAL_BATCH_SIZE = 64
from tensorflow.keras.preprocessing.image import ImageDataGenerator
validate_generator = ImageDataGenerator(rescale=1./255)
generated_validate_data = validate_generator.flow_from_directory(
VALIDATE_PATH,
target_size = IMAGE_SIZE,
batch_size = GLOBAL_BATCH_SIZE
)
validate_dataset = tf.data.Dataset.from_generator(
lambda: generated_validate_data,
output_types = (tf.float32, tf.float32),
output_shapes = (
[GLOBAL_BATCH_SIZE, *IMAGE_SIZE, 3],
[GLOBAL_BATCH_SIZE, CLASSES]
)
).cache().prefetch(buffer_size=tf.data.AUTOTUNE)
# Define the model
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.applications.mobilenet_v2 import preprocess_input
from tensorflow.keras.layers import GlobalAveragePooling2D, Dense, Dropout
from tensorflow.keras.models import Model
def build_and_compile_model():
base_model = MobileNetV2(
input_shape = IMAGE_SIZE + (3,),
include_top = False,
weights = None
)
x = preprocess_input(base_model.output)
x = GlobalAveragePooling2D()(x)
x = Dense(512, activation='relu')(x)
x = Dropout(0.2)(x)
outputs = Dense(CLASSES, activation='softmax')(x)
model = Model(inputs=base_model.input, outputs=outputs)
model.compile(
optimizer = 'adam',
loss = 'categorical_crossentropy',
metrics = ['accuracy']
)
return model
model = build_and_compile_model()
model.summary()
# Side-car evaluation
tf.keras.experimental.SidecarEvaluator(
model = model,
data = validate_dataset,
checkpoint_dir = MODEL_CKPT, # Dir for training-saved checkpoint
steps = None, # Evaluate until dataset is exhausted
max_evaluations = None, # The evaluation needs to be stopped manually
callbacks = [TensorBoard(log_dir='./logs')]
).start()
%tensorboard --logdir=logs
1.2 转换代码
在本地运行代码需要转换为.py
文件
主要是一下转换:
- 导入tensorboard:
%load_ext tensorboard
–>get_ipython().run_line_magic('load_ext', 'tensorboard')
- 删除日志:
!rm -rf logs
–>get_ipython().system('rm -rf logs')
- 运行tensorboard:
%tensorboard --logdir=logs
–>get_ipython().run_line_magic('tensorboard', '--logdir=logs')
1.3 修改单机代码
1.3.1 注释掉jupyter文件中转换后的魔法函数
需要注释掉jupyter文件中转换后的魔法函数,否则直接运行会出现以下错误
Traceback (most recent call last):
File "/home/hqc/container_share/ml-distributed-training-main/single-training/flowers_single.py", line 11, in <module>
get_ipython().run_line_magic('load_ext', 'tensorboard')
NameError: name 'get_ipython' is not defined
1.3.2 但后续代码中仍然会用到get_ipython
解决:
代码中加入from IPython import get_ipython
,一般会报错找不到IPython
,直接安装即可:pip install Ipython
这个问题暂时解决
1.3.3 Rescaling无法导入
报错:ImportError: cannot import name 'Rescaling' from 'tensorflow.keras.layers' (/home/hqc/anaconda3/envs/tf/lib/python3.9/site-packages/tensorflow/keras/layers/__init__.py)
1.3.4 getoutput报错
Traceback (most recent call last):
File "/home/hqc/container_share/ml-distributed-training-main/single-training/flowers_single.py", line 68, in <module>
data_length = get_ipython().getoutput('find {data_root} -name *.jpg | wc -l')
AttributeError: 'NoneType' object has no attribute 'getoutput'
1.3.5 解决方法
太多类似的import问题和找不到包的问题,很可能是版本的问题。
但不可能重新配置开发环境了,太过于麻烦了,因此放弃运行单机的源码验证,学习parameterstrategy的思路即可。
1.4 修改ParameterStrategy代码
1.4.1 集群IP修改
针对提供的开发环境,设置集群IP
tf_config = {
'cluster': {
'worker': ['172.72.0.4:12345', '172.72.0.5:12345'],
'ps': ['172.72.0.2:12345', '172.72.0.3:12345'],
'chief': ['172.72.0.6:12345']
},
'task': {'type': 'worker', 'index': 0},
# 'task': {'type': 'ps', 'index': 0}
}
os.environ.pop('TF_CONFIG', None)
os.environ['TF_CONFIG'] = json.dumps(tf_config)
注意:不同的代码type
和index
的值也应相应地修改。
1.4.2 GPU设置修改
每个节点上的代码都应设置只有一个GPU可见,因为ps策略要求所有节点上的GPU数目相同。
否则coordinator
将会报以下错误:NotImplementedError: Multi-gpu is not supported yet.
设置单个GPU可见的指令:os.environ['CUDA_VISIBLE_DEVICES']='0'
,加到代码中即可
但由于本机上只有2块GPU,不够5台节点分,因此实际上这些集群代码无法运行起来。
解决方法是:转向云服务器租用GPU进行实验。
2 学习如何修改为ParameterStrategy
2.1 worker and ps
worker和ps节点运行的代码和整体需要实现的逻辑没什么关联,感觉就是提供了一个有强大算力的机器以供整个模型训练使用,代码里也主要就是一些设置,包括:
- 第一部分:设置集群IP,让集群可以发现并使用
- 第二部分:集群IP解析器,让集群区分是worker还是ps
- 第三部分:区分之后对应节点功能不同设置GPU的调用
- 第四部分:创建一个server,并等待接收coordinator的指令
2.2 coordinator
主要就是依靠coordinator来进行协调联系,发送指令,代码主体需要实现模型逻辑。
与MultiWorkerMirroredStrategy的区别主要包括:
- 好像没设置GPU,为什么呢?暂时还不明白。
- 集群内的节点增多,对应IP都得加上;coordinator的task中的类型得指定为
chief
。 - 多机多卡初始化
MultiWorkerMirroredStrategy
时,参数包括一个通信选项(RING),并且采用的是自动分片策略;而ps策略初始化ParameterServerStrategy
时,需要先定义一个变量分片器手动分片,并且参数包括集群解析器和前面定义的变量分片器。 - 数据准备方面只做了训练集的数据增强
- 创建数据集得使用
DatasetCreator
,训练集的操作须被封装在一个函数里边。