基于前面的基础知识积累,废话不多说,直接实战~

1 项目实践

参考github项目 前文使用Docker准备了5个相关开发环境,刚好来试试。

1.1 代码准备

项目源代码如下:
worker and ps.ipynb

import os
import json
import tensorflow as tf
tf.get_logger().setLevel('ERROR') # 记录日志

# Cluster setup
tf_config = {
    'cluster': {
        'worker': ['192.168.1.1:12345', '192.168.1.2:12345'],
        'ps': ['192.168.1.3:12345', '192.168.1.4:12345'],
        'chief': ['192.168.1.5:12345']
    },
    'task': {'type': 'worker', 'index': 0},
    # 'task': {'type': 'ps', 'index': 0}
}
os.environ.pop('TF_CONFIG', None)
os.environ['TF_CONFIG'] = json.dumps(tf_config)

cluster_resolver = tf.distribute.cluster_resolver.TFConfigClusterResolver() # 集群解析器
if cluster_resolver.task_type == 'ps':
    os.environ['CUDA_VISIBLE_DEVICES'] = '-1'  # 若解析为ps类型,则仅使用最后一块GPU
    print('Parameter server detected')
elif cluster_resolver.task_type == 'worker':
    gpu_devices = tf.config.list_physical_devices('GPU') 
    if len(gpu_devices) == 0: raise SystemError('GPU device not found')
    for gpu in gpu_devices: 
        tf.config.experimental.set_memory_growth(gpu, True) # 若解析为worker则设置所有GPU为自增长
    print('Worker detected with GPU(s):', gpu_devices)
else: raise SystemError('Machine in wrong role')

# Allow reporting worker and ps failure to the coordinator
# 设置环境变量使得允许工作器和参数服务器报告错误给协调者
# 这个设置在将来可能不需要
os.environ['GRPC_FAIL_FAST'] = 'use_caller'

# Start a TensorFlow server and wait.
# 启动一个server并等待
server = tf.distribute.Server(
    cluster_resolver.cluster_spec(),
    job_name = cluster_resolver.task_type,
    task_index = cluster_resolver.task_id,
    protocol = cluster_resolver.rpc_layer or 'grpc', # 协议相关
    start = True
)
server.join()

coordinator.ipynb

import os
import json
import h5py
import tensorflow as tf
tf.get_logger().setLevel('ERROR')
%load_ext tensorboard

# Cluster setup
tf_config = {
    'cluster': {
        'worker': ['192.168.1.1:12345', '192.168.1.2:12345'],
        'ps': ['192.168.1.3:12345', '192.168.1.4:12345'],
        'chief': ['192.168.1.5:12345']
    },
    'task': {'type': 'chief', 'index': 0}
}
os.environ.pop('TF_CONFIG', None)
os.environ['TF_CONFIG'] = json.dumps(tf_config)

# Allow reporting worker and ps failure to the coordinator
os.environ['GRPC_FAIL_FAST'] = 'use_caller'

# Instantiate a ParameterServerStrategy
# 指定对变量进行分片
variable_partitioner = (
    tf.distribute.experimental.partitioners.MinSizePartitioner(
        min_shard_bytes = (256 << 10), # 它为每个分片分配至少 256K
        max_shards = len(tf_config['cluster']['ps']) # 每个 ps 最多得到一个分片
    )
)
# 初始化ps策略实例
strategy = tf.distribute.experimental.ParameterServerStrategy(
    tf.distribute.cluster_resolver.TFConfigClusterResolver(), # 建立解析器
    variable_partitioner = variable_partitioner # 建立分片器
)
strategy

# Path setup
TRAIN_PATH = 'Dataset/Train'
VALIDATE_PATH = 'Dataset/Validate'
TEST_PATH = 'Dataset/Test'

MODEL_PATH = 'Model'
MODEL_CKPT = os.path.join(MODEL_PATH, 'ckpt-{epoch}')
MODEL_TRAINED = os.path.join(MODEL_PATH, 'model.hdf5')
MODEL_BACKUP = os.path.join(MODEL_PATH, 'backup')

# Preparing data
CLASSES = 30
IMAGE_SIZE = (224, 224)
PER_WORKER_BATCH_SIZE = 32
NUM_WORKERS = len(tf_config['cluster']['worker'])
GLOBAL_BATCH_SIZE = PER_WORKER_BATCH_SIZE * NUM_WORKERS
EPOCHS = 3

from tensorflow.keras.preprocessing.image import ImageDataGenerator # ImageDataGenerator数据增强,扩充数据集大小,提高模型泛化能力
train_generator = ImageDataGenerator(
    rescale = 1./255, # 所有数据集将乘以该数值,将像素值限制在0,1之间
    rotation_range = 40,  # 随机旋转角度数范围
    width_shift_range = 0.2,  # 随机宽度偏移量
    height_shift_range = 0.2,  # 随机高度偏移量
    shear_range = 0.2, # 让所有的点的x坐标或y坐标其中一个按比例平移,另外一个不动
    zoom_range = 0.2, # 随机缩放的范围[1-n,1+n]
    horizontal_flip = True # 是否随机水平翻转
)

# Input data
def train_dataset_fn(input_context):
    batch_size = input_context.get_per_replica_batch_size(GLOBAL_BATCH_SIZE)
    train_dataset = tf.data.Dataset.from_generator(
        lambda: train_generator.flow_from_directory(
            TRAIN_PATH, 
            target_size = IMAGE_SIZE, 
            batch_size = batch_size
        ), 
        output_types = (tf.float32, tf.float32), 
        output_shapes = ([batch_size, *IMAGE_SIZE, 3], [batch_size, CLASSES])
    ).shard(
        input_context.num_input_pipelines, 
        input_context.input_pipeline_id
    ).cache()
    return train_dataset.prefetch(buffer_size=tf.data.AUTOTUNE)

from tensorflow.keras.utils.experimental import DatasetCreator
train_dataset = DatasetCreator(train_dataset_fn)
num_train = !find {TRAIN_PATH} -type f | wc -l
num_train = int(num_train[0])
print(f'Found {num_train} files')

# Model implement
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.applications.mobilenet_v2 import preprocess_input
from tensorflow.keras.layers import GlobalAveragePooling2D, Dense, Dropout
from tensorflow.keras.models import Model

# Define the model
def build_and_compile_model():
    base_model = MobileNetV2(
        input_shape = IMAGE_SIZE + (3,), 
        include_top = False,
        weights = None
    )
    
    x = preprocess_input(base_model.output)
    x = GlobalAveragePooling2D()(x)
    x = Dense(512, activation='relu')(x)
    x = Dropout(0.2)(x)
    outputs = Dense(CLASSES, activation='softmax')(x)
    
    model = Model(inputs=base_model.input, outputs=outputs)
    model.compile(
        optimizer = 'adam', 
        loss = 'categorical_crossentropy', 
        metrics = ['accuracy']
    )
    return model

# Callbacks
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint 
from tensorflow.keras.callbacks import Callback, LearningRateScheduler
from tensorflow.keras.callbacks.experimental import BackupAndRestore

def decay(epoch):
    if epoch < 3: return 1e-3
    elif epoch >= 3 and epoch < 7: return 1e-4
    return 1e-5

# Define a callback for printing the learning rate at the end of each epoch.
class PrintLR(Callback):
    def on_epoch_end(self, epoch, logs=None):
        print(f'\nLearning rate for epoch {epoch + 1} is {model.optimizer.lr.numpy()}')

callbacks = [
    TensorBoard(log_dir='./logs'),
    BackupAndRestore(backup_dir=MODEL_BACKUP),
    ModelCheckpoint(filepath=MODEL_CKPT, save_weights_only=True, verbose=1),
    LearningRateScheduler(decay),
    PrintLR()
]
!rm -rf logs

# Training
with strategy.scope(): 
    model = build_and_compile_model()

history = model.fit(
    train_dataset,
    epochs = EPOCHS,
    steps_per_epoch = num_train // (GLOBAL_BATCH_SIZE * NUM_WORKERS),
    # steps_per_epoch = num_train // GLOBAL_BATCH_SIZE,
    # callbacks = callbacks,
    # verbose = 1, # not allowed with ParameterServerStrategy
)
model.save(MODEL_TRAINED)

model.save(MODEL_TRAINED)
%tensorboard --logdir=logs

evaluator.ipynb

import os
import json
import tensorflow as tf
tf.get_logger().setLevel('ERROR')
%load_ext tensorboard

# Cluster setup
tf_config = {
    'cluster': {'evaluator': ['192.168.1.6:12345']},
    'task': {'type': 'evaluator', 'index': 0}
}
os.environ.pop('TF_CONFIG', None)
os.environ['TF_CONFIG'] = json.dumps(tf_config)
tf.distribute.cluster_resolver.TFConfigClusterResolver()

# Path setup
TRAIN_PATH = 'Dataset/Train'
VALIDATE_PATH = 'Dataset/Validate'
TEST_PATH = 'Dataset/Test'

MODEL_PATH = 'Model'
MODEL_CKPT = os.path.join(MODEL_PATH, 'ckpt-{epoch}')

# Preparing data
CLASSES = 30
IMAGE_SIZE = (224, 224)
GLOBAL_BATCH_SIZE = 64

from tensorflow.keras.preprocessing.image import ImageDataGenerator
validate_generator = ImageDataGenerator(rescale=1./255)
generated_validate_data = validate_generator.flow_from_directory(
    VALIDATE_PATH, 
    target_size = IMAGE_SIZE, 
    batch_size = GLOBAL_BATCH_SIZE
)

validate_dataset = tf.data.Dataset.from_generator(
    lambda: generated_validate_data, 
    output_types = (tf.float32, tf.float32), 
    output_shapes = (
        [GLOBAL_BATCH_SIZE, *IMAGE_SIZE, 3], 
        [GLOBAL_BATCH_SIZE, CLASSES]
    )
).cache().prefetch(buffer_size=tf.data.AUTOTUNE)

# Define the model
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.applications.mobilenet_v2 import preprocess_input
from tensorflow.keras.layers import GlobalAveragePooling2D, Dense, Dropout
from tensorflow.keras.models import Model

def build_and_compile_model():
    base_model = MobileNetV2(
        input_shape = IMAGE_SIZE + (3,), 
        include_top = False,
        weights = None
    )
    
    x = preprocess_input(base_model.output)
    x = GlobalAveragePooling2D()(x)
    x = Dense(512, activation='relu')(x)
    x = Dropout(0.2)(x)
    outputs = Dense(CLASSES, activation='softmax')(x)
    
    model = Model(inputs=base_model.input, outputs=outputs)
    model.compile(
        optimizer = 'adam', 
        loss = 'categorical_crossentropy', 
        metrics = ['accuracy']
    )
    return model

model = build_and_compile_model()
model.summary()

# Side-car evaluation
tf.keras.experimental.SidecarEvaluator(
    model = model,
    data = validate_dataset,
    checkpoint_dir = MODEL_CKPT, # Dir for training-saved checkpoint
    steps = None, # Evaluate until dataset is exhausted
    max_evaluations = None, # The evaluation needs to be stopped manually
    callbacks = [TensorBoard(log_dir='./logs')]
).start()
%tensorboard --logdir=logs

1.2 转换代码

在本地运行代码需要转换为.py文件

主要是一下转换:

  1. 导入tensorboard:%load_ext tensorboard–>get_ipython().run_line_magic('load_ext', 'tensorboard')
  2. 删除日志:!rm -rf logs–>get_ipython().system('rm -rf logs')
  3. 运行tensorboard:%tensorboard --logdir=logs–>get_ipython().run_line_magic('tensorboard', '--logdir=logs')

1.3 修改单机代码

1.3.1 注释掉jupyter文件中转换后的魔法函数

需要注释掉jupyter文件中转换后的魔法函数,否则直接运行会出现以下错误

Traceback (most recent call last):
  File "/home/hqc/container_share/ml-distributed-training-main/single-training/flowers_single.py", line 11, in <module>
    get_ipython().run_line_magic('load_ext', 'tensorboard')
NameError: name 'get_ipython' is not defined

1.3.2 但后续代码中仍然会用到get_ipython

解决:
代码中加入from IPython import get_ipython,一般会报错找不到IPython,直接安装即可:pip install Ipython

这个问题暂时解决

1.3.3 Rescaling无法导入

报错:
ImportError: cannot import name 'Rescaling' from 'tensorflow.keras.layers' (/home/hqc/anaconda3/envs/tf/lib/python3.9/site-packages/tensorflow/keras/layers/__init__.py)

1.3.4 getoutput报错

Traceback (most recent call last):
  File "/home/hqc/container_share/ml-distributed-training-main/single-training/flowers_single.py", line 68, in <module>
    data_length = get_ipython().getoutput('find {data_root} -name *.jpg | wc -l')
AttributeError: 'NoneType' object has no attribute 'getoutput'

1.3.5 解决方法

太多类似的import问题和找不到包的问题,很可能是版本的问题。
但不可能重新配置开发环境了,太过于麻烦了,因此放弃运行单机的源码验证,学习parameterstrategy的思路即可。

1.4 修改ParameterStrategy代码

1.4.1 集群IP修改

针对提供的开发环境,设置集群IP

tf_config = {
    'cluster': {
        'worker': ['172.72.0.4:12345', '172.72.0.5:12345'],
        'ps': ['172.72.0.2:12345', '172.72.0.3:12345'],
        'chief': ['172.72.0.6:12345']
    },
    'task': {'type': 'worker', 'index': 0},
    # 'task': {'type': 'ps', 'index': 0}
}
os.environ.pop('TF_CONFIG', None)
os.environ['TF_CONFIG'] = json.dumps(tf_config)

注意:不同的代码typeindex的值也应相应地修改。

1.4.2 GPU设置修改

每个节点上的代码都应设置只有一个GPU可见,因为ps策略要求所有节点上的GPU数目相同。
否则coordinator将会报以下错误:NotImplementedError: Multi-gpu is not supported yet.

设置单个GPU可见的指令:os.environ['CUDA_VISIBLE_DEVICES']='0',加到代码中即可
但由于本机上只有2块GPU,不够5台节点分,因此实际上这些集群代码无法运行起来。

解决方法是:转向云服务器租用GPU进行实验。

2 学习如何修改为ParameterStrategy

2.1 worker and ps

worker和ps节点运行的代码和整体需要实现的逻辑没什么关联,感觉就是提供了一个有强大算力的机器以供整个模型训练使用,代码里也主要就是一些设置,包括:

  1. 第一部分:设置集群IP,让集群可以发现并使用
  2. 第二部分:集群IP解析器,让集群区分是worker还是ps
  3. 第三部分:区分之后对应节点功能不同设置GPU的调用
  4. 第四部分:创建一个server,并等待接收coordinator的指令

2.2 coordinator

主要就是依靠coordinator来进行协调联系,发送指令,代码主体需要实现模型逻辑。
与MultiWorkerMirroredStrategy的区别主要包括:

  1. 好像没设置GPU,为什么呢?暂时还不明白。
  2. 集群内的节点增多,对应IP都得加上;coordinator的task中的类型得指定为chief
  3. 多机多卡初始化MultiWorkerMirroredStrategy时,参数包括一个通信选项(RING),并且采用的是自动分片策略;而ps策略初始化ParameterServerStrategy时,需要先定义一个变量分片器手动分片,并且参数包括集群解析器和前面定义的变量分片器。
  4. 数据准备方面只做了训练集的数据增强
  5. 创建数据集得使用DatasetCreator,训练集的操作须被封装在一个函数里边。