深度学习框架使用简介
不同深度学习框架的python运行环境加载方式如下:
//加载TensorFlow运行环境
source /public/DeepLearning/TensorFlow/tensorflow-env.sh
//加载PyTorch运行环境
source /public/DeepLearning/PyTorch/pytorch-env.sh
//加载MxNet运行环境
source /public/DeepLearning/MxNet/mxnet-env.sh
下面分别介绍三个框架的单机和分布式运行示例:
TensorFlow:
//下载测试程序: https://github.com/horovod/horovod/blob/master/examples/tensorflow_synthetic_benchmark.py //单机单卡 python3 tensorflow_synthetic_benchmark.py --model=resnet50 --batch-size=128 --num-iters=500 //单机多卡 cat single_process.sh #!/bin/bash lrank=$OMPI_COMM_WORLD_LOCAL_RANK APP="python3 tensorflow_synthetic_benchmark.py --model=resnet50 --batch-size=128 --num-iters=500" case ${lrank} in [0]) export HIP_VISIBLE_DEVICES=0,1,2,3 export UCX_NET_DEVICES=mlx5_0:1 export UCX_IB_PCI_BW=mlx5_0:50Gbs numactl --cpunodebind=0 --membind=0 ${APP} ;; [1]) export HIP_VISIBLE_DEVICES=0,1,2,3 export UCX_NET_DEVICES=mlx5_1:1 export UCX_IB_PCI_BW=mlx5_1:50Gbs numactl --cpunodebind=1 --membind=1 ${APP} ;; [2]) export HIP_VISIBLE_DEVICES=0,1,2,3 export UCX_NET_DEVICES=mlx5_2:1 export UCX_IB_PCI_BW=mlx5_2:50Gbs numactl --cpunodebind=2 --membind=2 ${APP} ;; [3]) export HIP_VISIBLE_DEVICES=0,1,2,3 export UCX_NET_DEVICES=mlx5_3:1 export UCX_IB_PCI_BW=mlx5_3:50Gbs numactl --cpunodebind=3 --membind=3 ${APP} ;; esac mpirun -np 4 ./single_process.sh //多机多卡,结合调度系统使用 #!/bin/bash #SBATCH -p debug #SBATCH -N 16 #SBATCH -J xuan-tf #SBATCH -n 512 #SBATCH --gres=加速卡:4 hostfile=./$SLURM_JOB_ID scontrol show hostnames $SLURM_JOB_NODELIST > ${hostfile} for i in `cat ./${hostfile}` do echo ${i} slots=4 >> ./hostfile-dl-$SLURM_JOB_ID ((num_node=${num_node}+1)) done echo "resnet50 node is " ${num_node} ((num_加速卡=${num_node}*4)) mpirun -np ${num_加速卡} ./single_process.h
PyTorch:
//下载测试程序 https://github.com/pytorch/examples/blob/master/imagenet/main.py /*usage: main.py [-h] [-a ARCH] [-j N] [--epochs N] [--start-epoch N] [-b N] [--lr LR] [--momentum M] [--wd W] [-p N] [--resume PATH] [-e] [--pretrained] [--world-size WORLD_SIZE] [--rank RANK] [--dist-url DIST_URL] [--dist-backend DIST_BACKEND] [--seed SEED] [--gpu GPU] [--multiprocessing-distributed] DIR */ //单机单卡 python3 main.py \ --batch-size=32 \ --arch=resnet50 \ --workers 6 \ --epochs=1 \ --gpu=0 \ /imagenet/ //单机多卡(-) python3 main.py \ --batch-size=128 \ --arch=resnet50 \ --workers 24 \ --epochs=1 \ /imagenet/ //单机多卡(二) cat single_process.sh export GLOO_SOCKET_IFNAME=ib0,ib1,ib2,ib3 export MIOPEN_USER_DB_PATH=/tmp/pytorch-miopen-2.8 export HSA_USERPTR_FOR_PAGED_MEM=0 lrank=$OMPI_COMM_WORLD_LOCAL_RANK comm_rank=$OMPI_COMM_WORLD_RANK comm_size=$OMPI_COMM_WORLD_SIZE APP="python3 main.py --batch-size=32 --a=resnet50 -j 6 --epochs=1 --dist-url tcp://${1}:34567 --dist-backend gloo --world-size=${comm_size} --rank=${comm_rank} /imagenet/" case ${lrank} in [0]) export HIP_VISIBLE_DEVICES=0 export UCX_NET_DEVICES=mlx5_0:1 export UCX_IB_PCI_BW=mlx5_0:50Gbs GLOO_SOCKET_IFNAME=ib0,ib1,ib2,ib3 numactl --cpunodebind=0 --membind=0 ${APP} ;; [1]) export HIP_VISIBLE_DEVICES=1 export UCX_NET_DEVICES=mlx5_1:1 export UCX_IB_PCI_BW=mlx5_1:50Gbs GLOO_SOCKET_IFNAME=ib0,ib1,ib2,ib3 numactl --cpunodebind=1 --membind=1 ${APP} ;; [2]) export HIP_VISIBLE_DEVICES=2 export UCX_NET_DEVICES=mlx5_2:1 export UCX_IB_PCI_BW=mlx5_2:50Gbs GLOO_SOCKET_IFNAME=ib0,ib1,ib2,ib3 numactl --cpunodebind=2 --membind=2 ${APP} ;; [3]) export HIP_VISIBLE_DEVICES=3 export UCX_NET_DEVICES=mlx5_3:1 export UCX_IB_PCI_BW=mlx5_3:50Gbs GLOO_SOCKET_IFNAME=ib0,ib1,ib2,ib3 numactl --cpunodebind=3 --membind=3 ${APP} ;; esac //单机四卡 mpirun -np 4 `pwd`/single_process.sh $dist_url //多机多卡 #!/bin/bash #SBATCH -p debug #SBATCH -N 2 #SBATCH -J xuan-pytorch #SBATCH -n 64 which mpirun hostfile=./$SLURM_JOB_ID scontrol show hostnames $SLURM_JOB_NODELIST > ${hostfile} num_node=$(cat $hostfile|sort|uniq |wc -l) num_加速卡=$(($num_node*4)) nodename=$(cat $hostfile |sed -n "1p") dist_url=`echo $nodename | awk '{print $1}'` rm `pwd`/hostfile-xuan -f cat $hostfile|sort|uniq >`pwd`/tmp for i in `cat ./tmp` do echo ${i} slots=4 >> `pwd`/hostfile-xuan done mpirun -np $np --allow-run-as-root -hostfile `pwd`/hostfile-xuan `pwd`/single_process.sh $dist_url
Mxnet:
//下载测试程序 https://github.com/apache/incubator-mxnet/blob/v1.4.x/example/image-classification //单机单卡 python3 train_imagenet.py \ --benchmark 1 \ --gpus 0 \ --network inception-v3 \ --batch-size 64 \ --image-shape 3,299,299 \ --num-epochs 10 \ --kv-store device //单机多卡 python3 train_imagenet.py \ --benchmark 1 \ --gpus 0,1,2,3 \ --network inception-v3 \ --batch-size 64 \ --image-shape 3,299,299 \ --num-epochs 10 \ --kv-store device //多机多卡,ps-worker #!/bin/bash #SBATCH -J mxnet #SBATCH -p dl #SBATCH -N 2 #SBATCH -n 64 #SBATCH --gres=加速卡:4 hostfile=./$SLURM_JOB_ID scontrol show hostnames $SLURM_JOB_NODELIST > ${hostfile} num_node=0 for i in `cat ./${hostfile}` do # echo ${i} slots=4 >> ./mxnet-$SLURM_JOB_ID gethostip ${i} | awk '{print $2}' >>./mxnet-$SLURM_JOB_ID ((num_node=${num_node}+1)) done source /public/home/yangxuan1/mxnet-env.sh which python3 python3 ../tools/launch.py \ -n ${num_node} -s 2 -H mxnet-$SLURM_JOB_ID \ --sync-dst-dir ../example/distributed_training/ \ --launcher ssh \ "source /public/home/yangxuan1/mxnet-env.sh; python3 cifar10_dist.py \ --network resnet \ --num-layers 110 \ --batch-size 128 \ --kv-store dist_device_sync" /*cat hosts 10.11.7.51 10.11.7.53*/