深度学习框架使用简介

不同深度学习框架的python运行环境加载方式如下：

//加载TensorFlow运行环境

source /public/DeepLearning/TensorFlow/tensorflow-env.sh

//加载PyTorch运行环境

source /public/DeepLearning/PyTorch/pytorch-env.sh

//加载MxNet运行环境

source /public/DeepLearning/MxNet/mxnet-env.sh

下面分别介绍三个框架的单机和分布式运行示例：

TensorFlow:

    //下载测试程序：
    https://github.com/horovod/horovod/blob/master/examples/tensorflow_synthetic_benchmark.py
    //单机单卡
    python3 tensorflow_synthetic_benchmark.py --model=resnet50 --batch-size=128 --num-iters=500

    //单机多卡
    cat single_process.sh
    #!/bin/bash

    lrank=$OMPI_COMM_WORLD_LOCAL_RANK

    APP="python3 tensorflow_synthetic_benchmark.py --model=resnet50 --batch-size=128 --num-iters=500"
    case ${lrank} in
    [0])
    export HIP_VISIBLE_DEVICES=0,1,2,3
    export UCX_NET_DEVICES=mlx5_0:1
    export UCX_IB_PCI_BW=mlx5_0:50Gbs
    numactl --cpunodebind=0 --membind=0 ${APP}
    ;;
    [1])
    export HIP_VISIBLE_DEVICES=0,1,2,3
    export UCX_NET_DEVICES=mlx5_1:1
    export UCX_IB_PCI_BW=mlx5_1:50Gbs
    numactl --cpunodebind=1 --membind=1 ${APP}
    ;;
    [2])
    export HIP_VISIBLE_DEVICES=0,1,2,3
    export UCX_NET_DEVICES=mlx5_2:1
    export UCX_IB_PCI_BW=mlx5_2:50Gbs
    numactl --cpunodebind=2 --membind=2 ${APP}
    ;;
    [3])
    export HIP_VISIBLE_DEVICES=0,1,2,3
    export UCX_NET_DEVICES=mlx5_3:1
    export UCX_IB_PCI_BW=mlx5_3:50Gbs
    numactl --cpunodebind=3 --membind=3 ${APP}
    ;;
    esac

    mpirun -np 4 ./single_process.sh

    //多机多卡，结合调度系统使用
    #!/bin/bash
    #SBATCH -p debug
    #SBATCH -N 16
    #SBATCH -J xuan-tf
    #SBATCH -n 512
    #SBATCH --gres=加速卡:4

    hostfile=./$SLURM_JOB_ID
    scontrol show hostnames $SLURM_JOB_NODELIST > ${hostfile}
    for i in `cat ./${hostfile}`
    do
        echo ${i} slots=4 >> ./hostfile-dl-$SLURM_JOB_ID    ((num_node=${num_node}+1))
    done
    echo "resnet50 node is " ${num_node}
    ((num_加速卡=${num_node}*4))

    mpirun -np ${num_加速卡} ./single_process.h

PyTorch：

     //下载测试程序
     https://github.com/pytorch/examples/blob/master/imagenet/main.py
     /*usage: main.py [-h] [-a ARCH] [-j N] [--epochs N] [--start-epoch N] [-b N]
                 [--lr LR] [--momentum M] [--wd W] [-p N] [--resume PATH] [-e]
                 [--pretrained] [--world-size WORLD_SIZE] [--rank RANK]
                 [--dist-url DIST_URL] [--dist-backend DIST_BACKEND]
                 [--seed SEED] [--gpu GPU] [--multiprocessing-distributed]
                 DIR
     */
     //单机单卡
     python3 main.py \
         --batch-size=32 \
         --arch=resnet50 \
         --workers 6 \
         --epochs=1 \
         --gpu=0 \
         /imagenet/

     //单机多卡(-)
     python3 main.py \
         --batch-size=128 \
         --arch=resnet50 \
         --workers 24 \
         --epochs=1 \
         /imagenet/

     //单机多卡（二）
     cat single_process.sh
     export GLOO_SOCKET_IFNAME=ib0,ib1,ib2,ib3
     export MIOPEN_USER_DB_PATH=/tmp/pytorch-miopen-2.8
     export HSA_USERPTR_FOR_PAGED_MEM=0

     lrank=$OMPI_COMM_WORLD_LOCAL_RANK
     comm_rank=$OMPI_COMM_WORLD_RANK
     comm_size=$OMPI_COMM_WORLD_SIZE

     APP="python3 main.py --batch-size=32 --a=resnet50 -j 6 --epochs=1 --dist-url tcp://${1}:34567 --dist-backend gloo --world-size=${comm_size} --rank=${comm_rank} /imagenet/"
     case ${lrank} in
     [0])
     export HIP_VISIBLE_DEVICES=0
     export UCX_NET_DEVICES=mlx5_0:1
     export UCX_IB_PCI_BW=mlx5_0:50Gbs
     GLOO_SOCKET_IFNAME=ib0,ib1,ib2,ib3 numactl --cpunodebind=0 --membind=0 ${APP}
     ;;
     [1])
     export HIP_VISIBLE_DEVICES=1
     export UCX_NET_DEVICES=mlx5_1:1
     export UCX_IB_PCI_BW=mlx5_1:50Gbs
     GLOO_SOCKET_IFNAME=ib0,ib1,ib2,ib3 numactl --cpunodebind=1 --membind=1 ${APP}
     ;;
     [2])
     export HIP_VISIBLE_DEVICES=2
     export UCX_NET_DEVICES=mlx5_2:1
     export UCX_IB_PCI_BW=mlx5_2:50Gbs
     GLOO_SOCKET_IFNAME=ib0,ib1,ib2,ib3 numactl --cpunodebind=2 --membind=2 ${APP}
     ;;
     [3])
     export HIP_VISIBLE_DEVICES=3
     export UCX_NET_DEVICES=mlx5_3:1
     export UCX_IB_PCI_BW=mlx5_3:50Gbs
     GLOO_SOCKET_IFNAME=ib0,ib1,ib2,ib3 numactl --cpunodebind=3 --membind=3 ${APP}
     ;;
     esac
     //单机四卡
     mpirun -np 4 `pwd`/single_process.sh $dist_url

     //多机多卡
     #!/bin/bash
     #SBATCH -p debug
     #SBATCH -N 2
     #SBATCH -J xuan-pytorch
     #SBATCH -n 64
     which mpirun

     hostfile=./$SLURM_JOB_ID
     scontrol show hostnames $SLURM_JOB_NODELIST > ${hostfile}
     num_node=$(cat $hostfile|sort|uniq |wc -l)

     num_加速卡=$(($num_node*4))
     nodename=$(cat $hostfile |sed -n "1p")
     dist_url=`echo $nodename | awk '{print $1}'`

     rm `pwd`/hostfile-xuan -f
     cat $hostfile|sort|uniq >`pwd`/tmp

     for i in `cat ./tmp`
     do
         echo ${i} slots=4 >> `pwd`/hostfile-xuan
     done

     mpirun -np $np --allow-run-as-root -hostfile `pwd`/hostfile-xuan `pwd`/single_process.sh $dist_url

Mxnet：

    //下载测试程序
    https://github.com/apache/incubator-mxnet/blob/v1.4.x/example/image-classification

    //单机单卡
    python3 train_imagenet.py \
        --benchmark 1 \
        --gpus 0 \
        --network inception-v3 \
        --batch-size 64 \
        --image-shape 3,299,299 \
        --num-epochs 10 \
        --kv-store device

    //单机多卡
    python3 train_imagenet.py \
        --benchmark 1 \
        --gpus 0,1,2,3 \
        --network inception-v3 \
        --batch-size 64 \
        --image-shape 3,299,299 \
        --num-epochs 10 \
        --kv-store device

    //多机多卡，ps-worker
    #!/bin/bash
    #SBATCH -J mxnet
    #SBATCH -p dl
    #SBATCH -N 2
    #SBATCH -n 64
    #SBATCH --gres=加速卡:4

    hostfile=./$SLURM_JOB_ID
    scontrol show hostnames $SLURM_JOB_NODELIST > ${hostfile}
    num_node=0
    for i in `cat ./${hostfile}`
    do
    #    echo ${i} slots=4 >> ./mxnet-$SLURM_JOB_ID
        gethostip ${i} | awk '{print $2}' >>./mxnet-$SLURM_JOB_ID
        ((num_node=${num_node}+1))
    done

    source /public/home/yangxuan1/mxnet-env.sh
    which python3

    python3 ../tools/launch.py \
        -n ${num_node} -s 2 -H mxnet-$SLURM_JOB_ID \
        --sync-dst-dir ../example/distributed_training/ \
        --launcher ssh \
        "source /public/home/yangxuan1/mxnet-env.sh; python3 cifar10_dist.py \
        --network resnet \
        --num-layers 110 \
        --batch-size 128 \
        --kv-store dist_device_sync"

    /*cat hosts
    10.11.7.51
    10.11.7.53*/

深度学习框架使用简介

深度学习框架使用简介

results matching ""

No results matching ""