| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| |
|
| | |
| | export NNODES=$1 |
| | export NODE_RANK=$2 |
| | export NPROC_PER_NODE=$3 |
| | export MASTER_ADDR=$4 |
| | export prompt=$5 |
| | export MASTER_PORT=6038 |
| | export WORLD_SIZE=32 |
| | export YAML=runner_config/tp32.yaml |
| | export RANK_OFFSET=`expr $NODE_RANK \* ${NPROC_PER_NODE}` |
| |
|
| | |
| | export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True |
| | export HCCL_SOCKET_IFNAME=enp |
| | export HCCL_IF_IP=`hostname -I|awk -F " " '{print$1}'` |
| | export HCCL_IF_BASE_PORT=23456 |
| | export HCCL_OP_EXPANSION_MODE=AIV |
| | export HCCL_CONNECT_TIMEOUT=1200 |
| | export HCCL_EXEC_TIMEOUT=1200 |
| | if [[ -d "/usr/local/Ascend/ascend-toolkit/latest" ]]; then |
| | export ASCEND_HOME_PATH=/usr/local/Ascend/ascend-toolkit/latest |
| | else |
| | export ASCEND_HOME_PATH=/usr/local/Ascend/latest |
| | fi |
| | export PYTHONPATH=${PYTHONPATH}:${ASCEND_HOME_PATH}/python/site-packages/ |
| |
|
| | |
| | DATE=`date +%Y%m%d` |
| | export MODEL_NAME="pangu_ultra_moe" |
| | NAME=${MODEL_NAME}_${WORLD_SIZE}p |
| | export TASK_QUEUE_ENABLE=2 |
| | export RES_PATH="res/${DATE}/${NAME}" |
| | WORK_DIR=`pwd` |
| | DUMP_PRECISION_PATH=${WORK_DIR}'/'${RES_PATH}'/dump_data' |
| | mkdir -p ${WORK_DIR}'/'${RES_PATH} |
| | mkdir -p ${DUMP_PRECISION_PATH} |
| |
|
| | |
| | cores=`cat /proc/cpuinfo|grep "processor" |wc -l` |
| | avg_core_per_rank=`expr $cores \/ $NPROC_PER_NODE` |
| | core_gap=`expr $avg_core_per_rank \- 1` |
| | for((i=0; i<${NPROC_PER_NODE}; i++)) |
| | do |
| | echo $i |
| | start=`expr $i \* $avg_core_per_rank` |
| | end=`expr $start \+ $core_gap` |
| | cmdopt=$start"-"$end |
| | export LOCAL_RANK=$i |
| | export RANK=$(expr $i + $RANK_OFFSET) |
| | export RANK_ID=$RANK |
| | if [ $i -eq 0 ];then |
| | taskset -c $cmdopt python3 generate.py \ |
| | --prompt "$prompt" \ |
| | --yaml_file_path=${YAML} 2>&1 | tee ${WORK_DIR}/${RES_PATH}/log_${LOCAL_RANK}.log & |
| | else |
| | taskset -c $cmdopt python3 generate.py \ |
| | --prompt "$prompt" \ |
| | --yaml_file_path=${YAML} &> ${WORK_DIR}/${RES_PATH}/log_${LOCAL_RANK}.log & |
| | fi |
| | done |
| |
|
| | wait |