#!/bin/bash # TestTime RLVR Batch Evaluation Script # 벤치마크 전체에 대하여 평가를 실행합니다 # 사용법 출력 함수 print_usage() { echo "사용법: $0 [MODEL] [BENCHMARK] [MAX_PROBLEMS] [GPU_ID] [RESUME_OPTIONS]" echo "" echo "매개변수:" echo " MODEL 모델 이름 (기본값: Qwen/Qwen2.5-7B)" echo " BENCHMARK 벤치마크 (mbpp|humaneval|all, 기본값: mbpp)" echo " MAX_PROBLEMS 최대 문제 수 (0=전체, 기본값: 10)" echo " GPU_ID GPU 번호 (기본값: 6)" echo "" echo "Resume 옵션:" echo " --resume 이전에 완료된 문제들을 제외하고 이어서 실행" echo " --start-from PROBLEM 특정 문제 ID부터 시작 (예: Mbpp/100)" echo "" echo "예시:" echo " $0 # 기본 설정 (MBPP 10문제)" echo " $0 \"Qwen/Qwen2.5-7B\" mbpp 0 6 # MBPP 전체" echo " $0 \"Qwen/Qwen2.5-7B\" all 0 6 --resume # 모든 벤치마크 전체 (이어서)" echo " $0 \"Qwen/Qwen2.5-7B\" mbpp 0 6 --start-from Mbpp/100 # Mbpp/100부터 시작" } # 도움말 확인 if [[ "$1" == "--help" ]] || [[ "$1" == "-h" ]]; then print_usage exit 0 fi # GPU 설정 (스크립트에서 동적으로 설정됨) # export CUDA_VISIBLE_DEVICES=6 # EvalPlus 경로 추가 export PYTHONPATH="/home/ubuntu/RLVR/TestTime-RLVR-v2/evaluation/code_eval/coding:$PYTHONPATH" # HuggingFace 캐시 경로 설정 (빠른 로딩) export HF_HOME="/data/.cache/huggingface" export TRANSFORMERS_CACHE="/data/.cache/huggingface" # EvalPlus import를 위한 Python path 설정 # conda 환경의 site-packages 경로 추가 export PYTHONPATH="/data/miniforge3/envs/azr/lib/python3.10/site-packages:$PYTHONPATH" # Tokenizer parallelism 경고 방지 export TOKENIZERS_PARALLELISM=false echo "🚀 TestTime RLVR Batch Evaluation" echo "==================================" # 기본 설정 MODEL=${1:-"Qwen/Qwen2.5-7B"} BENCHMARK=${2:-"mbpp"} MAX_PROBLEMS=${3:-10} GPU_ID=${4:-6} # Resume 옵션 처리 RESUME_OPTIONS="" START_FROM="" # 5번째 매개변수부터 resume 옵션 처리 shift 4 # 처음 4개 매개변수 제거 while [[ $# -gt 0 ]]; do case $1 in --resume) RESUME_OPTIONS="--resume" shift ;; --start-from) if [[ -n $2 ]]; then START_FROM="--start_from $2" shift 2 else echo "❌ Error: --start-from requires a problem ID" print_usage exit 1 fi ;; *) echo "❌ Error: Unknown option $1" print_usage exit 1 ;; esac done echo "📋 Configuration:" echo " Model: $MODEL" echo " Benchmark: $BENCHMARK" echo " Max Problems: $MAX_PROBLEMS (0 = 전체)" echo " GPU: $GPU_ID" if [[ -n "$RESUME_OPTIONS" ]]; then echo " Resume: Enabled" fi if [[ -n "$START_FROM" ]]; then echo " Start From: $(echo $START_FROM | cut -d' ' -f2)" fi echo "" # 시작 시간 기록 START_TIME=$(date +%s) TIMESTAMP=$(date +"%Y%m%d_%H%M%S") # 로그 디렉토리 설정 (절대 경로) BASE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" LOG_DIR="${BASE_DIR}/../tmp/logs" mkdir -p "$LOG_DIR" # 로그 파일 설정 LOG_FILE="${LOG_DIR}/batch_evaluation_${TIMESTAMP}.log" echo "📝 Full log will be saved to: $LOG_FILE" echo "" # 로그 파일에 시작 정보 기록 echo "==========================================================" > "$LOG_FILE" echo "TestTime RLVR Batch Evaluation Log" >> "$LOG_FILE" echo "Started at: $(date)" >> "$LOG_FILE" echo "Model: $MODEL" >> "$LOG_FILE" echo "Benchmark: $BENCHMARK" >> "$LOG_FILE" echo "Max Problems: $MAX_PROBLEMS" >> "$LOG_FILE" echo "GPU: $GPU_ID" >> "$LOG_FILE" echo "Log File: $LOG_FILE" >> "$LOG_FILE" echo "==========================================================" >> "$LOG_FILE" echo "" >> "$LOG_FILE" # 로그 함수 정의 log_and_display() { echo "$1" | tee -a "$LOG_FILE" } # Python 실행 결과를 로그에 기록하는 함수 run_with_log() { local cmd="$1" echo "Executing: $cmd" >> "$LOG_FILE" echo "----------------------------------------" >> "$LOG_FILE" # 임시 파일을 사용해 exit code 보존 local temp_exit_file=$(mktemp) (eval "$cmd"; echo $? > "$temp_exit_file") 2>&1 | tee -a "$LOG_FILE" local exit_code=$(cat "$temp_exit_file") rm -f "$temp_exit_file" echo "----------------------------------------" >> "$LOG_FILE" echo "Exit code: $exit_code" >> "$LOG_FILE" echo "" >> "$LOG_FILE" return $exit_code } # 전체 벤치마크 실행 (all 옵션) if [[ "$BENCHMARK" == "all" ]]; then log_and_display "🎯 Running evaluation on ALL benchmarks (MBPP+ and HumanEval+)" log_and_display "" # MBPP+ 실행 log_and_display "===============================================" log_and_display "📊 MBPP+ Evaluation" log_and_display "===============================================" run_with_log "cd '$BASE_DIR' && python batch_evaluate_testtime.py --model '$MODEL' --benchmark 'mbpp' --max_problems $MAX_PROBLEMS --gpu $GPU_ID --output_dir '${BASE_DIR}/../tmp/batch_results_${TIMESTAMP}' $RESUME_OPTIONS $START_FROM" MBPP_EXIT_CODE=$? if [ $MBPP_EXIT_CODE -eq 0 ]; then log_and_display "✅ MBPP+ evaluation completed" # MBPP+ 결과 요약 표시 log_and_display "" log_and_display "📊 MBPP+ Results Summary:" log_and_display "===============================================" # 가장 최근 MBPP 결과 디렉토리 찾기 MBPP_RESULT_DIR=$(find "${BASE_DIR}/../tmp/batch_results_${TIMESTAMP}" -name "*mbpp*" -type d | head -1) if [ -n "$MBPP_RESULT_DIR" ] && [ -f "$MBPP_RESULT_DIR/evaluation_summary.md" ]; then # 요약 정보 추출 및 표시 log_and_display "📁 Results saved to: $MBPP_RESULT_DIR" # 주요 통계 추출 MBPP_ACCURACY=$(grep "Initial Solution Accuracy" "$MBPP_RESULT_DIR/evaluation_summary.md" | head -1 | sed 's/.*: //' || echo "N/A") MBPP_TOTAL=$(grep "Total Problems" "$MBPP_RESULT_DIR/evaluation_summary.md" | sed 's/.*: //' || echo "N/A") MBPP_TIME=$(grep "Total Execution Time" "$MBPP_RESULT_DIR/evaluation_summary.md" | sed 's/.*: //' || echo "N/A") log_and_display "📈 Initial Solution Accuracy: $MBPP_ACCURACY" log_and_display "📊 Total Problems: $MBPP_TOTAL" log_and_display "⏱️ Execution Time: $MBPP_TIME" # Reasoning task 정확률 표시 if grep -q "Problem-based Accuracy" "$MBPP_RESULT_DIR/evaluation_summary.md"; then log_and_display "🧠 Reasoning Task Performance:" grep "Problem-based Accuracy" "$MBPP_RESULT_DIR/evaluation_summary.md" | while read line; do task_name=$(echo "$line" | sed 's/.*\*\*\([^*]*\)\*\*.*/\1/') accuracy=$(echo "$line" | sed 's/.*: \([0-9.]*\).*/\1/') log_and_display " - $task_name: $accuracy" done fi else log_and_display "📁 Results directory: $MBPP_RESULT_DIR (summary not yet available)" fi else log_and_display "❌ MBPP+ evaluation failed" fi log_and_display "" log_and_display "🔄 Proceeding to HumanEval+ evaluation..." log_and_display "" # HumanEval+ 실행 log_and_display "===============================================" log_and_display "📊 HumanEval+ Evaluation" log_and_display "===============================================" run_with_log "cd '$BASE_DIR' && python batch_evaluate_testtime.py --model '$MODEL' --benchmark 'humaneval' --max_problems $MAX_PROBLEMS --gpu $GPU_ID --output_dir '${BASE_DIR}/../tmp/batch_results_${TIMESTAMP}' $RESUME_OPTIONS $START_FROM" HUMANEVAL_EXIT_CODE=$? if [ $HUMANEVAL_EXIT_CODE -eq 0 ]; then log_and_display "✅ HumanEval+ evaluation completed" # HumanEval+ 결과 요약 표시 log_and_display "" log_and_display "📊 HumanEval+ Results Summary:" log_and_display "===============================================" # 가장 최근 HumanEval 결과 디렉토리 찾기 HUMANEVAL_RESULT_DIR=$(find "${BASE_DIR}/../tmp/batch_results_${TIMESTAMP}" -name "*humaneval*" -type d | head -1) if [ -n "$HUMANEVAL_RESULT_DIR" ] && [ -f "$HUMANEVAL_RESULT_DIR/evaluation_summary.md" ]; then # 요약 정보 추출 및 표시 log_and_display "📁 Results saved to: $HUMANEVAL_RESULT_DIR" # 주요 통계 추출 HUMANEVAL_ACCURACY=$(grep "Initial Solution Accuracy" "$HUMANEVAL_RESULT_DIR/evaluation_summary.md" | head -1 | sed 's/.*: //' || echo "N/A") HUMANEVAL_TOTAL=$(grep "Total Problems" "$HUMANEVAL_RESULT_DIR/evaluation_summary.md" | sed 's/.*: //' || echo "N/A") HUMANEVAL_TIME=$(grep "Total Execution Time" "$HUMANEVAL_RESULT_DIR/evaluation_summary.md" | sed 's/.*: //' || echo "N/A") log_and_display "📈 Initial Solution Accuracy: $HUMANEVAL_ACCURACY" log_and_display "📊 Total Problems: $HUMANEVAL_TOTAL" log_and_display "⏱️ Execution Time: $HUMANEVAL_TIME" # Reasoning task 정확률 표시 if grep -q "Problem-based Accuracy" "$HUMANEVAL_RESULT_DIR/evaluation_summary.md"; then log_and_display "🧠 Reasoning Task Performance:" grep "Problem-based Accuracy" "$HUMANEVAL_RESULT_DIR/evaluation_summary.md" | while read line; do task_name=$(echo "$line" | sed 's/.*\*\*\([^*]*\)\*\*.*/\1/') accuracy=$(echo "$line" | sed 's/.*: \([0-9.]*\).*/\1/') log_and_display " - $task_name: $accuracy" done fi else log_and_display "📁 Results directory: $HUMANEVAL_RESULT_DIR (summary not yet available)" fi else log_and_display "❌ HumanEval+ evaluation failed" fi # 결과 요약 # 최종 종합 요약 log_and_display "" log_and_display "🎉 All Benchmarks Completed!" log_and_display "===============================================" log_and_display "📊 Final Summary:" log_and_display " MBPP+: $([ $MBPP_EXIT_CODE -eq 0 ] && echo "✅ Success" || echo "❌ Failed")" log_and_display " HumanEval+: $([ $HUMANEVAL_EXIT_CODE -eq 0 ] && echo "✅ Success" || echo "❌ Failed")" # 통합 결과 디렉토리 정보 OUTPUT_DIR="${BASE_DIR}/../tmp/batch_results_${TIMESTAMP}" log_and_display "" log_and_display "📁 All results saved in: $OUTPUT_DIR" # 각 벤치마크별 디렉토리 표시 if [ $MBPP_EXIT_CODE -eq 0 ]; then MBPP_DIR=$(find "$OUTPUT_DIR" -name "*mbpp*" -type d | head -1) [ -n "$MBPP_DIR" ] && log_and_display " 📂 MBPP+ detailed results: $MBPP_DIR" fi if [ $HUMANEVAL_EXIT_CODE -eq 0 ]; then HUMANEVAL_DIR=$(find "$OUTPUT_DIR" -name "*humaneval*" -type d | head -1) [ -n "$HUMANEVAL_DIR" ] && log_and_display " 📂 HumanEval+ detailed results: $HUMANEVAL_DIR" fi else # 단일 벤치마크 실행 log_and_display "🎯 Running evaluation on $BENCHMARK benchmark" log_and_display "" run_with_log "cd '$BASE_DIR' && python batch_evaluate_testtime.py --model '$MODEL' --benchmark '$BENCHMARK' --max_problems $MAX_PROBLEMS --gpu $GPU_ID --output_dir '${BASE_DIR}/../tmp/batch_results' $RESUME_OPTIONS $START_FROM" # Python 실행이 완료되면 명시적으로 exit code 반환 PYTHON_EXIT_CODE=$? OUTPUT_DIR="${BASE_DIR}/../tmp/batch_results" fi # 실행 시간 계산 END_TIME=$(date +%s) DURATION=$((END_TIME - START_TIME)) HOURS=$((DURATION / 3600)) MINUTES=$(((DURATION % 3600) / 60)) SECONDS=$((DURATION % 60)) log_and_display "" log_and_display "🎉 Batch evaluation completed!" log_and_display "⏱️ Total duration: ${HOURS}h ${MINUTES}m ${SECONDS}s" log_and_display "📁 Check results in: $OUTPUT_DIR" log_and_display "📝 Full log saved to: $LOG_FILE" echo "" echo "📋 Summary:" echo " Results: $OUTPUT_DIR" echo " Full Log: $LOG_FILE" echo " Duration: ${HOURS}h ${MINUTES}m ${SECONDS}s" # 명시적으로 스크립트 종료 exit 0