📦 agentscope-ai / OpenJudge

📄 run_grader_evals.sh · 112 lines
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112#!/bin/bash
set -e

# Display help information
show_help() {
    cat << EOF
Usage: $0 [OPTIONS]

This script runs the OpenJudge evaluation task.

Optional arguments:
  --agent-model MODEL       Specify the language model used by the agent (Optional - Default: qwen3-32b)
  --text-model MODEL        Specify the text evaluation model (Optional - Default: qwen3-32b)
  --multimodal-model MODEL  Specify the multimodal evaluation model (Optional - Default: qwen-vl-max)
  --workers N               Specify the number of parallel worker processes (Optional - Default: 5)
  --category CAT            Specify the evaluation category (Optional: mutually exclusive with --grader. Default: all categories)
  --grader GRADER           Specify a custom grader name (Optional: mutually exclusive with --category. Default: all graders)
  --help, -h                Show this help message and exit

Notes:
  - --category and --grader cannot be used together.
  - If no arguments are provided, the script runs with default settings.
EOF
    exit 0
}


agent_model=""
text_model=""
multimodal_model=""
workers=""
category=""
grader=""

# Parse command-line arguments
while [[ $# -gt 0 ]]; do
    case $1 in
        --agent-model)
            agent_model="$2"
            shift 2
            ;;
        --text-model)
            text_model="$2"
            shift 2
            ;;
        --multimodal-model)
            multimodal_model="$2"
            shift 2
            ;;
        --workers)
            workers="$2"
            shift 2
            ;;
        --category)
            category="$2"
            shift 2
            ;;
        --grader)
            grader="$2"
            shift 2
            ;;
        --help|-h)
            show_help
            ;;
        *)
            echo "Unknown argument: $1" >&2
            echo "Run '$0 --help' for usage." >&2
            exit 1
            ;;
    esac
done

# Check if both --category and --grader are specified (they are mutually exclusive)
if [[ -n "$category" && -n "$grader" ]]; then
    echo "Error: --category and --grader cannot be used at the same time." >&2
    exit 1
fi

# Install dependencies (only needed on first run; comment out for subsequent runs)
pip install huggingface_hub
hf download agentscope-ai/OpenJudge --repo-type dataset --local-dir agentscope-ai/OpenJudge
pip install py-openjudge datasets

# Set environment variables (replace 'your_dashscope_api_key' with your actual key if needed)
# export OPENAI_API_KEY=your_dashscope_api_key
# export OPENAI_BASE_URL=https://dashscope.aliyuncs.com/compatible-mode/v1

# Copy the evaluation script into the dataset directory
cp -f run_grader_evaluations.py agentscope-ai/OpenJudge/
cd agentscope-ai/OpenJudge

# Build the command
cmd_array=("python" "run_grader_evaluations.py")

# Add either --category or --grader (mutually exclusive)
if [[ -n "$category" ]]; then
    cmd_array+=("--category" "$category")
elif [[ -n "$grader" ]]; then
    cmd_array+=("--grader" "$grader")
fi

# Append other optional arguments if they are non-empty
[[ -n "$agent_model" ]] && cmd_array+=("--agent-model" "$agent_model")
[[ -n "$text_model" ]] && cmd_array+=("--text-model" "$text_model")
[[ -n "$multimodal_model" ]] && cmd_array+=("--multimodal-model" "$multimodal_model")
[[ -n "$workers" ]] && cmd_array+=("--workers" "$workers")

echo "Executing command: ${cmd_array[*]}"

# Execute the command
"${cmd_array[@]}"