File size: 2,793 Bytes
f49345e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
      
{
    "llama-3.2-3b-instruct": {
        "success_rate": 0.18239999999999998,
        "relative_action_count": 0.8956800000000001
    },
    "qwen2.5-3b-instruct": {
        "success_rate": 0.184,
        "relative_action_count": 0.8255466666666665
    },
    "gpt-3.5-turbo": {
        "success_rate": 0.272,
        "relative_action_count": 0.8743314285714285
    },
    "qwen2.5-7b-instruct": {
        "success_rate": 0.36639999999999995,
        "relative_action_count": 0.8682133333333335
    },
    "gemma-2-9b-it": {
        "success_rate": 0.392,
        "relative_action_count": 0.8522190476190475
    },
    "llama-3.1-8b-instruct": {
        "success_rate": 0.4424,
        "relative_action_count": 0.8441104761904763
    },
    "gemma-2-27b-it": {
        "success_rate": 0.548,
        "relative_action_count": 0.6583142857142856
    },
    "yi-lightning": {
        "success_rate": 0.6728,
        "relative_action_count": 0.5962819047619048
    },
    "llama-3.1-70b-instruct": {
        "success_rate": 0.696,
        "relative_action_count": 0.5514495238095238
    },
    "llama-3.3-70b-instruct": {
        "success_rate": 0.712,
        "relative_action_count": 0.5916438095238095
    },
    "gpt-4o-mini": {
        "success_rate": 0.7239999999999999,
        "relative_action_count": 0.5270952380952381
    },
    "gemini-1.5-pro": {
        "success_rate": 0.7256,
        "relative_action_count": 0.5686514285714285
    },
    "claude-3.5-haiku": {
        "success_rate": 0.7343999999999999,
        "relative_action_count": 0.757095238095238
    },
    "qwen2.5-14b-instruct": {
        "success_rate": 0.756,
        "relative_action_count": 0.5723257142857143
    },
    "qwen2.5-72b-instruct": {
        "success_rate": 0.7584,
        "relative_action_count": 0.5753561904761904
    },
    "gpt-4o": {
        "success_rate": 0.7856000000000002,
        "relative_action_count": 0.506207619047619
    },
    "qwen2.5-32b-instruct": {
        "success_rate": 0.7879999999999999,
        "relative_action_count": 0.5955619047619047
    },
    "mistral-large-instruct-2411": {
        "success_rate": 0.7879999999999999,
        "relative_action_count": 0.5365238095238094
    },
    "claude-3.5-sonnet": {
        "success_rate": 0.8263999999999999,
        "relative_action_count": 0.46185714285714285
    },
    "deepseek-r1": {
        "success_rate": 0.8712,
        "relative_action_count": 0.51432
    },
    "o1-mini": {
        "success_rate": 0.8784000000000001,
        "relative_action_count": 0.46449523809523807
    },
    "deepseek-v3": {
        "success_rate": 0.8928,
        "relative_action_count": 0.5308400000000001
    },
    "qwq-32b": {
        "success_rate": 0.9032,
        "relative_action_count": 0.5338533333333333
    }
}