1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157import re
def filter_thinking_parts(text):
"""
Filter thinking parts from text (for models like Qwen3 that support thinking mode).
Supported thinking tag formats:
- <think>...</think>
"""
if not isinstance(text, str):
return text
# Define regex patterns for thinking parts
thinking_patterns = [r"<think>.*?</think>"]
# Apply all patterns sequentially for filtering
filtered_text = text
for pattern in thinking_patterns:
filtered_text = re.sub(pattern, "", filtered_text, flags=re.DOTALL | re.IGNORECASE)
# Clean up extra whitespace
filtered_text = re.sub(r"\n\s*\n", "\n\n", filtered_text) # Merge multiple newlines
filtered_text = filtered_text.strip()
return filtered_text
def extract_helpfulness_score(response_text):
"""
Extract helpfulness score from model response.
Extract score from <score> tag.
"""
# Handle case where response_text might not be a string
if not isinstance(response_text, str):
response_text = str(response_text)
# Extract score from <score> tag
score_pattern = r"<score>(.*?)</score>"
match = re.search(score_pattern, response_text, re.DOTALL)
if match:
score_content = match.group(1).strip()
# Extract numbers from content
numbers = re.findall(r"\d+", score_content)
if numbers:
try:
score = int(numbers[0]) # Take the first number as score
if 0 <= score <= 4: # Assume score range is 0-4
return score
except:
pass
return 0 # Default to 0 if extraction fails
def calculate_helpfulness_reward(predicted_score, true_score):
"""
Calculate reward based on the difference between predicted and true helpfulness scores.
Smaller difference results in higher reward.
For binary classification scenarios (true_score is 0 or 1):
- Correct prediction (exact match) -> Reward 1.0
- Wrong prediction -> Reward 0.0
"""
if true_score is None:
return 0.0
# Calculate difference
diff = abs(predicted_score - true_score)
# For binary classification (0 or 1), use exact match
if true_score in [0, 1]:
return 1.0 if diff == 0 else 0.0
# For multi-class scenarios (0-4), use difference calculation
# Convert difference to reward score (smaller difference = higher reward)
max_possible_diff = 4
normalized_diff = min(diff / max_possible_diff, 1.0)
# Reward = 1 - normalized difference
reward = 1.0 - normalized_diff
return reward
def compute_score(data_source, solution_str, ground_truth, extra_info=None, **kwargs):
"""
compute_score function compatible with naive.py.
Args:
data_source: Data source type
solution_str: Model generated response
ground_truth: Ground truth label (obtained from reward_model field)
extra_info: Additional information
"""
try:
# First filter out thinking parts (support thinking mode for models like Qwen3)
filtered_solution = filter_thinking_parts(solution_str)
# Extract helpfulness score from filtered solution_str
predicted_helpfulness = extract_helpfulness_score(filtered_solution)
# Handle ground_truth - could be a number or dict
if isinstance(ground_truth, dict):
true_helpfulness = ground_truth.get("helpfulness", 0)
elif isinstance(ground_truth, (int, float)):
true_helpfulness = int(ground_truth)
elif isinstance(ground_truth, str) and ground_truth.isdigit():
true_helpfulness = int(ground_truth)
else:
# If ground_truth is unavailable, try to get from extra_info
if extra_info and isinstance(extra_info, dict):
output_data = extra_info.get("output", [])
if output_data and len(output_data) > 0:
label_data = output_data[0].get("label", {})
true_helpfulness = label_data.get("helpfulness", 0)
else:
true_helpfulness = 0
else:
true_helpfulness = 0
# Calculate reward
reward = calculate_helpfulness_reward(predicted_helpfulness, true_helpfulness)
# Return detailed information
return {
"score": reward,
"predicted_helpfulness": predicted_helpfulness,
"true_helpfulness": true_helpfulness,
"data_source": data_source,
}
except Exception as e:
print(f"Error in compute_score: {e}")
# Return default values
return {"score": 0.0, "error": str(e), "data_source": data_source}
if __name__ == "__main__":
# Test cases
test_response = """<think>Let me analyze this answer step by step:
1. First, I'll check if the answer is well-structured...
4. Finally, I'll look at the overall helpfulness...
</think>
<score>2</score>"""
ground_truth = {"helpfulness": 3, "task_type": "pointwise"}
# Test compute_score function
result = compute_score(data_source="test", solution_str=test_response, ground_truth=ground_truth)
print("Test Result:")
print(f" Predicted Score: {result.get('predicted_helpfulness')}")
print(f" True Score: {result.get('true_helpfulness')}")
print(f" Reward: {result.get('score')}")