1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65# -*- coding: utf-8 -*-
"""
Base module for grader validation functionality.
This module provides the foundational classes for validating graders
by running evaluations and analyzing the results.
"""
from abc import ABC
from typing import Dict, List
from openjudge.analyzer.base_analyzer import AnalysisResult, BaseAnalyzer
from openjudge.graders.base_grader import BaseGrader
from openjudge.runner.grading_runner import GradingRunner
class GraderValidator(ABC):
"""
Base validator for graders.
This class provides the basic functionality to validate graders by running
evaluations on provided data and analyzing the results using a specified analyzer.
Attributes:
runner (GradingRunner): Runner responsible for executing the grader on data.
analyzer (BaseAnalyzer): Analyzer used to process and evaluate the results.
"""
def __init__(self, runner: GradingRunner, analyzer: BaseAnalyzer) -> None:
self.runner = runner
self.analyzer = analyzer
async def validate(
self,
dataset: List[dict],
grader: BaseGrader,
mapping: Dict[str, str],
) -> AnalysisResult:
"""
Validate a grader by running it on test data and analyzing the results.
This method executes the grader on the provided data samples using the
specified field mappings, then analyzes the results using the configured analyzer.
Args:
dataset (List[Dict]): List of data samples to evaluate. Each dictionary
represents a sample with input parameters and expected outputs.
grader (BaseGrader): The grader instance to validate.
mapping (Dict[str, str]): Field mapping dictionary that maps grader
input parameter names to corresponding keys in the data samples.
Returns:
AnalysisResult: Analysis result containing metrics and evaluation information
computed by the analyzer based on the grader's performance.
Example:
>>> validator = GraderValidator(runner=my_runner, analyser=my_analyzer)
>>> dataset = [{"query": "What is 2+2?", "expected": 4}]
>>> my_grader = MyGrader()
>>> mapping = {"question": "query"}
>>> result = await validator.validate(dataset, my_grader, mapping)
"""
result = await self.runner.arun(dataset, grader, mapping)
return self.analyzer.analyze(dataset, result["over_all"])