1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134# -*- coding: utf-8 -*-
"""Base class for LLM-based grader generators.
This module provides the foundation for grader generators that create
LLM-based graders by generating rubrics or other evaluation criteria
using language models.
"""
from abc import abstractmethod
from dataclasses import dataclass
from typing import List
from openjudge.generator.base_generator import (
BaseGraderGenerator,
GraderGeneratorConfig,
)
from openjudge.graders.llm_grader import LLMGrader
from openjudge.graders.schema import GraderMode
from openjudge.models.base_chat_model import BaseChatModel
from openjudge.models.schema.prompt_template import PromptTemplate
@dataclass
class LLMGraderGeneratorConfig(GraderGeneratorConfig):
"""Configuration for LLM-based grader generator.
Extends the base grader generator configuration with parameters specific
to LLM-based rubric generation.
Attributes:
model (BaseChatModel | None): Language model to use for generation.
If None, a default model may be used.
grader_mode (GraderMode): Mode for the generated grader (POINTWISE or LISTWISE).
Defaults to POINTWISE.
custom_evaluation_prompt (Template | None): Custom template for evaluation.
If None, a default template is used.
"""
model: BaseChatModel | None = None
grader_mode: GraderMode = GraderMode.POINTWISE
custom_evaluation_prompt: PromptTemplate | None = None
class LLMGraderGenerator(BaseGraderGenerator):
"""Abstract base class for LLM-based grader generators.
This class extends BaseGraderGenerator to provide specific functionality
for generating LLM-based graders. It handles the common pattern of
generating rubrics from data and then creating an LLMGrader that uses
those rubrics for evaluation.
Subclasses should implement the generate_rubrics method to define how
rubrics are created from the provided data.
"""
def __init__(self, config: LLMGraderGeneratorConfig) -> None:
"""Initialize the LLM grader generator with the provided configuration.
Args:
config (LLMGraderGeneratorConfig): Configuration object containing parameters for
LLM-based grader generation. The configuration includes:
- grader_name (str): Human-readable name for the generated grader.
- model (BaseChatModel | None): Language model to use for generation.
- grader_mode (GraderMode): Mode for the generated grader (POINTWISE or LISTWISE).
- custom_evaluation_prompt (PromptTemplate | None): Custom template for evaluation.
"""
super().__init__(config)
async def generate(self, dataset: List[dict], **kwargs) -> LLMGrader:
"""
Generate an LLMGrader based on rubrics created from provided data.
This method orchestrates the grader generation process by first creating
appropriate rubrics based on the input data, then constructing an LLMGrader
that uses those rubrics for evaluation.
Args:
dataset: List of data dictionaries to generate rubrics from.
Each dictionary typically contains query, response, and score information.
**kwargs: Additional arguments for the generation process.
Passed to the generate_rubrics method.
Returns:
LLMGrader: Generated LLMGrader with rubrics for evaluation.
Ready to evaluate new query-response pairs.
Example:
>>> config = LLMGraderGeneratorConfig(
... grader_name="My LLM Grader",
... model=my_model,
... grader_mode=GraderMode.POINTWISE
... )
>>> dataset = [
... {"query": "What is 2+2?", "response": "4", "label_score": 5},
... {"query": "What is 3+3?", "response": "6", "label_score": 5}
... ]
>>> generator = MyLLMGraderGenerator(config)
>>> grader = await generator.generate(dataset, config)
"""
rubrics = await self._generate_rubrics(dataset, **kwargs)
return LLMGrader(
model=self.config.model, # type: ignore
name=self.config.grader_name,
mode=self.config.grader_mode, # type: ignore
template=self.config.custom_evaluation_prompt, # type: ignore
rubrics=rubrics,
)
@abstractmethod
async def _generate_rubrics(self, dataset: List[dict], **kwargs) -> str:
"""
Generate rubrics from data.
This abstract method must be implemented by subclasses to define
how rubrics are generated from the provided data. The generated
rubrics will be used to configure the final LLMGrader.
Args:
dataset : List of data dictionaries containing examples for rubric generation.
Each dictionary should contain query, response, and potentially score information.
**kwargs: Additional arguments for the rubric generation process.
May include things like language preferences or specific guidelines.
Returns:
str: Generated rubrics in a format suitable for LLMGrader.
Typically a formatted string that can be inserted into evaluation prompts.
Example:
>>> dataset = [{"query": "What is 2+2?", "response": "4", "label_score": 5}]
>>> rubrics = await generator.generate_rubrics(dataset)
>>> print(rubrics)
"1. Mathematical accuracy: The response should contain correct mathematical calculations..."
"""