Skip to content

ultrajudge

Area

Bases: TypedDict

A TypedDict representing an area of evaluation.

Source code in src/distilabel/tasks/preference/ultrajudge.py
class Area(TypedDict):
    """A `TypedDict` representing an area of evaluation."""

    rating: float
    rationale: str

UltraJudgeOutput

Bases: TypedDict

A TypedDict representing the output of the UltraJudge task.

Source code in src/distilabel/tasks/preference/ultrajudge.py
class UltraJudgeOutput(TypedDict):
    """A `TypedDict` representing the output of the UltraJudge task."""

    rating: float
    areas: Dict[str, Area]

UltraJudgeTask dataclass

Bases: PreferenceTask

A PreferenceTask for the UltraJudge task. The UltraJudge task has been defined at Argilla specifically for a better evaluation using AI Feedback. The task is defined based on both UltraFeedback and JudgeLM, but with several improvements / modifications.

Parameters:

Name Type Description Default
system_prompt str

the system prompt to be used for generation. Defaults to None.

"You are an evaluator tasked with assessing AI assistants' responses from the perspective of typical user preferences. Your critical analysis should focus on human-like engagement, solution effectiveness, accuracy, clarity, and creativity. Approach each response as if you were the user, considering how well the response meets your needs and expectations in a real-world scenario. Provide detailed feedback that highlights strengths and areas for improvement in each response, keeping in mind the goal of simulating a human's preferred choice. Your evaluation should be impartial and thorough, reflecting a human's perspective in preferring responses that are practical, clear, authentic, and aligned with their intent. Avoid bias, and focus on the content and quality of the responses."
task_description Union[str, None]

the description of the task. Defaults to None.

"Your task is to rigorously evaluate the performance of {num_responses} AI assistants, simulating a human's perspective. You will assess each response based on four key domains, reflecting aspects that are typically valued by humans: {areas}. First provide a score between 0 and 10 and write a detailed feedback for each area and assistant. Finally, provide a list of {num_responses} scores, each separated by a space, to reflect the performance of Assistants 1 to {num_responses}."
areas List[str]

the areas to be used for the task. Defaults to a list of four areas: "Practical Accuracy", "Clarity & Transparency", "Authenticity & Reliability", and "Compliance with Intent".

field(default_factory=lambda : ['Practical Accuracy', 'Clarity & Transparency', 'Authenticity & Reliability', 'Compliance with Intent'])
Source code in src/distilabel/tasks/preference/ultrajudge.py
@dataclass
class UltraJudgeTask(PreferenceTask):
    """A `PreferenceTask` for the UltraJudge task. The `UltraJudge` task has been defined
    at Argilla specifically for a better evaluation using AI Feedback. The task is defined
    based on both UltraFeedback and JudgeLM, but with several improvements / modifications.

    Args:
        system_prompt (str, optional): the system prompt to be used for generation. Defaults to `None`.
        task_description (Union[str, None], optional): the description of the task. Defaults to `None`.
        areas (List[str], optional): the areas to be used for the task. Defaults to a list of four areas:
            "Practical Accuracy", "Clarity & Transparency", "Authenticity & Reliability", and "Compliance with Intent".
    """

    system_prompt: str = (
        "You are an evaluator tasked with assessing AI assistants' responses from the perspective of typical user preferences."
        " Your critical analysis should focus on human-like engagement, solution effectiveness, accuracy, clarity, and"
        " creativity. Approach each response as if you were the user, considering how well the response meets your needs"
        " and expectations in a real-world scenario. Provide detailed feedback that highlights strengths and areas for"
        " improvement in each response, keeping in mind the goal of simulating a human's preferred choice. "
        "Your evaluation should be impartial and thorough, reflecting a human's perspective in preferring responses that are practical,"
        " clear, authentic, and aligned with their intent. Avoid bias, and focus on the content and quality of the responses."
    )

    task_description: str = (
        "Your task is to rigorously evaluate the performance of {num_responses} AI assistants, simulating a human's perspective."
        " You will assess each response based on four key domains, reflecting aspects that are typically valued by humans:"
        " {areas}."
        " First provide a score between 0 and 10 and write a detailed feedback for each area and assistant."
        " Finally, provide a list of {num_responses} scores, each separated by a space, to reflect the performance of Assistants 1 to {num_responses}."
    )

    areas: List[str] = field(
        default_factory=lambda: [
            "Practical Accuracy",
            "Clarity & Transparency",
            "Authenticity & Reliability",
            "Compliance with Intent",
        ]
    )

    __jinja2_template__: ClassVar[str] = field(
        default=_ULTRAJUDGE_TEMPLATE, init=False, repr=False
    )

    @property
    def output_args_names(self) -> List[str]:
        """Returns the names of the output arguments of the task."""
        return ["rating", "areas"]

    @property
    def areas_str(self) -> str:
        """Returns a string representation of the areas."""
        return ", ".join(self.areas[:-1]) + ", and " + self.areas[-1]

    @property
    def extract_area_score_and_rationale_regex(self) -> str:
        """Returns a regex to extract the area, score, and rationale from the output."""
        return rf"({'|'.join(self.areas)})\s*-\s*(\d+(?:\.\d+)?)\n(.*?)(?=\n\n|\Z)"

    @property
    def extract_final_scores_regex(self) -> str:
        """Returns a regex to extract the final scores from the output."""
        return r"Final scores:\s*((?:\d+(?:\.\d+)?\s*)+)"

    def generate_prompt(self, input: str, generations: List[str], **_: Any) -> Prompt:
        """Generates a prompt following the UltraJudge specification.

        Args:
            input (str): the input to be used for the prompt.
            generations (List[str]): the generations to be used for the prompt.

        Returns:
            Prompt: the generated prompt.

        Examples:
            >>> from distilabel.tasks.preference import UltraJudgeTask
            >>> task = UltraJudgeTask(system_prompt="You are a helpful assistant.")
            >>> task.generate_prompt("What are the first 5 Fibonacci numbers?", ["0 1 1 2 3", "0 1 1 2 3"])
            Prompt(
                system_prompt="You are a helpful assistant.",
                formatted_prompt="Your task is to rigorously evaluate the performance of ...",
            )
        """
        render_kwargs = {
            "task_description": self.task_description.format(
                num_responses=len(generations), areas=self.areas_str
            ),
            "instruction": input,
            "responses": generations,
        }

        return Prompt(
            system_prompt=self.system_prompt,
            formatted_prompt=self.template.render(**render_kwargs),
        )

    def parse_output(self, output: str) -> List[UltraJudgeOutput]:
        """Parses the output of the model into the desired format."""
        num_areas = len(self.areas)
        # `areas_results` includes num_generations * num_areas tuples
        areas_results = re.findall(self.extract_area_score_and_rationale_regex, output)
        final_scores = [
            float(str_score)
            for str_score in re.findall(self.extract_final_scores_regex, output)[
                0
            ].split(" ")
        ]

        outputs = []
        for i, rating in enumerate(final_scores):
            areas = {}
            # Get the areas for the i-th generation
            for area in areas_results[i * num_areas : i * num_areas + num_areas]:
                name, area_rating, rationale = area
                areas[name] = Area(rating=area_rating, rationale=rationale)
            outputs.append(UltraJudgeOutput(rating=rating, areas=areas))

        return outputs

    def _merge_rationales(
        self, rationales: List[Dict[str, Any]], generations_column: str = "generations"
    ) -> str:
        """Overwrite of the `_merge_rationales` as we need to process the areas before merging."""

        def format_area(area: Dict[str, Any]) -> str:
            sections = []
            for title, ratings in area.items():
                sections.append(title)
                for k, v in ratings.items():
                    sections.append(f"{k}:{v}")
            return "\n".join(sections)

        merged_rationales = []
        for idx, area in enumerate(rationales, start=1):
            merged_rationales.append(
                f"{generations_column}-{idx}:\n{format_area(area)}\n"
            )
        return "\n".join(merged_rationales)

areas_str: str property

Returns a string representation of the areas.

extract_area_score_and_rationale_regex: str property

Returns a regex to extract the area, score, and rationale from the output.

extract_final_scores_regex: str property

Returns a regex to extract the final scores from the output.

output_args_names: List[str] property

Returns the names of the output arguments of the task.

generate_prompt(input, generations, **_)

Generates a prompt following the UltraJudge specification.

Parameters:

Name Type Description Default
input str

the input to be used for the prompt.

required
generations List[str]

the generations to be used for the prompt.

required

Returns:

Name Type Description
Prompt Prompt

the generated prompt.

Examples:

>>> from distilabel.tasks.preference import UltraJudgeTask
>>> task = UltraJudgeTask(system_prompt="You are a helpful assistant.")
>>> task.generate_prompt("What are the first 5 Fibonacci numbers?", ["0 1 1 2 3", "0 1 1 2 3"])
Prompt(
    system_prompt="You are a helpful assistant.",
    formatted_prompt="Your task is to rigorously evaluate the performance of ...",
)
Source code in src/distilabel/tasks/preference/ultrajudge.py
def generate_prompt(self, input: str, generations: List[str], **_: Any) -> Prompt:
    """Generates a prompt following the UltraJudge specification.

    Args:
        input (str): the input to be used for the prompt.
        generations (List[str]): the generations to be used for the prompt.

    Returns:
        Prompt: the generated prompt.

    Examples:
        >>> from distilabel.tasks.preference import UltraJudgeTask
        >>> task = UltraJudgeTask(system_prompt="You are a helpful assistant.")
        >>> task.generate_prompt("What are the first 5 Fibonacci numbers?", ["0 1 1 2 3", "0 1 1 2 3"])
        Prompt(
            system_prompt="You are a helpful assistant.",
            formatted_prompt="Your task is to rigorously evaluate the performance of ...",
        )
    """
    render_kwargs = {
        "task_description": self.task_description.format(
            num_responses=len(generations), areas=self.areas_str
        ),
        "instruction": input,
        "responses": generations,
    }

    return Prompt(
        system_prompt=self.system_prompt,
        formatted_prompt=self.template.render(**render_kwargs),
    )

parse_output(output)

Parses the output of the model into the desired format.

Source code in src/distilabel/tasks/preference/ultrajudge.py
def parse_output(self, output: str) -> List[UltraJudgeOutput]:
    """Parses the output of the model into the desired format."""
    num_areas = len(self.areas)
    # `areas_results` includes num_generations * num_areas tuples
    areas_results = re.findall(self.extract_area_score_and_rationale_regex, output)
    final_scores = [
        float(str_score)
        for str_score in re.findall(self.extract_final_scores_regex, output)[
            0
        ].split(" ")
    ]

    outputs = []
    for i, rating in enumerate(final_scores):
        areas = {}
        # Get the areas for the i-th generation
        for area in areas_results[i * num_areas : i * num_areas + num_areas]:
            name, area_rating, rationale = area
            areas[name] = Area(rating=area_rating, rationale=rationale)
        outputs.append(UltraJudgeOutput(rating=rating, areas=areas))

    return outputs