Skip to content

judgelm

JudgeLMOutput

Bases: TypedDict

A TypedDict matching the output format of JudgeLM.

Source code in src/distilabel/tasks/preference/judgelm.py
class JudgeLMOutput(TypedDict):
    """A `TypedDict` matching the output format of JudgeLM."""

    rating: List[float]
    rationale: str

JudgeLMTask dataclass

Bases: PreferenceTask

A PreferenceTask following the prompt templated used by JudgeLM.

Parameters:

Name Type Description Default
system_prompt str

the system prompt to be used for generation. Defaults to None.

'You are a helpful and precise assistant for checking the quality of the answer.'
task_description Union[str, None]

the description of the task. Defaults to None.

'We would like to request your feedback on the performance of {num_responses} AI assistants in response to the user question displayed above.\nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only {num_responses} values indicating the scores for Assistants 1 to {num_responses}, respectively. The {num_responses} scores are separated by a space. In the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment.'
Source code in src/distilabel/tasks/preference/judgelm.py
@dataclass
class JudgeLMTask(PreferenceTask):
    """A `PreferenceTask` following the prompt templated used by JudgeLM.

    Args:
        system_prompt (str, optional): the system prompt to be used for generation. Defaults to `None`.
        task_description (Union[str, None], optional): the description of the task. Defaults to `None`.
    """

    task_description: str = (
        "We would like to request your feedback on the performance of {num_responses} AI assistants in response to the"
        " user question displayed above.\nPlease rate the helpfulness, relevance, accuracy, level of details"
        " of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher"
        " score indicates better overall performance.\nPlease first output a single line containing only {num_responses}"
        " values indicating the scores for Assistants 1 to {num_responses}, respectively. The {num_responses} scores are separated by"
        " a space. In the subsequent line, please provide a comprehensive explanation of your evaluation,"
        " avoiding any potential bias and ensuring that the order in which the responses were presented does"
        " not affect your judgment."
    )
    system_prompt: str = "You are a helpful and precise assistant for checking the quality of the answer."

    __jinja2_template__: ClassVar[str] = _JUDGELM_TEMPLATE
    __type__: Literal["labelling"] = "labelling"

    def generate_prompt(self, input: str, generations: List[str]) -> Prompt:
        """Generates a prompt following the JudgeLM specification.

        Args:
            input (str): the input to be used for the prompt.
            generations (List[str]): the generations to be used for the prompt.

        Returns:
            Prompt: the generated prompt.

        Examples:
            >>> from distilabel.tasks.preference import JudgeLMTask
            >>> task = JudgeLMTask(system_prompt="You are a helpful assistant.")
            >>> task.generate_prompt("What are the first 5 Fibonacci numbers?", ["0 1 1 2 3", "0 1 1 2 3"])
            Prompt(
                system_prompt="You are a helpful assistant.",
                formatted_prompt="[Question]\nWhat are the first 5 Fibonacci numbers?\n...",
            )
        """
        render_kwargs = {
            "input": input,
            "responses": generations,
            "task_description": self.task_description.format(
                num_responses=len(generations)
            ),
        }
        return Prompt(
            system_prompt=self.system_prompt,
            formatted_prompt=self.template.render(**render_kwargs),
        )

    def parse_output(self, output: str) -> JudgeLMOutput:
        """Parses the output of the model into the desired format."""
        split_output = output.split("\n")
        rating = [float(rating) for rating in split_output[0].split(" ")]
        rationale = "\n".join(split_output[1:])
        return JudgeLMOutput(rating=rating, rationale=rationale)

generate_prompt(input, generations)

Generates a prompt following the JudgeLM specification.

    Args:
        input (str): the input to be used for the prompt.
        generations (List[str]): the generations to be used for the prompt.

    Returns:
        Prompt: the generated prompt.

    Examples:
        >>> from distilabel.tasks.preference import JudgeLMTask
        >>> task = JudgeLMTask(system_prompt="You are a helpful assistant.")
        >>> task.generate_prompt("What are the first 5 Fibonacci numbers?", ["0 1 1 2 3", "0 1 1 2 3"])
        Prompt(
            system_prompt="You are a helpful assistant.",
            formatted_prompt="[Question]

What are the first 5 Fibonacci numbers? ...", )

Source code in src/distilabel/tasks/preference/judgelm.py
def generate_prompt(self, input: str, generations: List[str]) -> Prompt:
    """Generates a prompt following the JudgeLM specification.

    Args:
        input (str): the input to be used for the prompt.
        generations (List[str]): the generations to be used for the prompt.

    Returns:
        Prompt: the generated prompt.

    Examples:
        >>> from distilabel.tasks.preference import JudgeLMTask
        >>> task = JudgeLMTask(system_prompt="You are a helpful assistant.")
        >>> task.generate_prompt("What are the first 5 Fibonacci numbers?", ["0 1 1 2 3", "0 1 1 2 3"])
        Prompt(
            system_prompt="You are a helpful assistant.",
            formatted_prompt="[Question]\nWhat are the first 5 Fibonacci numbers?\n...",
        )
    """
    render_kwargs = {
        "input": input,
        "responses": generations,
        "task_description": self.task_description.format(
            num_responses=len(generations)
        ),
    }
    return Prompt(
        system_prompt=self.system_prompt,
        formatted_prompt=self.template.render(**render_kwargs),
    )

parse_output(output)

Parses the output of the model into the desired format.

Source code in src/distilabel/tasks/preference/judgelm.py
def parse_output(self, output: str) -> JudgeLMOutput:
    """Parses the output of the model into the desired format."""
    split_output = output.split("\n")
    rating = [float(rating) for rating in split_output[0].split(" ")]
    rationale = "\n".join(split_output[1:])
    return JudgeLMOutput(rating=rating, rationale=rationale)