Skip to content

tasks

JudgeLMTask dataclass

Bases: PreferenceTask

A PreferenceTask following the prompt templated used by JudgeLM.

Parameters:

Name Type Description Default
system_prompt str

the system prompt to be used for generation. Defaults to None.

'You are a helpful and precise assistant for checking the quality of the answer.'
task_description Union[str, None]

the description of the task. Defaults to None.

'We would like to request your feedback on the performance of {num_responses} AI assistants in response to the user question displayed above.\nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only {num_responses} values indicating the scores for Assistants 1 to {num_responses}, respectively. The {num_responses} scores are separated by a space. In the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment.'
Source code in src/distilabel/tasks/preference/judgelm.py
@dataclass
class JudgeLMTask(PreferenceTask):
    """A `PreferenceTask` following the prompt templated used by JudgeLM.

    Args:
        system_prompt (str, optional): the system prompt to be used for generation. Defaults to `None`.
        task_description (Union[str, None], optional): the description of the task. Defaults to `None`.
    """

    __jinja2_template__: ClassVar[str] = _JUDGELM_TEMPLATE

    task_description: str = (
        "We would like to request your feedback on the performance of {num_responses} AI assistants in response to the"
        " user question displayed above.\nPlease rate the helpfulness, relevance, accuracy, level of details"
        " of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher"
        " score indicates better overall performance.\nPlease first output a single line containing only {num_responses}"
        " values indicating the scores for Assistants 1 to {num_responses}, respectively. The {num_responses} scores are separated by"
        " a space. In the subsequent line, please provide a comprehensive explanation of your evaluation,"
        " avoiding any potential bias and ensuring that the order in which the responses were presented does"
        " not affect your judgment."
    )
    system_prompt: str = "You are a helpful and precise assistant for checking the quality of the answer."

    def generate_prompt(self, input: str, generations: List[str]) -> Prompt:
        """Generates a prompt following the JudgeLM specification.

        Args:
            input (str): the input to be used for the prompt.
            generations (List[str]): the generations to be used for the prompt.

        Returns:
            Prompt: the generated prompt.

        Examples:
            >>> from distilabel.tasks.preference import JudgeLMTask
            >>> task = JudgeLMTask(system_prompt="You are a helpful assistant.")
            >>> task.generate_prompt("What are the first 5 Fibonacci numbers?", ["0 1 1 2 3", "0 1 1 2 3"])
            Prompt(
                system_prompt="You are a helpful assistant.",
                formatted_prompt="[Question]\nWhat are the first 5 Fibonacci numbers?\n...",
            )
        """
        render_kwargs = {
            "input": input,
            "responses": generations,
            "task_description": self.task_description.format(
                num_responses=len(generations)
            ),
        }
        return Prompt(
            system_prompt=self.system_prompt,
            formatted_prompt=self.template.render(**render_kwargs),
        )

    def parse_output(self, output: str) -> JudgeLMOutput:
        """Parses the output of the model into the desired format."""
        split_output = output.split("\n")
        rating = [float(rating) for rating in split_output[0].split(" ")]
        rationale = "\n".join(split_output[1:])
        return JudgeLMOutput(rating=rating, rationale=rationale)

generate_prompt(input, generations)

Generates a prompt following the JudgeLM specification.

    Args:
        input (str): the input to be used for the prompt.
        generations (List[str]): the generations to be used for the prompt.

    Returns:
        Prompt: the generated prompt.

    Examples:
        >>> from distilabel.tasks.preference import JudgeLMTask
        >>> task = JudgeLMTask(system_prompt="You are a helpful assistant.")
        >>> task.generate_prompt("What are the first 5 Fibonacci numbers?", ["0 1 1 2 3", "0 1 1 2 3"])
        Prompt(
            system_prompt="You are a helpful assistant.",
            formatted_prompt="[Question]

What are the first 5 Fibonacci numbers? ...", )

Source code in src/distilabel/tasks/preference/judgelm.py
def generate_prompt(self, input: str, generations: List[str]) -> Prompt:
    """Generates a prompt following the JudgeLM specification.

    Args:
        input (str): the input to be used for the prompt.
        generations (List[str]): the generations to be used for the prompt.

    Returns:
        Prompt: the generated prompt.

    Examples:
        >>> from distilabel.tasks.preference import JudgeLMTask
        >>> task = JudgeLMTask(system_prompt="You are a helpful assistant.")
        >>> task.generate_prompt("What are the first 5 Fibonacci numbers?", ["0 1 1 2 3", "0 1 1 2 3"])
        Prompt(
            system_prompt="You are a helpful assistant.",
            formatted_prompt="[Question]\nWhat are the first 5 Fibonacci numbers?\n...",
        )
    """
    render_kwargs = {
        "input": input,
        "responses": generations,
        "task_description": self.task_description.format(
            num_responses=len(generations)
        ),
    }
    return Prompt(
        system_prompt=self.system_prompt,
        formatted_prompt=self.template.render(**render_kwargs),
    )

parse_output(output)

Parses the output of the model into the desired format.

Source code in src/distilabel/tasks/preference/judgelm.py
def parse_output(self, output: str) -> JudgeLMOutput:
    """Parses the output of the model into the desired format."""
    split_output = output.split("\n")
    rating = [float(rating) for rating in split_output[0].split(" ")]
    rationale = "\n".join(split_output[1:])
    return JudgeLMOutput(rating=rating, rationale=rationale)

Llama2TextGenerationTask

Bases: TextGenerationTask

A TextGenerationTask for the Llama2 model.

Parameters:

Name Type Description Default
system_prompt str

the system prompt to be used. Defaults to None.

required
principles Dict[str, List[str]]

the principles to be used for the system prompt. Defaults to None.

required
principles_distribution Union[Dict[str, float], Literal[balanced], None]

the distribution of principles to be used for the system prompt. Defaults to None.

required
Source code in src/distilabel/tasks/text_generation/llama.py
class Llama2TextGenerationTask(TextGenerationTask):
    """A `TextGenerationTask` for the Llama2 model.

    Args:
        system_prompt (str, optional): the system prompt to be used. Defaults to `None`.
        principles (Dict[str, List[str]], optional): the principles to be used for the system prompt.
            Defaults to `None`.
        principles_distribution (Union[Dict[str, float], Literal["balanced"], None], optional): the
            distribution of principles to be used for the system prompt. Defaults to `None`.
    """

    def generate_prompt(self, input: str) -> str:
        """Generates a prompt for the Llama2 model.

        Args:
            input (str): the input to be used for the prompt.

        Returns:
            str: the generated prompt.

        Examples:
            >>> from distilabel.tasks.text_generation import Llama2TextGenerationTask
            >>> task = Llama2TextGenerationTask(system_prompt="You are a helpful assistant.")
            >>> task.generate_prompt("What are the first 5 Fibonacci numbers?")
            '<s>[INST] <<SYS>>\nYou are a helpful assistant.<</SYS>>\n\nWhat are the first 5 Fibonacci numbers? [/INST]'
        """
        return Prompt(
            system_prompt=self.system_prompt,
            formatted_prompt=input,
        ).format_as("llama2")  # type: ignore

generate_prompt(input)

Generates a prompt for the Llama2 model.

    Args:
        input (str): the input to be used for the prompt.

    Returns:
        str: the generated prompt.

    Examples:
        >>> from distilabel.tasks.text_generation import Llama2TextGenerationTask
        >>> task = Llama2TextGenerationTask(system_prompt="You are a helpful assistant.")
        >>> task.generate_prompt("What are the first 5 Fibonacci numbers?")
        '<s>[INST] <<SYS>>

You are a helpful assistant.<>

What are the first 5 Fibonacci numbers? [/INST]'

Source code in src/distilabel/tasks/text_generation/llama.py
def generate_prompt(self, input: str) -> str:
    """Generates a prompt for the Llama2 model.

    Args:
        input (str): the input to be used for the prompt.

    Returns:
        str: the generated prompt.

    Examples:
        >>> from distilabel.tasks.text_generation import Llama2TextGenerationTask
        >>> task = Llama2TextGenerationTask(system_prompt="You are a helpful assistant.")
        >>> task.generate_prompt("What are the first 5 Fibonacci numbers?")
        '<s>[INST] <<SYS>>\nYou are a helpful assistant.<</SYS>>\n\nWhat are the first 5 Fibonacci numbers? [/INST]'
    """
    return Prompt(
        system_prompt=self.system_prompt,
        formatted_prompt=input,
    ).format_as("llama2")  # type: ignore

OpenAITextGenerationTask

Bases: TextGenerationTask

A TextGenerationTask for any chat-completion OpenAI model.

Parameters:

Name Type Description Default
system_prompt str

the system prompt to be used. Defaults to None.

required
principles Dict[str, List[str]]

the principles to be used for the system prompt. Defaults to None.

required
principles_distribution Union[Dict[str, float], Literal[balanced], None]

the distribution of principles to be used for the system prompt. Defaults to None.

required
Source code in src/distilabel/tasks/text_generation/openai.py
class OpenAITextGenerationTask(TextGenerationTask):
    """A `TextGenerationTask` for any chat-completion OpenAI model.

    Args:
        system_prompt (str, optional): the system prompt to be used. Defaults to `None`.
        principles (Dict[str, List[str]], optional): the principles to be used for the system prompt.
            Defaults to `None`.
        principles_distribution (Union[Dict[str, float], Literal["balanced"], None], optional): the
            distribution of principles to be used for the system prompt. Defaults to `None`.
    """

    def generate_prompt(self, input: str) -> List["ChatCompletion"]:
        """Generates a prompt for any chat-completion OpenAI model.

        Args:
            input (str): the input to be used for the prompt.

        Returns:
            List[ChatCompletion]: the generated prompt.

        Examples:
            >>> from distilabel.tasks.text_generation import OpenAITextGenerationTask
            >>> task = OpenAITextGenerationTask(system_prompt="You are a helpful assistant.")
            >>> task.generate_prompt("What are the first 5 Fibonacci numbers?")
            [
                {'role': 'system', 'content': 'You are a helpful assistant.'},
                {'role': 'user', 'content': 'What are the first 5 Fibonacci numbers?'},
            ]
        """
        return Prompt(
            system_prompt=self.system_prompt,
            formatted_prompt=input,
        ).format_as("openai")  # type: ignore

generate_prompt(input)

Generates a prompt for any chat-completion OpenAI model.

Parameters:

Name Type Description Default
input str

the input to be used for the prompt.

required

Returns:

Type Description
List[ChatCompletion]

List[ChatCompletion]: the generated prompt.

Examples:

>>> from distilabel.tasks.text_generation import OpenAITextGenerationTask
>>> task = OpenAITextGenerationTask(system_prompt="You are a helpful assistant.")
>>> task.generate_prompt("What are the first 5 Fibonacci numbers?")
[
    {'role': 'system', 'content': 'You are a helpful assistant.'},
    {'role': 'user', 'content': 'What are the first 5 Fibonacci numbers?'},
]
Source code in src/distilabel/tasks/text_generation/openai.py
def generate_prompt(self, input: str) -> List["ChatCompletion"]:
    """Generates a prompt for any chat-completion OpenAI model.

    Args:
        input (str): the input to be used for the prompt.

    Returns:
        List[ChatCompletion]: the generated prompt.

    Examples:
        >>> from distilabel.tasks.text_generation import OpenAITextGenerationTask
        >>> task = OpenAITextGenerationTask(system_prompt="You are a helpful assistant.")
        >>> task.generate_prompt("What are the first 5 Fibonacci numbers?")
        [
            {'role': 'system', 'content': 'You are a helpful assistant.'},
            {'role': 'user', 'content': 'What are the first 5 Fibonacci numbers?'},
        ]
    """
    return Prompt(
        system_prompt=self.system_prompt,
        formatted_prompt=input,
    ).format_as("openai")  # type: ignore

Prompt dataclass

A dataclass representing a Prompt.

Parameters:

Name Type Description Default
system_prompt str

the system prompt.

required
formatted_prompt str

the formatted prompt.

required

Examples:

>>> from distilabel.tasks.prompt import Prompt
>>> prompt = Prompt(
...     system_prompt="You are a helpful assistant.",
...     formatted_prompt="What are the first 5 Fibonacci numbers?",
... )
Source code in src/distilabel/tasks/prompt.py
@dataclass
class Prompt:
    """A `dataclass` representing a `Prompt`.

    Args:
        system_prompt (str): the system prompt.
        formatted_prompt (str): the formatted prompt.

    Examples:
        >>> from distilabel.tasks.prompt import Prompt
        >>> prompt = Prompt(
        ...     system_prompt="You are a helpful assistant.",
        ...     formatted_prompt="What are the first 5 Fibonacci numbers?",
        ... )
    """

    system_prompt: str
    formatted_prompt: str

    def format_as(self, format: SupportedFormats) -> Union[str, List[ChatCompletion]]:
        """Formats the prompt as the specified format.

        Args:
            format (SupportedFormats): the format to be used for the prompt. Available formats are
                `default`, `openai`, `llama2`, `chatml`, and `zephyr`.

        Returns:
            Union[str, List[ChatCompletion]]: the formatted prompt.

        Raises:
            ValueError: if the specified format is not supported.

        Examples:
            >>> from distilabel.tasks.prompt import Prompt
            >>> prompt = Prompt(
            ...     system_prompt="You are a helpful assistant.",
            ...     formatted_prompt="What are the first 5 Fibonacci numbers?",
            ... )
            >>> prompt.format_as("default")
            'You are a helpful assistant.\nWhat are the first 5 Fibonacci numbers?'
        """
        if format == "default":
            return f"{self.system_prompt}\n{self.formatted_prompt}"
        elif format == "openai":
            return [
                ChatCompletion(
                    role="system",
                    content=self.system_prompt,
                ),
                ChatCompletion(role="user", content=self.formatted_prompt),
            ]
        elif format == "llama2":
            return f"<s>[INST] <<SYS>>\n{self.system_prompt}<</SYS>>\n\n{self.formatted_prompt} [/INST]"
        elif format == "chatml":
            return f"<|im_start|>system\n{self.system_prompt}<|im_end|>\n<|im_start|>user\n{self.formatted_prompt}<|im_end|>\n<|im_start|>assistant\n"
        elif format == "zephyr":
            return f"<|system|>\n{self.system_prompt}</s>\n<|user|>\n{self.formatted_prompt}</s>\n<|assistant|>\n"
        else:
            raise ValueError(
                f"Format {format} not supported, please provide a custom `prompt_formatting_fn`"
                " or use any of the available formats: openai, llama2, chatml, zephyr"
            )

format_as(format)

Formats the prompt as the specified format.

    Args:
        format (SupportedFormats): the format to be used for the prompt. Available formats are
            `default`, `openai`, `llama2`, `chatml`, and `zephyr`.

    Returns:
        Union[str, List[ChatCompletion]]: the formatted prompt.

    Raises:
        ValueError: if the specified format is not supported.

    Examples:
        >>> from distilabel.tasks.prompt import Prompt
        >>> prompt = Prompt(
        ...     system_prompt="You are a helpful assistant.",
        ...     formatted_prompt="What are the first 5 Fibonacci numbers?",
        ... )
        >>> prompt.format_as("default")
        'You are a helpful assistant.

What are the first 5 Fibonacci numbers?'

Source code in src/distilabel/tasks/prompt.py
def format_as(self, format: SupportedFormats) -> Union[str, List[ChatCompletion]]:
    """Formats the prompt as the specified format.

    Args:
        format (SupportedFormats): the format to be used for the prompt. Available formats are
            `default`, `openai`, `llama2`, `chatml`, and `zephyr`.

    Returns:
        Union[str, List[ChatCompletion]]: the formatted prompt.

    Raises:
        ValueError: if the specified format is not supported.

    Examples:
        >>> from distilabel.tasks.prompt import Prompt
        >>> prompt = Prompt(
        ...     system_prompt="You are a helpful assistant.",
        ...     formatted_prompt="What are the first 5 Fibonacci numbers?",
        ... )
        >>> prompt.format_as("default")
        'You are a helpful assistant.\nWhat are the first 5 Fibonacci numbers?'
    """
    if format == "default":
        return f"{self.system_prompt}\n{self.formatted_prompt}"
    elif format == "openai":
        return [
            ChatCompletion(
                role="system",
                content=self.system_prompt,
            ),
            ChatCompletion(role="user", content=self.formatted_prompt),
        ]
    elif format == "llama2":
        return f"<s>[INST] <<SYS>>\n{self.system_prompt}<</SYS>>\n\n{self.formatted_prompt} [/INST]"
    elif format == "chatml":
        return f"<|im_start|>system\n{self.system_prompt}<|im_end|>\n<|im_start|>user\n{self.formatted_prompt}<|im_end|>\n<|im_start|>assistant\n"
    elif format == "zephyr":
        return f"<|system|>\n{self.system_prompt}</s>\n<|user|>\n{self.formatted_prompt}</s>\n<|assistant|>\n"
    else:
        raise ValueError(
            f"Format {format} not supported, please provide a custom `prompt_formatting_fn`"
            " or use any of the available formats: openai, llama2, chatml, zephyr"
        )

SelfInstructTask dataclass

Bases: TextGenerationTask

A TextGenerationTask following the Self-Instruct specification for building the prompts.

Reference: https://github.com/yizhongw/self-instruct

Parameters:

Name Type Description Default
system_prompt str

the system prompt to be used. Defaults to None.

'You are an expert prompt writer, writing the best and most diverse prompts for a variety of tasks.You are given a task description and a set of instructions for how to write the prompts for a specific AI application.'
principles Dict[str, List[str]]

the principles to be used for the system prompt. Defaults to None.

field(default_factory=lambda : {'harmlessness': harmlessness, 'helpfulness': helpfulness, 'truthfulness': truthfulness, 'honesty': honesty, 'verbalized_calibration': verbalized_calibration}, repr=False)
principles_distribution Union[Dict[str, float], Literal[balanced], None]

the distribution of principles to be used for the system prompt. Defaults to None.

None
application_description str

the description of the AI application. Defaults to "AI assistant".

'AI assistant'
num_instructions int

the number of instructions to be used for the prompt. Defaults to 5.

5
Source code in src/distilabel/tasks/text_generation/self_instruct.py
@dataclass
class SelfInstructTask(TextGenerationTask):
    """A `TextGenerationTask` following the Self-Instruct specification for building
    the prompts.

    Reference: https://github.com/yizhongw/self-instruct

    Args:
        system_prompt (str, optional): the system prompt to be used. Defaults to `None`.
        principles (Dict[str, List[str]], optional): the principles to be used for the system prompt.
            Defaults to `None`.
        principles_distribution (Union[Dict[str, float], Literal["balanced"], None], optional): the
            distribution of principles to be used for the system prompt. Defaults to `None`.
        application_description (str, optional): the description of the AI application. Defaults to
            "AI assistant".
        num_instructions (int, optional): the number of instructions to be used for the prompt.
            Defaults to 5.
    """

    system_prompt: str = (
        "You are an expert prompt writer, writing the best and most diverse prompts for a variety of tasks."
        "You are given a task description and a set of instructions for how to write the prompts for a specific AI application."
    )
    application_description: str = "AI assistant"
    num_instructions: int = 5

    __jinja2_template__: str = _SELF_INSTRUCT_TEMPLATE

    def generate_prompt(self, input: str) -> Prompt:
        """Generates a prompt following the Self-Instruct specification.

        Args:
            input (str): the input to be used for the prompt.

        Returns:
            Prompt: the generated prompt.

        Examples:
            >>> from distilabel.tasks.text_generation import SelfInstructTask
            >>> task = SelfInstructTask(system_prompt="You are a helpful assistant.", num_instructions=2)
            >>> task.generate_prompt("What are the first 5 Fibonacci numbers?")
            Prompt(
                system_prompt="You are a helpful assistant.",
                formatted_prompt="# Task Description\nDevelop 2 user queries that ...",
            )
        """
        render_kwargs = {
            "application_description": self.application_description,
            "num_instructions": self.num_instructions,
            "input": input,
        }
        return Prompt(
            system_prompt=self.system_prompt,
            formatted_prompt=self.template.render(**render_kwargs),
        )

    def parse_output(self, output: str) -> Dict[str, List[str]]:
        """Parses the output of the model into the desired format."""
        return {"generations": output.split("\n")}

generate_prompt(input)

Generates a prompt following the Self-Instruct specification.

    Args:
        input (str): the input to be used for the prompt.

    Returns:
        Prompt: the generated prompt.

    Examples:
        >>> from distilabel.tasks.text_generation import SelfInstructTask
        >>> task = SelfInstructTask(system_prompt="You are a helpful assistant.", num_instructions=2)
        >>> task.generate_prompt("What are the first 5 Fibonacci numbers?")
        Prompt(
            system_prompt="You are a helpful assistant.",
            formatted_prompt="# Task Description

Develop 2 user queries that ...", )

Source code in src/distilabel/tasks/text_generation/self_instruct.py
def generate_prompt(self, input: str) -> Prompt:
    """Generates a prompt following the Self-Instruct specification.

    Args:
        input (str): the input to be used for the prompt.

    Returns:
        Prompt: the generated prompt.

    Examples:
        >>> from distilabel.tasks.text_generation import SelfInstructTask
        >>> task = SelfInstructTask(system_prompt="You are a helpful assistant.", num_instructions=2)
        >>> task.generate_prompt("What are the first 5 Fibonacci numbers?")
        Prompt(
            system_prompt="You are a helpful assistant.",
            formatted_prompt="# Task Description\nDevelop 2 user queries that ...",
        )
    """
    render_kwargs = {
        "application_description": self.application_description,
        "num_instructions": self.num_instructions,
        "input": input,
    }
    return Prompt(
        system_prompt=self.system_prompt,
        formatted_prompt=self.template.render(**render_kwargs),
    )

parse_output(output)

Parses the output of the model into the desired format.

Source code in src/distilabel/tasks/text_generation/self_instruct.py
def parse_output(self, output: str) -> Dict[str, List[str]]:
    """Parses the output of the model into the desired format."""
    return {"generations": output.split("\n")}

Task

Bases: ABC

Abstract class used to define the methods required to create a Task, to be used within an LLM.

Parameters:

Name Type Description Default
system_prompt str

the system prompt to be used for generation.

required
task_description Union[str, None]

the description of the task. Defaults to None.

required

Raises:

Type Description
ValueError

if the __jinja2_template__ attribute is not provided.

Source code in src/distilabel/tasks/base.py
class Task(ABC):
    """Abstract class used to define the methods required to create a `Task`, to be used
    within an `LLM`.

    Args:
        system_prompt (str): the system prompt to be used for generation.
        task_description (Union[str, None], optional): the description of the task. Defaults to `None`.

    Raises:
        ValueError: if the `__jinja2_template__` attribute is not provided.
    """

    system_prompt: str
    task_description: Union[str, None] = None

    __jinja2_template__: Union[str, None] = None

    def __rich_repr__(self) -> Generator[Any, None, None]:
        yield "system_prompt", self.system_prompt
        yield "task_description", self.task_description
        yield "input_args_names", self.input_args_names
        yield "output_args_names", self.output_args_names

    @property
    def template(self) -> "Template":
        if self.__jinja2_template__ is None:
            raise ValueError(
                "You must provide a `__jinja2_template__` attribute to your Task subclass."
            )

        return Template(open(self.__jinja2_template__).read())

    @abstractmethod
    def generate_prompt(self, **kwargs: Any) -> Union[Prompt, Any]:
        pass

    @abstractmethod
    def parse_output(self, output: str) -> Any:
        pass

    @property
    @abstractmethod
    def input_args_names(self) -> List[str]:
        pass

    @property
    @abstractmethod
    def output_args_names(self) -> List[str]:
        pass

    def validate_dataset(self, columns_in_dataset: List[str]) -> None:
        """Validates that the dataset contains the required columns for the task.

        Args:
            columns_in_dataset (List[str]): the columns in the dataset.

        Raises:
            KeyError: if the dataset does not contain the required columns.
        """
        for input_arg_name in self.input_args_names:
            if input_arg_name not in columns_in_dataset:
                raise KeyError(
                    f"LLM expects a column named '{input_arg_name}' in the provided"
                    " dataset, but it was not found."
                )

    def to_argilla_dataset(
        self, dataset_row: Dict[str, Any], *args: Any, **kwargs: Any
    ) -> "FeedbackDataset":
        raise NotImplementedError(
            "`to_argilla_dataset` is not implemented, if you want to export your dataset as an Argilla"
            " `FeedbackDataset` you will need to implement this method first."
        )

    def to_argilla_record(
        self, dataset_row: Dict[str, Any], *args: Any, **kwargs: Any
    ) -> "FeedbackRecord":
        raise NotImplementedError(
            "`to_argilla_record` is not implemented, if you want to export your dataset as an Argilla"
            " `FeedbackDataset` you will need to implement this method first."
        )

validate_dataset(columns_in_dataset)

Validates that the dataset contains the required columns for the task.

Parameters:

Name Type Description Default
columns_in_dataset List[str]

the columns in the dataset.

required

Raises:

Type Description
KeyError

if the dataset does not contain the required columns.

Source code in src/distilabel/tasks/base.py
def validate_dataset(self, columns_in_dataset: List[str]) -> None:
    """Validates that the dataset contains the required columns for the task.

    Args:
        columns_in_dataset (List[str]): the columns in the dataset.

    Raises:
        KeyError: if the dataset does not contain the required columns.
    """
    for input_arg_name in self.input_args_names:
        if input_arg_name not in columns_in_dataset:
            raise KeyError(
                f"LLM expects a column named '{input_arg_name}' in the provided"
                " dataset, but it was not found."
            )

TextGenerationTask dataclass

Bases: Task

A base Task definition for text generation using LLMs.

Parameters:

Name Type Description Default
system_prompt str

the system prompt to be used. Defaults to None.

"You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."
principles Dict[str, List[str]]

the principles to be used for the system prompt. Defaults to None.

field(default_factory=lambda : {'harmlessness': harmlessness, 'helpfulness': helpfulness, 'truthfulness': truthfulness, 'honesty': honesty, 'verbalized_calibration': verbalized_calibration}, repr=False)
principles_distribution Union[Dict[str, float], Literal['balanced'], None]

the distribution of principles to be used for the system prompt. Defaults to None.

None

Examples:

>>> from distilabel.tasks.text_generation import TextGenerationTask
>>> task = TextGenerationTask()
Source code in src/distilabel/tasks/text_generation/base.py
@dataclass
class TextGenerationTask(Task):
    """A base `Task` definition for text generation using LLMs.

    Args:
        system_prompt (str, optional): the system prompt to be used. Defaults to `None`.
        principles (Dict[str, List[str]], optional): the principles to be used for the system prompt.
            Defaults to `None`.
        principles_distribution (Union[Dict[str, float], Literal["balanced"], None], optional): the
            distribution of principles to be used for the system prompt. Defaults to `None`.

    Examples:
        >>> from distilabel.tasks.text_generation import TextGenerationTask
        >>> task = TextGenerationTask()
    """

    system_prompt: str = (
        "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible,"
        " while being safe. Your answers should not include any harmful, unethical, racist, sexist,"
        " toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased"
        " and positive in nature.\nIf a question does not make any sense, or is not factually coherent,"
        " explain why instead of answering something not correct. If you don't know the answer to a"
        " question, please don't share false information."
    )
    principles: Dict[str, List[str]] = field(
        default_factory=lambda: {
            "harmlessness": UltraFeedbackPrinciples.harmlessness,
            "helpfulness": UltraFeedbackPrinciples.helpfulness,
            "truthfulness": UltraFeedbackPrinciples.truthfulness,
            "honesty": UltraFeedbackPrinciples.honesty,
            "verbalized_calibration": UltraFeedbackPrinciples.verbalized_calibration,
        },
        repr=False,
    )
    principles_distribution: Union[Dict[str, float], Literal["balanced"], None] = None

    def __post_init__(self) -> None:
        """Validates the `principles_distribution` if it is a dict.

        Raises:
            ValueError: if the `principles_distribution` is a dict and it does not sum to 1.0.
            ValueError: if the `principles` are not included in the `principles_distribution`.
        """
        if isinstance(self.principles_distribution, dict):
            not_included_principles = [
                principle
                for principle in self.principles
                if principle not in self.principles_distribution
            ]
            if not_included_principles:
                principles_str = ", ".join(
                    [f"'{principle}'" for principle in not_included_principles]
                )
                raise ValueError(
                    f"Principles {principles_str} included in `principles` is not in"
                    " `principles_distribution`"
                )

            if sum(self.principles_distribution.values()) != 1.0:
                raise ValueError(
                    "`principles_distribution` must sum to 1.0 if it is a dict containing"
                    " the distribution of principles to use."
                )

    def _get_principle(self) -> str:
        """Gets a principle from the `principles` dict respecting the `principal_distribution`.

        Returns:
            str: the principle to be used.
        """
        if isinstance(self.principles_distribution, dict):
            principle_group = random.choices(
                list(self.principles_distribution.keys()),
                weights=list(self.principles_distribution.values()),
                k=1,
            )[0]
        else:
            principle_group = random.choice(list(self.principles.keys()))
        return random.choice(self.principles[principle_group])

    def generate_prompt(self, input: str) -> Prompt:
        """Generates the prompt to be used for generation.

        Args:
            input (str): the input to be used for generation.

        Returns:
            Prompt: the generated prompt.

        Examples:
            >>> from distilabel.tasks.text_generation import TextGenerationTask
            >>> task = TextGenerationTask(system_prompt="You are a helpful assistant.")
            >>> task.generate_prompt("What are the first 5 Fibonacci numbers?")
            Prompt(system_prompt='You are a helpful assistant.', formatted_prompt='What are the first 5 Fibonacci numbers?')
        """
        system_prompt = self.system_prompt
        if self.principles_distribution is not None:
            principle = self._get_principle()
            system_prompt += " " + principle
        return Prompt(system_prompt=system_prompt, formatted_prompt=input)

    def parse_output(self, output: str) -> dict[str, str]:
        """Parses the output of the LLM into the desired format."""
        return {"generations": output}

    @property
    def input_args_names(self) -> list[str]:
        """Returns the input args names for the task."""
        return ["input"]

    @property
    def output_args_names(self) -> list[str]:
        """Returns the output args names for the task."""
        return ["generations"]

input_args_names: list[str] property

Returns the input args names for the task.

output_args_names: list[str] property

Returns the output args names for the task.

__post_init__()

Validates the principles_distribution if it is a dict.

Raises:

Type Description
ValueError

if the principles_distribution is a dict and it does not sum to 1.0.

ValueError

if the principles are not included in the principles_distribution.

Source code in src/distilabel/tasks/text_generation/base.py
def __post_init__(self) -> None:
    """Validates the `principles_distribution` if it is a dict.

    Raises:
        ValueError: if the `principles_distribution` is a dict and it does not sum to 1.0.
        ValueError: if the `principles` are not included in the `principles_distribution`.
    """
    if isinstance(self.principles_distribution, dict):
        not_included_principles = [
            principle
            for principle in self.principles
            if principle not in self.principles_distribution
        ]
        if not_included_principles:
            principles_str = ", ".join(
                [f"'{principle}'" for principle in not_included_principles]
            )
            raise ValueError(
                f"Principles {principles_str} included in `principles` is not in"
                " `principles_distribution`"
            )

        if sum(self.principles_distribution.values()) != 1.0:
            raise ValueError(
                "`principles_distribution` must sum to 1.0 if it is a dict containing"
                " the distribution of principles to use."
            )

generate_prompt(input)

Generates the prompt to be used for generation.

Parameters:

Name Type Description Default
input str

the input to be used for generation.

required

Returns:

Name Type Description
Prompt Prompt

the generated prompt.

Examples:

>>> from distilabel.tasks.text_generation import TextGenerationTask
>>> task = TextGenerationTask(system_prompt="You are a helpful assistant.")
>>> task.generate_prompt("What are the first 5 Fibonacci numbers?")
Prompt(system_prompt='You are a helpful assistant.', formatted_prompt='What are the first 5 Fibonacci numbers?')
Source code in src/distilabel/tasks/text_generation/base.py
def generate_prompt(self, input: str) -> Prompt:
    """Generates the prompt to be used for generation.

    Args:
        input (str): the input to be used for generation.

    Returns:
        Prompt: the generated prompt.

    Examples:
        >>> from distilabel.tasks.text_generation import TextGenerationTask
        >>> task = TextGenerationTask(system_prompt="You are a helpful assistant.")
        >>> task.generate_prompt("What are the first 5 Fibonacci numbers?")
        Prompt(system_prompt='You are a helpful assistant.', formatted_prompt='What are the first 5 Fibonacci numbers?')
    """
    system_prompt = self.system_prompt
    if self.principles_distribution is not None:
        principle = self._get_principle()
        system_prompt += " " + principle
    return Prompt(system_prompt=system_prompt, formatted_prompt=input)

parse_output(output)

Parses the output of the LLM into the desired format.

Source code in src/distilabel/tasks/text_generation/base.py
def parse_output(self, output: str) -> dict[str, str]:
    """Parses the output of the LLM into the desired format."""
    return {"generations": output}

UltraFeedbackTask dataclass

Bases: PreferenceTask

A PreferenceTask following the prompt template used by ULTRAFEEDBACK.

Parameters:

Name Type Description Default
system_prompt str

the system prompt to be used for generation. Defaults to None.

'Your role is to evaluate text quality based on given criteria.'
task_description Union[str, None]

the description of the task. Defaults to None.

required
ratings Union[List[Rating], None]

the ratings to be used for the task. Defaults to None.

required
Source code in src/distilabel/tasks/preference/ultrafeedback.py
@dataclass
class UltraFeedbackTask(PreferenceTask):
    """A `PreferenceTask` following the prompt template used by ULTRAFEEDBACK.

    Args:
        system_prompt (str, optional): the system prompt to be used for generation. Defaults to `None`.
        task_description (Union[str, None], optional): the description of the task. Defaults to `None`.
        ratings (Union[List[Rating], None], optional): the ratings to be used for the task. Defaults to `None`.
    """

    ratings: List[Rating]
    task_description: str

    __jinja2_template__: ClassVar[str] = field(
        default=_ULTRAFEEDBACK_TEMPLATE, init=False, repr=False
    )
    __subtasks__: ClassVar[List[str]] = [
        "text-quality",
        "helpfulness",
        "truthfulness",
        "honesty",
        "instruction-following",
    ]

    system_prompt: (
        str
    ) = "Your role is to evaluate text quality based on given criteria."

    def generate_prompt(self, input: str, generations: List[str]) -> Prompt:
        """Generates a prompt following the ULTRAFEEDBACK specification.

        Args:
            input (str): the input to be used for the prompt.
            generations (List[str]): the generations to be used for the prompt.

        Returns:
            Prompt: the generated prompt.

        Examples:
            >>> from distilabel.tasks.preference import UltraFeedbackTask
            >>> task = UltraFeedbackTask.for_text_quality()
            >>> task.generate_prompt("What are the first 5 Fibonacci numbers?", ["0 1 1 2 3", "0 1 1 2 3"])
            Prompt(
                system_prompt="Your role is to evaluate text quality based on given criteria.",
                formatted_prompt="# General Text Quality Assessment\nEvaluate the model's ...",
            )
        """
        render_kwargs = {
            "task_description": self.task_description,
            "ratings": self.ratings,
            "input": input,
            "responses": generations,
        }
        return Prompt(
            system_prompt=self.system_prompt,
            formatted_prompt=self.template.render(**render_kwargs),
        )

    def parse_output(self, output: str) -> List[UltraFeedbackOutput]:
        """Parses the output of the model into the desired format."""
        parsed_output = []
        for section in output.split("#### Output for Text ")[1:]:
            rating, rationale = section.split("\n")[1:3]
            rating = float(rating.split(": ")[1])
            rationale = rationale.split(": ")[1]
            parsed_output.append(
                UltraFeedbackOutput(rating=rating, rationale=rationale)
            )
        return parsed_output

    @classmethod
    def for_text_quality(
        cls,
        system_prompt: Optional[str] = None,
        task_description: Optional[str] = None,
        ratings: Optional[List[Rating]] = None,
    ) -> "UltraFeedbackTask":
        kwargs = {}
        if system_prompt is not None:
            kwargs.update({"system_prompt": system_prompt})
        if task_description is None:
            task_description = dedent(
                """
                # General Text Quality Assessment
                Evaluate the model's outputs based on various criteria:
                1. **Correctness & Informativeness**: Does the output provide accurate and helpful information?
                2. **Honesty & Uncertainty**: How confidently does the model convey its information, and does it express uncertainty appropriately?
                3. **Truthfulness & Hallucination**: Does the model introduce misleading or fabricated details?
                4. **Instruction Following**: Does the model's output align with given instructions and the user's intent?
                Your role is to provide a holistic assessment considering all the above factors.

                **Scoring**: Rate outputs 1 to 5 based on the overall quality, considering all aspects:
                """
            )
        kwargs.update({"task_description": task_description})

        if ratings is None:
            ratings = [
                Rating(
                    value=1,
                    description="**Low Quality**: Contains inaccuracies, may be entirely wrong or has severe hallucinations.",
                ),
                Rating(
                    value=2,
                    description="**Moderate Quality**: Addresses some aspects, but has errors or is partially aligned with instructions.",
                ),
                Rating(
                    value=3,
                    description="**Good**: Generally accurate but may contain minor errors or slight deviations.",
                ),
                Rating(
                    value=4,
                    description="**Very Good**: Near perfect, with minor issues in terms of alignment or confidence.",
                ),
                Rating(
                    value=5,
                    description="**Excellent**: Accurate, confident, aligned with instructions, and free of hallucinations.",
                ),
            ]
        kwargs.update({"ratings": ratings})
        return cls(**kwargs)

    @classmethod
    def for_helpfulness(
        cls,
        system_prompt: Optional[str] = None,
        task_description: Optional[str] = None,
        ratings: Optional[List[Rating]] = None,
    ) -> "UltraFeedbackTask":
        kwargs = {}
        if system_prompt is not None:
            kwargs.update({"system_prompt": system_prompt})

        if task_description is None:
            task_description = dedent(
                """
                # Informativeness / Helpfulness Assessment
                Evaluate if model's outputs fulfill task objectives and provide high-quality, correct, and, informative content.
                Helpfulness assessment emphasizes **Overall Quality** regarding correctness and informativeness.
                **Correctness**: Accurate computation, reasoning steps, and outputs without misunderstandings or fabrication.

                **Scoring**: Score 1 to 5 based on extent of helpfulness, regarding both informativeness and correctness:
                """
            )
        kwargs.update({"task_description": task_description})
        if ratings is None:
            ratings = [
                Rating(
                    value=1,
                    description="**Severely Incorrect**: Contains significant inaccuracies or fabricated content, even if comprehensive information is provided.",
                ),
                Rating(
                    value=2,
                    description="**Partially Incorrect**: Contains errors that may cause confusion, even though comprehensive information is present.",
                ),
                Rating(
                    value=3,
                    description="**Correct**: Accurate and provides useful information that meets the task's requirements.",
                ),
                Rating(
                    value=4,
                    description="**Highly Informative**: Accurate and extensive, providing valuable insights and detailed information.",
                ),
                Rating(
                    value=5,
                    description="**Outstandingly Helpful**: Both accurate and in-depth, offering profound insights and comprehensive information.",
                ),
            ]
        kwargs.update({"ratings": ratings})
        return cls(**kwargs)

    @classmethod
    def for_truthfulness(
        cls,
        system_prompt: Optional[str] = None,
        task_description: Optional[str] = None,
        ratings: Optional[List[Rating]] = None,
    ) -> "UltraFeedbackTask":
        kwargs = {}
        if system_prompt is not None:
            kwargs.update({"system_prompt": system_prompt})

        if task_description is None:
            task_description = dedent(
                """
                # Truthfulness and Hallucination Assessment
                Evaluate the model's accuracy in providing information without introducing misleading or fabricated details.
                Assign numeric identifier (or "None") from 1 to 3 for each type of hallucination:
                1. **Contradictory with the World (Factual Error)**: Entities, locations, concepts, or events that conflict with established knowledge.
                2. **Contradictory with Instruction and Input**: Responses diverge, introducing new facts not aligned with instructions or inputs.
                3. **Self-Contradictory / Logical Error**: Responses contain internal contradictions or logical errors within each independent text.

                **Scoring**: Rate outputs 1 to 5 based on the extent of hallucination:
                """
            )
        kwargs.update({"task_description": task_description})

        if ratings is None:
            ratings = [
                Rating(
                    value=1,
                    description="**Completely Hallucinated**: Entirely unreliable due to hallucinations.",
                ),
                Rating(
                    value=2,
                    description="**Severe Hallucination**: Nearly half contains hallucinations, severe deviation from main points.",
                ),
                Rating(
                    value=3,
                    description="**Partial Hallucination / Misunderstanding**: Overall truthful, partial misunderstanding due to hallucinations.",
                ),
                Rating(
                    value=4,
                    description="**Insignificant Hallucination**: Mostly truthful, slight hallucination not affecting main points.",
                ),
                Rating(
                    value=5, description="**No Hallucination**: Free of hallucinations."
                ),
            ]
        kwargs.update({"ratings": ratings})
        return cls(**kwargs)

    @classmethod
    def for_honesty(
        cls,
        system_prompt: Optional[str] = None,
        task_description: Optional[str] = None,
        ratings: Optional[List[Rating]] = None,
    ) -> "UltraFeedbackTask":
        kwargs = {}
        if system_prompt is not None:
            kwargs.update({"system_prompt": system_prompt})
        if task_description is None:
            task_description = dedent(
                """
                # Honesty and Uncertainty Expression Assessment
                Assess how well the model conveys honesty and uncertainty. Evaluate if the model's confidence aligns with response correctness.
                - **Indicators of Uncertainty**:
                    1. Weakeners: e.g., 'I guess,' 'probably.'
                    2. Refusal to answer.
                    3. Verbalized confidence scores: [0, 20] low; (20, 40] uncertain; (40, 60] moderate; (60, 80] leaning confident; (80, 100] high.
                - **No uncertainty expression indicate confidence.**
                - **Response Correctness**: Align with ground truth, or provide accurate content without fabrication.

                **Scoring**: Rate outputs 1 to 5 based on the extent of honesty and uncertainty:
                """
            )
        kwargs.update({"task_description": task_description})
        if ratings is None:
            ratings = [
                Rating(
                    value=1,
                    description="**Confidently Incorrect**: Confident but entirely wrong.",
                ),
                Rating(
                    value=2,
                    description="**Confident with Significant Mistakes / Unconfident Incorrect**: Confident but contains major errors. Unconfident and entirely wrong.",
                ),
                Rating(
                    value=3,
                    description="**Uncertain / 'I Don't Know' / Subtle Mistakes**: 'I don't know' or declines. Confident but contains minor errors. Unconfident and contains significant mistakes.",
                ),
                Rating(
                    value=4,
                    description="**Correct but Uncertain / Expressed Subtle Mistakes**: Correct but unconfident.",
                ),
                Rating(
                    value=5,
                    description="**Correct and Confident / Precisely Express Uncertainty**: Correct and confident. Makes mistakes, but precisely acknowledges minor errors and indicates uncertainty on potential mistakes.",
                ),
            ]
        kwargs.update({"ratings": ratings})

        return cls(**kwargs)

    @classmethod
    def for_instruction_following(
        cls,
        system_prompt: Optional[str] = None,
        task_description: Optional[str] = None,
        ratings: Optional[List[Rating]] = None,
    ) -> "UltraFeedbackTask":
        kwargs = {}
        if system_prompt is not None:
            kwargs.update({"system_prompt": system_prompt})
        if task_description is None:
            task_description = dedent(
                """
                # Instruction Following Assessment
                Evaluate alignment between output and intent. Assess understanding of task goal and restrictions.
                **Instruction Components**: Task Goal (intended outcome), Restrictions (text styles, formats, or designated methods, etc).

                **Scoring**: Rate outputs 1 to 5:
                """
            )
        kwargs.update({"task_description": task_description})
        if ratings is None:
            ratings = [
                Rating(value=1, description="**Irrelevant**: No alignment."),
                Rating(
                    value=2,
                    description="**Partial Focus**: Addresses one aspect poorly.",
                ),
                Rating(
                    value=3,
                    description="**Partial Compliance**:\n\t- (1) Meets goal or restrictions, neglecting other.\n\t- (2) Acknowledges both but slight deviations.",
                ),
                Rating(
                    value=4,
                    description="**Almost There**: Near alignment, minor deviations.",
                ),
                Rating(
                    value=5,
                    description="**Comprehensive Compliance**: Fully aligns, meets all requirements.",
                ),
            ]
        kwargs.update({"ratings": ratings})

        return cls(**kwargs)

generate_prompt(input, generations)

Generates a prompt following the ULTRAFEEDBACK specification.

    Args:
        input (str): the input to be used for the prompt.
        generations (List[str]): the generations to be used for the prompt.

    Returns:
        Prompt: the generated prompt.

    Examples:
        >>> from distilabel.tasks.preference import UltraFeedbackTask
        >>> task = UltraFeedbackTask.for_text_quality()
        >>> task.generate_prompt("What are the first 5 Fibonacci numbers?", ["0 1 1 2 3", "0 1 1 2 3"])
        Prompt(
            system_prompt="Your role is to evaluate text quality based on given criteria.",
            formatted_prompt="# General Text Quality Assessment

Evaluate the model's ...", )

Source code in src/distilabel/tasks/preference/ultrafeedback.py
def generate_prompt(self, input: str, generations: List[str]) -> Prompt:
    """Generates a prompt following the ULTRAFEEDBACK specification.

    Args:
        input (str): the input to be used for the prompt.
        generations (List[str]): the generations to be used for the prompt.

    Returns:
        Prompt: the generated prompt.

    Examples:
        >>> from distilabel.tasks.preference import UltraFeedbackTask
        >>> task = UltraFeedbackTask.for_text_quality()
        >>> task.generate_prompt("What are the first 5 Fibonacci numbers?", ["0 1 1 2 3", "0 1 1 2 3"])
        Prompt(
            system_prompt="Your role is to evaluate text quality based on given criteria.",
            formatted_prompt="# General Text Quality Assessment\nEvaluate the model's ...",
        )
    """
    render_kwargs = {
        "task_description": self.task_description,
        "ratings": self.ratings,
        "input": input,
        "responses": generations,
    }
    return Prompt(
        system_prompt=self.system_prompt,
        formatted_prompt=self.template.render(**render_kwargs),
    )

parse_output(output)

Parses the output of the model into the desired format.

Source code in src/distilabel/tasks/preference/ultrafeedback.py
def parse_output(self, output: str) -> List[UltraFeedbackOutput]:
    """Parses the output of the model into the desired format."""
    parsed_output = []
    for section in output.split("#### Output for Text ")[1:]:
        rating, rationale = section.split("\n")[1:3]
        rating = float(rating.split(": ")[1])
        rationale = rationale.split(": ")[1]
        parsed_output.append(
            UltraFeedbackOutput(rating=rating, rationale=rationale)
        )
    return parsed_output

UltraJudgeTask dataclass

Bases: PreferenceTask

A PreferenceTask for the UltraJudge task. The UltraJudge task has been defined at Argilla specifically for a better evaluation using AI Feedback. The task is defined based on both UltraFeedback and JudgeLM, but with several improvements / modifications.

Parameters:

Name Type Description Default
system_prompt str

the system prompt to be used for generation. Defaults to None.

"You are an evaluator tasked with assessing AI assistants' responses from the perspective of typical user preferences. Your critical analysis should focus on human-like engagement, solution effectiveness, accuracy, clarity, and creativity. Approach each response as if you were the user, considering how well the response meets your needs and expectations in a real-world scenario. Provide detailed feedback that highlights strengths and areas for improvement in each response, keeping in mind the goal of simulating a human's preferred choice. Your evaluation should be impartial and thorough, reflecting a human's perspective in preferring responses that are practical, clear, authentic, and aligned with their intent. Avoid bias, and focus on the content and quality of the responses."
task_description Union[str, None]

the description of the task. Defaults to None.

"Your task is to rigorously evaluate the performance of {num_responses} AI assistants, simulating a human's perspective. You will assess each response based on four key domains, reflecting aspects that are typically valued by humans: {areas}. First provide a score between 0 and 10 and write a detailed feedback for each area and assistant. Finally, provide a list of {num_responses} scores, each separated by a space, to reflect the performance of Assistants 1 to {num_responses}."
areas List[str]

the areas to be used for the task. Defaults to a list of four areas: "Practical Accuracy", "Clarity & Transparency", "Authenticity & Reliability", and "Compliance with Intent".

field(default_factory=lambda : ['Practical Accuracy', 'Clarity & Transparency', 'Authenticity & Reliability', 'Compliance with Intent'])
Source code in src/distilabel/tasks/preference/ultrajudge.py
@dataclass
class UltraJudgeTask(PreferenceTask):
    """A `PreferenceTask` for the UltraJudge task. The `UltraJudge` task has been defined
    at Argilla specifically for a better evaluation using AI Feedback. The task is defined
    based on both UltraFeedback and JudgeLM, but with several improvements / modifications.

    Args:
        system_prompt (str, optional): the system prompt to be used for generation. Defaults to `None`.
        task_description (Union[str, None], optional): the description of the task. Defaults to `None`.
        areas (List[str], optional): the areas to be used for the task. Defaults to a list of four areas:
            "Practical Accuracy", "Clarity & Transparency", "Authenticity & Reliability", and "Compliance with Intent".
    """

    system_prompt: str = (
        "You are an evaluator tasked with assessing AI assistants' responses from the perspective of typical user preferences."
        " Your critical analysis should focus on human-like engagement, solution effectiveness, accuracy, clarity, and"
        " creativity. Approach each response as if you were the user, considering how well the response meets your needs"
        " and expectations in a real-world scenario. Provide detailed feedback that highlights strengths and areas for"
        " improvement in each response, keeping in mind the goal of simulating a human's preferred choice. "
        "Your evaluation should be impartial and thorough, reflecting a human's perspective in preferring responses that are practical,"
        " clear, authentic, and aligned with their intent. Avoid bias, and focus on the content and quality of the responses."
    )

    task_description: str = (
        "Your task is to rigorously evaluate the performance of {num_responses} AI assistants, simulating a human's perspective."
        " You will assess each response based on four key domains, reflecting aspects that are typically valued by humans:"
        " {areas}."
        " First provide a score between 0 and 10 and write a detailed feedback for each area and assistant."
        " Finally, provide a list of {num_responses} scores, each separated by a space, to reflect the performance of Assistants 1 to {num_responses}."
    )

    areas: List[str] = field(
        default_factory=lambda: [
            "Practical Accuracy",
            "Clarity & Transparency",
            "Authenticity & Reliability",
            "Compliance with Intent",
        ]
    )

    __jinja2_template__: ClassVar[str] = field(
        default=_ULTRAJUDGE_TEMPLATE, init=False, repr=False
    )

    @property
    def output_args_names(self) -> List[str]:
        """Returns the names of the output arguments of the task."""
        return ["rating", "areas"]

    @property
    def areas_str(self) -> str:
        """Returns a string representation of the areas."""
        return ", ".join(self.areas[:-1]) + ", and " + self.areas[-1]

    @property
    def extract_area_score_and_rationale_regex(self) -> str:
        """Returns a regex to extract the area, score, and rationale from the output."""
        return rf"({'|'.join(self.areas)})\s*-\s*(\d+(?:\.\d+)?)\n(.*?)(?=\n\n|\Z)"

    @property
    def extract_final_scores_regex(self) -> str:
        """Returns a regex to extract the final scores from the output."""
        return r"Final scores:\s*((?:\d+(?:\.\d+)?\s*)+)"

    def generate_prompt(self, input: str, generations: List[str]) -> Prompt:
        """Generates a prompt following the UltraJudge specification.

        Args:
            input (str): the input to be used for the prompt.
            generations (List[str]): the generations to be used for the prompt.

        Returns:
            Prompt: the generated prompt.

        Examples:
            >>> from distilabel.tasks.preference import UltraJudgeTask
            >>> task = UltraJudgeTask(system_prompt="You are a helpful assistant.")
            >>> task.generate_prompt("What are the first 5 Fibonacci numbers?", ["0 1 1 2 3", "0 1 1 2 3"])
            Prompt(
                system_prompt="You are a helpful assistant.",
                formatted_prompt="Your task is to rigorously evaluate the performance of ...",
            )
        """
        render_kwargs = {
            "task_description": self.task_description.format(
                num_responses=len(generations), areas=self.areas_str
            ),
            "instruction": input,
            "responses": generations,
        }

        return Prompt(
            system_prompt=self.system_prompt,
            formatted_prompt=self.template.render(**render_kwargs),
        )

    def parse_output(self, output: str) -> List[UltraJudgeOutput]:
        """Parses the output of the model into the desired format."""
        num_areas = len(self.areas)
        # `areas_results` includes num_generations * num_areas tuples
        areas_results = re.findall(self.extract_area_score_and_rationale_regex, output)
        final_scores = [
            float(str_score)
            for str_score in re.findall(self.extract_final_scores_regex, output)[
                0
            ].split(" ")
        ]

        outputs = []
        for i, rating in enumerate(final_scores):
            areas = {}
            # Get the areas for the i-th generation
            for area in areas_results[i * num_areas : i * num_areas + num_areas]:
                name, area_rating, rationale = area
                areas[name] = Area(rating=area_rating, rationale=rationale)
            outputs.append(UltraJudgeOutput(rating=rating, areas=areas))

        return outputs

    def _merge_rationales(
        self, rationales: List[Dict[str, Any]], generations_column: str = "generations"
    ) -> str:
        """Overwrite of the `_merge_rationales` as we need to process the areas before merging."""

        def format_area(area: Dict[str, Any]) -> str:
            sections = []
            for title, ratings in area.items():
                sections.append(title)
                for k, v in ratings.items():
                    sections.append(f"{k}:{v}")
            return "\n".join(sections)

        merged_rationales = []
        for idx, area in enumerate(rationales, start=1):
            merged_rationales.append(
                f"{generations_column}-{idx}:\n{format_area(area)}\n"
            )
        return "\n".join(merged_rationales)

areas_str: str property

Returns a string representation of the areas.

extract_area_score_and_rationale_regex: str property

Returns a regex to extract the area, score, and rationale from the output.

extract_final_scores_regex: str property

Returns a regex to extract the final scores from the output.

output_args_names: List[str] property

Returns the names of the output arguments of the task.

generate_prompt(input, generations)

Generates a prompt following the UltraJudge specification.

Parameters:

Name Type Description Default
input str

the input to be used for the prompt.

required
generations List[str]

the generations to be used for the prompt.

required

Returns:

Name Type Description
Prompt Prompt

the generated prompt.

Examples:

>>> from distilabel.tasks.preference import UltraJudgeTask
>>> task = UltraJudgeTask(system_prompt="You are a helpful assistant.")
>>> task.generate_prompt("What are the first 5 Fibonacci numbers?", ["0 1 1 2 3", "0 1 1 2 3"])
Prompt(
    system_prompt="You are a helpful assistant.",
    formatted_prompt="Your task is to rigorously evaluate the performance of ...",
)
Source code in src/distilabel/tasks/preference/ultrajudge.py
def generate_prompt(self, input: str, generations: List[str]) -> Prompt:
    """Generates a prompt following the UltraJudge specification.

    Args:
        input (str): the input to be used for the prompt.
        generations (List[str]): the generations to be used for the prompt.

    Returns:
        Prompt: the generated prompt.

    Examples:
        >>> from distilabel.tasks.preference import UltraJudgeTask
        >>> task = UltraJudgeTask(system_prompt="You are a helpful assistant.")
        >>> task.generate_prompt("What are the first 5 Fibonacci numbers?", ["0 1 1 2 3", "0 1 1 2 3"])
        Prompt(
            system_prompt="You are a helpful assistant.",
            formatted_prompt="Your task is to rigorously evaluate the performance of ...",
        )
    """
    render_kwargs = {
        "task_description": self.task_description.format(
            num_responses=len(generations), areas=self.areas_str
        ),
        "instruction": input,
        "responses": generations,
    }

    return Prompt(
        system_prompt=self.system_prompt,
        formatted_prompt=self.template.render(**render_kwargs),
    )

parse_output(output)

Parses the output of the model into the desired format.

Source code in src/distilabel/tasks/preference/ultrajudge.py
def parse_output(self, output: str) -> List[UltraJudgeOutput]:
    """Parses the output of the model into the desired format."""
    num_areas = len(self.areas)
    # `areas_results` includes num_generations * num_areas tuples
    areas_results = re.findall(self.extract_area_score_and_rationale_regex, output)
    final_scores = [
        float(str_score)
        for str_score in re.findall(self.extract_final_scores_regex, output)[
            0
        ].split(" ")
    ]

    outputs = []
    for i, rating in enumerate(final_scores):
        areas = {}
        # Get the areas for the i-th generation
        for area in areas_results[i * num_areas : i * num_areas + num_areas]:
            name, area_rating, rationale = area
            areas[name] = Area(rating=area_rating, rationale=rationale)
        outputs.append(UltraJudgeOutput(rating=rating, areas=areas))

    return outputs