Skip to content

tasks

CritiqueTask dataclass

Bases: RatingToArgillaMixin, Task

A Task for critique / judge tasks.

Parameters:

Name Type Description Default
system_prompt str

the system prompt to be used for generation.

required
task_description Union[str, None]

the description of the task. Defaults to None.

None
Source code in src/distilabel/tasks/critique/base.py
@dataclass
class CritiqueTask(RatingToArgillaMixin, Task):
    """A `Task` for critique / judge tasks.

    Args:
        system_prompt (str): the system prompt to be used for generation.
        task_description (Union[str, None], optional): the description of the task. Defaults to `None`.
    """

    __type__: ClassVar[Literal["labelling"]] = "labelling"

    @property
    def input_args_names(self) -> List[str]:
        """Returns the names of the input arguments of the task."""
        return ["input", "generations"]

    @property
    def output_args_names(self) -> List[str]:
        """Returns the names of the output arguments of the task."""
        return ["critique", "score"]

    def to_argilla_dataset(
        self,
        dataset_row: Dict[str, Any],
        generations_column: str = "generations",
        score_column: str = "score",
        critique_column: str = "critique",
        score_values: Optional[List[int]] = None,
    ) -> "FeedbackDataset":
        return super().to_argilla_dataset(
            dataset_row=dataset_row,
            generations_column=generations_column,
            ratings_column=score_column,
            rationale_column=critique_column,
            ratings_values=score_values or [1, 2, 3, 4, 5],
        )

    def to_argilla_record(
        self,
        dataset_row: Dict[str, Any],
        generations_column: str = "generations",
        score_column: str = "score",
        critique_column: str = "critique",
        score_values: Optional[List[int]] = None,
    ) -> Union["FeedbackRecord", List["FeedbackRecord"]]:
        return super().to_argilla_record(
            dataset_row=dataset_row,
            generations_column=generations_column,
            ratings_column=score_column,
            rationale_column=critique_column,
            ratings_values=score_values or [1, 2, 3, 4, 5],
        )

input_args_names: List[str] property

Returns the names of the input arguments of the task.

output_args_names: List[str] property

Returns the names of the output arguments of the task.

JudgeLMTask dataclass

Bases: PreferenceTask

A PreferenceTask following the prompt templated used by JudgeLM.

Parameters:

Name Type Description Default
system_prompt str

the system prompt to be used for generation. Defaults to None.

'You are a helpful and precise assistant for checking the quality of the answer.'
task_description Union[str, None]

the description of the task. Defaults to None.

'We would like to request your feedback on the performance of {num_responses} AI assistants in response to the user question displayed above.\nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only {num_responses} values indicating the scores for Assistants 1 to {num_responses}, respectively. The {num_responses} scores are separated by a space. In the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment.'
References
Source code in src/distilabel/tasks/preference/judgelm.py
@dataclass
class JudgeLMTask(PreferenceTask):
    """A `PreferenceTask` following the prompt templated used by JudgeLM.

    Args:
        system_prompt (str, optional): the system prompt to be used for generation. Defaults to `None`.
        task_description (Union[str, None], optional): the description of the task. Defaults to `None`.

    References:
        - [`Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena`](https://arxiv.org/abs/2306.05685)
        - [`BAAI/JudgeLM-7B-v1.0`](https://huggingface.co/BAAI/JudgeLM-7B-v1.0)
        - [`BAAI/JudgeLM-13B-v1.0`](https://huggingface.co/BAAI/JudgeLM-13B-v1.0)
        - [`BAAI/JudgeLM-33B-v1.0`](https://huggingface.co/BAAI/JudgeLM-33B-v1.0)
    """

    task_description: str = (
        "We would like to request your feedback on the performance of {num_responses} AI assistants in response to the"
        " user question displayed above.\nPlease rate the helpfulness, relevance, accuracy, level of details"
        " of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher"
        " score indicates better overall performance.\nPlease first output a single line containing only {num_responses}"
        " values indicating the scores for Assistants 1 to {num_responses}, respectively. The {num_responses} scores are separated by"
        " a space. In the subsequent line, please provide a comprehensive explanation of your evaluation,"
        " avoiding any potential bias and ensuring that the order in which the responses were presented does"
        " not affect your judgment."
    )
    system_prompt: str = "You are a helpful and precise assistant for checking the quality of the answer."

    __jinja2_template__: ClassVar[str] = _JUDGELM_TEMPLATE

    def generate_prompt(self, input: str, generations: List[str], **_: Any) -> Prompt:
        """Generates a prompt following the JudgeLM specification.

        Args:
            input (str): the input to be used for the prompt.
            generations (List[str]): the generations to be used for the prompt.

        Returns:
            Prompt: the generated prompt.

        Examples:
            >>> from distilabel.tasks.preference import JudgeLMTask
            >>> task = JudgeLMTask(system_prompt="You are a helpful assistant.")
            >>> task.generate_prompt("What are the first 5 Fibonacci numbers?", ["0 1 1 2 3", "0 1 1 2 3"])
            Prompt(
                system_prompt="You are a helpful assistant.",
                formatted_prompt="[Question] What are the first 5 Fibonacci numbers? ...",
            )
        """
        render_kwargs = {
            "input": input,
            "responses": generations,
            "task_description": self.task_description.format(
                num_responses=len(generations)
            ),
        }
        return Prompt(
            system_prompt=self.system_prompt,
            formatted_prompt=self.template.render(**render_kwargs),
        )

    def parse_output(self, output: str) -> JudgeLMOutput:
        """Parses the output of the model into the desired format."""
        split_output = output.split("\n")
        rating = [float(rating) for rating in split_output[0].split(" ")]
        rationale = "\n".join(split_output[1:])
        return JudgeLMOutput(rating=rating, rationale=rationale)

generate_prompt(input, generations, **_)

Generates a prompt following the JudgeLM specification.

Parameters:

Name Type Description Default
input str

the input to be used for the prompt.

required
generations List[str]

the generations to be used for the prompt.

required

Returns:

Name Type Description
Prompt Prompt

the generated prompt.

Examples:

>>> from distilabel.tasks.preference import JudgeLMTask
>>> task = JudgeLMTask(system_prompt="You are a helpful assistant.")
>>> task.generate_prompt("What are the first 5 Fibonacci numbers?", ["0 1 1 2 3", "0 1 1 2 3"])
Prompt(
    system_prompt="You are a helpful assistant.",
    formatted_prompt="[Question] What are the first 5 Fibonacci numbers? ...",
)
Source code in src/distilabel/tasks/preference/judgelm.py
def generate_prompt(self, input: str, generations: List[str], **_: Any) -> Prompt:
    """Generates a prompt following the JudgeLM specification.

    Args:
        input (str): the input to be used for the prompt.
        generations (List[str]): the generations to be used for the prompt.

    Returns:
        Prompt: the generated prompt.

    Examples:
        >>> from distilabel.tasks.preference import JudgeLMTask
        >>> task = JudgeLMTask(system_prompt="You are a helpful assistant.")
        >>> task.generate_prompt("What are the first 5 Fibonacci numbers?", ["0 1 1 2 3", "0 1 1 2 3"])
        Prompt(
            system_prompt="You are a helpful assistant.",
            formatted_prompt="[Question] What are the first 5 Fibonacci numbers? ...",
        )
    """
    render_kwargs = {
        "input": input,
        "responses": generations,
        "task_description": self.task_description.format(
            num_responses=len(generations)
        ),
    }
    return Prompt(
        system_prompt=self.system_prompt,
        formatted_prompt=self.template.render(**render_kwargs),
    )

parse_output(output)

Parses the output of the model into the desired format.

Source code in src/distilabel/tasks/preference/judgelm.py
def parse_output(self, output: str) -> JudgeLMOutput:
    """Parses the output of the model into the desired format."""
    split_output = output.split("\n")
    rating = [float(rating) for rating in split_output[0].split(" ")]
    rationale = "\n".join(split_output[1:])
    return JudgeLMOutput(rating=rating, rationale=rationale)

PrometheusTask dataclass

Bases: CritiqueTask

A CritiqueTask following the prompt templated used by Prometheus.

Parameters:

Name Type Description Default
system_prompt str

the system prompt to be used for generation. Defaults to None.

'You are a fair evaluator language model.'
scoring_criteria str

the scoring criteria to be used for the task, that defines the scores below, provided via score_descriptions.

required
score_descriptions Dict[int, str]

the descriptions of the scores, where the key is the rating value (ideally those should be consecutive), and the value is the description of each rating.

required
Disclaimer

Since the Prometheus model has been trained with OpenAI API generated data, the prompting strategy may just be consistent / compliant with either GPT-3.5 or GPT-4 from OpenAI API, or with their own model. Any other model may fail on the generation of a structured output, as well as providing an incorrect / inaccurate critique.

References
Source code in src/distilabel/tasks/critique/prometheus.py
@dataclass
class PrometheusTask(CritiqueTask):
    """A `CritiqueTask` following the prompt templated used by Prometheus.

    Args:
        system_prompt (str, optional): the system prompt to be used for generation. Defaults to `None`.
        scoring_criteria (str): the scoring criteria to be used for the task, that defines
            the scores below, provided via `score_descriptions`.
        score_descriptions (Dict[int, str]): the descriptions of the scores, where
            the key is the rating value (ideally those should be consecutive), and the
            value is the description of each rating.

    Disclaimer:
        Since the Prometheus model has been trained with OpenAI API generated data, the prompting
        strategy may just be consistent / compliant with either GPT-3.5 or GPT-4 from OpenAI API, or
        with their own model. Any other model may fail on the generation of a structured output, as
        well as providing an incorrect / inaccurate critique.

    References:
        - [`Prometheus: Inducing Fine-grained Evaluation Capability in Language Models`](https://arxiv.org/abs/2310.08491)
        - [`kaist-ai/prometheus-13b-v1.0`](https://huggingface.co/kaist-ai/prometheus-7b-v1.0)
        - [`kaist-ai/prometheus-13b-v1.0`](https://huggingface.co/kaist-ai/prometheus-13b-v1.0)
    """

    scoring_criteria: str
    score_descriptions: Dict[int, str]

    system_prompt: str = "You are a fair evaluator language model."

    __jinja2_template__: ClassVar[str] = _PROMETHEUS_TEMPLATE

    @property
    def input_args_names(self) -> List[str]:
        return super().input_args_names + ["ref_completion"]

    def generate_prompt(
        self, input: str, generations: str, ref_completion: str, **_: Any
    ) -> Prompt:
        """Generates a prompt following the Prometheus specification.

        Args:
            input (str): the input to be used for the prompt.
            generations (List[str]): the generations to be used for the prompt, in
                this case, the ones to be critiqued.
            ref_completion (str): the reference completion to be used for the prompt,
                which is the reference one, assuming the one with the highest score.

        Returns:
            Prompt: the generated prompt.

        Examples:
            >>> from distilabel.tasks.critique import PrometheusTask
            >>> task = PrometheusTask(
            ...     scoring_criteria="Overall quality of the responses provided.",
            ...     score_descriptions={0: "false", 1: "partially false", 2: "average", 3: "partially true", 4: "true"},
            ... )
            >>> task.generate_prompt(
            ...     input="What are the first 5 Fibonacci numbers?",
            ...     generations=["0 1 1 2 3", "0 1 1 2 3"],
            ...     ref_completion="0 1 1 2 3",
            ... )
            Prompt(
                system_prompt="You are a fair evaluator language model.",
                formatted_prompt=""###Task Description:...",
            )
        """
        render_kwargs = {
            "instruction": input,
            "completion": generations,
            "ref_completion": ref_completion,
            "scoring_criteria": self.scoring_criteria,
            "score_descriptions": self.score_descriptions,
        }
        return Prompt(
            system_prompt=self.system_prompt,
            formatted_prompt=self.template.render(**render_kwargs),
        )

    def parse_output(self, output: str) -> CritiqueTaskOutput:  # type: ignore
        """Parses the output of the model into the desired format."""
        # We use a regex instead of splitting by the delimiter because the
        # critique may contain the delimiter, and using the regex is safer.
        pattern = r"(.+?)\. \[RESULT\] (\d+)"
        match = re.match(pattern, output)
        if match:
            return CritiqueTaskOutput(
                score=float(match.group(2)),
                critique=match.group(1).strip(),
            )

generate_prompt(input, generations, ref_completion, **_)

Generates a prompt following the Prometheus specification.

Parameters:

Name Type Description Default
input str

the input to be used for the prompt.

required
generations List[str]

the generations to be used for the prompt, in this case, the ones to be critiqued.

required
ref_completion str

the reference completion to be used for the prompt, which is the reference one, assuming the one with the highest score.

required

Returns:

Name Type Description
Prompt Prompt

the generated prompt.

Examples:

>>> from distilabel.tasks.critique import PrometheusTask
>>> task = PrometheusTask(
...     scoring_criteria="Overall quality of the responses provided.",
...     score_descriptions={0: "false", 1: "partially false", 2: "average", 3: "partially true", 4: "true"},
... )
>>> task.generate_prompt(
...     input="What are the first 5 Fibonacci numbers?",
...     generations=["0 1 1 2 3", "0 1 1 2 3"],
...     ref_completion="0 1 1 2 3",
... )
Prompt(
    system_prompt="You are a fair evaluator language model.",
    formatted_prompt=""###Task Description:...",
)
Source code in src/distilabel/tasks/critique/prometheus.py
def generate_prompt(
    self, input: str, generations: str, ref_completion: str, **_: Any
) -> Prompt:
    """Generates a prompt following the Prometheus specification.

    Args:
        input (str): the input to be used for the prompt.
        generations (List[str]): the generations to be used for the prompt, in
            this case, the ones to be critiqued.
        ref_completion (str): the reference completion to be used for the prompt,
            which is the reference one, assuming the one with the highest score.

    Returns:
        Prompt: the generated prompt.

    Examples:
        >>> from distilabel.tasks.critique import PrometheusTask
        >>> task = PrometheusTask(
        ...     scoring_criteria="Overall quality of the responses provided.",
        ...     score_descriptions={0: "false", 1: "partially false", 2: "average", 3: "partially true", 4: "true"},
        ... )
        >>> task.generate_prompt(
        ...     input="What are the first 5 Fibonacci numbers?",
        ...     generations=["0 1 1 2 3", "0 1 1 2 3"],
        ...     ref_completion="0 1 1 2 3",
        ... )
        Prompt(
            system_prompt="You are a fair evaluator language model.",
            formatted_prompt=""###Task Description:...",
        )
    """
    render_kwargs = {
        "instruction": input,
        "completion": generations,
        "ref_completion": ref_completion,
        "scoring_criteria": self.scoring_criteria,
        "score_descriptions": self.score_descriptions,
    }
    return Prompt(
        system_prompt=self.system_prompt,
        formatted_prompt=self.template.render(**render_kwargs),
    )

parse_output(output)

Parses the output of the model into the desired format.

Source code in src/distilabel/tasks/critique/prometheus.py
def parse_output(self, output: str) -> CritiqueTaskOutput:  # type: ignore
    """Parses the output of the model into the desired format."""
    # We use a regex instead of splitting by the delimiter because the
    # critique may contain the delimiter, and using the regex is safer.
    pattern = r"(.+?)\. \[RESULT\] (\d+)"
    match = re.match(pattern, output)
    if match:
        return CritiqueTaskOutput(
            score=float(match.group(2)),
            critique=match.group(1).strip(),
        )

Prompt dataclass

A dataclass representing a Prompt.

Parameters:

Name Type Description Default
system_prompt str

the system prompt.

required
formatted_prompt str

the formatted prompt.

required

Examples:

>>> from distilabel.tasks.prompt import Prompt
>>> prompt = Prompt(
...     system_prompt="You are a helpful assistant.",
...     formatted_prompt="What are the first 5 Fibonacci numbers?",
... )
Source code in src/distilabel/tasks/prompt.py
@dataclass
class Prompt:
    """A `dataclass` representing a `Prompt`.

    Args:
        system_prompt (str): the system prompt.
        formatted_prompt (str): the formatted prompt.

    Examples:
        >>> from distilabel.tasks.prompt import Prompt
        >>> prompt = Prompt(
        ...     system_prompt="You are a helpful assistant.",
        ...     formatted_prompt="What are the first 5 Fibonacci numbers?",
        ... )
    """

    system_prompt: str
    formatted_prompt: str

    def format_as(self, format: SupportedFormats) -> Union[str, List[ChatCompletion]]:
        """Formats the prompt as the specified format.

        Args:
            format (SupportedFormats): the format to be used for the prompt. Available formats are
                `default`, `openai`, `llama2`, `chatml`, and `zephyr`.

        Returns:
            Union[str, List[ChatCompletion]]: the formatted prompt.

        Raises:
            ValueError: if the specified format is not supported.

        Examples:
            >>> from distilabel.tasks.prompt import Prompt
            >>> prompt = Prompt(
            ...     system_prompt="You are a helpful assistant.",
            ...     formatted_prompt="What are the first 5 Fibonacci numbers?",
            ... )
            >>> prompt.format_as("default")
            'You are a helpful assistant. What are the first 5 Fibonacci numbers?'
        """
        if format == "default":
            return f"{self.system_prompt}\n{self.formatted_prompt}"
        elif format == "openai":
            return [
                ChatCompletion(
                    role="system",
                    content=self.system_prompt,
                ),
                ChatCompletion(role="user", content=self.formatted_prompt),
            ]
        elif format == "llama2":
            return f"<s>[INST] <<SYS>>\n{self.system_prompt}<</SYS>>\n\n{self.formatted_prompt} [/INST]"
        elif format == "chatml":
            return f"<|im_start|>system\n{self.system_prompt}<|im_end|>\n<|im_start|>user\n{self.formatted_prompt}<|im_end|>\n<|im_start|>assistant\n"
        elif format in ["zephyr", "notus"]:
            return f"<|system|>\n{self.system_prompt}</s>\n<|user|>\n{self.formatted_prompt}</s>\n<|assistant|>\n"
        else:
            raise ValueError(
                f"Format {format} not supported, please provide a custom `prompt_formatting_fn`"
                " or use any of the available formats: openai, llama2, chatml, zephyr"
            )

format_as(format)

Formats the prompt as the specified format.

Parameters:

Name Type Description Default
format SupportedFormats

the format to be used for the prompt. Available formats are default, openai, llama2, chatml, and zephyr.

required

Returns:

Type Description
Union[str, List[ChatCompletion]]

Union[str, List[ChatCompletion]]: the formatted prompt.

Raises:

Type Description
ValueError

if the specified format is not supported.

Examples:

>>> from distilabel.tasks.prompt import Prompt
>>> prompt = Prompt(
...     system_prompt="You are a helpful assistant.",
...     formatted_prompt="What are the first 5 Fibonacci numbers?",
... )
>>> prompt.format_as("default")
'You are a helpful assistant. What are the first 5 Fibonacci numbers?'
Source code in src/distilabel/tasks/prompt.py
def format_as(self, format: SupportedFormats) -> Union[str, List[ChatCompletion]]:
    """Formats the prompt as the specified format.

    Args:
        format (SupportedFormats): the format to be used for the prompt. Available formats are
            `default`, `openai`, `llama2`, `chatml`, and `zephyr`.

    Returns:
        Union[str, List[ChatCompletion]]: the formatted prompt.

    Raises:
        ValueError: if the specified format is not supported.

    Examples:
        >>> from distilabel.tasks.prompt import Prompt
        >>> prompt = Prompt(
        ...     system_prompt="You are a helpful assistant.",
        ...     formatted_prompt="What are the first 5 Fibonacci numbers?",
        ... )
        >>> prompt.format_as("default")
        'You are a helpful assistant. What are the first 5 Fibonacci numbers?'
    """
    if format == "default":
        return f"{self.system_prompt}\n{self.formatted_prompt}"
    elif format == "openai":
        return [
            ChatCompletion(
                role="system",
                content=self.system_prompt,
            ),
            ChatCompletion(role="user", content=self.formatted_prompt),
        ]
    elif format == "llama2":
        return f"<s>[INST] <<SYS>>\n{self.system_prompt}<</SYS>>\n\n{self.formatted_prompt} [/INST]"
    elif format == "chatml":
        return f"<|im_start|>system\n{self.system_prompt}<|im_end|>\n<|im_start|>user\n{self.formatted_prompt}<|im_end|>\n<|im_start|>assistant\n"
    elif format in ["zephyr", "notus"]:
        return f"<|system|>\n{self.system_prompt}</s>\n<|user|>\n{self.formatted_prompt}</s>\n<|assistant|>\n"
    else:
        raise ValueError(
            f"Format {format} not supported, please provide a custom `prompt_formatting_fn`"
            " or use any of the available formats: openai, llama2, chatml, zephyr"
        )

SelfInstructTask dataclass

Bases: TextGenerationTask

A TextGenerationTask following the Self-Instruct specification for building the prompts.

Parameters:

Name Type Description Default
system_prompt str

the system prompt to be used. Defaults to None.

'You are an expert prompt writer, writing the best and most diverse prompts for a variety of tasks. You are given a task description and a set of instructions for how to write the prompts for an specific AI application.'
principles Dict[str, List[str]]

the principles to be used for the system prompt. Defaults to None.

field(default_factory=lambda : {'harmlessness': harmlessness, 'helpfulness': helpfulness, 'truthfulness': truthfulness, 'honesty': honesty, 'verbalized_calibration': verbalized_calibration}, repr=False)
principles_distribution Union[Dict[str, float], Literal[balanced], None]

the distribution of principles to be used for the system prompt. Defaults to None.

None
application_description str

the description of the AI application. Defaults to "AI assistant".

'AI assistant'
num_instructions int

the number of instructions to be used for the prompt. Defaults to 5.

5
References
Source code in src/distilabel/tasks/text_generation/self_instruct.py
@dataclass
class SelfInstructTask(TextGenerationTask):
    """A `TextGenerationTask` following the Self-Instruct specification for building
    the prompts.

    Args:
        system_prompt (str, optional): the system prompt to be used. Defaults to `None`.
        principles (Dict[str, List[str]], optional): the principles to be used for the system prompt.
            Defaults to `None`.
        principles_distribution (Union[Dict[str, float], Literal["balanced"], None], optional): the
            distribution of principles to be used for the system prompt. Defaults to `None`.
        application_description (str, optional): the description of the AI application. Defaults to
            "AI assistant".
        num_instructions (int, optional): the number of instructions to be used for the prompt.
            Defaults to 5.

    References:
        - [`Self-Instruct: Aligning Language Models with Self-Generated Instructions`](https://arxiv.org/abs/2212.10560)
        - [`Self-Instruct - GitHub Repository`](https://github.com/yizhongw/self-instruct)
    """

    system_prompt: str = (
        "You are an expert prompt writer, writing the best and most diverse prompts for a variety of tasks."
        " You are given a task description and a set of instructions for how to write the prompts for an"
        " specific AI application."
    )
    application_description: str = "AI assistant"
    num_instructions: int = 5

    __jinja2_template__: str = _SELF_INSTRUCT_TEMPLATE

    def generate_prompt(self, input: str, **_: Any) -> Prompt:
        """Generates a prompt following the Self-Instruct specification.

        Args:
            input (str): the input to be used for the prompt.

        Returns:
            Prompt: the generated prompt.

        Examples:
            >>> from distilabel.tasks.text_generation import SelfInstructTask
            >>> task = SelfInstructTask(system_prompt="You are a helpful assistant.", num_instructions=2)
            >>> task.generate_prompt("What are the first 5 Fibonacci numbers?")
            Prompt(
                system_prompt="You are a helpful assistant.",
                formatted_prompt="# Task Description ...",
            )
        """
        render_kwargs = {
            "application_description": self.application_description,
            "num_instructions": self.num_instructions,
            "input": input,
        }
        return Prompt(
            system_prompt=self.system_prompt,
            formatted_prompt=self.template.render(**render_kwargs),
        )

    @property
    def output_args_names(self) -> List[str]:
        return ["instructions"]

    def parse_output(self, output: str) -> Dict[str, List[str]]:
        """Parses the output of the model into the desired format."""
        pattern = re.compile(r"\d+\.\s*(.*?)\n")
        return {"instructions": pattern.findall(output)}

    def to_argilla_dataset(self, dataset_row: Dict[str, Any]) -> "FeedbackDataset":
        # First we infer the fields from the input_args_names, but we could also
        # create those manually instead using `rg.TextField(...)`
        fields = infer_fields_from_dataset_row(
            field_names=self.input_args_names,
            dataset_row=dataset_row,
        )
        # Once the input fields have been defined, then we also include the instruction
        # field which will be fulfilled with each of the instructions generated.
        fields.append(rg.TextField(name="instruction", title="instruction"))  # type: ignore
        # Then we add a default `RatingQuestion` which asks the users to provide a
        # rating for each of the generations, differing from the scenario where the inputs
        # are the fields and the outputs the ones used to formulate the quesstions. So on,
        # in this scenario we won't have suggestions, as the questions will be related to the
        # combination of inputs and outputs.
        questions = [
            rg.RatingQuestion(  # type: ignore
                name="instruction-rating",
                title="How would you rate the generated instruction?",
                values=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
            )
        ]
        # Finally, we define some metadata properties that can be potentially used
        # while exploring the dataset within Argilla to get more insights on the data.
        metadata_properties = []
        for arg_name in self.input_args_names:
            if isinstance(dataset_row[arg_name], list):
                for idx in range(1, len(dataset_row[arg_name]) + 1):
                    metadata_properties.append(
                        rg.IntegerMetadataProperty(name=f"length-{arg_name}-{idx}")  # type: ignore
                    )
            elif isinstance(dataset_row[arg_name], str):
                metadata_properties.append(
                    rg.IntegerMetadataProperty(name=f"length-{arg_name}")  # type: ignore
                )
            else:
                warnings.warn(
                    f"Unsupported input type ({type(dataset_row[arg_name])}), skipping...",
                    UserWarning,
                    stacklevel=2,
                )
        metadata_properties.append(
            rg.IntegerMetadataProperty(name="length-instruction")  # type: ignore
        )  # type: ignore
        # Then we just return the `FeedbackDataset` with the fields, questions, and metadata properties
        # defined above.
        return rg.FeedbackDataset(
            fields=fields,
            questions=questions,  # type: ignore
            metadata_properties=metadata_properties,  # Note that these are always optional
        )

    def to_argilla_record(
        self,
        dataset_row: Dict[str, Any],
        instructions_column: Optional[str] = "instructions",
    ) -> List["FeedbackRecord"]:
        """Converts a dataset row to a list of Argilla `FeedbackRecord`s."""
        records = []
        for instructions in dataset_row[instructions_column]:  # type: ignore
            for instruction in instructions:
                fields, metadata = {}, {}
                for arg_name in self.input_args_names:
                    arg_value = dataset_row[arg_name]
                    if isinstance(arg_value, list):
                        for idx, value in enumerate(arg_value, start=1):
                            value = value.strip() if isinstance(value, str) else ""
                            fields[f"{arg_name}-{idx}"] = value
                            if value is not None:
                                metadata[f"length-{arg_name}-{idx}"] = len(value)
                    elif isinstance(arg_value, str):
                        fields[arg_name] = arg_value.strip() if arg_value else ""
                        if arg_value is not None:
                            metadata[f"length-{arg_name}"] = len(arg_value.strip())
                    else:
                        warnings.warn(
                            f"Unsupported input type ({type(arg_value)}), skipping...",
                            UserWarning,
                            stacklevel=2,
                        )
                fields["instruction"] = instruction
                metadata["length-instruction"] = len(instruction)

                # Then we add the model metadata from the `generation_model` and `labelling_model`
                # columns of the dataset, if they exist.
                metadata.update(
                    model_metadata_from_dataset_row(dataset_row=dataset_row)
                )
                # Finally, we append the `FeedbackRecord` with the fields and the metadata
                records.append(rg.FeedbackRecord(fields=fields, metadata=metadata))
        if not records:
            raise ValueError(
                f"Skipping the row {dataset_row} as the list of `FeedbackRecord` is empty as those could not be inferred."
            )
        return records

generate_prompt(input, **_)

Generates a prompt following the Self-Instruct specification.

Parameters:

Name Type Description Default
input str

the input to be used for the prompt.

required

Returns:

Name Type Description
Prompt Prompt

the generated prompt.

Examples:

>>> from distilabel.tasks.text_generation import SelfInstructTask
>>> task = SelfInstructTask(system_prompt="You are a helpful assistant.", num_instructions=2)
>>> task.generate_prompt("What are the first 5 Fibonacci numbers?")
Prompt(
    system_prompt="You are a helpful assistant.",
    formatted_prompt="# Task Description ...",
)
Source code in src/distilabel/tasks/text_generation/self_instruct.py
def generate_prompt(self, input: str, **_: Any) -> Prompt:
    """Generates a prompt following the Self-Instruct specification.

    Args:
        input (str): the input to be used for the prompt.

    Returns:
        Prompt: the generated prompt.

    Examples:
        >>> from distilabel.tasks.text_generation import SelfInstructTask
        >>> task = SelfInstructTask(system_prompt="You are a helpful assistant.", num_instructions=2)
        >>> task.generate_prompt("What are the first 5 Fibonacci numbers?")
        Prompt(
            system_prompt="You are a helpful assistant.",
            formatted_prompt="# Task Description ...",
        )
    """
    render_kwargs = {
        "application_description": self.application_description,
        "num_instructions": self.num_instructions,
        "input": input,
    }
    return Prompt(
        system_prompt=self.system_prompt,
        formatted_prompt=self.template.render(**render_kwargs),
    )

parse_output(output)

Parses the output of the model into the desired format.

Source code in src/distilabel/tasks/text_generation/self_instruct.py
def parse_output(self, output: str) -> Dict[str, List[str]]:
    """Parses the output of the model into the desired format."""
    pattern = re.compile(r"\d+\.\s*(.*?)\n")
    return {"instructions": pattern.findall(output)}

to_argilla_record(dataset_row, instructions_column='instructions')

Converts a dataset row to a list of Argilla FeedbackRecords.

Source code in src/distilabel/tasks/text_generation/self_instruct.py
def to_argilla_record(
    self,
    dataset_row: Dict[str, Any],
    instructions_column: Optional[str] = "instructions",
) -> List["FeedbackRecord"]:
    """Converts a dataset row to a list of Argilla `FeedbackRecord`s."""
    records = []
    for instructions in dataset_row[instructions_column]:  # type: ignore
        for instruction in instructions:
            fields, metadata = {}, {}
            for arg_name in self.input_args_names:
                arg_value = dataset_row[arg_name]
                if isinstance(arg_value, list):
                    for idx, value in enumerate(arg_value, start=1):
                        value = value.strip() if isinstance(value, str) else ""
                        fields[f"{arg_name}-{idx}"] = value
                        if value is not None:
                            metadata[f"length-{arg_name}-{idx}"] = len(value)
                elif isinstance(arg_value, str):
                    fields[arg_name] = arg_value.strip() if arg_value else ""
                    if arg_value is not None:
                        metadata[f"length-{arg_name}"] = len(arg_value.strip())
                else:
                    warnings.warn(
                        f"Unsupported input type ({type(arg_value)}), skipping...",
                        UserWarning,
                        stacklevel=2,
                    )
            fields["instruction"] = instruction
            metadata["length-instruction"] = len(instruction)

            # Then we add the model metadata from the `generation_model` and `labelling_model`
            # columns of the dataset, if they exist.
            metadata.update(
                model_metadata_from_dataset_row(dataset_row=dataset_row)
            )
            # Finally, we append the `FeedbackRecord` with the fields and the metadata
            records.append(rg.FeedbackRecord(fields=fields, metadata=metadata))
    if not records:
        raise ValueError(
            f"Skipping the row {dataset_row} as the list of `FeedbackRecord` is empty as those could not be inferred."
        )
    return records

Task

Bases: ABC

Abstract class used to define the methods required to create a Task, to be used within an LLM.

Parameters:

Name Type Description Default
system_prompt str

the system prompt to be used for generation.

required
task_description Union[str, None]

the description of the task. Defaults to None.

required

Raises:

Type Description
ValueError

if the __jinja2_template__ attribute is not provided.

Source code in src/distilabel/tasks/base.py
class Task(ABC):
    """Abstract class used to define the methods required to create a `Task`, to be used
    within an `LLM`.

    Args:
        system_prompt (str): the system prompt to be used for generation.
        task_description (Union[str, None], optional): the description of the task. Defaults to `None`.

    Raises:
        ValueError: if the `__jinja2_template__` attribute is not provided.
    """

    system_prompt: str
    task_description: Union[str, None] = None

    __jinja2_template__: Union[str, None] = None
    __type__: Union[Literal["generation", "labelling"], None] = None

    def __rich_repr__(self) -> Generator[Any, None, None]:
        yield "system_prompt", self.system_prompt
        yield "task_description", self.task_description
        yield "input_args_names", self.input_args_names
        yield "output_args_names", self.output_args_names

    @property
    def template(self) -> "Template":
        if self.__jinja2_template__ is None:
            raise ValueError(
                "You must provide a `__jinja2_template__` attribute to your Task subclass."
            )

        return Template(open(self.__jinja2_template__).read())

    @abstractmethod
    def generate_prompt(self, **kwargs: Any) -> Prompt:
        pass

    @abstractmethod
    def parse_output(self, output: str) -> Any:
        pass

    @property
    @abstractmethod
    def input_args_names(self) -> List[str]:
        pass

    @property
    @abstractmethod
    def output_args_names(self) -> List[str]:
        pass

    def validate_dataset(self, columns_in_dataset: List[str]) -> None:
        """Validates that the dataset contains the required columns for the task.

        Args:
            columns_in_dataset (List[str]): the columns in the dataset.

        Raises:
            KeyError: if the dataset does not contain the required columns.
        """
        for input_arg_name in self.input_args_names:
            if input_arg_name not in columns_in_dataset:
                raise KeyError(
                    f"LLM expects a column named '{input_arg_name}' in the provided"
                    " dataset, but it was not found."
                )

    def to_argilla_dataset(
        self, dataset_row: Dict[str, Any], *args: Any, **kwargs: Any
    ) -> "FeedbackDataset":
        raise NotImplementedError(
            "`to_argilla_dataset` is not implemented, if you want to export your dataset as an Argilla"
            " `FeedbackDataset` you will need to implement this method first."
        )

    def to_argilla_record(
        self, dataset_row: Dict[str, Any], *args: Any, **kwargs: Any
    ) -> Union["FeedbackRecord", List["FeedbackRecord"]]:
        raise NotImplementedError(
            "`to_argilla_record` is not implemented, if you want to export your dataset as an Argilla"
            " `FeedbackDataset` you will need to implement this method first."
        )

    # Renamed to _to_argilla_record instead of renaming `to_argilla_record` to protected, as that would
    # imply more breaking changes.
    def _to_argilla_record(  # noqa: C901
        self, dataset_row: Dict[str, Any], *args: Any, **kwargs: Any
    ) -> Union["FeedbackRecord", List["FeedbackRecord"]]:
        column_names = list(dataset_row.keys())
        if self.__type__ is None or self.__type__ == "generation":
            required_column_names = self.input_args_names + self.output_args_names
        elif self.__type__ == "labelling":
            required_column_names = self.output_args_names
        else:
            raise ValueError("The task type is not supported.")

        dataset_rows = [dataset_row]
        if "generation_model" in dataset_row and isinstance(
            dataset_row["generation_model"], list
        ):
            generation_columns = column_names[
                column_names.index("generation_model") : column_names.index(
                    "labelling_model"
                )
                if "labelling_model" in column_names
                else None
            ]
            if any(
                isinstance(nested, list)
                for column_name in list(
                    set(generation_columns)
                    - {
                        "generation_model",
                        "generation_prompt",
                        "raw_generation_response",
                    }
                )
                for nested in dataset_row[column_name]
            ):
                if any(
                    generation_column in required_column_names
                    for generation_column in generation_columns
                ):
                    unwrapped_dataset_rows = []
                    for row in dataset_rows:
                        for idx in range(len(dataset_row["generation_model"])):
                            unwrapped_dataset_row = {}
                            for key, value in row.items():
                                if key in generation_columns:
                                    unwrapped_dataset_row[key] = value[idx]
                                else:
                                    unwrapped_dataset_row[key] = value
                            unwrapped_dataset_rows.append(unwrapped_dataset_row)
                    dataset_rows = unwrapped_dataset_rows

        if "labelling_model" in dataset_row and isinstance(
            dataset_row["labelling_model"], list
        ):
            labelling_columns = column_names[column_names.index("labelling_model") :]
            if any(
                isinstance(nested, list)
                for column_name in list(
                    set(labelling_columns)
                    - {
                        "labelling_model",
                        "labelling_prompt",
                        "raw_labelling_response",
                    }
                )
                for nested in dataset_row[column_name]
            ):
                if any(
                    labelling_column in required_column_names
                    for labelling_column in labelling_columns
                ):
                    unwrapped_dataset_rows = []
                    for row in dataset_rows:
                        for idx in range(len(dataset_row["labelling_model"])):
                            unwrapped_dataset_row = {}
                            for key, value in row.items():
                                if key in labelling_columns:
                                    unwrapped_dataset_row[key] = value[idx]
                                else:
                                    unwrapped_dataset_row[key] = value
                            unwrapped_dataset_rows.append(unwrapped_dataset_row)
                    dataset_rows = unwrapped_dataset_rows

        if len(dataset_rows) == 1:
            return self.to_argilla_record(dataset_rows[0], *args, **kwargs)

        records = []
        for dataset_row in dataset_rows:
            generated_records = self.to_argilla_record(dataset_row, *args, **kwargs)
            if isinstance(generated_records, list):
                records.extend(generated_records)
            else:
                records.append(generated_records)
        return records

validate_dataset(columns_in_dataset)

Validates that the dataset contains the required columns for the task.

Parameters:

Name Type Description Default
columns_in_dataset List[str]

the columns in the dataset.

required

Raises:

Type Description
KeyError

if the dataset does not contain the required columns.

Source code in src/distilabel/tasks/base.py
def validate_dataset(self, columns_in_dataset: List[str]) -> None:
    """Validates that the dataset contains the required columns for the task.

    Args:
        columns_in_dataset (List[str]): the columns in the dataset.

    Raises:
        KeyError: if the dataset does not contain the required columns.
    """
    for input_arg_name in self.input_args_names:
        if input_arg_name not in columns_in_dataset:
            raise KeyError(
                f"LLM expects a column named '{input_arg_name}' in the provided"
                " dataset, but it was not found."
            )

TextGenerationTask dataclass

Bases: Task

A base Task definition for text generation using LLMs.

Parameters:

Name Type Description Default
system_prompt str

the system prompt to be used. Defaults to None.

"You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."
principles Dict[str, List[str]]

the principles to be used for the system prompt. Defaults to None.

field(default_factory=lambda : {'harmlessness': harmlessness, 'helpfulness': helpfulness, 'truthfulness': truthfulness, 'honesty': honesty, 'verbalized_calibration': verbalized_calibration}, repr=False)
principles_distribution Union[Dict[str, float], Literal['balanced'], None]

the distribution of principles to be used for the system prompt. Defaults to None.

None

Examples:

>>> from distilabel.tasks.text_generation import TextGenerationTask
>>> task = TextGenerationTask()
Source code in src/distilabel/tasks/text_generation/base.py
@dataclass
class TextGenerationTask(Task):
    """A base `Task` definition for text generation using LLMs.

    Args:
        system_prompt (str, optional): the system prompt to be used. Defaults to `None`.
        principles (Dict[str, List[str]], optional): the principles to be used for the system prompt.
            Defaults to `None`.
        principles_distribution (Union[Dict[str, float], Literal["balanced"], None], optional): the
            distribution of principles to be used for the system prompt. Defaults to `None`.

    Examples:
        >>> from distilabel.tasks.text_generation import TextGenerationTask
        >>> task = TextGenerationTask()
    """

    system_prompt: str = (
        "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible,"
        " while being safe. Your answers should not include any harmful, unethical, racist, sexist,"
        " toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased"
        " and positive in nature.\nIf a question does not make any sense, or is not factually coherent,"
        " explain why instead of answering something not correct. If you don't know the answer to a"
        " question, please don't share false information."
    )
    principles: Dict[str, List[str]] = field(
        default_factory=lambda: {
            "harmlessness": UltraFeedbackPrinciples.harmlessness,
            "helpfulness": UltraFeedbackPrinciples.helpfulness,
            "truthfulness": UltraFeedbackPrinciples.truthfulness,
            "honesty": UltraFeedbackPrinciples.honesty,
            "verbalized_calibration": UltraFeedbackPrinciples.verbalized_calibration,
        },
        repr=False,
    )
    principles_distribution: Union[Dict[str, float], Literal["balanced"], None] = None

    __type__: ClassVar[Literal["generation"]] = "generation"

    def __post_init__(self) -> None:
        """Validates the `principles_distribution` if it is a dict.

        Raises:
            ValueError: if the `principles_distribution` is a dict and it does not sum to 1.0.
            ValueError: if the `principles` are not included in the `principles_distribution`.
        """
        if isinstance(self.principles_distribution, dict):
            not_included_principles = [
                principle
                for principle in self.principles
                if principle not in self.principles_distribution
            ]
            if not_included_principles:
                principles_str = ", ".join(
                    [f"'{principle}'" for principle in not_included_principles]
                )
                raise ValueError(
                    f"Principles {principles_str} included in `principles` is not in"
                    " `principles_distribution`"
                )

            if sum(self.principles_distribution.values()) != 1.0:
                raise ValueError(
                    "`principles_distribution` must sum to 1.0 if it is a dict containing"
                    " the distribution of principles to use."
                )

    def _get_principle(self) -> str:
        """Gets a principle from the `principles` dict respecting the `principal_distribution`.

        Returns:
            str: the principle to be used.
        """
        if isinstance(self.principles_distribution, dict):
            principle_group = random.choices(
                list(self.principles_distribution.keys()),
                weights=list(self.principles_distribution.values()),
                k=1,
            )[0]
        else:
            principle_group = random.choice(list(self.principles.keys()))
        return random.choice(self.principles[principle_group])

    def generate_prompt(self, input: str, **_: Any) -> Prompt:
        """Generates the prompt to be used for generation.

        Args:
            input (str): the input to be used for generation.

        Returns:
            Prompt: the generated prompt.

        Examples:
            >>> from distilabel.tasks.text_generation import TextGenerationTask
            >>> task = TextGenerationTask(system_prompt="You are a helpful assistant.")
            >>> task.generate_prompt("What are the first 5 Fibonacci numbers?")
            Prompt(system_prompt='You are a helpful assistant.', formatted_prompt='What are the first 5 Fibonacci numbers?')
        """
        system_prompt = self.system_prompt
        if self.principles_distribution is not None:
            principle = self._get_principle()
            system_prompt += " " + principle
        return Prompt(system_prompt=system_prompt, formatted_prompt=input)

    def parse_output(self, output: str) -> Dict[str, str]:
        """Parses the output of the LLM into the desired format."""
        return {"generations": output}

    @property
    def input_args_names(self) -> List[str]:
        """Returns the input args names for the task."""
        return ["input"]

    @property
    def output_args_names(self) -> List[str]:
        """Returns the output args names for the task."""
        return ["generations"]

    def to_argilla_dataset(
        self,
        dataset_row: Dict[str, Any],
        generations_column: Optional[str] = "generations",
    ) -> "FeedbackDataset":
        # First we infer the fields from the input_args_names, but we could also
        # create those manually instead using `rg.TextField(...)`
        fields = infer_fields_from_dataset_row(
            field_names=self.input_args_names + self.output_args_names,
            dataset_row=dataset_row,
        )
        # Then we add a default `RatingQuestion` which asks the users to provide a
        # rating for each of the generations, differing from the scenario where the inputs
        # are the fields and the outputs the ones used to formulate the quesstions. So on,
        # in this scenario we won't have suggestions, as the questions will be related to the
        # combination of inputs and outputs.
        if generations_column is None or generations_column not in dataset_row:
            raise ValueError(
                f"The `generations_column='{generations_column}'` is not present in the dataset"
                f" row. Please provide any of {list(dataset_row.keys())}.",
            )
        questions = []
        for idx in range(1, len(dataset_row[generations_column]) + 1):
            questions.append(
                rg.RatingQuestion(  # type: ignore
                    name=f"{generations_column}-{idx}-rating",
                    title=f"How would you rate the generation at `{generations_column}-{idx}`?",
                    values=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                )
            )
        # Finally, we define some metadata properties that can be potentially used
        # while exploring the dataset within Argilla to get more insights on the data.
        metadata_properties = []
        for arg_name in self.input_args_names + self.output_args_names:
            if isinstance(dataset_row[arg_name], list):
                for idx in range(1, len(dataset_row[arg_name]) + 1):
                    metadata_properties.append(
                        rg.IntegerMetadataProperty(name=f"length-{arg_name}-{idx}")  # type: ignore
                    )
            elif isinstance(dataset_row[arg_name], str):
                metadata_properties.append(
                    rg.IntegerMetadataProperty(name=f"length-{arg_name}")  # type: ignore
                )
            else:
                warnings.warn(
                    f"Unsupported input type ({type(dataset_row[arg_name])}), skipping...",
                    UserWarning,
                    stacklevel=2,
                )
        # Then we just return the `FeedbackDataset` with the fields, questions, and metadata properties
        # defined above.
        return rg.FeedbackDataset(
            fields=fields,
            questions=questions,
            metadata_properties=metadata_properties,  # Note that these are always optional
        )

    def to_argilla_record(self, dataset_row: Dict[str, Any]) -> "FeedbackRecord":
        """Converts a dataset row to an Argilla `FeedbackRecord`."""
        # We start off with the fields, which are the inputs of the LLM, but also
        # build the metadata from them, as previously specified within the
        fields, metadata = {}, {}
        for arg_name in self.input_args_names + self.output_args_names:
            arg_value = dataset_row[arg_name]
            if isinstance(arg_value, list):
                for idx, value in enumerate(arg_value, start=1):
                    # TODO: value formatting was included here due to some issues
                    # with `SelfInstructTask` but these list-parsing may not be needed
                    # anymore.
                    value = (
                        value.strip()
                        if isinstance(value, str)
                        else "\n".join(value)
                        if isinstance(value, list)
                        else ""
                    )
                    fields[f"{arg_name}-{idx}"] = value
                    if value is not None:
                        metadata[f"length-{arg_name}-{idx}"] = len(value)
            elif isinstance(arg_value, str):
                fields[arg_name] = arg_value.strip() if arg_value else ""
                if arg_value is not None:
                    metadata[f"length-{arg_name}"] = len(arg_value.strip())
            else:
                warnings.warn(
                    f"Unsupported input type ({type(arg_value)}), skipping...",
                    UserWarning,
                    stacklevel=2,
                )
        # Then we add the model metadata from the `generation_model` and `labelling_model`
        # columns of the dataset, if they exist.
        metadata.update(model_metadata_from_dataset_row(dataset_row=dataset_row))
        # Finally, we return the `FeedbackRecord` with the fields and the metadata
        return rg.FeedbackRecord(fields=fields, metadata=metadata)

input_args_names: List[str] property

Returns the input args names for the task.

output_args_names: List[str] property

Returns the output args names for the task.

__post_init__()

Validates the principles_distribution if it is a dict.

Raises:

Type Description
ValueError

if the principles_distribution is a dict and it does not sum to 1.0.

ValueError

if the principles are not included in the principles_distribution.

Source code in src/distilabel/tasks/text_generation/base.py
def __post_init__(self) -> None:
    """Validates the `principles_distribution` if it is a dict.

    Raises:
        ValueError: if the `principles_distribution` is a dict and it does not sum to 1.0.
        ValueError: if the `principles` are not included in the `principles_distribution`.
    """
    if isinstance(self.principles_distribution, dict):
        not_included_principles = [
            principle
            for principle in self.principles
            if principle not in self.principles_distribution
        ]
        if not_included_principles:
            principles_str = ", ".join(
                [f"'{principle}'" for principle in not_included_principles]
            )
            raise ValueError(
                f"Principles {principles_str} included in `principles` is not in"
                " `principles_distribution`"
            )

        if sum(self.principles_distribution.values()) != 1.0:
            raise ValueError(
                "`principles_distribution` must sum to 1.0 if it is a dict containing"
                " the distribution of principles to use."
            )

generate_prompt(input, **_)

Generates the prompt to be used for generation.

Parameters:

Name Type Description Default
input str

the input to be used for generation.

required

Returns:

Name Type Description
Prompt Prompt

the generated prompt.

Examples:

>>> from distilabel.tasks.text_generation import TextGenerationTask
>>> task = TextGenerationTask(system_prompt="You are a helpful assistant.")
>>> task.generate_prompt("What are the first 5 Fibonacci numbers?")
Prompt(system_prompt='You are a helpful assistant.', formatted_prompt='What are the first 5 Fibonacci numbers?')
Source code in src/distilabel/tasks/text_generation/base.py
def generate_prompt(self, input: str, **_: Any) -> Prompt:
    """Generates the prompt to be used for generation.

    Args:
        input (str): the input to be used for generation.

    Returns:
        Prompt: the generated prompt.

    Examples:
        >>> from distilabel.tasks.text_generation import TextGenerationTask
        >>> task = TextGenerationTask(system_prompt="You are a helpful assistant.")
        >>> task.generate_prompt("What are the first 5 Fibonacci numbers?")
        Prompt(system_prompt='You are a helpful assistant.', formatted_prompt='What are the first 5 Fibonacci numbers?')
    """
    system_prompt = self.system_prompt
    if self.principles_distribution is not None:
        principle = self._get_principle()
        system_prompt += " " + principle
    return Prompt(system_prompt=system_prompt, formatted_prompt=input)

parse_output(output)

Parses the output of the LLM into the desired format.

Source code in src/distilabel/tasks/text_generation/base.py
def parse_output(self, output: str) -> Dict[str, str]:
    """Parses the output of the LLM into the desired format."""
    return {"generations": output}

to_argilla_record(dataset_row)

Converts a dataset row to an Argilla FeedbackRecord.

Source code in src/distilabel/tasks/text_generation/base.py
def to_argilla_record(self, dataset_row: Dict[str, Any]) -> "FeedbackRecord":
    """Converts a dataset row to an Argilla `FeedbackRecord`."""
    # We start off with the fields, which are the inputs of the LLM, but also
    # build the metadata from them, as previously specified within the
    fields, metadata = {}, {}
    for arg_name in self.input_args_names + self.output_args_names:
        arg_value = dataset_row[arg_name]
        if isinstance(arg_value, list):
            for idx, value in enumerate(arg_value, start=1):
                # TODO: value formatting was included here due to some issues
                # with `SelfInstructTask` but these list-parsing may not be needed
                # anymore.
                value = (
                    value.strip()
                    if isinstance(value, str)
                    else "\n".join(value)
                    if isinstance(value, list)
                    else ""
                )
                fields[f"{arg_name}-{idx}"] = value
                if value is not None:
                    metadata[f"length-{arg_name}-{idx}"] = len(value)
        elif isinstance(arg_value, str):
            fields[arg_name] = arg_value.strip() if arg_value else ""
            if arg_value is not None:
                metadata[f"length-{arg_name}"] = len(arg_value.strip())
        else:
            warnings.warn(
                f"Unsupported input type ({type(arg_value)}), skipping...",
                UserWarning,
                stacklevel=2,
            )
    # Then we add the model metadata from the `generation_model` and `labelling_model`
    # columns of the dataset, if they exist.
    metadata.update(model_metadata_from_dataset_row(dataset_row=dataset_row))
    # Finally, we return the `FeedbackRecord` with the fields and the metadata
    return rg.FeedbackRecord(fields=fields, metadata=metadata)

UltraCMTask dataclass

Bases: CritiqueTask

A CritiqueTask following the prompt templated used by UltraCM (from UltraFeedback).

Parameters:

Name Type Description Default
system_prompt str

the system prompt to be used for generation. Defaults to None.

"User: A one-turn chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, very detailed, and polite answers to the user's questions.</s>"
Disclaimer

Since the UltraCM model has been trained with OpenAI API generated data, the prompting strategy may just be consistent / compliant with either GPT-3.5 or GPT-4 from OpenAI API, or with their own model. Any other model may fail on the generation of a structured output, as well as providing an incorrect / inaccurate critique.

References
Source code in src/distilabel/tasks/critique/ultracm.py
@dataclass
class UltraCMTask(CritiqueTask):
    """A `CritiqueTask` following the prompt templated used by UltraCM (from UltraFeedback).

    Args:
        system_prompt (str, optional): the system prompt to be used for generation. Defaults to `None`.

    Disclaimer:
        Since the UltraCM model has been trained with OpenAI API generated data, the prompting
        strategy may just be consistent / compliant with either GPT-3.5 or GPT-4 from OpenAI API, or
        with their own model. Any other model may fail on the generation of a structured output, as
        well as providing an incorrect / inaccurate critique.

    References:
        - [`UltraFeedback: Boosting Language Models with High-quality Feedback`](https://arxiv.org/abs/2310.01377)
        - [`UltraFeedback - GitHub Repository`](https://github.com/OpenBMB/UltraFeedback)
        - [`openbmb/UltraCM-13b`](https://huggingface.co/openbmb/UltraCM-13b)
    """

    __jinja2_template__: ClassVar[str] = _ULTRACM_TEMPLATE

    system_prompt: str = (
        "User: A one-turn chat between a curious user and an artificial intelligence"
        " assistant. The assistant gives helpful, very detailed, and polite answers to"
        " the user's questions.</s>"
    )

    def generate_prompt(self, input: str, generations: str, **_: Any) -> Prompt:
        """Generates a prompt following the UltraCM specification.

        Args:
            input (str): the input to be used for the prompt.
            generations (List[str]): the generations to be used for the prompt, in
                this case, the ones to be critiqued.

        Returns:
            Prompt: the generated prompt.

        Examples:
            >>> from distilabel.tasks.critique import UltraCMTask
            >>> task = UltraCMTask()
            >>> task.generate_prompt(
            ...     input="What are the first 5 Fibonacci numbers?",
            ...     generations=["0 1 1 2 3", "0 1 1 2 3"],
            ... )
            Prompt(
                system_prompt="User: A one-turn chat between a curious user ...",
                formatted_prompt="User: Given my answer to an instruction, your role ...",
            )
        """
        render_kwargs = {
            "instruction": input,
            "completion": generations,
        }
        return Prompt(
            system_prompt=self.system_prompt,
            formatted_prompt=f"User: {self.template.render(**render_kwargs)}</s>\nAssistant: ### Feedback\nOverall Score: ",
        )

    def parse_output(self, output: str) -> CritiqueTaskOutput:  # type: ignore
        """Parses the output of the model into the desired format."""
        pattern = r"(\d+(?:\.\d+)?)\s*(.*)"
        match = re.match(pattern, output)
        if match:
            return CritiqueTaskOutput(
                score=float(match.group(1)),
                critique=match.group(2).strip(),
            )

    def to_argilla_dataset(
        self,
        dataset_row: Dict[str, Any],
        generations_column: str = "generations",
        score_column: str = "score",
        critique_column: str = "critique",
        score_values: Optional[List[int]] = None,
    ) -> "FeedbackDataset":
        return super().to_argilla_dataset(
            dataset_row=dataset_row,
            generations_column=generations_column,
            score_column=score_column,
            critique_column=critique_column,
            score_values=score_values or [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
        )

generate_prompt(input, generations, **_)

Generates a prompt following the UltraCM specification.

Parameters:

Name Type Description Default
input str

the input to be used for the prompt.

required
generations List[str]

the generations to be used for the prompt, in this case, the ones to be critiqued.

required

Returns:

Name Type Description
Prompt Prompt

the generated prompt.

Examples:

>>> from distilabel.tasks.critique import UltraCMTask
>>> task = UltraCMTask()
>>> task.generate_prompt(
...     input="What are the first 5 Fibonacci numbers?",
...     generations=["0 1 1 2 3", "0 1 1 2 3"],
... )
Prompt(
    system_prompt="User: A one-turn chat between a curious user ...",
    formatted_prompt="User: Given my answer to an instruction, your role ...",
)
Source code in src/distilabel/tasks/critique/ultracm.py
def generate_prompt(self, input: str, generations: str, **_: Any) -> Prompt:
    """Generates a prompt following the UltraCM specification.

    Args:
        input (str): the input to be used for the prompt.
        generations (List[str]): the generations to be used for the prompt, in
            this case, the ones to be critiqued.

    Returns:
        Prompt: the generated prompt.

    Examples:
        >>> from distilabel.tasks.critique import UltraCMTask
        >>> task = UltraCMTask()
        >>> task.generate_prompt(
        ...     input="What are the first 5 Fibonacci numbers?",
        ...     generations=["0 1 1 2 3", "0 1 1 2 3"],
        ... )
        Prompt(
            system_prompt="User: A one-turn chat between a curious user ...",
            formatted_prompt="User: Given my answer to an instruction, your role ...",
        )
    """
    render_kwargs = {
        "instruction": input,
        "completion": generations,
    }
    return Prompt(
        system_prompt=self.system_prompt,
        formatted_prompt=f"User: {self.template.render(**render_kwargs)}</s>\nAssistant: ### Feedback\nOverall Score: ",
    )

parse_output(output)

Parses the output of the model into the desired format.

Source code in src/distilabel/tasks/critique/ultracm.py
def parse_output(self, output: str) -> CritiqueTaskOutput:  # type: ignore
    """Parses the output of the model into the desired format."""
    pattern = r"(\d+(?:\.\d+)?)\s*(.*)"
    match = re.match(pattern, output)
    if match:
        return CritiqueTaskOutput(
            score=float(match.group(1)),
            critique=match.group(2).strip(),
        )

UltraFeedbackTask dataclass

Bases: PreferenceTask

A PreferenceTask following the prompt template used by ULTRAFEEDBACK.

Parameters:

Name Type Description Default
system_prompt str

the system prompt to be used for generation. Defaults to None.

'Your role is to evaluate text quality based on given criteria.'
task_description Union[str, None]

the description of the task. Defaults to None.

required
ratings Union[List[Rating], None]

the ratings to be used for the task. Defaults to None.

required
References
Source code in src/distilabel/tasks/preference/ultrafeedback.py
@dataclass
class UltraFeedbackTask(PreferenceTask):
    """A `PreferenceTask` following the prompt template used by ULTRAFEEDBACK.

    Args:
        system_prompt (str, optional): the system prompt to be used for generation. Defaults to `None`.
        task_description (Union[str, None], optional): the description of the task. Defaults to `None`.
        ratings (Union[List[Rating], None], optional): the ratings to be used for the task. Defaults to `None`.

    References:
        - [`UltraFeedback: Boosting Language Models with High-quality Feedback`](https://arxiv.org/abs/2310.01377)
        - [`UltraFeedback - GitHub Repository`](https://github.com/OpenBMB/UltraFeedback)
    """

    ratings: List[Rating]
    task_description: str

    system_prompt: (
        str
    ) = "Your role is to evaluate text quality based on given criteria."

    __jinja2_template__: ClassVar[str] = field(
        default=_ULTRAFEEDBACK_TEMPLATE, init=False, repr=False
    )
    __subtasks__: ClassVar[List[str]] = [
        "text-quality",
        "helpfulness",
        "truthfulness",
        "honesty",
        "instruction-following",
    ]

    def generate_prompt(self, input: str, generations: List[str], **_: Any) -> Prompt:
        """Generates a prompt following the ULTRAFEEDBACK specification.

        Args:
            input (str): the input to be used for the prompt.
            generations (List[str]): the generations to be used for the prompt.

        Returns:
            Prompt: the generated prompt.

        Examples:
            >>> from distilabel.tasks.preference import UltraFeedbackTask
            >>> task = UltraFeedbackTask.for_overall_quality()
            >>> task.generate_prompt("What are the first 5 Fibonacci numbers?", ["0 1 1 2 3", "0 1 1 2 3"])
            Prompt(
                system_prompt="Your role is to evaluate text quality based on given criteria.",
                formatted_prompt="# General Text Quality Assessment...",
            )
        """
        render_kwargs = {
            "task_description": self.task_description,
            "ratings": self.ratings,
            "input": input,
            "responses": generations,
        }
        return Prompt(
            system_prompt=self.system_prompt,
            formatted_prompt=self.template.render(**render_kwargs),
        )

    def parse_output(self, output: str) -> List[UltraFeedbackOutput]:
        """Parses the output of the model into the desired format."""
        parsed_output = []
        for section in output.split("#### Output for Text ")[1:]:
            rating, rationale = section.split("\n")[1:3]
            rating = float(rating.split(": ")[1])
            rationale = rationale.split(": ")[1]
            parsed_output.append(
                UltraFeedbackOutput(rating=rating, rationale=rationale)
            )
        return parsed_output

    # Override the default `to_argilla_dataset` method to provide the `ratings_values` of
    # UltraFeedback, as the default goes from 1-10 while UltraFeedback's default is 1-5
    # (0-4 actually, but Argilla doesn't support 0s).
    def to_argilla_dataset(
        self,
        dataset_row: Dict[str, Any],
        generations_column: str = "generations",
        ratings_column: str = "rating",
        rationale_column: str = "rationale",
        ratings_values: Optional[List[int]] = None,
    ) -> "FeedbackDataset":
        return super().to_argilla_dataset(
            dataset_row=dataset_row,
            generations_column=generations_column,
            ratings_column=ratings_column,
            rationale_column=rationale_column,
            ratings_values=ratings_values or [1, 2, 3, 4, 5],
        )

    # Override the default `to_argilla_record` method to provide the `ratings_values` of
    # UltraFeedback, as the default goes from 1-10 while UltraFeedback's default is 1-5
    # (0-4 actually, but Argilla doesn't support 0s).
    def to_argilla_record(
        self,
        dataset_row: Dict[str, Any],
        generations_column: str = "generations",
        ratings_column: str = "rating",
        rationale_column: str = "rationale",
        ratings_values: Optional[List[int]] = None,
    ) -> "FeedbackRecord":
        return super().to_argilla_record(
            dataset_row=dataset_row,
            generations_column=generations_column,
            ratings_column=ratings_column,
            rationale_column=rationale_column,
            ratings_values=ratings_values or [1, 2, 3, 4, 5],
        )

    @classmethod
    def for_overall_quality(
        cls,
        system_prompt: Optional[str] = None,
        task_description: Optional[str] = None,
        ratings: Optional[List[Rating]] = None,
    ) -> "UltraFeedbackTask":
        """Classmethod for the `UltraFeedbackTask` subtask defined by Argilla, in order to
        evaluate all the criterias originally defined in UltraFeedback at once, in a single
        subtask.
        """
        kwargs = {}
        if system_prompt is not None:
            kwargs.update({"system_prompt": system_prompt})
        if task_description is None:
            task_description = dedent(
                """
                # General Text Quality Assessment
                Evaluate the model's outputs based on various criteria:
                1. **Correctness & Informativeness**: Does the output provide accurate and helpful information?
                2. **Honesty & Uncertainty**: How confidently does the model convey its information, and does it express uncertainty appropriately?
                3. **Truthfulness & Hallucination**: Does the model introduce misleading or fabricated details?
                4. **Instruction Following**: Does the model's output align with given instructions and the user's intent?
                Your role is to provide a holistic assessment considering all the above factors.

                **Scoring**: Rate outputs 1 to 5 based on the overall quality, considering all aspects:
                """
            )
        kwargs.update({"task_description": task_description})

        if ratings is None:
            ratings = [
                Rating(
                    value=1,
                    description="**Low Quality**: Contains inaccuracies, may be entirely wrong or has severe hallucinations.",
                ),
                Rating(
                    value=2,
                    description="**Moderate Quality**: Addresses some aspects, but has errors or is partially aligned with instructions.",
                ),
                Rating(
                    value=3,
                    description="**Good**: Generally accurate but may contain minor errors or slight deviations.",
                ),
                Rating(
                    value=4,
                    description="**Very Good**: Near perfect, with minor issues in terms of alignment or confidence.",
                ),
                Rating(
                    value=5,
                    description="**Excellent**: Accurate, confident, aligned with instructions, and free of hallucinations.",
                ),
            ]
        kwargs.update({"ratings": ratings})
        return cls(**kwargs)

    @classmethod
    def for_helpfulness(
        cls,
        system_prompt: Optional[str] = None,
        task_description: Optional[str] = None,
        ratings: Optional[List[Rating]] = None,
    ) -> "UltraFeedbackTask":
        kwargs = {}
        if system_prompt is not None:
            kwargs.update({"system_prompt": system_prompt})

        if task_description is None:
            task_description = dedent(
                """
                # Informativeness / Helpfulness Assessment
                Evaluate if model's outputs fulfill task objectives and provide high-quality, correct, and, informative content.
                Helpfulness assessment emphasizes **Overall Quality** regarding correctness and informativeness.
                **Correctness**: Accurate computation, reasoning steps, and outputs without misunderstandings or fabrication.

                **Scoring**: Score 1 to 5 based on extent of helpfulness, regarding both informativeness and correctness:
                """
            )
        kwargs.update({"task_description": task_description})
        if ratings is None:
            ratings = [
                Rating(
                    value=1,
                    description="**Severely Incorrect**: Contains significant inaccuracies or fabricated content, even if comprehensive information is provided.",
                ),
                Rating(
                    value=2,
                    description="**Partially Incorrect**: Contains errors that may cause confusion, even though comprehensive information is present.",
                ),
                Rating(
                    value=3,
                    description="**Correct**: Accurate and provides useful information that meets the task's requirements.",
                ),
                Rating(
                    value=4,
                    description="**Highly Informative**: Accurate and extensive, providing valuable insights and detailed information.",
                ),
                Rating(
                    value=5,
                    description="**Outstandingly Helpful**: Both accurate and in-depth, offering profound insights and comprehensive information.",
                ),
            ]
        kwargs.update({"ratings": ratings})
        return cls(**kwargs)

    @classmethod
    def for_truthfulness(
        cls,
        system_prompt: Optional[str] = None,
        task_description: Optional[str] = None,
        ratings: Optional[List[Rating]] = None,
    ) -> "UltraFeedbackTask":
        kwargs = {}
        if system_prompt is not None:
            kwargs.update({"system_prompt": system_prompt})

        if task_description is None:
            task_description = dedent(
                """
                # Truthfulness and Hallucination Assessment
                Evaluate the model's accuracy in providing information without introducing misleading or fabricated details.
                Assign numeric identifier (or "None") from 1 to 3 for each type of hallucination:
                1. **Contradictory with the World (Factual Error)**: Entities, locations, concepts, or events that conflict with established knowledge.
                2. **Contradictory with Instruction and Input**: Responses diverge, introducing new facts not aligned with instructions or inputs.
                3. **Self-Contradictory / Logical Error**: Responses contain internal contradictions or logical errors within each independent text.

                **Scoring**: Rate outputs 1 to 5 based on the extent of hallucination:
                """
            )
        kwargs.update({"task_description": task_description})

        if ratings is None:
            ratings = [
                Rating(
                    value=1,
                    description="**Completely Hallucinated**: Entirely unreliable due to hallucinations.",
                ),
                Rating(
                    value=2,
                    description="**Severe Hallucination**: Nearly half contains hallucinations, severe deviation from main points.",
                ),
                Rating(
                    value=3,
                    description="**Partial Hallucination / Misunderstanding**: Overall truthful, partial misunderstanding due to hallucinations.",
                ),
                Rating(
                    value=4,
                    description="**Insignificant Hallucination**: Mostly truthful, slight hallucination not affecting main points.",
                ),
                Rating(
                    value=5, description="**No Hallucination**: Free of hallucinations."
                ),
            ]
        kwargs.update({"ratings": ratings})
        return cls(**kwargs)

    @classmethod
    def for_honesty(
        cls,
        system_prompt: Optional[str] = None,
        task_description: Optional[str] = None,
        ratings: Optional[List[Rating]] = None,
    ) -> "UltraFeedbackTask":
        kwargs = {}
        if system_prompt is not None:
            kwargs.update({"system_prompt": system_prompt})
        if task_description is None:
            task_description = dedent(
                """
                # Honesty and Uncertainty Expression Assessment
                Assess how well the model conveys honesty and uncertainty. Evaluate if the model's confidence aligns with response correctness.
                - **Indicators of Uncertainty**:
                    1. Weakeners: e.g., 'I guess,' 'probably.'
                    2. Refusal to answer.
                    3. Verbalized confidence scores: [0, 20] low; (20, 40] uncertain; (40, 60] moderate; (60, 80] leaning confident; (80, 100] high.
                - **No uncertainty expression indicate confidence.**
                - **Response Correctness**: Align with ground truth, or provide accurate content without fabrication.

                **Scoring**: Rate outputs 1 to 5 based on the extent of honesty and uncertainty:
                """
            )
        kwargs.update({"task_description": task_description})
        if ratings is None:
            ratings = [
                Rating(
                    value=1,
                    description="**Confidently Incorrect**: Confident but entirely wrong.",
                ),
                Rating(
                    value=2,
                    description="**Confident with Significant Mistakes / Unconfident Incorrect**: Confident but contains major errors. Unconfident and entirely wrong.",
                ),
                Rating(
                    value=3,
                    description="**Uncertain / 'I Don't Know' / Subtle Mistakes**: 'I don't know' or declines. Confident but contains minor errors. Unconfident and contains significant mistakes.",
                ),
                Rating(
                    value=4,
                    description="**Correct but Uncertain / Expressed Subtle Mistakes**: Correct but unconfident.",
                ),
                Rating(
                    value=5,
                    description="**Correct and Confident / Precisely Express Uncertainty**: Correct and confident. Makes mistakes, but precisely acknowledges minor errors and indicates uncertainty on potential mistakes.",
                ),
            ]
        kwargs.update({"ratings": ratings})

        return cls(**kwargs)

    @classmethod
    def for_instruction_following(
        cls,
        system_prompt: Optional[str] = None,
        task_description: Optional[str] = None,
        ratings: Optional[List[Rating]] = None,
    ) -> "UltraFeedbackTask":
        kwargs = {}
        if system_prompt is not None:
            kwargs.update({"system_prompt": system_prompt})
        if task_description is None:
            task_description = dedent(
                """
                # Instruction Following Assessment
                Evaluate alignment between output and intent. Assess understanding of task goal and restrictions.
                **Instruction Components**: Task Goal (intended outcome), Restrictions (text styles, formats, or designated methods, etc).

                **Scoring**: Rate outputs 1 to 5:
                """
            )
        kwargs.update({"task_description": task_description})
        if ratings is None:
            ratings = [
                Rating(value=1, description="**Irrelevant**: No alignment."),
                Rating(
                    value=2,
                    description="**Partial Focus**: Addresses one aspect poorly.",
                ),
                Rating(
                    value=3,
                    description="**Partial Compliance**:\n\t- (1) Meets goal or restrictions, neglecting other.\n\t- (2) Acknowledges both but slight deviations.",
                ),
                Rating(
                    value=4,
                    description="**Almost There**: Near alignment, minor deviations.",
                ),
                Rating(
                    value=5,
                    description="**Comprehensive Compliance**: Fully aligns, meets all requirements.",
                ),
            ]
        kwargs.update({"ratings": ratings})

        return cls(**kwargs)

for_overall_quality(system_prompt=None, task_description=None, ratings=None) classmethod

Classmethod for the UltraFeedbackTask subtask defined by Argilla, in order to evaluate all the criterias originally defined in UltraFeedback at once, in a single subtask.

Source code in src/distilabel/tasks/preference/ultrafeedback.py
@classmethod
def for_overall_quality(
    cls,
    system_prompt: Optional[str] = None,
    task_description: Optional[str] = None,
    ratings: Optional[List[Rating]] = None,
) -> "UltraFeedbackTask":
    """Classmethod for the `UltraFeedbackTask` subtask defined by Argilla, in order to
    evaluate all the criterias originally defined in UltraFeedback at once, in a single
    subtask.
    """
    kwargs = {}
    if system_prompt is not None:
        kwargs.update({"system_prompt": system_prompt})
    if task_description is None:
        task_description = dedent(
            """
            # General Text Quality Assessment
            Evaluate the model's outputs based on various criteria:
            1. **Correctness & Informativeness**: Does the output provide accurate and helpful information?
            2. **Honesty & Uncertainty**: How confidently does the model convey its information, and does it express uncertainty appropriately?
            3. **Truthfulness & Hallucination**: Does the model introduce misleading or fabricated details?
            4. **Instruction Following**: Does the model's output align with given instructions and the user's intent?
            Your role is to provide a holistic assessment considering all the above factors.

            **Scoring**: Rate outputs 1 to 5 based on the overall quality, considering all aspects:
            """
        )
    kwargs.update({"task_description": task_description})

    if ratings is None:
        ratings = [
            Rating(
                value=1,
                description="**Low Quality**: Contains inaccuracies, may be entirely wrong or has severe hallucinations.",
            ),
            Rating(
                value=2,
                description="**Moderate Quality**: Addresses some aspects, but has errors or is partially aligned with instructions.",
            ),
            Rating(
                value=3,
                description="**Good**: Generally accurate but may contain minor errors or slight deviations.",
            ),
            Rating(
                value=4,
                description="**Very Good**: Near perfect, with minor issues in terms of alignment or confidence.",
            ),
            Rating(
                value=5,
                description="**Excellent**: Accurate, confident, aligned with instructions, and free of hallucinations.",
            ),
        ]
    kwargs.update({"ratings": ratings})
    return cls(**kwargs)

generate_prompt(input, generations, **_)

Generates a prompt following the ULTRAFEEDBACK specification.

Parameters:

Name Type Description Default
input str

the input to be used for the prompt.

required
generations List[str]

the generations to be used for the prompt.

required

Returns:

Name Type Description
Prompt Prompt

the generated prompt.

Examples:

>>> from distilabel.tasks.preference import UltraFeedbackTask
>>> task = UltraFeedbackTask.for_overall_quality()
>>> task.generate_prompt("What are the first 5 Fibonacci numbers?", ["0 1 1 2 3", "0 1 1 2 3"])
Prompt(
    system_prompt="Your role is to evaluate text quality based on given criteria.",
    formatted_prompt="# General Text Quality Assessment...",
)
Source code in src/distilabel/tasks/preference/ultrafeedback.py
def generate_prompt(self, input: str, generations: List[str], **_: Any) -> Prompt:
    """Generates a prompt following the ULTRAFEEDBACK specification.

    Args:
        input (str): the input to be used for the prompt.
        generations (List[str]): the generations to be used for the prompt.

    Returns:
        Prompt: the generated prompt.

    Examples:
        >>> from distilabel.tasks.preference import UltraFeedbackTask
        >>> task = UltraFeedbackTask.for_overall_quality()
        >>> task.generate_prompt("What are the first 5 Fibonacci numbers?", ["0 1 1 2 3", "0 1 1 2 3"])
        Prompt(
            system_prompt="Your role is to evaluate text quality based on given criteria.",
            formatted_prompt="# General Text Quality Assessment...",
        )
    """
    render_kwargs = {
        "task_description": self.task_description,
        "ratings": self.ratings,
        "input": input,
        "responses": generations,
    }
    return Prompt(
        system_prompt=self.system_prompt,
        formatted_prompt=self.template.render(**render_kwargs),
    )

parse_output(output)

Parses the output of the model into the desired format.

Source code in src/distilabel/tasks/preference/ultrafeedback.py
def parse_output(self, output: str) -> List[UltraFeedbackOutput]:
    """Parses the output of the model into the desired format."""
    parsed_output = []
    for section in output.split("#### Output for Text ")[1:]:
        rating, rationale = section.split("\n")[1:3]
        rating = float(rating.split(": ")[1])
        rationale = rationale.split(": ")[1]
        parsed_output.append(
            UltraFeedbackOutput(rating=rating, rationale=rationale)
        )
    return parsed_output

UltraJudgeTask dataclass

Bases: PreferenceTask

A PreferenceTask for the UltraJudge task. The UltraJudge task has been defined at Argilla specifically for a better evaluation using AI Feedback. The task is defined based on both UltraFeedback and JudgeLM, but with several improvements / modifications.

Parameters:

Name Type Description Default
system_prompt str

the system prompt to be used for generation. Defaults to None.

"You are an evaluator tasked with assessing AI assistants' responses from the perspective of typical user preferences. Your critical analysis should focus on human-like engagement, solution effectiveness, accuracy, clarity, and creativity. Approach each response as if you were the user, considering how well the response meets your needs and expectations in a real-world scenario. Provide detailed feedback that highlights strengths and areas for improvement in each response, keeping in mind the goal of simulating a human's preferred choice. Your evaluation should be impartial and thorough, reflecting a human's perspective in preferring responses that are practical, clear, authentic, and aligned with their intent. Avoid bias, and focus on the content and quality of the responses."
task_description Union[str, None]

the description of the task. Defaults to None.

"Your task is to rigorously evaluate the performance of {num_responses} AI assistants, simulating a human's perspective. You will assess each response based on four key domains, reflecting aspects that are typically valued by humans: {areas}. First provide a score between 0 and 10 and write a detailed feedback for each area and assistant. Finally, provide a list of {num_responses} scores, each separated by a space, to reflect the performance of Assistants 1 to {num_responses}."
areas List[str]

the areas to be used for the task. Defaults to a list of four areas: "Practical Accuracy", "Clarity & Transparency", "Authenticity & Reliability", and "Compliance with Intent".

field(default_factory=lambda : ['Practical Accuracy', 'Clarity & Transparency', 'Authenticity & Reliability', 'Compliance with Intent'])
References
Source code in src/distilabel/tasks/preference/ultrajudge.py
@dataclass
class UltraJudgeTask(PreferenceTask):
    """A `PreferenceTask` for the UltraJudge task. The `UltraJudge` task has been defined
    at Argilla specifically for a better evaluation using AI Feedback. The task is defined
    based on both UltraFeedback and JudgeLM, but with several improvements / modifications.

    Args:
        system_prompt (str, optional): the system prompt to be used for generation. Defaults to `None`.
        task_description (Union[str, None], optional): the description of the task. Defaults to `None`.
        areas (List[str], optional): the areas to be used for the task. Defaults to a list of four areas:
            "Practical Accuracy", "Clarity & Transparency", "Authenticity & Reliability", and "Compliance with Intent".

    References:
        - [`UltraFeedback: Boosting Language Models with High-quality Feedback`](https://arxiv.org/abs/2310.01377)
        - [`Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena`](https://arxiv.org/abs/2306.05685)
    """

    system_prompt: str = (
        "You are an evaluator tasked with assessing AI assistants' responses from the perspective of typical user preferences."
        " Your critical analysis should focus on human-like engagement, solution effectiveness, accuracy, clarity, and"
        " creativity. Approach each response as if you were the user, considering how well the response meets your needs"
        " and expectations in a real-world scenario. Provide detailed feedback that highlights strengths and areas for"
        " improvement in each response, keeping in mind the goal of simulating a human's preferred choice. "
        "Your evaluation should be impartial and thorough, reflecting a human's perspective in preferring responses that are practical,"
        " clear, authentic, and aligned with their intent. Avoid bias, and focus on the content and quality of the responses."
    )

    task_description: str = (
        "Your task is to rigorously evaluate the performance of {num_responses} AI assistants, simulating a human's perspective."
        " You will assess each response based on four key domains, reflecting aspects that are typically valued by humans:"
        " {areas}."
        " First provide a score between 0 and 10 and write a detailed feedback for each area and assistant."
        " Finally, provide a list of {num_responses} scores, each separated by a space, to reflect the performance of Assistants 1 to {num_responses}."
    )

    areas: List[str] = field(
        default_factory=lambda: [
            "Practical Accuracy",
            "Clarity & Transparency",
            "Authenticity & Reliability",
            "Compliance with Intent",
        ]
    )

    __jinja2_template__: ClassVar[str] = field(
        default=_ULTRAJUDGE_TEMPLATE, init=False, repr=False
    )

    @property
    def output_args_names(self) -> List[str]:
        """Returns the names of the output arguments of the task."""
        return ["rating", "areas"]

    @property
    def areas_str(self) -> str:
        """Returns a string representation of the areas."""
        return ", ".join(self.areas[:-1]) + ", and " + self.areas[-1]

    @property
    def extract_area_score_and_rationale_regex(self) -> str:
        """Returns a regex to extract the area, score, and rationale from the output."""
        return rf"({'|'.join(self.areas)})\s*-\s*(\d+(?:\.\d+)?)\n(.*?)(?=\n\n|\Z)"

    @property
    def extract_final_scores_regex(self) -> str:
        """Returns a regex to extract the final scores from the output."""
        return r"Final scores:\s*((?:\d+(?:\.\d+)?\s*)+)"

    def generate_prompt(self, input: str, generations: List[str], **_: Any) -> Prompt:
        """Generates a prompt following the UltraJudge specification.

        Args:
            input (str): the input to be used for the prompt.
            generations (List[str]): the generations to be used for the prompt.

        Returns:
            Prompt: the generated prompt.

        Examples:
            >>> from distilabel.tasks.preference import UltraJudgeTask
            >>> task = UltraJudgeTask(system_prompt="You are a helpful assistant.")
            >>> task.generate_prompt("What are the first 5 Fibonacci numbers?", ["0 1 1 2 3", "0 1 1 2 3"])
            Prompt(
                system_prompt="You are a helpful assistant.",
                formatted_prompt="Your task is to rigorously evaluate the performance of ...",
            )
        """
        render_kwargs = {
            "task_description": self.task_description.format(
                num_responses=len(generations), areas=self.areas_str
            ),
            "instruction": input,
            "responses": generations,
        }

        return Prompt(
            system_prompt=self.system_prompt,
            formatted_prompt=self.template.render(**render_kwargs),
        )

    def parse_output(self, output: str) -> List[UltraJudgeOutput]:
        """Parses the output of the model into the desired format."""
        num_areas = len(self.areas)
        # `areas_results` includes num_generations * num_areas tuples
        areas_results = re.findall(self.extract_area_score_and_rationale_regex, output)
        final_scores = [
            float(str_score)
            for str_score in re.findall(self.extract_final_scores_regex, output)[
                0
            ].split(" ")
        ]

        outputs = []
        for i, rating in enumerate(final_scores):
            areas = {}
            # Get the areas for the i-th generation
            for area in areas_results[i * num_areas : i * num_areas + num_areas]:
                name, area_rating, rationale = area
                areas[name] = Area(rating=area_rating, rationale=rationale)
            outputs.append(UltraJudgeOutput(rating=rating, areas=areas))

        return outputs

    def _merge_rationales(
        self, rationales: List[Dict[str, Any]], generations_column: str = "generations"
    ) -> str:
        """Overwrite of the `_merge_rationales` as we need to process the areas before merging."""

        def format_area(area: Dict[str, Any]) -> str:
            sections = []
            for title, ratings in area.items():
                sections.append(title)
                for k, v in ratings.items():
                    sections.append(f"{k}:{v}")
            return "\n".join(sections)

        merged_rationales = []
        for idx, area in enumerate(rationales, start=1):
            merged_rationales.append(
                f"{generations_column}-{idx}:\n{format_area(area)}\n"
            )
        return "\n".join(merged_rationales)

areas_str: str property

Returns a string representation of the areas.

extract_area_score_and_rationale_regex: str property

Returns a regex to extract the area, score, and rationale from the output.

extract_final_scores_regex: str property

Returns a regex to extract the final scores from the output.

output_args_names: List[str] property

Returns the names of the output arguments of the task.

generate_prompt(input, generations, **_)

Generates a prompt following the UltraJudge specification.

Parameters:

Name Type Description Default
input str

the input to be used for the prompt.

required
generations List[str]

the generations to be used for the prompt.

required

Returns:

Name Type Description
Prompt Prompt

the generated prompt.

Examples:

>>> from distilabel.tasks.preference import UltraJudgeTask
>>> task = UltraJudgeTask(system_prompt="You are a helpful assistant.")
>>> task.generate_prompt("What are the first 5 Fibonacci numbers?", ["0 1 1 2 3", "0 1 1 2 3"])
Prompt(
    system_prompt="You are a helpful assistant.",
    formatted_prompt="Your task is to rigorously evaluate the performance of ...",
)
Source code in src/distilabel/tasks/preference/ultrajudge.py
def generate_prompt(self, input: str, generations: List[str], **_: Any) -> Prompt:
    """Generates a prompt following the UltraJudge specification.

    Args:
        input (str): the input to be used for the prompt.
        generations (List[str]): the generations to be used for the prompt.

    Returns:
        Prompt: the generated prompt.

    Examples:
        >>> from distilabel.tasks.preference import UltraJudgeTask
        >>> task = UltraJudgeTask(system_prompt="You are a helpful assistant.")
        >>> task.generate_prompt("What are the first 5 Fibonacci numbers?", ["0 1 1 2 3", "0 1 1 2 3"])
        Prompt(
            system_prompt="You are a helpful assistant.",
            formatted_prompt="Your task is to rigorously evaluate the performance of ...",
        )
    """
    render_kwargs = {
        "task_description": self.task_description.format(
            num_responses=len(generations), areas=self.areas_str
        ),
        "instruction": input,
        "responses": generations,
    }

    return Prompt(
        system_prompt=self.system_prompt,
        formatted_prompt=self.template.render(**render_kwargs),
    )

parse_output(output)

Parses the output of the model into the desired format.

Source code in src/distilabel/tasks/preference/ultrajudge.py
def parse_output(self, output: str) -> List[UltraJudgeOutput]:
    """Parses the output of the model into the desired format."""
    num_areas = len(self.areas)
    # `areas_results` includes num_generations * num_areas tuples
    areas_results = re.findall(self.extract_area_score_and_rationale_regex, output)
    final_scores = [
        float(str_score)
        for str_score in re.findall(self.extract_final_scores_regex, output)[
            0
        ].split(" ")
    ]

    outputs = []
    for i, rating in enumerate(final_scores):
        areas = {}
        # Get the areas for the i-th generation
        for area in areas_results[i * num_areas : i * num_areas + num_areas]:
            name, area_rating, rationale = area
            areas[name] = Area(rating=area_rating, rationale=rationale)
        outputs.append(UltraJudgeOutput(rating=rating, areas=areas))

    return outputs