Skip to content

self_instruct

SelfInstructTask dataclass

Bases: TextGenerationTask

A TextGenerationTask following the Self-Instruct specification for building the prompts.

Parameters:

Name Type Description Default
system_prompt str

the system prompt to be used. Defaults to None.

'You are an expert prompt writer, writing the best and most diverse prompts for a variety of tasks. You are given a task description and a set of instructions for how to write the prompts for an specific AI application.'
principles Dict[str, List[str]]

the principles to be used for the system prompt. Defaults to None.

field(default_factory=lambda : {'harmlessness': harmlessness, 'helpfulness': helpfulness, 'truthfulness': truthfulness, 'honesty': honesty, 'verbalized_calibration': verbalized_calibration}, repr=False)
principles_distribution Union[Dict[str, float], Literal[balanced], None]

the distribution of principles to be used for the system prompt. Defaults to None.

None
application_description str

the description of the AI application. Defaults to "AI assistant".

'AI assistant'
num_instructions int

the number of instructions to be used for the prompt. Defaults to 5.

5
criteria_for_query_generation str

the criteria for query generation that we want our model to have. Default value covers default behaviour for SelfInstructTask. This value is passed to the .jinja template, where extra instructions are added to ensure correct output format.

'Incorporate a diverse range of verbs, avoiding repetition.\nEnsure queries are compatible with AI model\'s text generation functions and are limited to 1-2 sentences.\nDesign queries to be self-contained and standalone.\nBlend interrogative (e.g., "What is the significance of x?") and imperative (e.g., "Detail the process of x.") styles.'
References
Source code in src/distilabel/tasks/text_generation/self_instruct.py
@dataclass
class SelfInstructTask(TextGenerationTask):
    """A `TextGenerationTask` following the Self-Instruct specification for building
    the prompts.

    Args:
        system_prompt (str, optional): the system prompt to be used. Defaults to `None`.
        principles (Dict[str, List[str]], optional): the principles to be used for the system prompt.
            Defaults to `None`.
        principles_distribution (Union[Dict[str, float], Literal["balanced"], None], optional): the
            distribution of principles to be used for the system prompt. Defaults to `None`.
        application_description (str, optional): the description of the AI application. Defaults to
            "AI assistant".
        num_instructions (int, optional): the number of instructions to be used for the prompt.
            Defaults to 5.
        criteria_for_query_generation (str, optional): the criteria for query generation that we want
            our model to have. Default value covers default behaviour for SelfInstructTask. This value is
            passed to the .jinja template, where extra instructions are added to ensure correct output format.

    References:
        - [`Self-Instruct: Aligning Language Models with Self-Generated Instructions`](https://arxiv.org/abs/2212.10560)
        - [`Self-Instruct - GitHub Repository`](https://github.com/yizhongw/self-instruct)
    """

    system_prompt: str = (
        "You are an expert prompt writer, writing the best and most diverse prompts for a variety of tasks."
        " You are given a task description and a set of instructions for how to write the prompts for an"
        " specific AI application."
    )

    application_description: str = "AI assistant"
    num_instructions: int = 5

    criteria_for_query_generation: str = (
        "Incorporate a diverse range of verbs, avoiding repetition.\n"
        "Ensure queries are compatible with AI model's text generation functions and are limited to 1-2 sentences.\n"
        "Design queries to be self-contained and standalone.\n"
        'Blend interrogative (e.g., "What is the significance of x?") and imperative (e.g., "Detail the process of x.") styles.'
    )

    __jinja2_template__: str = _SELF_INSTRUCT_TEMPLATE

    def generate_prompt(self, input: str, **_: Any) -> Prompt:
        """Generates a prompt following the Self-Instruct specification.

        Args:
            input (str): the input to be used for the prompt.

        Returns:
            Prompt: the generated prompt.

        Examples:
            >>> from distilabel.tasks.text_generation import SelfInstructTask
            >>> task = SelfInstructTask(system_prompt="You are a helpful assistant.", num_instructions=2)
            >>> task.generate_prompt("What are the first 5 Fibonacci numbers?")
            Prompt(
                system_prompt="You are a helpful assistant.",
                formatted_prompt="# Task Description ...",
            )
        """
        render_kwargs = {
            "application_description": self.application_description,
            "num_instructions": self.num_instructions,
            "criteria_for_query_generation": self.criteria_for_query_generation,
            "input": input,
        }
        return Prompt(
            system_prompt=self.system_prompt,
            formatted_prompt=self.template.render(**render_kwargs),
        )

    @property
    def output_args_names(self) -> List[str]:
        return ["instructions"]

    def parse_output(self, output: str) -> Dict[str, List[str]]:
        """Parses the output of the model into the desired format."""
        pattern = re.compile(r"\d+\.\s*(.*?)\n")
        return {"instructions": pattern.findall(output)}

    def to_argilla_dataset(self, dataset_row: Dict[str, Any]) -> "FeedbackDataset":
        # First we infer the fields from the input_args_names, but we could also
        # create those manually instead using `rg.TextField(...)`
        fields = infer_fields_from_dataset_row(
            field_names=self.input_args_names,
            dataset_row=dataset_row,
        )
        # Once the input fields have been defined, then we also include the instruction
        # field which will be fulfilled with each of the instructions generated.
        fields.append(rg.TextField(name="instruction", title="instruction"))  # type: ignore
        # Then we add a default `RatingQuestion` which asks the users to provide a
        # rating for each of the generations, differing from the scenario where the inputs
        # are the fields and the outputs the ones used to formulate the quesstions. So on,
        # in this scenario we won't have suggestions, as the questions will be related to the
        # combination of inputs and outputs.
        questions = [
            rg.RatingQuestion(  # type: ignore
                name="instruction-rating",
                title="How would you rate the generated instruction?",
                values=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
            )
        ]
        # Finally, we define some metadata properties that can be potentially used
        # while exploring the dataset within Argilla to get more insights on the data.
        metadata_properties = []
        for arg_name in self.input_args_names:
            if isinstance(dataset_row[arg_name], list):
                for idx in range(1, len(dataset_row[arg_name]) + 1):
                    metadata_properties.append(
                        rg.IntegerMetadataProperty(name=f"length-{arg_name}-{idx}")  # type: ignore
                    )
            elif isinstance(dataset_row[arg_name], str):
                metadata_properties.append(
                    rg.IntegerMetadataProperty(name=f"length-{arg_name}")  # type: ignore
                )
            else:
                warnings.warn(
                    f"Unsupported input type ({type(dataset_row[arg_name])}), skipping...",
                    UserWarning,
                    stacklevel=2,
                )
        metadata_properties.append(
            rg.IntegerMetadataProperty(name="length-instruction")  # type: ignore
        )  # type: ignore
        # Then we just return the `FeedbackDataset` with the fields, questions, and metadata properties
        # defined above.
        return rg.FeedbackDataset(
            fields=fields,
            questions=questions,  # type: ignore
            metadata_properties=metadata_properties,  # Note that these are always optional
        )

    def to_argilla_record(
        self,
        dataset_row: Dict[str, Any],
        instructions_column: Optional[str] = "instructions",
    ) -> List["FeedbackRecord"]:
        """Converts a dataset row to a list of Argilla `FeedbackRecord`s."""
        records = []
        for instruction in dataset_row[instructions_column]:  # type: ignore
            fields, metadata = {}, {}
            for arg_name in self.input_args_names:
                arg_value = dataset_row[arg_name]
                if isinstance(arg_value, list):
                    for idx, value in enumerate(arg_value, start=1):
                        value = value.strip() if isinstance(value, str) else ""
                        fields[f"{arg_name}-{idx}"] = value
                        if value is not None:
                            metadata[f"length-{arg_name}-{idx}"] = len(value)
                elif isinstance(arg_value, str):
                    fields[arg_name] = arg_value.strip() if arg_value else ""
                    if arg_value is not None:
                        metadata[f"length-{arg_name}"] = len(arg_value.strip())
                else:
                    warnings.warn(
                        f"Unsupported input type ({type(arg_value)}), skipping...",
                        UserWarning,
                        stacklevel=2,
                    )
            fields["instruction"] = instruction
            metadata["length-instruction"] = len(instruction)

            # Then we add the model metadata from the `generation_model` and `labelling_model`
            # columns of the dataset, if they exist.
            metadata.update(model_metadata_from_dataset_row(dataset_row=dataset_row))
            # Finally, we append the `FeedbackRecord` with the fields and the metadata
            records.append(rg.FeedbackRecord(fields=fields, metadata=metadata))
        if not records:
            raise ValueError(
                f"Skipping the row {dataset_row} as the list of `FeedbackRecord` is empty as those could not be inferred."
            )
        return records

generate_prompt(input, **_)

Generates a prompt following the Self-Instruct specification.

Parameters:

Name Type Description Default
input str

the input to be used for the prompt.

required

Returns:

Name Type Description
Prompt Prompt

the generated prompt.

Examples:

>>> from distilabel.tasks.text_generation import SelfInstructTask
>>> task = SelfInstructTask(system_prompt="You are a helpful assistant.", num_instructions=2)
>>> task.generate_prompt("What are the first 5 Fibonacci numbers?")
Prompt(
    system_prompt="You are a helpful assistant.",
    formatted_prompt="# Task Description ...",
)
Source code in src/distilabel/tasks/text_generation/self_instruct.py
def generate_prompt(self, input: str, **_: Any) -> Prompt:
    """Generates a prompt following the Self-Instruct specification.

    Args:
        input (str): the input to be used for the prompt.

    Returns:
        Prompt: the generated prompt.

    Examples:
        >>> from distilabel.tasks.text_generation import SelfInstructTask
        >>> task = SelfInstructTask(system_prompt="You are a helpful assistant.", num_instructions=2)
        >>> task.generate_prompt("What are the first 5 Fibonacci numbers?")
        Prompt(
            system_prompt="You are a helpful assistant.",
            formatted_prompt="# Task Description ...",
        )
    """
    render_kwargs = {
        "application_description": self.application_description,
        "num_instructions": self.num_instructions,
        "criteria_for_query_generation": self.criteria_for_query_generation,
        "input": input,
    }
    return Prompt(
        system_prompt=self.system_prompt,
        formatted_prompt=self.template.render(**render_kwargs),
    )

parse_output(output)

Parses the output of the model into the desired format.

Source code in src/distilabel/tasks/text_generation/self_instruct.py
def parse_output(self, output: str) -> Dict[str, List[str]]:
    """Parses the output of the model into the desired format."""
    pattern = re.compile(r"\d+\.\s*(.*?)\n")
    return {"instructions": pattern.findall(output)}

to_argilla_record(dataset_row, instructions_column='instructions')

Converts a dataset row to a list of Argilla FeedbackRecords.

Source code in src/distilabel/tasks/text_generation/self_instruct.py
def to_argilla_record(
    self,
    dataset_row: Dict[str, Any],
    instructions_column: Optional[str] = "instructions",
) -> List["FeedbackRecord"]:
    """Converts a dataset row to a list of Argilla `FeedbackRecord`s."""
    records = []
    for instruction in dataset_row[instructions_column]:  # type: ignore
        fields, metadata = {}, {}
        for arg_name in self.input_args_names:
            arg_value = dataset_row[arg_name]
            if isinstance(arg_value, list):
                for idx, value in enumerate(arg_value, start=1):
                    value = value.strip() if isinstance(value, str) else ""
                    fields[f"{arg_name}-{idx}"] = value
                    if value is not None:
                        metadata[f"length-{arg_name}-{idx}"] = len(value)
            elif isinstance(arg_value, str):
                fields[arg_name] = arg_value.strip() if arg_value else ""
                if arg_value is not None:
                    metadata[f"length-{arg_name}"] = len(arg_value.strip())
            else:
                warnings.warn(
                    f"Unsupported input type ({type(arg_value)}), skipping...",
                    UserWarning,
                    stacklevel=2,
                )
        fields["instruction"] = instruction
        metadata["length-instruction"] = len(instruction)

        # Then we add the model metadata from the `generation_model` and `labelling_model`
        # columns of the dataset, if they exist.
        metadata.update(model_metadata_from_dataset_row(dataset_row=dataset_row))
        # Finally, we append the `FeedbackRecord` with the fields and the metadata
        records.append(rg.FeedbackRecord(fields=fields, metadata=metadata))
    if not records:
        raise ValueError(
            f"Skipping the row {dataset_row} as the list of `FeedbackRecord` is empty as those could not be inferred."
        )
    return records