Skip to content

dataset

CustomDataset

Bases: Dataset

A custom dataset class that extends from datasets.Dataset and is used to generate an Argilla FeedbackDataset instance from the pre-defined configuration within the task provided to Pipeline.generate.

Source code in src/distilabel/dataset.py
class CustomDataset(Dataset):
    """A custom dataset class that extends from `datasets.Dataset` and is used to generate
    an Argilla `FeedbackDataset` instance from the pre-defined configuration within the task
    provided to `Pipeline.generate`.
    """

    task: Union["Task", None] = None

    def to_argilla(self) -> "FeedbackDataset":
        """Converts the dataset to an Argilla `FeedbackDataset` instance, based on the
        task defined in the dataset as part of `Pipeline.generate`.

        Raises:
            ImportError: if the argilla library is not installed.
            ValueError: if the task is not set.

        Returns:
            FeedbackDataset: the Argilla `FeedbackDataset` instance.
        """
        if not _ARGILLA_AVAILABLE:
            raise ImportError(
                "To use `to_argilla` method is required to have `argilla` installed. "
                "Please install it with `pip install argilla`."
            )

        if self.task is None:
            raise ValueError(
                "The task is not set. Please set it with `dataset.task = <task>`."
            )

        try:
            rg_dataset = self.task.to_argilla_dataset(dataset_row=self[0])  # type: ignore
        except Exception as e:
            raise ValueError(
                f"Error while converting the dataset to an Argilla `FeedbackDataset` instance: {e}"
            ) from e

        for dataset_row in self:
            if any(
                dataset_row[input_arg_name] is None  # type: ignore
                for input_arg_name in self.task.input_args_names
            ):
                continue
            rg_dataset.add_records(
                self.task.to_argilla_record(dataset_row=dataset_row)  # type: ignore
            )
        return rg_dataset

to_argilla()

Converts the dataset to an Argilla FeedbackDataset instance, based on the task defined in the dataset as part of Pipeline.generate.

Raises:

Type Description
ImportError

if the argilla library is not installed.

ValueError

if the task is not set.

Returns:

Name Type Description
FeedbackDataset FeedbackDataset

the Argilla FeedbackDataset instance.

Source code in src/distilabel/dataset.py
def to_argilla(self) -> "FeedbackDataset":
    """Converts the dataset to an Argilla `FeedbackDataset` instance, based on the
    task defined in the dataset as part of `Pipeline.generate`.

    Raises:
        ImportError: if the argilla library is not installed.
        ValueError: if the task is not set.

    Returns:
        FeedbackDataset: the Argilla `FeedbackDataset` instance.
    """
    if not _ARGILLA_AVAILABLE:
        raise ImportError(
            "To use `to_argilla` method is required to have `argilla` installed. "
            "Please install it with `pip install argilla`."
        )

    if self.task is None:
        raise ValueError(
            "The task is not set. Please set it with `dataset.task = <task>`."
        )

    try:
        rg_dataset = self.task.to_argilla_dataset(dataset_row=self[0])  # type: ignore
    except Exception as e:
        raise ValueError(
            f"Error while converting the dataset to an Argilla `FeedbackDataset` instance: {e}"
        ) from e

    for dataset_row in self:
        if any(
            dataset_row[input_arg_name] is None  # type: ignore
            for input_arg_name in self.task.input_args_names
        ):
            continue
        rg_dataset.add_records(
            self.task.to_argilla_record(dataset_row=dataset_row)  # type: ignore
        )
    return rg_dataset