llm

`InferenceEndpointsLLM`

Bases: LLM

Source code in src/distilabel/llm/huggingface/inference_endpoints.py

class InferenceEndpointsLLM(LLM):
    def __init__(
        self,
        endpoint_name: str,
        task: "Task",
        endpoint_namespace: Union[str, None] = None,
        token: Union[str, None] = None,
        max_new_tokens: int = 128,
        repetition_penalty: Union[float, None] = None,
        seed: Union[int, None] = None,
        do_sample: bool = False,
        temperature: Union[float, None] = None,
        top_k: Union[int, None] = None,
        top_p: Union[float, None] = None,
        typical_p: Union[float, None] = None,
        num_threads: Union[int, None] = None,
        prompt_format: Union["SupportedFormats", None] = None,
        prompt_formatting_fn: Union[Callable[..., str], None] = None,
    ) -> None:
        """Initializes the InferenceEndpointsLLM class.

        Args:
            endpoint_name (str): The name of the endpoint.
            task (Task): The task to be performed by the LLM.
            endpoint_namespace (Union[str, None]): The namespace of the endpoint. Defaults to None.
            token (Union[str, None]): The token for the endpoint. Defaults to None.
            max_new_tokens (int): The maximum number of tokens to be generated. Defaults to 128.
            repetition_penalty (Union[float, None]): The repetition penalty to be used for generation. Defaults to None.
            seed (Union[int, None]): The seed for generation. Defaults to None.
            do_sample (bool): Whether to do sampling. Defaults to False.
            temperature (Union[float, None]): The temperature for generation. Defaults to None.
            top_k (Union[int, None]): The top_k for generation. Defaults to None.
            top_p (Union[float, None]): The top_p for generation. Defaults to None.
            typical_p (Union[float, None]): The typical_p for generation. Defaults to None.
            num_threads (Union[int, None]): The number of threads. Defaults to None.
            prompt_format (Union["SupportedFormats", None]): The format of the prompt. Defaults to None.
            prompt_formatting_fn (Union[Callable[..., str], None]): The function for formatting the prompt. Defaults to None.

        Examples:
            >>> from distilabel.tasks.text_generation import TextGenerationTask as Task
            >>> from distilabel.llm import InferenceEndpointsLLM
            >>> task = Task()
            >>> llm = InferenceEndpointsLLM(
            ...     endpoint_name="<INFERENCE_ENDPOINT_NAME>",
            ...     task=task,
            ... )
        """
        super().__init__(
            task=task,
            num_threads=num_threads,
            prompt_format=prompt_format,
            prompt_formatting_fn=prompt_formatting_fn,
        )

        if not _HUGGINGFACE_HUB_AVAILABLE:
            raise ImportError(
                "`InferenceEndpointsLLM` cannot be used as `huggingface-hub` is not "
                "installed, please install it with `pip install huggingface-hub`."
            )

        self.do_sample = do_sample
        self.max_new_tokens = max_new_tokens
        self.repetition_penalty = repetition_penalty
        self.seed = seed
        self.temperature = temperature
        self.top_k = top_k
        self.top_p = top_p
        self.typical_p = typical_p

        self.inference_endpoint = get_inference_endpoint(
            name=endpoint_name, namespace=endpoint_namespace, token=token
        )
        self.inference_endpoint.wait(timeout=30)

    def __rich_repr__(self) -> Generator[Any, None, None]:
        yield from super().__rich_repr__()
        yield (
            "parameters",
            {
                "do_sample": self.do_sample,
                "max_new_tokens": self.max_new_tokens,
                "repetition_penalty": self.repetition_penalty,
                "seed": self.seed,
                "temperature": self.temperature,
                "top_k": self.top_k,
                "top_p": self.top_p,
                "typical_p": self.typical_p,
            },
        )

    @property
    def model_name(self) -> str:
        """Returns the model name of the endpoint."""
        return self.inference_endpoint.repository

    @retry(
        retry=retry_if_exception_type(_INFERENCE_ENDPOINTS_API_RETRY_ON_EXCEPTIONS),
        stop=stop_after_attempt(_INFERENCE_ENDPOINTS_API_STOP_AFTER_ATTEMPT),
        wait=wait_random_exponential(
            multiplier=_INFERENCE_ENDPOINTS_API_WAIT_RANDOM_EXPONENTIAL_MULTIPLIER,
            max=_INFERENCE_ENDPOINTS_API_WAIT_RANDOM_EXPONENTIAL_MAX,
        ),
        before_sleep=before_sleep_log(logger, logging.INFO),
        after=after_log(logger, logging.INFO),
    )
    def _text_generation_with_backoff(self, **kwargs: Any) -> Any:
        """Performs text generation with backoff in case of an error."""
        return self.inference_endpoint.client.text_generation(**kwargs)  # type: ignore

    def _generate(
        self, inputs: List[Dict[str, Any]], num_generations: int = 1
    ) -> List[List[LLMOutput]]:
        """Generates `num_generations` for each input in `inputs`.

        Args:
            inputs (List[Dict[str, Any]]): the inputs to be used for generation.
            num_generations (int, optional): the number of generations to be performed for each
                input. Defaults to 1.

        Returns:
            List[List[LLMOutput]]: the outputs of the LLM.
        """
        prompts = self._generate_prompts(
            inputs, default_format=None, expected_output_type=str
        )
        outputs = []
        for prompt in prompts:
            raw_responses = [
                self._text_generation_with_backoff(
                    prompt=prompt,
                    do_sample=self.do_sample,
                    max_new_tokens=self.max_new_tokens,
                    repetition_penalty=self.repetition_penalty,
                    seed=self.seed,
                    temperature=self.temperature,
                    top_k=self.top_k,
                    top_p=self.top_p,
                    typical_p=self.typical_p,
                )
                for _ in range(num_generations)
            ]
            output = []
            for raw_response in raw_responses:
                try:
                    parsed_response = self.task.parse_output(raw_response)
                except Exception as e:
                    logger.error(f"Error parsing Inference Endpoints output: {e}")
                    parsed_response = None
                output.append(
                    LLMOutput(
                        model_name=self.model_name,
                        prompt_used=prompt,
                        raw_output=raw_response,
                        parsed_output=parsed_response,
                    )
                )
            outputs.append(output)
        return outputs

`model_name: str` `property`

Returns the model name of the endpoint.

`init(endpoint_name, task, endpoint_namespace=None, token=None, max_new_tokens=128, repetition_penalty=None, seed=None, do_sample=False, temperature=None, top_k=None, top_p=None, typical_p=None, num_threads=None, prompt_format=None, prompt_formatting_fn=None)`

Initializes the InferenceEndpointsLLM class.

Parameters:

Name	Type	Description	Default
`endpoint_name`	`str`	The name of the endpoint.	required
`task`	`Task`	The task to be performed by the LLM.	required
`endpoint_namespace`	`Union[str, None]`	The namespace of the endpoint. Defaults to None.	`None`
`token`	`Union[str, None]`	The token for the endpoint. Defaults to None.	`None`
`max_new_tokens`	`int`	The maximum number of tokens to be generated. Defaults to 128.	`128`
`repetition_penalty`	`Union[float, None]`	The repetition penalty to be used for generation. Defaults to None.	`None`
`seed`	`Union[int, None]`	The seed for generation. Defaults to None.	`None`
`do_sample`	`bool`	Whether to do sampling. Defaults to False.	`False`
`temperature`	`Union[float, None]`	The temperature for generation. Defaults to None.	`None`
`top_k`	`Union[int, None]`	The top_k for generation. Defaults to None.	`None`
`top_p`	`Union[float, None]`	The top_p for generation. Defaults to None.	`None`
`typical_p`	`Union[float, None]`	The typical_p for generation. Defaults to None.	`None`
`num_threads`	`Union[int, None]`	The number of threads. Defaults to None.	`None`
`prompt_format`	`Union[SupportedFormats, None]`	The format of the prompt. Defaults to None.	`None`
`prompt_formatting_fn`	`Union[Callable[..., str], None]`	The function for formatting the prompt. Defaults to None.	`None`

Examples:

>>> from distilabel.tasks.text_generation import TextGenerationTask as Task
>>> from distilabel.llm import InferenceEndpointsLLM
>>> task = Task()
>>> llm = InferenceEndpointsLLM(
...     endpoint_name="<INFERENCE_ENDPOINT_NAME>",
...     task=task,
... )

Source code in src/distilabel/llm/huggingface/inference_endpoints.py

def __init__(
    self,
    endpoint_name: str,
    task: "Task",
    endpoint_namespace: Union[str, None] = None,
    token: Union[str, None] = None,
    max_new_tokens: int = 128,
    repetition_penalty: Union[float, None] = None,
    seed: Union[int, None] = None,
    do_sample: bool = False,
    temperature: Union[float, None] = None,
    top_k: Union[int, None] = None,
    top_p: Union[float, None] = None,
    typical_p: Union[float, None] = None,
    num_threads: Union[int, None] = None,
    prompt_format: Union["SupportedFormats", None] = None,
    prompt_formatting_fn: Union[Callable[..., str], None] = None,
) -> None:
    """Initializes the InferenceEndpointsLLM class.

    Args:
        endpoint_name (str): The name of the endpoint.
        task (Task): The task to be performed by the LLM.
        endpoint_namespace (Union[str, None]): The namespace of the endpoint. Defaults to None.
        token (Union[str, None]): The token for the endpoint. Defaults to None.
        max_new_tokens (int): The maximum number of tokens to be generated. Defaults to 128.
        repetition_penalty (Union[float, None]): The repetition penalty to be used for generation. Defaults to None.
        seed (Union[int, None]): The seed for generation. Defaults to None.
        do_sample (bool): Whether to do sampling. Defaults to False.
        temperature (Union[float, None]): The temperature for generation. Defaults to None.
        top_k (Union[int, None]): The top_k for generation. Defaults to None.
        top_p (Union[float, None]): The top_p for generation. Defaults to None.
        typical_p (Union[float, None]): The typical_p for generation. Defaults to None.
        num_threads (Union[int, None]): The number of threads. Defaults to None.
        prompt_format (Union["SupportedFormats", None]): The format of the prompt. Defaults to None.
        prompt_formatting_fn (Union[Callable[..., str], None]): The function for formatting the prompt. Defaults to None.

    Examples:
        >>> from distilabel.tasks.text_generation import TextGenerationTask as Task
        >>> from distilabel.llm import InferenceEndpointsLLM
        >>> task = Task()
        >>> llm = InferenceEndpointsLLM(
        ...     endpoint_name="<INFERENCE_ENDPOINT_NAME>",
        ...     task=task,
        ... )
    """
    super().__init__(
        task=task,
        num_threads=num_threads,
        prompt_format=prompt_format,
        prompt_formatting_fn=prompt_formatting_fn,
    )

    if not _HUGGINGFACE_HUB_AVAILABLE:
        raise ImportError(
            "`InferenceEndpointsLLM` cannot be used as `huggingface-hub` is not "
            "installed, please install it with `pip install huggingface-hub`."
        )

    self.do_sample = do_sample
    self.max_new_tokens = max_new_tokens
    self.repetition_penalty = repetition_penalty
    self.seed = seed
    self.temperature = temperature
    self.top_k = top_k
    self.top_p = top_p
    self.typical_p = typical_p

    self.inference_endpoint = get_inference_endpoint(
        name=endpoint_name, namespace=endpoint_namespace, token=token
    )
    self.inference_endpoint.wait(timeout=30)

`LLM`

Bases: ABC

Source code in src/distilabel/llm/base.py

class LLM(ABC):
    def __init__(
        self,
        task: Task,
        num_threads: Union[int, None] = None,
        prompt_format: Union["SupportedFormats", None] = None,
        prompt_formatting_fn: Union[Callable[..., str], None] = None,
    ) -> None:
        """Initializes the LLM base class.

        Note:
            This class is intended to be used internally, but you anyone can still create
            a subclass, implement the `abstractmethod`s and use it.

        Args:
            task (Task): the task to be performed by the LLM.
            num_threads (Union[int, None], optional): the number of threads to be used
                for parallel generation. If `None`, no parallel generation will be performed.
                Defaults to `None`.
            prompt_format (Union["SupportedFormats", None], optional): the format to be used
                for the prompt. If `None`, the default format of the task will be used, available
                formats are `openai`, `chatml`, `llama2`, `zephyr`, and `default`. Defaults to `None`,
                but `default` (concatenation of `system_prompt` and `formatted_prompt` with a line-break)
                will be used if no `prompt_formatting_fn` is provided.
            prompt_formatting_fn (Union[Callable[..., str], None], optional): a function to be
                applied to the prompt before generation. If `None`, no formatting will be applied.
                Defaults to `None`.
        """
        self.task = task

        self.thread_pool_executor = (
            ThreadPoolExecutor(max_workers=num_threads)
            if num_threads is not None
            else None
        )

        self.prompt_format = prompt_format
        self.prompt_formatting_fn = prompt_formatting_fn

    def __del__(self) -> None:
        """Shuts down the thread pool executor if it is not `None`."""
        if self.thread_pool_executor is not None:
            self.thread_pool_executor.shutdown()

    @property
    def num_threads(self) -> Union[int, None]:
        if self.thread_pool_executor:
            return self.thread_pool_executor._max_workers

    def __repr__(self) -> str:
        return f"{self.__class__.__name__}(task={self.task.__class__.__name__}, num_threads={self.num_threads}, promp_format='{self.prompt_format}', model='{self.model_name}')"

    def __rich_repr__(self) -> Generator[Any, None, None]:
        yield "task", self.task
        yield "num_threads", self.num_threads
        yield "prompt_format", self.prompt_format
        if self.prompt_formatting_fn is not None:
            args = f"({', '.join(self.prompt_formatting_fn.__code__.co_varnames)})"
            representation = self.prompt_formatting_fn.__name__ + args
            yield "prompt_formatting_fn", representation
        yield "model", self.model_name

    @property
    @abstractmethod
    def model_name(self) -> str:
        pass

    def _generate_prompts(
        self,
        inputs: List[Dict[str, Any]],
        default_format: Union["SupportedFormats", None] = None,
        expected_output_type: Type = str,
    ) -> List[Any]:
        """Generates the prompts to be used for generation.

        Args:
            inputs (List[Dict[str, Any]]): the inputs to be used for generation.
            default_format (Union["SupportedFormats", None], optional): the default format to be used
                for the prompt if no `prompt_format` is specified. Defaults to `None`.
            expected_output_type (Type, optional): the expected type of the prompt. Defaults to `str`.

        Returns:
            List[Any]: the generated prompts.

        Raises:
            ValueError: if the generated prompt is not of the expected type.
        """
        prompts = []
        for input in inputs:
            prompt = self.task.generate_prompt(**input)
            if not isinstance(prompt, Prompt) and self.prompt_formatting_fn is not None:
                warnings.warn(
                    "The method `generate_prompt` is not returning a `Prompt` class but a prompt"
                    f" of `type={type(prompt)}`, meaning that a pre-formatting has already been"
                    " applied in the `task.generate_prompt` method, so the usage of a `prompt_formatting_fn`"
                    " is discouraged.",
                    UserWarning,
                    stacklevel=2,
                )
                prompt = self.prompt_formatting_fn(prompt)
            elif isinstance(prompt, Prompt) and self.prompt_formatting_fn is None:
                if self.prompt_format is not None or default_format is not None:
                    prompt = prompt.format_as(
                        format=self.prompt_format or default_format  # type: ignore
                    )
                else:
                    warnings.warn(
                        "No `prompt_format` has been specified and no `default_format` is set, so"
                        " the prompt will be concatenated with a line-break and no specific formatting"
                        " by default.",
                        UserWarning,
                        stacklevel=2,
                    )
                    prompt = prompt.format_as(format="default")
            if not isinstance(prompt, expected_output_type):
                raise ValueError(
                    f"The provided `prompt={prompt}` is of `type={type(prompt)}`, but it must be of"
                    f" `type={expected_output_type}`, so make sure that `task.generate_prompt` returns"
                    f" a `{expected_output_type}` or that the `formatting_fn` formats the prompt as a "
                    f" `{expected_output_type}`."
                )
            prompts.append(prompt)
        return prompts

    @abstractmethod
    def _generate(
        self, inputs: List[Dict[str, Any]], num_generations: int = 1
    ) -> List[List["LLMOutput"]]:
        pass

    def generate(
        self,
        inputs: List[Dict[str, Any]],
        num_generations: int = 1,
        progress_callback_func: Union[Callable, None] = None,
    ) -> Union[List[List["LLMOutput"]], Future[List[List["LLMOutput"]]]]:
        """Generates the outputs for the given inputs using the LLM.

        Args:
            inputs (List[Dict[str, Any]]): the inputs to be used for generation.
            num_generations (int, optional): the number of generations to be performed for each input.
                Defaults to `1`.
            progress_callback_func (Union[Callable, None], optional): a function to be called at each
                generation step. Defaults to `None`.

        Returns:
            Union[List[Future[List["LLMOutput"]]], List[List["LLMOutput"]]]: the generated outputs.
        """

        def _progress():
            if progress_callback_func is not None:
                progress_callback_func(advance=num_generations * len(inputs))

        if self.thread_pool_executor is not None:
            futures = []
            for input in inputs:
                future = self.thread_pool_executor.submit(
                    self._generate, [input], num_generations
                )
                futures.append(future)
            future = when_all_complete(futures)
            future.add_done_callback(lambda _: _progress())
            return future

        generations = self._generate(inputs, num_generations)
        _progress()
        return generations

    @property
    def return_futures(self) -> bool:
        """Whether the `LLM` returns futures"""
        return True

`return_futures: bool` `property`

Whether the LLM returns futures

`del()`

Shuts down the thread pool executor if it is not None.

Source code in src/distilabel/llm/base.py

def __del__(self) -> None:
    """Shuts down the thread pool executor if it is not `None`."""
    if self.thread_pool_executor is not None:
        self.thread_pool_executor.shutdown()

`init(task, num_threads=None, prompt_format=None, prompt_formatting_fn=None)`

Initializes the LLM base class.

Note

This class is intended to be used internally, but you anyone can still create a subclass, implement the abstractmethods and use it.

Parameters:

Name	Type	Description	Default
`task`	`Task`	the task to be performed by the LLM.	required
`num_threads`	`Union[int, None]`	the number of threads to be used for parallel generation. If `None`, no parallel generation will be performed. Defaults to `None`.	`None`
`prompt_format`	`Union['SupportedFormats', None]`	the format to be used for the prompt. If `None`, the default format of the task will be used, available formats are `openai`, `chatml`, `llama2`, `zephyr`, and `default`. Defaults to `None`, but `default` (concatenation of `system_prompt` and `formatted_prompt` with a line-break) will be used if no `prompt_formatting_fn` is provided.	`None`
`prompt_formatting_fn`	`Union[Callable[..., str], None]`	a function to be applied to the prompt before generation. If `None`, no formatting will be applied. Defaults to `None`.	`None`

Source code in src/distilabel/llm/base.py

def __init__(
    self,
    task: Task,
    num_threads: Union[int, None] = None,
    prompt_format: Union["SupportedFormats", None] = None,
    prompt_formatting_fn: Union[Callable[..., str], None] = None,
) -> None:
    """Initializes the LLM base class.

    Note:
        This class is intended to be used internally, but you anyone can still create
        a subclass, implement the `abstractmethod`s and use it.

    Args:
        task (Task): the task to be performed by the LLM.
        num_threads (Union[int, None], optional): the number of threads to be used
            for parallel generation. If `None`, no parallel generation will be performed.
            Defaults to `None`.
        prompt_format (Union["SupportedFormats", None], optional): the format to be used
            for the prompt. If `None`, the default format of the task will be used, available
            formats are `openai`, `chatml`, `llama2`, `zephyr`, and `default`. Defaults to `None`,
            but `default` (concatenation of `system_prompt` and `formatted_prompt` with a line-break)
            will be used if no `prompt_formatting_fn` is provided.
        prompt_formatting_fn (Union[Callable[..., str], None], optional): a function to be
            applied to the prompt before generation. If `None`, no formatting will be applied.
            Defaults to `None`.
    """
    self.task = task

    self.thread_pool_executor = (
        ThreadPoolExecutor(max_workers=num_threads)
        if num_threads is not None
        else None
    )

    self.prompt_format = prompt_format
    self.prompt_formatting_fn = prompt_formatting_fn

`generate(inputs, num_generations=1, progress_callback_func=None)`

Generates the outputs for the given inputs using the LLM.

Parameters:

Name	Type	Description	Default
`inputs`	`List[Dict[str, Any]]`	the inputs to be used for generation.	required
`num_generations`	`int`	the number of generations to be performed for each input. Defaults to `1`.	`1`
`progress_callback_func`	`Union[Callable, None]`	a function to be called at each generation step. Defaults to `None`.	`None`

Returns:

Type	Description
`Union[List[List['LLMOutput']], Future[List[List['LLMOutput']]]]`	Union[List[Future[List["LLMOutput"]]], List[List["LLMOutput"]]]: the generated outputs.

Source code in src/distilabel/llm/base.py

def generate(
    self,
    inputs: List[Dict[str, Any]],
    num_generations: int = 1,
    progress_callback_func: Union[Callable, None] = None,
) -> Union[List[List["LLMOutput"]], Future[List[List["LLMOutput"]]]]:
    """Generates the outputs for the given inputs using the LLM.

    Args:
        inputs (List[Dict[str, Any]]): the inputs to be used for generation.
        num_generations (int, optional): the number of generations to be performed for each input.
            Defaults to `1`.
        progress_callback_func (Union[Callable, None], optional): a function to be called at each
            generation step. Defaults to `None`.

    Returns:
        Union[List[Future[List["LLMOutput"]]], List[List["LLMOutput"]]]: the generated outputs.
    """

    def _progress():
        if progress_callback_func is not None:
            progress_callback_func(advance=num_generations * len(inputs))

    if self.thread_pool_executor is not None:
        futures = []
        for input in inputs:
            future = self.thread_pool_executor.submit(
                self._generate, [input], num_generations
            )
            futures.append(future)
        future = when_all_complete(futures)
        future.add_done_callback(lambda _: _progress())
        return future

    generations = self._generate(inputs, num_generations)
    _progress()
    return generations

`LLMPool`

LLMPool is a class that wraps multiple ProcessLLMs and performs generation in parallel using them. Depending on the number of LLMs and the parameter num_generations, the LLMPool will decide how many generations to perform for each LLM:

If num_generations is less than the number of LLMs, then num_generations LLMs will be chosen randomly and each of them will perform 1 generation.
If num_generations is equal to the number of LLMs, then each LLM will perform 1 generation.
If num_generations is greater than the number of LLMs, then each LLM will perform num_generations // num_llms generations, and the remaining num_generations % num_llms generations will be performed by num_generations % num_llms randomly chosen LLMs.

Attributes:

Name	Type	Description
`llms`	`List[ProcessLLM]`	the `ProcessLLM`s to be used for generation.

Source code in src/distilabel/llm/base.py

class LLMPool:
    """LLMPool is a class that wraps multiple `ProcessLLM`s and performs generation in
    parallel using them. Depending on the number of `LLM`s and the parameter `num_generations`,
    the `LLMPool` will decide how many generations to perform for each `LLM`:

    - If `num_generations` is less than the number of `LLM`s, then `num_generations` LLMs
    will be chosen randomly and each of them will perform 1 generation.


    - If `num_generations` is equal to the number of `LLM`s, then each `LLM` will perform
    1 generation.

    - If `num_generations` is greater than the number of `LLM`s, then each `LLM` will
    perform `num_generations // num_llms` generations, and the remaining `num_generations % num_llms`
    generations will be performed by `num_generations % num_llms` randomly chosen `LLM`s.

    Attributes:
        llms (List[ProcessLLM]): the `ProcessLLM`s to be used for generation.
    """

    def __init__(self, llms: List[ProcessLLM]) -> None:
        """Initializes the `LLMPool` class.

        Args:
            llms: the `ProcessLLM`s to be used for generation. The list must contain at
                least 2 `ProcessLLM`s.

        Raises:
            ValueError: if the `llms` argument contains less than 2 `ProcessLLM`s, the
                `llms` argument contains `ProcessLLM`s that are not `ProcessLLM`s, or
                if the `llms` argument contains `ProcessLLM`s with different tasks.
        """
        if len(llms) < 2:
            raise ValueError(
                "The `llms` argument must contain at least 2 `ProcessLLM`s. If you want"
                " to use a single `ProcessLLM`, use the `ProcessLLM` directly instead."
            )

        if not all(isinstance(llm, ProcessLLM) for llm in llms):
            raise ValueError("The `llms` argument must contain only `ProcessLLM`s.")

        if not all(llm.task == llms[0].task for llm in llms):
            raise ValueError(
                "The `llms` argument must contain `ProcessLLM`s with the same task."
            )

        self.llms = llms
        self.num_llms = len(llms)

    def _get_num_generations_per_llm(self, num_generations: int) -> Dict[int, int]:
        """Returns the number of generations to be performed by each `LLM`.

        Args:
            num_generations: the number of generations to be performed.

        Returns:
            Dict[int, int]: a dictionary where the keys are the ids of the `LLM`s and the
            values are the number of generations to be performed by each `LLM`.
        """
        llms_ids = list(range(self.num_llms))
        generations_per_llm = {i: num_generations // self.num_llms for i in llms_ids}

        for i in random.sample(llms_ids, k=num_generations % self.num_llms):
            generations_per_llm[i] += 1

        return generations_per_llm

    def generate(
        self,
        inputs: List[Dict[str, Any]],
        num_generations: int = 1,
        progress_callback_func: Union[Callable, None] = None,
    ) -> List[List["LLMOutput"]]:
        """Generates the outputs for the given inputs using the pool of `ProcessLLM`s.

        Args:
            inputs (List[Dict[str, Any]]): the inputs to be used for generation.
            num_generations (int, optional): the number of generations to be performed for each input.
                Defaults to `1`.
            progress_callback_func (Union[Callable, None], optional): a function to be called at each
                generation step. Defaults to `None`.

        Returns:
            Future[List[List["LLMOutput"]]]: the generated outputs as a `Future`.
        """
        num_generations_per_llm = self._get_num_generations_per_llm(num_generations)

        futures = [
            llm.generate(
                inputs,
                num_generations=num_generations_per_llm[i],
                progress_callback_func=progress_callback_func,
            )
            for i, llm in enumerate(self.llms)
            if num_generations_per_llm[i] > 0
        ]
        llms_generations = [future.result() for future in futures]

        generations = []
        for llms_row_generations in zip(*llms_generations):
            row_generations = []
            for llm_row_generations in llms_row_generations:
                for generation in llm_row_generations:
                    row_generations.append(generation)
            generations.append(row_generations)

        return generations

    def teardown(self) -> None:
        """Stops the `ProcessLLM`s."""
        for llm in self.llms:
            llm.teardown()

    @property
    def task(self) -> "Task":
        """Returns the task that will be used by the `ProcessLLM`s of this pool.

        Returns:
            Task: the task that will be used by the `ProcessLLM`s of this pool.
        """
        return self.llms[0].task

    @property
    def return_futures(self) -> bool:
        """Whether the `LLM` returns futures"""
        return False

`return_futures: bool` `property`

Whether the LLM returns futures

`task: 'Task'` `property`

Returns the task that will be used by the ProcessLLMs of this pool.

Returns:

Name	Type	Description
`Task`	`'Task'`	the task that will be used by the `ProcessLLM`s of this pool.

`init(llms)`

Initializes the LLMPool class.

Parameters:

Name	Type	Description	Default
`llms`	`List[ProcessLLM]`	the `ProcessLLM`s to be used for generation. The list must contain at least 2 `ProcessLLM`s.	required

Raises:

Type	Description
`ValueError`	if the `llms` argument contains less than 2 `ProcessLLM`s, the `llms` argument contains `ProcessLLM`s that are not `ProcessLLM`s, or if the `llms` argument contains `ProcessLLM`s with different tasks.

Source code in src/distilabel/llm/base.py

def __init__(self, llms: List[ProcessLLM]) -> None:
    """Initializes the `LLMPool` class.

    Args:
        llms: the `ProcessLLM`s to be used for generation. The list must contain at
            least 2 `ProcessLLM`s.

    Raises:
        ValueError: if the `llms` argument contains less than 2 `ProcessLLM`s, the
            `llms` argument contains `ProcessLLM`s that are not `ProcessLLM`s, or
            if the `llms` argument contains `ProcessLLM`s with different tasks.
    """
    if len(llms) < 2:
        raise ValueError(
            "The `llms` argument must contain at least 2 `ProcessLLM`s. If you want"
            " to use a single `ProcessLLM`, use the `ProcessLLM` directly instead."
        )

    if not all(isinstance(llm, ProcessLLM) for llm in llms):
        raise ValueError("The `llms` argument must contain only `ProcessLLM`s.")

    if not all(llm.task == llms[0].task for llm in llms):
        raise ValueError(
            "The `llms` argument must contain `ProcessLLM`s with the same task."
        )

    self.llms = llms
    self.num_llms = len(llms)

`generate(inputs, num_generations=1, progress_callback_func=None)`

Generates the outputs for the given inputs using the pool of ProcessLLMs.

Parameters:

Name	Type	Description	Default
`inputs`	`List[Dict[str, Any]]`	the inputs to be used for generation.	required
`num_generations`	`int`	the number of generations to be performed for each input. Defaults to `1`.	`1`
`progress_callback_func`	`Union[Callable, None]`	a function to be called at each generation step. Defaults to `None`.	`None`

Returns:

Type	Description
`List[List['LLMOutput']]`	Future[List[List["LLMOutput"]]]: the generated outputs as a `Future`.

Source code in src/distilabel/llm/base.py

def generate(
    self,
    inputs: List[Dict[str, Any]],
    num_generations: int = 1,
    progress_callback_func: Union[Callable, None] = None,
) -> List[List["LLMOutput"]]:
    """Generates the outputs for the given inputs using the pool of `ProcessLLM`s.

    Args:
        inputs (List[Dict[str, Any]]): the inputs to be used for generation.
        num_generations (int, optional): the number of generations to be performed for each input.
            Defaults to `1`.
        progress_callback_func (Union[Callable, None], optional): a function to be called at each
            generation step. Defaults to `None`.

    Returns:
        Future[List[List["LLMOutput"]]]: the generated outputs as a `Future`.
    """
    num_generations_per_llm = self._get_num_generations_per_llm(num_generations)

    futures = [
        llm.generate(
            inputs,
            num_generations=num_generations_per_llm[i],
            progress_callback_func=progress_callback_func,
        )
        for i, llm in enumerate(self.llms)
        if num_generations_per_llm[i] > 0
    ]
    llms_generations = [future.result() for future in futures]

    generations = []
    for llms_row_generations in zip(*llms_generations):
        row_generations = []
        for llm_row_generations in llms_row_generations:
            for generation in llm_row_generations:
                row_generations.append(generation)
        generations.append(row_generations)

    return generations

`teardown()`

Stops the ProcessLLMs.

Source code in src/distilabel/llm/base.py

def teardown(self) -> None:
    """Stops the `ProcessLLM`s."""
    for llm in self.llms:
        llm.teardown()

`LlamaCppLLM`

Bases: LLM

Source code in src/distilabel/llm/llama_cpp.py

class LlamaCppLLM(LLM):
    def __init__(
        self,
        model: "Llama",
        task: "Task",
        max_new_tokens: int = 128,
        temperature: float = 0.8,
        top_p: float = 0.95,
        top_k: int = 40,
        repeat_penalty: float = 1.1,
        seed: int = 1337,
        prompt_format: Union[SupportedFormats, None] = None,
        prompt_formatting_fn: Union[Callable[..., str], None] = None,
    ) -> None:
        """Initializes the LlamaCppLLM class.

        Args:
            model (Llama): the llama-cpp model to be used.
            task (Task): the task to be performed by the LLM.
            max_new_tokens (int, optional): the maximum number of tokens to be generated.
                Defaults to 128.
            temperature (float, optional): the temperature to be used for generation.
                Defaults to 0.8.
            top_p (float, optional): the top-p value to be used for generation.
                Defaults to 0.95.
            top_k (int, optional): the top-k value to be used for generation.
                Defaults to 40.
            repeat_penalty (float, optional): the repeat penalty to be used for generation.
                Defaults to 1.1.
            seed (int, optional): the seed to be used for generation, setting it to -1 implies
                that a different response will be generated on each generation, similarly to
                HuggingFace's `do_sample` arg. Defaults to 1337.
            prompt_format (Union[SupportedFormats, None], optional): the format to be used
                for the prompt. If `None`, the default format of the task will be used, available
                formats are `openai`, `chatml`, `llama2`, `zephyr`, and `default`. Defaults to `None`,
                but `default` (concatenation of `system_prompt` and `formatted_prompt` with a line-break)
                will be used if no `prompt_formatting_fn` is provided.
            prompt_formatting_fn (Union[Callable[..., str], None], optional): a function to be
                applied to the prompt before generation. If `None`, no formatting will be applied.
                Defaults to `None`.

        Examples:
            >>> from llama_cpp import Llama
            >>> from distilabel.tasks.text_generation import TextGenerationTask as Task
            >>> from distilabel.llm import LlamaCppLLM
            >>> model = Llama(model_path="path/to/model")
            >>> task = Task()
            >>> llm = LlamaCppLLM(model=model, task=task)
        """
        super().__init__(
            task=task,
            prompt_format=prompt_format,
            prompt_formatting_fn=prompt_formatting_fn,
        )

        if not _LLAMA_CPP_AVAILABLE:
            raise ImportError(
                "`LlamaCppLLM` cannot be used as `llama_cpp` is not installed, please "
                " install it with `pip install llama-cpp-python`."
            )

        self.max_tokens = max_new_tokens
        self.temperature = temperature
        self.top_p = top_p
        self.top_k = top_k
        self.repeat_penalty = repeat_penalty
        self.seed = seed

        self.model = model

    def __rich_repr__(self) -> Generator[Any, None, None]:
        yield from super().__rich_repr__()
        yield (
            "parameters",
            {
                "max_new_tokens": self.max_tokens,
                "temperature": self.temperature,
                "top_p": self.top_p,
                "top_k": self.top_k,
                "repeat_penalty": self.repeat_penalty,
            },
        )

    @property
    def model_name(self) -> str:
        """Returns the name of the llama-cpp model, which is the same as the model path."""
        return self.model.model_path

    def _generate(
        self, inputs: List[Dict[str, Any]], num_generations: int = 1
    ) -> List[List[LLMOutput]]:
        """Generates `num_generations` for each input in `inputs`.

        Args:
            inputs (List[Dict[str, Any]]): the inputs to be used for generation.
            num_generations (int, optional): the number of generations to be performed for each
                input. Defaults to 1.

        Returns:
            List[List[LLMOutput]]: the generated outputs.
        """
        prompts = self._generate_prompts(
            inputs, default_format=None, expected_output_type=str
        )
        outputs = []
        for prompt in prompts:
            output = []
            for _ in range(num_generations):
                raw_output = self.model.create_completion(
                    prompt,
                    max_tokens=self.max_tokens,
                    temperature=self.temperature,
                    top_p=self.top_p,
                    top_k=self.top_k,
                    repeat_penalty=self.repeat_penalty,
                )
                try:
                    parsed_output = self.task.parse_output(
                        raw_output["choices"][0]["text"].strip()
                    )
                except Exception as e:
                    logger.error(f"Error parsing llama-cpp output: {e}")
                    parsed_output = None
                output.append(
                    LLMOutput(
                        model_name=self.model_name,
                        prompt_used=prompt,
                        raw_output=raw_output,
                        parsed_output=parsed_output,
                    )
                )
            outputs.append(output)
        return outputs

`model_name: str` `property`

Returns the name of the llama-cpp model, which is the same as the model path.

`init(model, task, max_new_tokens=128, temperature=0.8, top_p=0.95, top_k=40, repeat_penalty=1.1, seed=1337, prompt_format=None, prompt_formatting_fn=None)`

Initializes the LlamaCppLLM class.

Parameters:

Name	Type	Description	Default
`model`	`Llama`	the llama-cpp model to be used.	required
`task`	`Task`	the task to be performed by the LLM.	required
`max_new_tokens`	`int`	the maximum number of tokens to be generated. Defaults to 128.	`128`
`temperature`	`float`	the temperature to be used for generation. Defaults to 0.8.	`0.8`
`top_p`	`float`	the top-p value to be used for generation. Defaults to 0.95.	`0.95`
`top_k`	`int`	the top-k value to be used for generation. Defaults to 40.	`40`
`repeat_penalty`	`float`	the repeat penalty to be used for generation. Defaults to 1.1.	`1.1`
`seed`	`int`	the seed to be used for generation, setting it to -1 implies that a different response will be generated on each generation, similarly to HuggingFace's `do_sample` arg. Defaults to 1337.	`1337`
`prompt_format`	`Union[SupportedFormats, None]`	the format to be used for the prompt. If `None`, the default format of the task will be used, available formats are `openai`, `chatml`, `llama2`, `zephyr`, and `default`. Defaults to `None`, but `default` (concatenation of `system_prompt` and `formatted_prompt` with a line-break) will be used if no `prompt_formatting_fn` is provided.	`None`
`prompt_formatting_fn`	`Union[Callable[..., str], None]`	a function to be applied to the prompt before generation. If `None`, no formatting will be applied. Defaults to `None`.	`None`

Examples:

>>> from llama_cpp import Llama
>>> from distilabel.tasks.text_generation import TextGenerationTask as Task
>>> from distilabel.llm import LlamaCppLLM
>>> model = Llama(model_path="path/to/model")
>>> task = Task()
>>> llm = LlamaCppLLM(model=model, task=task)

Source code in src/distilabel/llm/llama_cpp.py

def __init__(
    self,
    model: "Llama",
    task: "Task",
    max_new_tokens: int = 128,
    temperature: float = 0.8,
    top_p: float = 0.95,
    top_k: int = 40,
    repeat_penalty: float = 1.1,
    seed: int = 1337,
    prompt_format: Union[SupportedFormats, None] = None,
    prompt_formatting_fn: Union[Callable[..., str], None] = None,
) -> None:
    """Initializes the LlamaCppLLM class.

    Args:
        model (Llama): the llama-cpp model to be used.
        task (Task): the task to be performed by the LLM.
        max_new_tokens (int, optional): the maximum number of tokens to be generated.
            Defaults to 128.
        temperature (float, optional): the temperature to be used for generation.
            Defaults to 0.8.
        top_p (float, optional): the top-p value to be used for generation.
            Defaults to 0.95.
        top_k (int, optional): the top-k value to be used for generation.
            Defaults to 40.
        repeat_penalty (float, optional): the repeat penalty to be used for generation.
            Defaults to 1.1.
        seed (int, optional): the seed to be used for generation, setting it to -1 implies
            that a different response will be generated on each generation, similarly to
            HuggingFace's `do_sample` arg. Defaults to 1337.
        prompt_format (Union[SupportedFormats, None], optional): the format to be used
            for the prompt. If `None`, the default format of the task will be used, available
            formats are `openai`, `chatml`, `llama2`, `zephyr`, and `default`. Defaults to `None`,
            but `default` (concatenation of `system_prompt` and `formatted_prompt` with a line-break)
            will be used if no `prompt_formatting_fn` is provided.
        prompt_formatting_fn (Union[Callable[..., str], None], optional): a function to be
            applied to the prompt before generation. If `None`, no formatting will be applied.
            Defaults to `None`.

    Examples:
        >>> from llama_cpp import Llama
        >>> from distilabel.tasks.text_generation import TextGenerationTask as Task
        >>> from distilabel.llm import LlamaCppLLM
        >>> model = Llama(model_path="path/to/model")
        >>> task = Task()
        >>> llm = LlamaCppLLM(model=model, task=task)
    """
    super().__init__(
        task=task,
        prompt_format=prompt_format,
        prompt_formatting_fn=prompt_formatting_fn,
    )

    if not _LLAMA_CPP_AVAILABLE:
        raise ImportError(
            "`LlamaCppLLM` cannot be used as `llama_cpp` is not installed, please "
            " install it with `pip install llama-cpp-python`."
        )

    self.max_tokens = max_new_tokens
    self.temperature = temperature
    self.top_p = top_p
    self.top_k = top_k
    self.repeat_penalty = repeat_penalty
    self.seed = seed

    self.model = model

`OpenAILLM`

Bases: LLM

Source code in src/distilabel/llm/openai.py

class OpenAILLM(LLM):
    def __init__(
        self,
        task: "Task",
        model: str = "gpt-3.5-turbo",
        client: Union["OpenAI", None] = None,
        openai_api_key: Union[str, None] = None,
        max_new_tokens: int = 128,
        frequency_penalty: float = 0.0,
        presence_penalty: float = 0.0,
        temperature: float = 1.0,
        top_p: float = 1.0,
        num_threads: Union[int, None] = None,
        prompt_format: Union["SupportedFormats", None] = None,
        prompt_formatting_fn: Union[Callable[..., str], None] = None,
    ) -> None:
        """Initializes the OpenAILLM class.

        Args:
            task (Task): the task to be performed by the LLM.
            model (str, optional): the model to be used for generation. Defaults to "gpt-3.5-turbo".
            client (Union[OpenAI, None], optional): an OpenAI client to be used for generation.
                If `None`, a new client will be created. Defaults to `None`.
            openai_api_key (Union[str, None], optional): the OpenAI API key to be used for generation.
                If `None`, the `OPENAI_API_KEY` environment variable will be used. Defaults to `None`.
            max_new_tokens (int, optional): the maximum number of tokens to be generated.
                Defaults to 128.
            frequency_penalty (float, optional): the frequency penalty to be used for generation.
                Defaults to 0.0.
            presence_penalty (float, optional): the presence penalty to be used for generation.
                Defaults to 0.0.
            temperature (float, optional): the temperature to be used for generation.
                Defaults to 1.0.
            top_p (float, optional): the top-p value to be used for generation.
                Defaults to 1.0.
            num_threads (Union[int, None], optional): the number of threads to be used
                for parallel generation. If `None`, no parallel generation will be performed.
                Defaults to `None`.
            prompt_format (Union[SupportedFormats, None], optional): the format to be used
                for the prompt. If `None`, the default format of the task will be used, available
                formats are `openai`, `chatml`, `llama2`, `zephyr`, and `default`. Defaults to `None`,
                but `default` (concatenation of `system_prompt` and `formatted_prompt` with a line-break)
                will be used if no `prompt_formatting_fn` is provided.
            prompt_formatting_fn (Union[Callable[..., str], None], optional): a function to be
                applied to the prompt before generation. If `None`, no formatting will be applied.
                Defaults to `None`.

        Raises:
            AssertionError: if the provided `model` is not available in your OpenAI account.

        Examples:
            >>> from distilabel.tasks.text_generation import TextGenerationTask as Task
            >>> from distilabel.llm import OpenAILLM
            >>> task = Task()
            >>> llm = OpenAILLM(model="gpt-3.5-turbo", task=task)
        """
        super().__init__(
            task=task,
            num_threads=num_threads,
            prompt_format=prompt_format,
            prompt_formatting_fn=prompt_formatting_fn,
        )

        if not _OPENAI_AVAILABLE:
            raise ImportError(
                "`OpenAILLM` cannot be used as `openai` is not installed, please "
                " install it with `pip install openai`."
            )

        self.max_tokens = max_new_tokens
        self.frequency_penalty = frequency_penalty
        self.presence_penalty = presence_penalty
        self.temperature = temperature
        self.top_p = top_p

        self.client = client or OpenAI(api_key=openai_api_key, max_retries=6)

        assert (
            model in self.available_models
        ), f"Provided `model` is not available in your OpenAI account, available models are {self.available_models}"
        self.model = model

    def __rich_repr__(self) -> Generator[Any, None, None]:
        yield from super().__rich_repr__()
        yield (
            "parameters",
            {
                "max_tokens": self.max_tokens,
                "frequency_penalty": self.frequency_penalty,
                "presence_penalty": self.presence_penalty,
                "temperature": self.temperature,
                "top_p": self.top_p,
            },
        )

    @cached_property
    def available_models(self) -> List[str]:
        """Returns the list of available models in your OpenAI account."""
        return [model.id for model in self.client.models.list().data]

    @property
    def model_name(self) -> str:
        """Returns the name of the OpenAI model."""
        return self.model

    def _generate(
        self,
        inputs: List[Dict[str, Any]],
        num_generations: int = 1,
    ) -> List[List[LLMOutput]]:
        """Generates `num_generations` for each input in `inputs`.

        Args:
            inputs (List[Dict[str, Any]]): the inputs to be used for generation.
            num_generations (int, optional): the number of generations to be performed for each
                input. Defaults to 1.

        Returns:
            List[List[LLMOutput]]: the generated outputs.
        """
        prompts = self._generate_prompts(
            inputs, default_format="openai", expected_output_type=list
        )
        outputs = []
        for prompt in prompts:
            chat_completions = self.client.chat.completions.create(
                messages=prompt,
                model=self.model,
                n=num_generations,
                max_tokens=self.max_tokens,
                frequency_penalty=self.frequency_penalty,
                presence_penalty=self.presence_penalty,
                temperature=self.temperature,
                top_p=self.top_p,
                timeout=50,
            )

            output = []
            for chat_completion in chat_completions.choices:
                try:
                    parsed_response = self.task.parse_output(
                        chat_completion.message.content.strip()
                    )
                except Exception as e:
                    logger.error(f"Error parsing OpenAI response: {e}")
                    parsed_response = None
                output.append(
                    LLMOutput(
                        model_name=self.model_name,
                        prompt_used=prompt,
                        raw_output=chat_completion.message.content,
                        parsed_output=parsed_response,
                    )
                )
            outputs.append(output)
        return outputs

`available_models: List[str]` `cached` `property`

Returns the list of available models in your OpenAI account.

`model_name: str` `property`

Returns the name of the OpenAI model.

`init(task, model='gpt-3.5-turbo', client=None, openai_api_key=None, max_new_tokens=128, frequency_penalty=0.0, presence_penalty=0.0, temperature=1.0, top_p=1.0, num_threads=None, prompt_format=None, prompt_formatting_fn=None)`

Initializes the OpenAILLM class.

Parameters:

Name	Type	Description	Default
`task`	`Task`	the task to be performed by the LLM.	required
`model`	`str`	the model to be used for generation. Defaults to "gpt-3.5-turbo".	`'gpt-3.5-turbo'`
`client`	`Union[OpenAI, None]`	an OpenAI client to be used for generation. If `None`, a new client will be created. Defaults to `None`.	`None`
`openai_api_key`	`Union[str, None]`	the OpenAI API key to be used for generation. If `None`, the `OPENAI_API_KEY` environment variable will be used. Defaults to `None`.	`None`
`max_new_tokens`	`int`	the maximum number of tokens to be generated. Defaults to 128.	`128`
`frequency_penalty`	`float`	the frequency penalty to be used for generation. Defaults to 0.0.	`0.0`
`presence_penalty`	`float`	the presence penalty to be used for generation. Defaults to 0.0.	`0.0`
`temperature`	`float`	the temperature to be used for generation. Defaults to 1.0.	`1.0`
`top_p`	`float`	the top-p value to be used for generation. Defaults to 1.0.	`1.0`
`num_threads`	`Union[int, None]`	the number of threads to be used for parallel generation. If `None`, no parallel generation will be performed. Defaults to `None`.	`None`
`prompt_format`	`Union[SupportedFormats, None]`	the format to be used for the prompt. If `None`, the default format of the task will be used, available formats are `openai`, `chatml`, `llama2`, `zephyr`, and `default`. Defaults to `None`, but `default` (concatenation of `system_prompt` and `formatted_prompt` with a line-break) will be used if no `prompt_formatting_fn` is provided.	`None`
`prompt_formatting_fn`	`Union[Callable[..., str], None]`	a function to be applied to the prompt before generation. If `None`, no formatting will be applied. Defaults to `None`.	`None`

Raises:

Type	Description
`AssertionError`	if the provided `model` is not available in your OpenAI account.

Examples:

>>> from distilabel.tasks.text_generation import TextGenerationTask as Task
>>> from distilabel.llm import OpenAILLM
>>> task = Task()
>>> llm = OpenAILLM(model="gpt-3.5-turbo", task=task)

Source code in src/distilabel/llm/openai.py

def __init__(
    self,
    task: "Task",
    model: str = "gpt-3.5-turbo",
    client: Union["OpenAI", None] = None,
    openai_api_key: Union[str, None] = None,
    max_new_tokens: int = 128,
    frequency_penalty: float = 0.0,
    presence_penalty: float = 0.0,
    temperature: float = 1.0,
    top_p: float = 1.0,
    num_threads: Union[int, None] = None,
    prompt_format: Union["SupportedFormats", None] = None,
    prompt_formatting_fn: Union[Callable[..., str], None] = None,
) -> None:
    """Initializes the OpenAILLM class.

    Args:
        task (Task): the task to be performed by the LLM.
        model (str, optional): the model to be used for generation. Defaults to "gpt-3.5-turbo".
        client (Union[OpenAI, None], optional): an OpenAI client to be used for generation.
            If `None`, a new client will be created. Defaults to `None`.
        openai_api_key (Union[str, None], optional): the OpenAI API key to be used for generation.
            If `None`, the `OPENAI_API_KEY` environment variable will be used. Defaults to `None`.
        max_new_tokens (int, optional): the maximum number of tokens to be generated.
            Defaults to 128.
        frequency_penalty (float, optional): the frequency penalty to be used for generation.
            Defaults to 0.0.
        presence_penalty (float, optional): the presence penalty to be used for generation.
            Defaults to 0.0.
        temperature (float, optional): the temperature to be used for generation.
            Defaults to 1.0.
        top_p (float, optional): the top-p value to be used for generation.
            Defaults to 1.0.
        num_threads (Union[int, None], optional): the number of threads to be used
            for parallel generation. If `None`, no parallel generation will be performed.
            Defaults to `None`.
        prompt_format (Union[SupportedFormats, None], optional): the format to be used
            for the prompt. If `None`, the default format of the task will be used, available
            formats are `openai`, `chatml`, `llama2`, `zephyr`, and `default`. Defaults to `None`,
            but `default` (concatenation of `system_prompt` and `formatted_prompt` with a line-break)
            will be used if no `prompt_formatting_fn` is provided.
        prompt_formatting_fn (Union[Callable[..., str], None], optional): a function to be
            applied to the prompt before generation. If `None`, no formatting will be applied.
            Defaults to `None`.

    Raises:
        AssertionError: if the provided `model` is not available in your OpenAI account.

    Examples:
        >>> from distilabel.tasks.text_generation import TextGenerationTask as Task
        >>> from distilabel.llm import OpenAILLM
        >>> task = Task()
        >>> llm = OpenAILLM(model="gpt-3.5-turbo", task=task)
    """
    super().__init__(
        task=task,
        num_threads=num_threads,
        prompt_format=prompt_format,
        prompt_formatting_fn=prompt_formatting_fn,
    )

    if not _OPENAI_AVAILABLE:
        raise ImportError(
            "`OpenAILLM` cannot be used as `openai` is not installed, please "
            " install it with `pip install openai`."
        )

    self.max_tokens = max_new_tokens
    self.frequency_penalty = frequency_penalty
    self.presence_penalty = presence_penalty
    self.temperature = temperature
    self.top_p = top_p

    self.client = client or OpenAI(api_key=openai_api_key, max_retries=6)

    assert (
        model in self.available_models
    ), f"Provided `model` is not available in your OpenAI account, available models are {self.available_models}"
    self.model = model

`ProcessLLM`

A class that wraps an LLM and performs generation in a separate process. The result is a Future that will be set when the generation is completed.

This class creates a new child process that will load the LLM and perform the text generation. In order to communicate with this child process, a bridge thread is created in the main process. The bridge thread will send and receive the results from the child process using multiprocessing.Queues. The communication between the bridge thread and the main process is done using Futures. This architecture was inspired by the ProcessPoolExecutor from the concurrent.futures module and it's a simplified version of it.

Source code in src/distilabel/llm/base.py

class ProcessLLM:
    """A class that wraps an `LLM` and performs generation in a separate process. The
    result is a `Future` that will be set when the generation is completed.

    This class creates a new child process that will load the `LLM` and perform the
    text generation. In order to communicate with this child process, a bridge thread
    is created in the main process. The bridge thread will send and receive the results
    from the child process using `multiprocessing.Queue`s. The communication between the
    bridge thread and the main process is done using `Future`s. This architecture was
    inspired by the `ProcessPoolExecutor` from the `concurrent.futures` module and it's
    a simplified version of it.
    """

    def __init__(self, task: Task, load_llm_fn: Callable[[Task], LLM]) -> None:
        """Initializes the `ProcessLLM` class.

        Args:
            task: the task to be performed by the `LLM`. This task will be used by the
                child process when calling the `load_llm_fn`.
            load_llm_fn (Callable[[Task], LLM]): a function that will be executed in the
                child process to load the `LLM`. It must return an `LLM` instance.
        """
        self.task = task

        self._load_llm_fn = load_llm_fn

        # The bridge thread will act as a bridge between the main process and the child
        # process for communication. It will send the generation requests to the child
        # process and receive the results from the child process.
        self._bridge_thread = None

        # The child process which will load the `LLM` and perform the generation.
        self._generation_process = None

        # The `Semaphore` that will be used to synchronize the loading of the `LLM`.
        # `_BridgeThread` will be blocked until `_GenerationProcess` has called the
        # `load_llm_fn` and the `LLM` has been loaded.
        self._load_llm_sem = mp.Semaphore(0)

        # This thread will create text generation requests
        self.pending_text_generation_request: Dict[int, _TextGenerationRequest] = {}
        self.text_generation_request_count = 0
        self.text_generation_request_ids_queue: queue.Queue[int] = queue.Queue()

        # Queues for the communication between the `_BridgeThread` and the `_GenerationProcess`
        self._call_queue = mp.Queue()
        self._result_queue = mp.Queue()

        # Shared memory object for transfering the `model_name` to the main process
        # once the `LLM` is loaded
        self._model_name = mp.Array(c_char, MAX_MODEL_NAME_LENGTH)

    def _start_bridge_thread(self) -> None:
        """Starts the bridge thread and the generation process."""
        if self._bridge_thread is None:
            self._generation_process = _GenerationProcess(self)
            self._generation_process.start()
            pid = self._generation_process.pid
            logger.debug(f"Generation process with PID {pid} started!")

            self._bridge_thread = _BridgeThread(self)
            self._bridge_thread.start()
            logger.debug("Bridge thread for process with PID {pid} started!")

    def _add_text_generation_request(
        self,
        inputs: List[Dict[str, Any]],
        num_generations: int = 1,
        progress_callback_func: Union[Callable, None] = None,
    ) -> Future[List[List["LLMOutput"]]]:
        """Creates and send a new text generation request to the bridge thread. This thread
        and the bridge thread shares a dictionary used to store the text generation requests.
        This thread will add the text generation requests to the dictionary and the bridge
        thread will only read from it. In order for the bridge thread to know that a new
        text generation request has been added to the dictionary, this thread will put the
        id of the request in a queue. The bridge thread will read from this queue and get
        the text generation request from the dictionary.
        """

        def _progress():
            if progress_callback_func is not None:
                progress_callback_func(advance=num_generations * len(inputs))

        text_generation_request = _TextGenerationRequest(
            inputs=inputs, num_generations=num_generations
        )
        # Put the request information in the dictionary associated to the request id
        self.pending_text_generation_request[
            self.text_generation_request_count
        ] = text_generation_request
        # Put the request id in the queue (for the `_BridgeThread` to consume it)
        self.text_generation_request_ids_queue.put(self.text_generation_request_count)
        self.text_generation_request_count += 1
        text_generation_request.future.add_done_callback(lambda _: _progress())
        return text_generation_request.future

    def generate(
        self,
        inputs: List[Dict[str, Any]],
        num_generations: int = 1,
        progress_callback_func: Union[Callable, None] = None,
    ) -> Future[List[List["LLMOutput"]]]:
        """Generates the outputs for the given inputs using the `ProcessLLM` and its loaded
        `LLM`.

        Args:
            inputs (List[Dict[str, Any]]): the inputs to be used for generation.
            num_generations (int, optional): the number of generations to be performed for each input.
                Defaults to `1`.
            progress_callback_func (Union[Callable, None], optional): a function to be called at each
                generation step. Defaults to `None`.

        Returns:
            Future[List[List["LLMOutput"]]]: the generated outputs as a `Future`.
        """
        self._start_bridge_thread()
        return self._add_text_generation_request(
            inputs, num_generations, progress_callback_func
        )

    def teardown(self) -> None:
        """Stops the bridge thread and the generation process."""
        if self._generation_process is not None:
            self._generation_process.stop()
            self._generation_process.join()

        if self._bridge_thread is not None:
            self._bridge_thread.stop()
            self._bridge_thread.join()

    @cached_property
    def model_name(self) -> str:
        """Returns the model name of the `LLM` once it has been loaded."""
        with self._model_name:
            return "".join([c.decode() for c in self._model_name if c != b"\0"])

    @property
    def return_futures(self) -> bool:
        """Whether the `LLM` returns futures"""
        return True

`model_name: str` `cached` `property`

Returns the model name of the LLM once it has been loaded.

`return_futures: bool` `property`

Whether the LLM returns futures

`init(task, load_llm_fn)`

Initializes the ProcessLLM class.

Parameters:

Name	Type	Description	Default
`task`	`Task`	the task to be performed by the `LLM`. This task will be used by the child process when calling the `load_llm_fn`.	required
`load_llm_fn`	`Callable[[Task], LLM]`	a function that will be executed in the child process to load the `LLM`. It must return an `LLM` instance.	required

Source code in src/distilabel/llm/base.py

def __init__(self, task: Task, load_llm_fn: Callable[[Task], LLM]) -> None:
    """Initializes the `ProcessLLM` class.

    Args:
        task: the task to be performed by the `LLM`. This task will be used by the
            child process when calling the `load_llm_fn`.
        load_llm_fn (Callable[[Task], LLM]): a function that will be executed in the
            child process to load the `LLM`. It must return an `LLM` instance.
    """
    self.task = task

    self._load_llm_fn = load_llm_fn

    # The bridge thread will act as a bridge between the main process and the child
    # process for communication. It will send the generation requests to the child
    # process and receive the results from the child process.
    self._bridge_thread = None

    # The child process which will load the `LLM` and perform the generation.
    self._generation_process = None

    # The `Semaphore` that will be used to synchronize the loading of the `LLM`.
    # `_BridgeThread` will be blocked until `_GenerationProcess` has called the
    # `load_llm_fn` and the `LLM` has been loaded.
    self._load_llm_sem = mp.Semaphore(0)

    # This thread will create text generation requests
    self.pending_text_generation_request: Dict[int, _TextGenerationRequest] = {}
    self.text_generation_request_count = 0
    self.text_generation_request_ids_queue: queue.Queue[int] = queue.Queue()

    # Queues for the communication between the `_BridgeThread` and the `_GenerationProcess`
    self._call_queue = mp.Queue()
    self._result_queue = mp.Queue()

    # Shared memory object for transfering the `model_name` to the main process
    # once the `LLM` is loaded
    self._model_name = mp.Array(c_char, MAX_MODEL_NAME_LENGTH)

`generate(inputs, num_generations=1, progress_callback_func=None)`

Generates the outputs for the given inputs using the ProcessLLM and its loaded LLM.

Parameters:

Name	Type	Description	Default
`inputs`	`List[Dict[str, Any]]`	the inputs to be used for generation.	required
`num_generations`	`int`	the number of generations to be performed for each input. Defaults to `1`.	`1`
`progress_callback_func`	`Union[Callable, None]`	a function to be called at each generation step. Defaults to `None`.	`None`

Returns:

Type	Description
`Future[List[List['LLMOutput']]]`	Future[List[List["LLMOutput"]]]: the generated outputs as a `Future`.

Source code in src/distilabel/llm/base.py

def generate(
    self,
    inputs: List[Dict[str, Any]],
    num_generations: int = 1,
    progress_callback_func: Union[Callable, None] = None,
) -> Future[List[List["LLMOutput"]]]:
    """Generates the outputs for the given inputs using the `ProcessLLM` and its loaded
    `LLM`.

    Args:
        inputs (List[Dict[str, Any]]): the inputs to be used for generation.
        num_generations (int, optional): the number of generations to be performed for each input.
            Defaults to `1`.
        progress_callback_func (Union[Callable, None], optional): a function to be called at each
            generation step. Defaults to `None`.

    Returns:
        Future[List[List["LLMOutput"]]]: the generated outputs as a `Future`.
    """
    self._start_bridge_thread()
    return self._add_text_generation_request(
        inputs, num_generations, progress_callback_func
    )

`teardown()`

Stops the bridge thread and the generation process.

Source code in src/distilabel/llm/base.py

def teardown(self) -> None:
    """Stops the bridge thread and the generation process."""
    if self._generation_process is not None:
        self._generation_process.stop()
        self._generation_process.join()

    if self._bridge_thread is not None:
        self._bridge_thread.stop()
        self._bridge_thread.join()

`TransformersLLM`

Bases: LLM

Source code in src/distilabel/llm/huggingface/transformers.py

class TransformersLLM(LLM):
    def __init__(
        self,
        model: "PreTrainedModel",
        tokenizer: "PreTrainedTokenizer",
        task: "Task",
        max_new_tokens: int = 128,
        do_sample: bool = False,
        temperature: float = 1.0,
        top_k: int = 50,
        top_p: float = 1.0,
        typical_p: float = 1.0,
        num_threads: Union[int, None] = None,
        prompt_format: Union["SupportedFormats", None] = None,
        prompt_formatting_fn: Union[Callable[..., str], None] = None,
    ) -> None:
        """Initializes the TransformersLLM class.

        Args:
            model (PreTrainedModel): the model to be used for generation.
            tokenizer (PreTrainedTokenizer): the tokenizer to be used for generation.
            task (Task): the task to be performed by the LLM.
            max_new_tokens (int, optional): the maximum number of tokens to be generated.
                Defaults to 128.
            do_sample (bool, optional): whether to sample from the model or not.
                Defaults to False.
            temperature (float, optional): the temperature to be used for generation.
                Defaults to 1.0.
            top_k (int, optional): the top-k value to be used for generation.
                Defaults to 50.
            top_p (float, optional): the top-p value to be used for generation.
                Defaults to 1.0.
            typical_p (float, optional): the typical-p value to be used for generation.
                Defaults to 1.0.
            num_threads (Union[int, None], optional): the number of threads to be used for generation.
                If `None`, the number of threads will be set to the number of available CPUs.
                Defaults to `None`.
            prompt_format (Union[SupportedFormats, None], optional): the format to be used
                for formatting the prompts. If `None`, the prompts will not be formatted.
                Defaults to `None`.
            prompt_formatting_fn (Union[Callable[..., str], None], optional): the function to be used
                for formatting the prompts. If `None`, the prompts will not be formatted.

        Examples:
            >>> from transformers import AutoModelForCausalLM, AutoTokenizer
            >>> from distilabel.tasks.text_generation import TextGenerationTask as Task
            >>> from distilabel.llm import TransformersLLM
            >>> model = AutoModelForCausalLM.from_pretrained("gpt2")
            >>> tokenizer = AutoTokenizer.from_pretrained("gpt2")
            >>> task = Task()
            >>> llm = TransformersLLM(
            ...     model=model,
            ...     tokenizer=tokenizer,
            ...     task=task,
            ... )
        """
        super().__init__(
            task=task,
            num_threads=num_threads,
            prompt_format=prompt_format,
            prompt_formatting_fn=prompt_formatting_fn,
        )

        self.max_new_tokens = max_new_tokens
        self.do_sample = do_sample
        self.temperature = temperature
        self.top_k = top_k
        self.top_p = top_p
        self.typical_p = typical_p

        self.model = model
        self.tokenizer = tokenizer

        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token
        if (
            hasattr(self.tokenizer, "use_default_system_prompt")
            and self.tokenizer.use_default_system_prompt  # type: ignore
        ):
            # The `tokenizer` also has a method named `apply_chat_template` that expects a `Conversation` as OpenAI does with the ChatML format
            warnings.warn(
                "The provided `tokenizer` has `use_default_system_prompt=True` which means that the default system prompt will be used, which may collide with the `task` provided as an arg to this class.",
                UserWarning,
                stacklevel=2,
            )

    def __rich_repr__(self) -> Generator[Any, None, None]:
        yield from super().__rich_repr__()
        yield (
            "parameters",
            {
                "max_new_tokens": self.max_new_tokens,
                "do_sample": self.do_sample,
                "temperature": self.temperature,
                "top_k": self.top_k,
                "top_p": self.top_p,
                "typical_p": self.typical_p,
            },
        )

    @property
    def model_name(self) -> str:
        """Returns the name of the Transformers model."""
        return self.model.config.name_or_path

    def _generate(
        self, inputs: List[Dict[str, Any]], num_generations: int = 1
    ) -> List[List[LLMOutput]]:
        """Generates `num_generations` for each input in `inputs`.

        Args:
            inputs (List[Dict[str, Any]]): the inputs to be used for generation.
            num_generations (int, optional): the number of generations to be performed for each
                input. Defaults to 1.

        Returns:
            List[List[LLMOutput]]: the outputs of the LLM.
        """
        prompts = self._generate_prompts(
            inputs, default_format=None, expected_output_type=str
        )
        encodings = self.tokenizer(prompts, padding=True, return_tensors="pt")
        encodings = encodings.to(self.model.device)
        with torch.inference_mode():
            generated_ids = self.model.generate(
                **encodings,  # type: ignore
                pad_token_id=self.tokenizer.eos_token_id,
                generation_config=GenerationConfig(
                    do_sample=self.do_sample,
                    temperature=self.temperature,
                    max_new_tokens=self.max_new_tokens,
                    top_k=self.top_k,
                    top_p=self.top_p,
                    typical_p=self.typical_p,
                    num_return_sequences=num_generations,
                ),
            )
        raw_outputs = self.tokenizer.batch_decode(
            generated_ids[:, encodings.input_ids.shape[1] :],
            skip_special_tokens=True,
            clean_up_tokenization_spaces=True,
        )
        outputs = []
        for prompt, i in zip(prompts, range(0, len(raw_outputs), num_generations)):
            output = []
            for raw_output in raw_outputs[i : i + num_generations]:
                try:
                    parsed_output = self.task.parse_output(raw_output)
                except Exception as e:
                    logger.error(f"Error parsing Transformers output: {e}")
                    parsed_output = None
                output.append(
                    LLMOutput(
                        model_name=self.model_name,
                        prompt_used=prompt,
                        raw_output=raw_output,
                        parsed_output=parsed_output,
                    )
                )
            outputs.append(output)
        return outputs

`model_name: str` `property`

Returns the name of the Transformers model.

`init(model, tokenizer, task, max_new_tokens=128, do_sample=False, temperature=1.0, top_k=50, top_p=1.0, typical_p=1.0, num_threads=None, prompt_format=None, prompt_formatting_fn=None)`

Initializes the TransformersLLM class.

Parameters:

Name	Type	Description	Default
`model`	`PreTrainedModel`	the model to be used for generation.	required
`tokenizer`	`PreTrainedTokenizer`	the tokenizer to be used for generation.	required
`task`	`Task`	the task to be performed by the LLM.	required
`max_new_tokens`	`int`	the maximum number of tokens to be generated. Defaults to 128.	`128`
`do_sample`	`bool`	whether to sample from the model or not. Defaults to False.	`False`
`temperature`	`float`	the temperature to be used for generation. Defaults to 1.0.	`1.0`
`top_k`	`int`	the top-k value to be used for generation. Defaults to 50.	`50`
`top_p`	`float`	the top-p value to be used for generation. Defaults to 1.0.	`1.0`
`typical_p`	`float`	the typical-p value to be used for generation. Defaults to 1.0.	`1.0`
`num_threads`	`Union[int, None]`	the number of threads to be used for generation. If `None`, the number of threads will be set to the number of available CPUs. Defaults to `None`.	`None`
`prompt_format`	`Union[SupportedFormats, None]`	the format to be used for formatting the prompts. If `None`, the prompts will not be formatted. Defaults to `None`.	`None`
`prompt_formatting_fn`	`Union[Callable[..., str], None]`	the function to be used for formatting the prompts. If `None`, the prompts will not be formatted.	`None`

Examples:

>>> from transformers import AutoModelForCausalLM, AutoTokenizer
>>> from distilabel.tasks.text_generation import TextGenerationTask as Task
>>> from distilabel.llm import TransformersLLM
>>> model = AutoModelForCausalLM.from_pretrained("gpt2")
>>> tokenizer = AutoTokenizer.from_pretrained("gpt2")
>>> task = Task()
>>> llm = TransformersLLM(
...     model=model,
...     tokenizer=tokenizer,
...     task=task,
... )

Source code in src/distilabel/llm/huggingface/transformers.py

def __init__(
    self,
    model: "PreTrainedModel",
    tokenizer: "PreTrainedTokenizer",
    task: "Task",
    max_new_tokens: int = 128,
    do_sample: bool = False,
    temperature: float = 1.0,
    top_k: int = 50,
    top_p: float = 1.0,
    typical_p: float = 1.0,
    num_threads: Union[int, None] = None,
    prompt_format: Union["SupportedFormats", None] = None,
    prompt_formatting_fn: Union[Callable[..., str], None] = None,
) -> None:
    """Initializes the TransformersLLM class.

    Args:
        model (PreTrainedModel): the model to be used for generation.
        tokenizer (PreTrainedTokenizer): the tokenizer to be used for generation.
        task (Task): the task to be performed by the LLM.
        max_new_tokens (int, optional): the maximum number of tokens to be generated.
            Defaults to 128.
        do_sample (bool, optional): whether to sample from the model or not.
            Defaults to False.
        temperature (float, optional): the temperature to be used for generation.
            Defaults to 1.0.
        top_k (int, optional): the top-k value to be used for generation.
            Defaults to 50.
        top_p (float, optional): the top-p value to be used for generation.
            Defaults to 1.0.
        typical_p (float, optional): the typical-p value to be used for generation.
            Defaults to 1.0.
        num_threads (Union[int, None], optional): the number of threads to be used for generation.
            If `None`, the number of threads will be set to the number of available CPUs.
            Defaults to `None`.
        prompt_format (Union[SupportedFormats, None], optional): the format to be used
            for formatting the prompts. If `None`, the prompts will not be formatted.
            Defaults to `None`.
        prompt_formatting_fn (Union[Callable[..., str], None], optional): the function to be used
            for formatting the prompts. If `None`, the prompts will not be formatted.

    Examples:
        >>> from transformers import AutoModelForCausalLM, AutoTokenizer
        >>> from distilabel.tasks.text_generation import TextGenerationTask as Task
        >>> from distilabel.llm import TransformersLLM
        >>> model = AutoModelForCausalLM.from_pretrained("gpt2")
        >>> tokenizer = AutoTokenizer.from_pretrained("gpt2")
        >>> task = Task()
        >>> llm = TransformersLLM(
        ...     model=model,
        ...     tokenizer=tokenizer,
        ...     task=task,
        ... )
    """
    super().__init__(
        task=task,
        num_threads=num_threads,
        prompt_format=prompt_format,
        prompt_formatting_fn=prompt_formatting_fn,
    )

    self.max_new_tokens = max_new_tokens
    self.do_sample = do_sample
    self.temperature = temperature
    self.top_k = top_k
    self.top_p = top_p
    self.typical_p = typical_p

    self.model = model
    self.tokenizer = tokenizer

    if self.tokenizer.pad_token is None:
        self.tokenizer.pad_token = self.tokenizer.eos_token
    if (
        hasattr(self.tokenizer, "use_default_system_prompt")
        and self.tokenizer.use_default_system_prompt  # type: ignore
    ):
        # The `tokenizer` also has a method named `apply_chat_template` that expects a `Conversation` as OpenAI does with the ChatML format
        warnings.warn(
            "The provided `tokenizer` has `use_default_system_prompt=True` which means that the default system prompt will be used, which may collide with the `task` provided as an arg to this class.",
            UserWarning,
            stacklevel=2,
        )

`vLLM`

Bases: LLM

Source code in src/distilabel/llm/vllm.py

class vLLM(LLM):
    def __init__(
        self,
        vllm: "_vLLM",
        task: "Task",
        max_new_tokens: int = 128,
        presence_penalty: float = 0.0,
        frequency_penalty: float = 0.0,
        temperature: float = 1.0,
        top_p: float = 1.0,
        top_k: int = -1,
        prompt_format: Union["SupportedFormats", None] = None,
        prompt_formatting_fn: Union[Callable[..., str], None] = None,
    ) -> None:
        """Initializes the vLLM class.

        Args:
            vllm (_vLLM): the vLLM model to be used.
            task (Task): the task to be performed by the LLM.
            max_new_tokens (int, optional): the maximum number of tokens to be generated.
                Defaults to 128.
            presence_penalty (float, optional): the presence penalty to be used for generation.
                Defaults to 0.0.
            frequency_penalty (float, optional): the frequency penalty to be used for generation.
                Defaults to 0.0.
            temperature (float, optional): the temperature to be used for generation.
                Defaults to 1.0.
            top_p (float, optional): the top-p value to be used for generation.
                Defaults to 1.0.
            top_k (int, optional): the top-k value to be used for generation.
                Defaults to -1.
            prompt_format (Union[SupportedFormats, None], optional): the format to be used
                for the prompt. If `None`, the default format of the task will be used, available
                formats are `openai`, `chatml`, `llama2`, `zephyr`, and `default`. Defaults to `None`,
                but `default` (concatenation of `system_prompt` and `formatted_prompt` with a line-break)
                will be used if no `prompt_formatting_fn` is provided.
            prompt_formatting_fn (Union[Callable[..., str], None], optional): a function to be
                applied to the prompt before generation. If `None`, no formatting will be applied.

        Examples:
            >>> from vllm import LLM
            >>> from distilabel.tasks.text_generation import TextGenerationTask as Task
            >>> from distilabel.llm import vLLM
            >>> model = LLM(model="gpt2")
            >>> task = Task()
            >>> llm = vLLM(model=model, task=task)
        """
        super().__init__(
            task=task,
            prompt_format=prompt_format,
            prompt_formatting_fn=prompt_formatting_fn,
        )

        if not _VLLM_AVAILABLE:
            raise ImportError(
                "`vLLM` cannot be used as `vllm` is not installed, please "
                " install it with `pip install vllm`."
            )

        self.presence_penalty = presence_penalty
        self.frequency_penalty = frequency_penalty
        self.temperature = temperature
        self.top_p = top_p
        self.top_k = top_k
        self.max_tokens = max_new_tokens

        self.vllm = vllm

    def __rich_repr__(self) -> Generator[Any, None, None]:
        yield from super().__rich_repr__()
        yield (
            "parameters",
            {
                "max_tokens": self.max_tokens,
                "presence_penalty": self.presence_penalty,
                "frequency_penalty": self.frequency_penalty,
                "temperature": self.temperature,
                "top_p": self.top_p,
                "top_k": self.top_k,
            },
        )

    @property
    def model_name(self) -> str:
        """Returns the name of the vLLM model."""
        return self.vllm.llm_engine.model_config.model  # type: ignore

    def _generate(
        self, inputs: List[Dict[str, Any]], num_generations: int = 1
    ) -> List[List[LLMOutput]]:
        """Generates `num_generations` for each input in `inputs`.

        Args:
            inputs (List[Dict[str, Any]]): the inputs to be used for generation.
            num_generations (int, optional): the number of generations to be performed for each
                input. Defaults to 1.

        Returns:
            List[List[LLMOutput]]: the outputs of the LLM.
        """
        prompts = self._generate_prompts(
            inputs, default_format=None, expected_output_type=str
        )
        requests = self.vllm.generate(
            prompts,
            SamplingParams(  # type: ignore
                n=num_generations,
                presence_penalty=self.presence_penalty,
                frequency_penalty=self.frequency_penalty,
                temperature=self.temperature,
                top_p=self.top_p,
                top_k=self.top_k,
                max_tokens=self.max_tokens,
            ),
            use_tqdm=False,  # type: ignore
        )
        outputs = []
        for request, prompt in zip(requests, prompts):
            output = []
            for request_output in request.outputs:
                try:
                    parsed_output = self.task.parse_output(request_output.text)
                except Exception as e:
                    logger.error(f"Error parsing vLLM output: {e}")
                    parsed_output = None
                output.append(
                    LLMOutput(
                        model_name=self.model_name,
                        prompt_used=prompt,
                        raw_output=request_output.text,
                        parsed_output=parsed_output,
                    )
                )
            outputs.append(output)
        return outputs

`model_name: str` `property`

Returns the name of the vLLM model.

`init(vllm, task, max_new_tokens=128, presence_penalty=0.0, frequency_penalty=0.0, temperature=1.0, top_p=1.0, top_k=-1, prompt_format=None, prompt_formatting_fn=None)`

Initializes the vLLM class.

Parameters:

Name	Type	Description	Default
`vllm`	`LLM`	the vLLM model to be used.	required
`task`	`Task`	the task to be performed by the LLM.	required
`max_new_tokens`	`int`	the maximum number of tokens to be generated. Defaults to 128.	`128`
`presence_penalty`	`float`	the presence penalty to be used for generation. Defaults to 0.0.	`0.0`
`frequency_penalty`	`float`	the frequency penalty to be used for generation. Defaults to 0.0.	`0.0`
`temperature`	`float`	the temperature to be used for generation. Defaults to 1.0.	`1.0`
`top_p`	`float`	the top-p value to be used for generation. Defaults to 1.0.	`1.0`
`top_k`	`int`	the top-k value to be used for generation. Defaults to -1.	`-1`
`prompt_format`	`Union[SupportedFormats, None]`	the format to be used for the prompt. If `None`, the default format of the task will be used, available formats are `openai`, `chatml`, `llama2`, `zephyr`, and `default`. Defaults to `None`, but `default` (concatenation of `system_prompt` and `formatted_prompt` with a line-break) will be used if no `prompt_formatting_fn` is provided.	`None`
`prompt_formatting_fn`	`Union[Callable[..., str], None]`	a function to be applied to the prompt before generation. If `None`, no formatting will be applied.	`None`

Examples:

>>> from vllm import LLM
>>> from distilabel.tasks.text_generation import TextGenerationTask as Task
>>> from distilabel.llm import vLLM
>>> model = LLM(model="gpt2")
>>> task = Task()
>>> llm = vLLM(model=model, task=task)

Source code in src/distilabel/llm/vllm.py

def __init__(
    self,
    vllm: "_vLLM",
    task: "Task",
    max_new_tokens: int = 128,
    presence_penalty: float = 0.0,
    frequency_penalty: float = 0.0,
    temperature: float = 1.0,
    top_p: float = 1.0,
    top_k: int = -1,
    prompt_format: Union["SupportedFormats", None] = None,
    prompt_formatting_fn: Union[Callable[..., str], None] = None,
) -> None:
    """Initializes the vLLM class.

    Args:
        vllm (_vLLM): the vLLM model to be used.
        task (Task): the task to be performed by the LLM.
        max_new_tokens (int, optional): the maximum number of tokens to be generated.
            Defaults to 128.
        presence_penalty (float, optional): the presence penalty to be used for generation.
            Defaults to 0.0.
        frequency_penalty (float, optional): the frequency penalty to be used for generation.
            Defaults to 0.0.
        temperature (float, optional): the temperature to be used for generation.
            Defaults to 1.0.
        top_p (float, optional): the top-p value to be used for generation.
            Defaults to 1.0.
        top_k (int, optional): the top-k value to be used for generation.
            Defaults to -1.
        prompt_format (Union[SupportedFormats, None], optional): the format to be used
            for the prompt. If `None`, the default format of the task will be used, available
            formats are `openai`, `chatml`, `llama2`, `zephyr`, and `default`. Defaults to `None`,
            but `default` (concatenation of `system_prompt` and `formatted_prompt` with a line-break)
            will be used if no `prompt_formatting_fn` is provided.
        prompt_formatting_fn (Union[Callable[..., str], None], optional): a function to be
            applied to the prompt before generation. If `None`, no formatting will be applied.

    Examples:
        >>> from vllm import LLM
        >>> from distilabel.tasks.text_generation import TextGenerationTask as Task
        >>> from distilabel.llm import vLLM
        >>> model = LLM(model="gpt2")
        >>> task = Task()
        >>> llm = vLLM(model=model, task=task)
    """
    super().__init__(
        task=task,
        prompt_format=prompt_format,
        prompt_formatting_fn=prompt_formatting_fn,
    )

    if not _VLLM_AVAILABLE:
        raise ImportError(
            "`vLLM` cannot be used as `vllm` is not installed, please "
            " install it with `pip install vllm`."
        )

    self.presence_penalty = presence_penalty
    self.frequency_penalty = frequency_penalty
    self.temperature = temperature
    self.top_p = top_p
    self.top_k = top_k
    self.max_tokens = max_new_tokens

    self.vllm = vllm

llm

InferenceEndpointsLLM

model_name: str property

__init__(endpoint_name, task, endpoint_namespace=None, token=None, max_new_tokens=128, repetition_penalty=None, seed=None, do_sample=False, temperature=None, top_k=None, top_p=None, typical_p=None, num_threads=None, prompt_format=None, prompt_formatting_fn=None)

LLM

return_futures: bool property

__del__()

__init__(task, num_threads=None, prompt_format=None, prompt_formatting_fn=None)

generate(inputs, num_generations=1, progress_callback_func=None)

LLMPool

return_futures: bool property

task: 'Task' property

__init__(llms)

generate(inputs, num_generations=1, progress_callback_func=None)

teardown()

LlamaCppLLM

model_name: str property

__init__(model, task, max_new_tokens=128, temperature=0.8, top_p=0.95, top_k=40, repeat_penalty=1.1, seed=1337, prompt_format=None, prompt_formatting_fn=None)

OpenAILLM

available_models: List[str] cached property

model_name: str property

__init__(task, model='gpt-3.5-turbo', client=None, openai_api_key=None, max_new_tokens=128, frequency_penalty=0.0, presence_penalty=0.0, temperature=1.0, top_p=1.0, num_threads=None, prompt_format=None, prompt_formatting_fn=None)

ProcessLLM

model_name: str cached property

return_futures: bool property

__init__(task, load_llm_fn)

generate(inputs, num_generations=1, progress_callback_func=None)

teardown()

TransformersLLM

model_name: str property

__init__(model, tokenizer, task, max_new_tokens=128, do_sample=False, temperature=1.0, top_k=50, top_p=1.0, typical_p=1.0, num_threads=None, prompt_format=None, prompt_formatting_fn=None)

vLLM

model_name: str property

__init__(vllm, task, max_new_tokens=128, presence_penalty=0.0, frequency_penalty=0.0, temperature=1.0, top_p=1.0, top_k=-1, prompt_format=None, prompt_formatting_fn=None)

`InferenceEndpointsLLM`

`model_name: str` `property`

`init(endpoint_name, task, endpoint_namespace=None, token=None, max_new_tokens=128, repetition_penalty=None, seed=None, do_sample=False, temperature=None, top_k=None, top_p=None, typical_p=None, num_threads=None, prompt_format=None, prompt_formatting_fn=None)`

`LLM`

`return_futures: bool` `property`

`del()`

`init(task, num_threads=None, prompt_format=None, prompt_formatting_fn=None)`

`generate(inputs, num_generations=1, progress_callback_func=None)`

`LLMPool`

`return_futures: bool` `property`

`task: 'Task'` `property`

`init(llms)`

`generate(inputs, num_generations=1, progress_callback_func=None)`

`teardown()`

`LlamaCppLLM`

`model_name: str` `property`

`init(model, task, max_new_tokens=128, temperature=0.8, top_p=0.95, top_k=40, repeat_penalty=1.1, seed=1337, prompt_format=None, prompt_formatting_fn=None)`

`OpenAILLM`

`available_models: List[str]` `cached` `property`

`model_name: str` `property`

`init(task, model='gpt-3.5-turbo', client=None, openai_api_key=None, max_new_tokens=128, frequency_penalty=0.0, presence_penalty=0.0, temperature=1.0, top_p=1.0, num_threads=None, prompt_format=None, prompt_formatting_fn=None)`

`ProcessLLM`

`model_name: str` `cached` `property`

`return_futures: bool` `property`

`init(task, load_llm_fn)`

`generate(inputs, num_generations=1, progress_callback_func=None)`

`teardown()`

`TransformersLLM`

`model_name: str` `property`

`init(model, tokenizer, task, max_new_tokens=128, do_sample=False, temperature=1.0, top_k=50, top_p=1.0, typical_p=1.0, num_threads=None, prompt_format=None, prompt_formatting_fn=None)`

`vLLM`

`model_name: str` `property`

`init(vllm, task, max_new_tokens=128, presence_penalty=0.0, frequency_penalty=0.0, temperature=1.0, top_p=1.0, top_k=-1, prompt_format=None, prompt_formatting_fn=None)`