llm

`InferenceEndpointsLLM`

Bases: LLM

Source code in src/distilabel/llm/huggingface/inference_endpoints.py

class InferenceEndpointsLLM(LLM):
    def __init__(
        self,
        endpoint_name: str,
        task: "Task",
        endpoint_namespace: Union[str, None] = None,
        token: Union[str, None] = None,
        max_new_tokens: int = 128,
        repetition_penalty: Union[float, None] = None,
        seed: Union[int, None] = None,
        do_sample: bool = False,
        temperature: Union[float, None] = None,
        top_k: Union[int, None] = None,
        top_p: Union[float, None] = None,
        typical_p: Union[float, None] = None,
        num_threads: Union[int, None] = None,
        prompt_format: Union["SupportedFormats", None] = None,
        prompt_formatting_fn: Union[Callable[..., str], None] = None,
    ) -> None:
        """Initializes the InferenceEndpointsLLM class.

        Args:
            endpoint_name (str): The name of the endpoint.
            task (Task): The task to be performed by the LLM.
            endpoint_namespace (Union[str, None]): The namespace of the endpoint. Defaults to None.
            token (Union[str, None]): The token for the endpoint. Defaults to None.
            max_new_tokens (int): The maximum number of tokens to be generated. Defaults to 128.
            repetition_penalty (Union[float, None]): The repetition penalty to be used for generation. Defaults to None.
            seed (Union[int, None]): The seed for generation. Defaults to None.
            do_sample (bool): Whether to do sampling. Defaults to False.
            temperature (Union[float, None]): The temperature for generation. Defaults to None.
            top_k (Union[int, None]): The top_k for generation. Defaults to None.
            top_p (Union[float, None]): The top_p for generation. Defaults to None.
            typical_p (Union[float, None]): The typical_p for generation. Defaults to None.
            num_threads (Union[int, None]): The number of threads. Defaults to None.
            prompt_format (Union["SupportedFormats", None]): The format of the prompt. Defaults to None.
            prompt_formatting_fn (Union[Callable[..., str], None]): The function for formatting the prompt. Defaults to None.

        Examples:
            >>> from distilabel.tasks.text_generation import TextGenerationTask as Task
            >>> from distilabel.llm import InferenceEndpointsLLM
            >>> task = Task()
            >>> llm = InferenceEndpointsLLM(
            ...     endpoint_name="<INFERENCE_ENDPOINT_NAME>",
            ...     task=task,
            ... )
        """
        super().__init__(
            task=task,
            num_threads=num_threads,
            prompt_format=prompt_format,
            prompt_formatting_fn=prompt_formatting_fn,
        )

        if not _HUGGINGFACE_HUB_AVAILABLE:
            raise ImportError(
                "`InferenceEndpointsLLM` cannot be used as `huggingface-hub` is not "
                "installed, please install it with `pip install huggingface-hub`."
            )

        self.do_sample = do_sample
        self.max_new_tokens = max_new_tokens
        self.repetition_penalty = repetition_penalty
        self.seed = seed
        self.temperature = temperature
        self.top_k = top_k
        self.top_p = top_p
        self.typical_p = typical_p

        self.inference_endpoint = get_inference_endpoint(
            name=endpoint_name, namespace=endpoint_namespace, token=token
        )
        self.inference_endpoint.wait(timeout=30)

    def __rich_repr__(self) -> Generator[Any, None, None]:
        yield from super().__rich_repr__()
        yield (
            "parameters",
            {
                "do_sample": self.do_sample,
                "max_new_tokens": self.max_new_tokens,
                "repetition_penalty": self.repetition_penalty,
                "seed": self.seed,
                "temperature": self.temperature,
                "top_k": self.top_k,
                "top_p": self.top_p,
                "typical_p": self.typical_p,
            },
        )

    @property
    def model_name(self) -> str:
        """Returns the model name of the endpoint."""
        return self.inference_endpoint.repository

    @retry(
        retry=retry_if_exception_type(_INFERENCE_ENDPOINTS_API_RETRY_ON_EXCEPTIONS),
        stop=stop_after_attempt(_INFERENCE_ENDPOINTS_API_STOP_AFTER_ATTEMPT),
        wait=wait_random_exponential(
            multiplier=_INFERENCE_ENDPOINTS_API_WAIT_RANDOM_EXPONENTIAL_MULTIPLIER,
            max=_INFERENCE_ENDPOINTS_API_WAIT_RANDOM_EXPONENTIAL_MAX,
        ),
        before_sleep=before_sleep_log(logger, logging.INFO),
        after=after_log(logger, logging.INFO),
    )
    def _text_generation_with_backoff(self, **kwargs: Any) -> Any:
        """Performs text generation with backoff in case of an error."""
        return self.inference_endpoint.client.text_generation(**kwargs)  # type: ignore

    def _generate(
        self, inputs: List[Dict[str, Any]], num_generations: int = 1
    ) -> List[List[LLMOutput]]:
        """Generates `num_generations` for each input in `inputs`.

        Args:
            inputs (List[Dict[str, Any]]): the inputs to be used for generation.
            num_generations (int, optional): the number of generations to be performed for each
                input. Defaults to 1.

        Returns:
            List[List[LLMOutput]]: the outputs of the LLM.
        """
        prompts = self._generate_prompts(
            inputs, default_format=None, expected_output_type=str
        )
        outputs = []
        for prompt in prompts:
            raw_responses = [
                self._text_generation_with_backoff(
                    prompt=prompt,
                    do_sample=self.do_sample,
                    max_new_tokens=self.max_new_tokens,
                    repetition_penalty=self.repetition_penalty,
                    seed=self.seed,
                    temperature=self.temperature,
                    top_k=self.top_k,
                    top_p=self.top_p,
                    typical_p=self.typical_p,
                )
                for _ in range(num_generations)
            ]
            output = []
            for raw_response in raw_responses:
                try:
                    parsed_response = self.task.parse_output(raw_response)
                except Exception as e:
                    logger.error(f"Error parsing Inference Endpoints output: {e}")
                    parsed_response = None
                output.append(
                    LLMOutput(
                        model_name=self.model_name,
                        prompt_used=prompt,
                        raw_output=raw_response,
                        parsed_output=parsed_response,
                    )
                )
            outputs.append(output)
        return outputs

`model_name: str` `property`

Returns the model name of the endpoint.

`init(endpoint_name, task, endpoint_namespace=None, token=None, max_new_tokens=128, repetition_penalty=None, seed=None, do_sample=False, temperature=None, top_k=None, top_p=None, typical_p=None, num_threads=None, prompt_format=None, prompt_formatting_fn=None)`

Initializes the InferenceEndpointsLLM class.

Parameters:

Name	Type	Description	Default
`endpoint_name`	`str`	The name of the endpoint.	required
`task`	`Task`	The task to be performed by the LLM.	required
`endpoint_namespace`	`Union[str, None]`	The namespace of the endpoint. Defaults to None.	`None`
`token`	`Union[str, None]`	The token for the endpoint. Defaults to None.	`None`
`max_new_tokens`	`int`	The maximum number of tokens to be generated. Defaults to 128.	`128`
`repetition_penalty`	`Union[float, None]`	The repetition penalty to be used for generation. Defaults to None.	`None`
`seed`	`Union[int, None]`	The seed for generation. Defaults to None.	`None`
`do_sample`	`bool`	Whether to do sampling. Defaults to False.	`False`
`temperature`	`Union[float, None]`	The temperature for generation. Defaults to None.	`None`
`top_k`	`Union[int, None]`	The top_k for generation. Defaults to None.	`None`
`top_p`	`Union[float, None]`	The top_p for generation. Defaults to None.	`None`
`typical_p`	`Union[float, None]`	The typical_p for generation. Defaults to None.	`None`
`num_threads`	`Union[int, None]`	The number of threads. Defaults to None.	`None`
`prompt_format`	`Union[SupportedFormats, None]`	The format of the prompt. Defaults to None.	`None`
`prompt_formatting_fn`	`Union[Callable[..., str], None]`	The function for formatting the prompt. Defaults to None.	`None`

Examples:

>>> from distilabel.tasks.text_generation import TextGenerationTask as Task
>>> from distilabel.llm import InferenceEndpointsLLM
>>> task = Task()
>>> llm = InferenceEndpointsLLM(
...     endpoint_name="<INFERENCE_ENDPOINT_NAME>",
...     task=task,
... )

Source code in src/distilabel/llm/huggingface/inference_endpoints.py

def __init__(
    self,
    endpoint_name: str,
    task: "Task",
    endpoint_namespace: Union[str, None] = None,
    token: Union[str, None] = None,
    max_new_tokens: int = 128,
    repetition_penalty: Union[float, None] = None,
    seed: Union[int, None] = None,
    do_sample: bool = False,
    temperature: Union[float, None] = None,
    top_k: Union[int, None] = None,
    top_p: Union[float, None] = None,
    typical_p: Union[float, None] = None,
    num_threads: Union[int, None] = None,
    prompt_format: Union["SupportedFormats", None] = None,
    prompt_formatting_fn: Union[Callable[..., str], None] = None,
) -> None:
    """Initializes the InferenceEndpointsLLM class.

    Args:
        endpoint_name (str): The name of the endpoint.
        task (Task): The task to be performed by the LLM.
        endpoint_namespace (Union[str, None]): The namespace of the endpoint. Defaults to None.
        token (Union[str, None]): The token for the endpoint. Defaults to None.
        max_new_tokens (int): The maximum number of tokens to be generated. Defaults to 128.
        repetition_penalty (Union[float, None]): The repetition penalty to be used for generation. Defaults to None.
        seed (Union[int, None]): The seed for generation. Defaults to None.
        do_sample (bool): Whether to do sampling. Defaults to False.
        temperature (Union[float, None]): The temperature for generation. Defaults to None.
        top_k (Union[int, None]): The top_k for generation. Defaults to None.
        top_p (Union[float, None]): The top_p for generation. Defaults to None.
        typical_p (Union[float, None]): The typical_p for generation. Defaults to None.
        num_threads (Union[int, None]): The number of threads. Defaults to None.
        prompt_format (Union["SupportedFormats", None]): The format of the prompt. Defaults to None.
        prompt_formatting_fn (Union[Callable[..., str], None]): The function for formatting the prompt. Defaults to None.

    Examples:
        >>> from distilabel.tasks.text_generation import TextGenerationTask as Task
        >>> from distilabel.llm import InferenceEndpointsLLM
        >>> task = Task()
        >>> llm = InferenceEndpointsLLM(
        ...     endpoint_name="<INFERENCE_ENDPOINT_NAME>",
        ...     task=task,
        ... )
    """
    super().__init__(
        task=task,
        num_threads=num_threads,
        prompt_format=prompt_format,
        prompt_formatting_fn=prompt_formatting_fn,
    )

    if not _HUGGINGFACE_HUB_AVAILABLE:
        raise ImportError(
            "`InferenceEndpointsLLM` cannot be used as `huggingface-hub` is not "
            "installed, please install it with `pip install huggingface-hub`."
        )

    self.do_sample = do_sample
    self.max_new_tokens = max_new_tokens
    self.repetition_penalty = repetition_penalty
    self.seed = seed
    self.temperature = temperature
    self.top_k = top_k
    self.top_p = top_p
    self.typical_p = typical_p

    self.inference_endpoint = get_inference_endpoint(
        name=endpoint_name, namespace=endpoint_namespace, token=token
    )
    self.inference_endpoint.wait(timeout=30)

`LlamaCppLLM`

Bases: LLM

Source code in src/distilabel/llm/llama_cpp.py

class LlamaCppLLM(LLM):
    def __init__(
        self,
        model: "Llama",
        task: "Task",
        max_new_tokens: int = 128,
        temperature: float = 0.8,
        top_p: float = 0.95,
        top_k: int = 40,
        repeat_penalty: float = 1.1,
        prompt_format: Union[SupportedFormats, None] = None,
        prompt_formatting_fn: Union[Callable[..., str], None] = None,
    ) -> None:
        """Initializes the LlamaCppLLM class.

        Args:
            model (Llama): the llama-cpp model to be used.
            task (Task): the task to be performed by the LLM.
            max_new_tokens (int, optional): the maximum number of tokens to be generated.
                Defaults to 128.
            temperature (float, optional): the temperature to be used for generation.
                Defaults to 0.8.
            top_p (float, optional): the top-p value to be used for generation.
                Defaults to 0.95.
            top_k (int, optional): the top-k value to be used for generation.
                Defaults to 40.
            repeat_penalty (float, optional): the repeat penalty to be used for generation.
                Defaults to 1.1.
            prompt_format (Union[SupportedFormats, None], optional): the format to be used
                for the prompt. If `None`, the default format of the task will be used, available
                formats are `openai`, `chatml`, `llama2`, `zephyr`, and `default`. Defaults to `None`,
                but `default` (concatenation of `system_prompt` and `formatted_prompt` with a line-break)
                will be used if no `prompt_formatting_fn` is provided.
            prompt_formatting_fn (Union[Callable[..., str], None], optional): a function to be
                applied to the prompt before generation. If `None`, no formatting will be applied.
                Defaults to `None`.

        Examples:
            >>> from llama_cpp import Llama
            >>> from distilabel.tasks.text_generation import TextGenerationTask as Task
            >>> from distilabel.llm import LlamaCppLLM
            >>> model = Llama(model_path="path/to/model")
            >>> task = Task()
            >>> llm = LlamaCppLLM(model=model, task=task)
        """
        super().__init__(
            task=task,
            prompt_format=prompt_format,
            prompt_formatting_fn=prompt_formatting_fn,
        )

        if not _LLAMA_CPP_AVAILABLE:
            raise ImportError(
                "`LlamaCppLLM` cannot be used as `llama_cpp` is not installed, please "
                " install it with `pip install llama-cpp-python`."
            )

        self.max_tokens = max_new_tokens
        self.temperature = temperature
        self.top_p = top_p
        self.top_k = top_k
        self.repeat_penalty = repeat_penalty

        self.model = model

    def __rich_repr__(self) -> Generator[Any, None, None]:
        yield from super().__rich_repr__()
        yield (
            "parameters",
            {
                "max_new_tokens": self.max_tokens,
                "temperature": self.temperature,
                "top_p": self.top_p,
                "top_k": self.top_k,
                "repeat_penalty": self.repeat_penalty,
            },
        )

    @property
    def model_name(self) -> str:
        """Returns the name of the llama-cpp model, which is the same as the model path."""
        return self.model.model_path

    def _generate(
        self, inputs: List[Dict[str, Any]], num_generations: int = 1
    ) -> List[List[LLMOutput]]:
        """Generates `num_generations` for each input in `inputs`.

        Args:
            inputs (List[Dict[str, Any]]): the inputs to be used for generation.
            num_generations (int, optional): the number of generations to be performed for each
                input. Defaults to 1.

        Returns:
            List[List[LLMOutput]]: the generated outputs.
        """
        prompts = self._generate_prompts(
            inputs, default_format=None, expected_output_type=str
        )
        outputs = []
        for prompt in prompts:
            output = []
            for _ in range(num_generations):
                raw_output = self.model.create_completion(
                    prompt,
                    max_tokens=self.max_tokens,
                    temperature=self.temperature,
                    top_p=self.top_p,
                    top_k=self.top_k,
                    repeat_penalty=self.repeat_penalty,
                )
                try:
                    parsed_output = self.task.parse_output(
                        raw_output["choices"][0]["text"].strip()
                    )
                except Exception as e:
                    logger.error(f"Error parsing llama-cpp output: {e}")
                    parsed_output = None
                output.append(
                    LLMOutput(
                        model_name=self.model_name,
                        prompt_used=prompt,
                        raw_output=raw_output,
                        parsed_output=parsed_output,
                    )
                )
            outputs.append(output)
        return outputs

`model_name: str` `property`

Returns the name of the llama-cpp model, which is the same as the model path.

`init(model, task, max_new_tokens=128, temperature=0.8, top_p=0.95, top_k=40, repeat_penalty=1.1, prompt_format=None, prompt_formatting_fn=None)`

Initializes the LlamaCppLLM class.

Parameters:

Name	Type	Description	Default
`model`	`Llama`	the llama-cpp model to be used.	required
`task`	`Task`	the task to be performed by the LLM.	required
`max_new_tokens`	`int`	the maximum number of tokens to be generated. Defaults to 128.	`128`
`temperature`	`float`	the temperature to be used for generation. Defaults to 0.8.	`0.8`
`top_p`	`float`	the top-p value to be used for generation. Defaults to 0.95.	`0.95`
`top_k`	`int`	the top-k value to be used for generation. Defaults to 40.	`40`
`repeat_penalty`	`float`	the repeat penalty to be used for generation. Defaults to 1.1.	`1.1`
`prompt_format`	`Union[SupportedFormats, None]`	the format to be used for the prompt. If `None`, the default format of the task will be used, available formats are `openai`, `chatml`, `llama2`, `zephyr`, and `default`. Defaults to `None`, but `default` (concatenation of `system_prompt` and `formatted_prompt` with a line-break) will be used if no `prompt_formatting_fn` is provided.	`None`
`prompt_formatting_fn`	`Union[Callable[..., str], None]`	a function to be applied to the prompt before generation. If `None`, no formatting will be applied. Defaults to `None`.	`None`

Examples:

>>> from llama_cpp import Llama
>>> from distilabel.tasks.text_generation import TextGenerationTask as Task
>>> from distilabel.llm import LlamaCppLLM
>>> model = Llama(model_path="path/to/model")
>>> task = Task()
>>> llm = LlamaCppLLM(model=model, task=task)

Source code in src/distilabel/llm/llama_cpp.py

def __init__(
    self,
    model: "Llama",
    task: "Task",
    max_new_tokens: int = 128,
    temperature: float = 0.8,
    top_p: float = 0.95,
    top_k: int = 40,
    repeat_penalty: float = 1.1,
    prompt_format: Union[SupportedFormats, None] = None,
    prompt_formatting_fn: Union[Callable[..., str], None] = None,
) -> None:
    """Initializes the LlamaCppLLM class.

    Args:
        model (Llama): the llama-cpp model to be used.
        task (Task): the task to be performed by the LLM.
        max_new_tokens (int, optional): the maximum number of tokens to be generated.
            Defaults to 128.
        temperature (float, optional): the temperature to be used for generation.
            Defaults to 0.8.
        top_p (float, optional): the top-p value to be used for generation.
            Defaults to 0.95.
        top_k (int, optional): the top-k value to be used for generation.
            Defaults to 40.
        repeat_penalty (float, optional): the repeat penalty to be used for generation.
            Defaults to 1.1.
        prompt_format (Union[SupportedFormats, None], optional): the format to be used
            for the prompt. If `None`, the default format of the task will be used, available
            formats are `openai`, `chatml`, `llama2`, `zephyr`, and `default`. Defaults to `None`,
            but `default` (concatenation of `system_prompt` and `formatted_prompt` with a line-break)
            will be used if no `prompt_formatting_fn` is provided.
        prompt_formatting_fn (Union[Callable[..., str], None], optional): a function to be
            applied to the prompt before generation. If `None`, no formatting will be applied.
            Defaults to `None`.

    Examples:
        >>> from llama_cpp import Llama
        >>> from distilabel.tasks.text_generation import TextGenerationTask as Task
        >>> from distilabel.llm import LlamaCppLLM
        >>> model = Llama(model_path="path/to/model")
        >>> task = Task()
        >>> llm = LlamaCppLLM(model=model, task=task)
    """
    super().__init__(
        task=task,
        prompt_format=prompt_format,
        prompt_formatting_fn=prompt_formatting_fn,
    )

    if not _LLAMA_CPP_AVAILABLE:
        raise ImportError(
            "`LlamaCppLLM` cannot be used as `llama_cpp` is not installed, please "
            " install it with `pip install llama-cpp-python`."
        )

    self.max_tokens = max_new_tokens
    self.temperature = temperature
    self.top_p = top_p
    self.top_k = top_k
    self.repeat_penalty = repeat_penalty

    self.model = model

`OpenAILLM`

Bases: LLM

Source code in src/distilabel/llm/openai.py

class OpenAILLM(LLM):
    def __init__(
        self,
        task: "Task",
        model: str = "gpt-3.5-turbo",
        client: Union["OpenAI", None] = None,
        openai_api_key: Union[str, None] = None,
        max_new_tokens: int = 128,
        frequency_penalty: float = 0.0,
        presence_penalty: float = 0.0,
        temperature: float = 1.0,
        top_p: float = 1.0,
        num_threads: Union[int, None] = None,
        prompt_format: Union["SupportedFormats", None] = None,
        prompt_formatting_fn: Union[Callable[..., str], None] = None,
    ) -> None:
        """Initializes the OpenAILLM class.

        Args:
            task (Task): the task to be performed by the LLM.
            model (str, optional): the model to be used for generation. Defaults to "gpt-3.5-turbo".
            client (Union[OpenAI, None], optional): an OpenAI client to be used for generation.
                If `None`, a new client will be created. Defaults to `None`.
            openai_api_key (Union[str, None], optional): the OpenAI API key to be used for generation.
                If `None`, the `OPENAI_API_KEY` environment variable will be used. Defaults to `None`.
            max_new_tokens (int, optional): the maximum number of tokens to be generated.
                Defaults to 128.
            frequency_penalty (float, optional): the frequency penalty to be used for generation.
                Defaults to 0.0.
            presence_penalty (float, optional): the presence penalty to be used for generation.
                Defaults to 0.0.
            temperature (float, optional): the temperature to be used for generation.
                Defaults to 1.0.
            top_p (float, optional): the top-p value to be used for generation.
                Defaults to 1.0.
            num_threads (Union[int, None], optional): the number of threads to be used
                for parallel generation. If `None`, no parallel generation will be performed.
                Defaults to `None`.
            prompt_format (Union[SupportedFormats, None], optional): the format to be used
                for the prompt. If `None`, the default format of the task will be used, available
                formats are `openai`, `chatml`, `llama2`, `zephyr`, and `default`. Defaults to `None`,
                but `default` (concatenation of `system_prompt` and `formatted_prompt` with a line-break)
                will be used if no `prompt_formatting_fn` is provided.
            prompt_formatting_fn (Union[Callable[..., str], None], optional): a function to be
                applied to the prompt before generation. If `None`, no formatting will be applied.
                Defaults to `None`.

        Raises:
            AssertionError: if the provided `model` is not available in your OpenAI account.

        Examples:
            >>> from distilabel.tasks.text_generation import TextGenerationTask as Task
            >>> from distilabel.llm import OpenAILLM
            >>> task = Task()
            >>> llm = OpenAILLM(model="gpt-3.5-turbo", task=task)
        """
        super().__init__(
            task=task,
            num_threads=num_threads,
            prompt_format=prompt_format,
            prompt_formatting_fn=prompt_formatting_fn,
        )

        if not _OPENAI_AVAILABLE:
            raise ImportError(
                "`OpenAILLM` cannot be used as `openai` is not installed, please "
                " install it with `pip install openai`."
            )

        self.max_tokens = max_new_tokens
        self.frequency_penalty = frequency_penalty
        self.presence_penalty = presence_penalty
        self.temperature = temperature
        self.top_p = top_p

        self.client = client or OpenAI(api_key=openai_api_key, max_retries=6)

        assert (
            model in self.available_models
        ), f"Provided `model` is not available in your OpenAI account, available models are {self.available_models}"
        self.model = model

    def __rich_repr__(self) -> Generator[Any, None, None]:
        yield from super().__rich_repr__()
        yield (
            "parameters",
            {
                "max_tokens": self.max_tokens,
                "frequency_penalty": self.frequency_penalty,
                "presence_penalty": self.presence_penalty,
                "temperature": self.temperature,
                "top_p": self.top_p,
            },
        )

    @cached_property
    def available_models(self) -> List[str]:
        """Returns the list of available models in your OpenAI account."""
        return [model.id for model in self.client.models.list().data]

    @property
    def model_name(self) -> str:
        """Returns the name of the OpenAI model."""
        return self.model

    def _generate(
        self,
        inputs: List[Dict[str, Any]],
        num_generations: int = 1,
    ) -> List[List[LLMOutput]]:
        """Generates `num_generations` for each input in `inputs`.

        Args:
            inputs (List[Dict[str, Any]]): the inputs to be used for generation.
            num_generations (int, optional): the number of generations to be performed for each
                input. Defaults to 1.

        Returns:
            List[List[LLMOutput]]: the generated outputs.
        """
        prompts = self._generate_prompts(
            inputs, default_format="openai", expected_output_type=list
        )
        outputs = []
        for prompt in prompts:
            chat_completions = self.client.chat.completions.create(
                messages=prompt,
                model=self.model,
                n=num_generations,
                max_tokens=self.max_tokens,
                frequency_penalty=self.frequency_penalty,
                presence_penalty=self.presence_penalty,
                temperature=self.temperature,
                top_p=self.top_p,
                timeout=50,
            )

            output = []
            for chat_completion in chat_completions.choices:
                try:
                    parsed_response = self.task.parse_output(
                        chat_completion.message.content.strip()
                    )
                except Exception as e:
                    logger.error(f"Error parsing OpenAI response: {e}")
                    parsed_response = None
                output.append(
                    LLMOutput(
                        model_name=self.model_name,
                        prompt_used=prompt,
                        raw_output=chat_completion.message.content,
                        parsed_output=parsed_response,
                    )
                )
            outputs.append(output)
        return outputs

`available_models: List[str]` `cached` `property`

Returns the list of available models in your OpenAI account.

`model_name: str` `property`

Returns the name of the OpenAI model.

`init(task, model='gpt-3.5-turbo', client=None, openai_api_key=None, max_new_tokens=128, frequency_penalty=0.0, presence_penalty=0.0, temperature=1.0, top_p=1.0, num_threads=None, prompt_format=None, prompt_formatting_fn=None)`

Initializes the OpenAILLM class.

Parameters:

Name	Type	Description	Default
`task`	`Task`	the task to be performed by the LLM.	required
`model`	`str`	the model to be used for generation. Defaults to "gpt-3.5-turbo".	`'gpt-3.5-turbo'`
`client`	`Union[OpenAI, None]`	an OpenAI client to be used for generation. If `None`, a new client will be created. Defaults to `None`.	`None`
`openai_api_key`	`Union[str, None]`	the OpenAI API key to be used for generation. If `None`, the `OPENAI_API_KEY` environment variable will be used. Defaults to `None`.	`None`
`max_new_tokens`	`int`	the maximum number of tokens to be generated. Defaults to 128.	`128`
`frequency_penalty`	`float`	the frequency penalty to be used for generation. Defaults to 0.0.	`0.0`
`presence_penalty`	`float`	the presence penalty to be used for generation. Defaults to 0.0.	`0.0`
`temperature`	`float`	the temperature to be used for generation. Defaults to 1.0.	`1.0`
`top_p`	`float`	the top-p value to be used for generation. Defaults to 1.0.	`1.0`
`num_threads`	`Union[int, None]`	the number of threads to be used for parallel generation. If `None`, no parallel generation will be performed. Defaults to `None`.	`None`
`prompt_format`	`Union[SupportedFormats, None]`	the format to be used for the prompt. If `None`, the default format of the task will be used, available formats are `openai`, `chatml`, `llama2`, `zephyr`, and `default`. Defaults to `None`, but `default` (concatenation of `system_prompt` and `formatted_prompt` with a line-break) will be used if no `prompt_formatting_fn` is provided.	`None`
`prompt_formatting_fn`	`Union[Callable[..., str], None]`	a function to be applied to the prompt before generation. If `None`, no formatting will be applied. Defaults to `None`.	`None`

Raises:

Type	Description
`AssertionError`	if the provided `model` is not available in your OpenAI account.

Examples:

>>> from distilabel.tasks.text_generation import TextGenerationTask as Task
>>> from distilabel.llm import OpenAILLM
>>> task = Task()
>>> llm = OpenAILLM(model="gpt-3.5-turbo", task=task)

Source code in src/distilabel/llm/openai.py

def __init__(
    self,
    task: "Task",
    model: str = "gpt-3.5-turbo",
    client: Union["OpenAI", None] = None,
    openai_api_key: Union[str, None] = None,
    max_new_tokens: int = 128,
    frequency_penalty: float = 0.0,
    presence_penalty: float = 0.0,
    temperature: float = 1.0,
    top_p: float = 1.0,
    num_threads: Union[int, None] = None,
    prompt_format: Union["SupportedFormats", None] = None,
    prompt_formatting_fn: Union[Callable[..., str], None] = None,
) -> None:
    """Initializes the OpenAILLM class.

    Args:
        task (Task): the task to be performed by the LLM.
        model (str, optional): the model to be used for generation. Defaults to "gpt-3.5-turbo".
        client (Union[OpenAI, None], optional): an OpenAI client to be used for generation.
            If `None`, a new client will be created. Defaults to `None`.
        openai_api_key (Union[str, None], optional): the OpenAI API key to be used for generation.
            If `None`, the `OPENAI_API_KEY` environment variable will be used. Defaults to `None`.
        max_new_tokens (int, optional): the maximum number of tokens to be generated.
            Defaults to 128.
        frequency_penalty (float, optional): the frequency penalty to be used for generation.
            Defaults to 0.0.
        presence_penalty (float, optional): the presence penalty to be used for generation.
            Defaults to 0.0.
        temperature (float, optional): the temperature to be used for generation.
            Defaults to 1.0.
        top_p (float, optional): the top-p value to be used for generation.
            Defaults to 1.0.
        num_threads (Union[int, None], optional): the number of threads to be used
            for parallel generation. If `None`, no parallel generation will be performed.
            Defaults to `None`.
        prompt_format (Union[SupportedFormats, None], optional): the format to be used
            for the prompt. If `None`, the default format of the task will be used, available
            formats are `openai`, `chatml`, `llama2`, `zephyr`, and `default`. Defaults to `None`,
            but `default` (concatenation of `system_prompt` and `formatted_prompt` with a line-break)
            will be used if no `prompt_formatting_fn` is provided.
        prompt_formatting_fn (Union[Callable[..., str], None], optional): a function to be
            applied to the prompt before generation. If `None`, no formatting will be applied.
            Defaults to `None`.

    Raises:
        AssertionError: if the provided `model` is not available in your OpenAI account.

    Examples:
        >>> from distilabel.tasks.text_generation import TextGenerationTask as Task
        >>> from distilabel.llm import OpenAILLM
        >>> task = Task()
        >>> llm = OpenAILLM(model="gpt-3.5-turbo", task=task)
    """
    super().__init__(
        task=task,
        num_threads=num_threads,
        prompt_format=prompt_format,
        prompt_formatting_fn=prompt_formatting_fn,
    )

    if not _OPENAI_AVAILABLE:
        raise ImportError(
            "`OpenAILLM` cannot be used as `openai` is not installed, please "
            " install it with `pip install openai`."
        )

    self.max_tokens = max_new_tokens
    self.frequency_penalty = frequency_penalty
    self.presence_penalty = presence_penalty
    self.temperature = temperature
    self.top_p = top_p

    self.client = client or OpenAI(api_key=openai_api_key, max_retries=6)

    assert (
        model in self.available_models
    ), f"Provided `model` is not available in your OpenAI account, available models are {self.available_models}"
    self.model = model

`TransformersLLM`

Bases: LLM

Source code in src/distilabel/llm/huggingface/transformers.py

class TransformersLLM(LLM):
    def __init__(
        self,
        model: "PreTrainedModel",
        tokenizer: "PreTrainedTokenizer",
        task: "Task",
        max_new_tokens: int = 128,
        do_sample: bool = False,
        temperature: float = 1.0,
        top_k: int = 50,
        top_p: float = 1.0,
        typical_p: float = 1.0,
        num_threads: Union[int, None] = None,
        prompt_format: Union["SupportedFormats", None] = None,
        prompt_formatting_fn: Union[Callable[..., str], None] = None,
    ) -> None:
        """Initializes the TransformersLLM class.

        Args:
            model (PreTrainedModel): the model to be used for generation.
            tokenizer (PreTrainedTokenizer): the tokenizer to be used for generation.
            task (Task): the task to be performed by the LLM.
            max_new_tokens (int, optional): the maximum number of tokens to be generated.
                Defaults to 128.
            do_sample (bool, optional): whether to sample from the model or not.
                Defaults to False.
            temperature (float, optional): the temperature to be used for generation.
                Defaults to 1.0.
            top_k (int, optional): the top-k value to be used for generation.
                Defaults to 50.
            top_p (float, optional): the top-p value to be used for generation.
                Defaults to 1.0.
            typical_p (float, optional): the typical-p value to be used for generation.
                Defaults to 1.0.
            num_threads (Union[int, None], optional): the number of threads to be used for generation.
                If `None`, the number of threads will be set to the number of available CPUs.
                Defaults to `None`.
            prompt_format (Union[SupportedFormats, None], optional): the format to be used
                for formatting the prompts. If `None`, the prompts will not be formatted.
                Defaults to `None`.
            prompt_formatting_fn (Union[Callable[..., str], None], optional): the function to be used
                for formatting the prompts. If `None`, the prompts will not be formatted.

        Examples:
            >>> from transformers import AutoModelForCausalLM, AutoTokenizer
            >>> from distilabel.tasks.text_generation import TextGenerationTask as Task
            >>> from distilabel.llm import TransformersLLM
            >>> model = AutoModelForCausalLM.from_pretrained("gpt2")
            >>> tokenizer = AutoTokenizer.from_pretrained("gpt2")
            >>> task = Task()
            >>> llm = TransformersLLM(
            ...     model=model,
            ...     tokenizer=tokenizer,
            ...     task=task,
            ... )
        """
        super().__init__(
            task=task,
            num_threads=num_threads,
            prompt_format=prompt_format,
            prompt_formatting_fn=prompt_formatting_fn,
        )

        self.max_new_tokens = max_new_tokens
        self.do_sample = do_sample
        self.temperature = temperature
        self.top_k = top_k
        self.top_p = top_p
        self.typical_p = typical_p

        self.model = model
        if self.device != "cpu":
            self.model.to(self.device)

        self.tokenizer = tokenizer
        self.tokenizer.padding_side = "left"
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token
        if (
            hasattr(self.tokenizer, "use_default_system_prompt")
            and self.tokenizer.use_default_system_prompt  # type: ignore
        ):
            # The `tokenizer` also has a method named `apply_chat_template` that expects a `Conversation` as OpenAI does with the ChatML format
            warnings.warn(
                "The provided `tokenizer` has `use_default_system_prompt=True` which means that the default system prompt will be used, which may collide with the `task` provided as an arg to this class.",
                UserWarning,
                stacklevel=2,
            )

    def __rich_repr__(self) -> Generator[Any, None, None]:
        yield from super().__rich_repr__()
        yield (
            "parameters",
            {
                "max_new_tokens": self.max_new_tokens,
                "do_sample": self.do_sample,
                "temperature": self.temperature,
                "top_k": self.top_k,
                "top_p": self.top_p,
                "typical_p": self.typical_p,
            },
        )

    @property
    def model_name(self) -> str:
        """Returns the name of the Transformers model."""
        return self.model.config.name_or_path

    @cached_property
    def device(self) -> "device":
        """Returns the device to be used for generation."""
        if torch.cuda.is_available():
            return torch.device("cuda")
        if torch.backends.mps.is_available() and torch.backends.mps.is_built():  # type: ignore
            return torch.device("mps")
        return torch.device("cpu")

    def _generate(
        self, inputs: List[Dict[str, Any]], num_generations: int = 1
    ) -> List[List[LLMOutput]]:
        """Generates `num_generations` for each input in `inputs`.

        Args:
            inputs (List[Dict[str, Any]]): the inputs to be used for generation.
            num_generations (int, optional): the number of generations to be performed for each
                input. Defaults to 1.

        Returns:
            List[List[LLMOutput]]: the outputs of the LLM.
        """
        prompts = self._generate_prompts(
            inputs, default_format=None, expected_output_type=str
        )
        encodings = self.tokenizer(prompts, padding=True, return_tensors="pt")
        if self.device != "cpu":
            encodings = encodings.to(self.device)
        with torch.inference_mode():
            generated_ids = self.model.generate(
                **encodings,  # type: ignore
                pad_token_id=self.tokenizer.eos_token_id,
                generation_config=GenerationConfig(
                    do_sample=self.do_sample,
                    temperature=self.temperature,
                    max_new_tokens=self.max_new_tokens,
                    top_k=self.top_k,
                    top_p=self.top_p,
                    typical_p=self.typical_p,
                    num_return_sequences=num_generations,
                ),
            )
        raw_outputs = self.tokenizer.batch_decode(
            generated_ids[:, encodings.input_ids.shape[1] :],
            skip_special_tokens=True,
            clean_up_tokenization_spaces=True,
        )
        outputs = []
        for prompt, i in zip(prompts, range(0, len(raw_outputs), num_generations)):
            output = []
            for raw_output in raw_outputs[i : i + num_generations]:
                try:
                    parsed_output = self.task.parse_output(raw_output)
                except Exception as e:
                    logger.error(f"Error parsing Transformers output: {e}")
                    parsed_output = None
                output.append(
                    LLMOutput(
                        model_name=self.model_name,
                        prompt_used=prompt,
                        raw_output=raw_output,
                        parsed_output=parsed_output,
                    )
                )
            outputs.append(output)
        return outputs

`device: device` `cached` `property`

Returns the device to be used for generation.

`model_name: str` `property`

Returns the name of the Transformers model.

`init(model, tokenizer, task, max_new_tokens=128, do_sample=False, temperature=1.0, top_k=50, top_p=1.0, typical_p=1.0, num_threads=None, prompt_format=None, prompt_formatting_fn=None)`

Initializes the TransformersLLM class.

Parameters:

Name	Type	Description	Default
`model`	`PreTrainedModel`	the model to be used for generation.	required
`tokenizer`	`PreTrainedTokenizer`	the tokenizer to be used for generation.	required
`task`	`Task`	the task to be performed by the LLM.	required
`max_new_tokens`	`int`	the maximum number of tokens to be generated. Defaults to 128.	`128`
`do_sample`	`bool`	whether to sample from the model or not. Defaults to False.	`False`
`temperature`	`float`	the temperature to be used for generation. Defaults to 1.0.	`1.0`
`top_k`	`int`	the top-k value to be used for generation. Defaults to 50.	`50`
`top_p`	`float`	the top-p value to be used for generation. Defaults to 1.0.	`1.0`
`typical_p`	`float`	the typical-p value to be used for generation. Defaults to 1.0.	`1.0`
`num_threads`	`Union[int, None]`	the number of threads to be used for generation. If `None`, the number of threads will be set to the number of available CPUs. Defaults to `None`.	`None`
`prompt_format`	`Union[SupportedFormats, None]`	the format to be used for formatting the prompts. If `None`, the prompts will not be formatted. Defaults to `None`.	`None`
`prompt_formatting_fn`	`Union[Callable[..., str], None]`	the function to be used for formatting the prompts. If `None`, the prompts will not be formatted.	`None`

Examples:

>>> from transformers import AutoModelForCausalLM, AutoTokenizer
>>> from distilabel.tasks.text_generation import TextGenerationTask as Task
>>> from distilabel.llm import TransformersLLM
>>> model = AutoModelForCausalLM.from_pretrained("gpt2")
>>> tokenizer = AutoTokenizer.from_pretrained("gpt2")
>>> task = Task()
>>> llm = TransformersLLM(
...     model=model,
...     tokenizer=tokenizer,
...     task=task,
... )

Source code in src/distilabel/llm/huggingface/transformers.py

def __init__(
    self,
    model: "PreTrainedModel",
    tokenizer: "PreTrainedTokenizer",
    task: "Task",
    max_new_tokens: int = 128,
    do_sample: bool = False,
    temperature: float = 1.0,
    top_k: int = 50,
    top_p: float = 1.0,
    typical_p: float = 1.0,
    num_threads: Union[int, None] = None,
    prompt_format: Union["SupportedFormats", None] = None,
    prompt_formatting_fn: Union[Callable[..., str], None] = None,
) -> None:
    """Initializes the TransformersLLM class.

    Args:
        model (PreTrainedModel): the model to be used for generation.
        tokenizer (PreTrainedTokenizer): the tokenizer to be used for generation.
        task (Task): the task to be performed by the LLM.
        max_new_tokens (int, optional): the maximum number of tokens to be generated.
            Defaults to 128.
        do_sample (bool, optional): whether to sample from the model or not.
            Defaults to False.
        temperature (float, optional): the temperature to be used for generation.
            Defaults to 1.0.
        top_k (int, optional): the top-k value to be used for generation.
            Defaults to 50.
        top_p (float, optional): the top-p value to be used for generation.
            Defaults to 1.0.
        typical_p (float, optional): the typical-p value to be used for generation.
            Defaults to 1.0.
        num_threads (Union[int, None], optional): the number of threads to be used for generation.
            If `None`, the number of threads will be set to the number of available CPUs.
            Defaults to `None`.
        prompt_format (Union[SupportedFormats, None], optional): the format to be used
            for formatting the prompts. If `None`, the prompts will not be formatted.
            Defaults to `None`.
        prompt_formatting_fn (Union[Callable[..., str], None], optional): the function to be used
            for formatting the prompts. If `None`, the prompts will not be formatted.

    Examples:
        >>> from transformers import AutoModelForCausalLM, AutoTokenizer
        >>> from distilabel.tasks.text_generation import TextGenerationTask as Task
        >>> from distilabel.llm import TransformersLLM
        >>> model = AutoModelForCausalLM.from_pretrained("gpt2")
        >>> tokenizer = AutoTokenizer.from_pretrained("gpt2")
        >>> task = Task()
        >>> llm = TransformersLLM(
        ...     model=model,
        ...     tokenizer=tokenizer,
        ...     task=task,
        ... )
    """
    super().__init__(
        task=task,
        num_threads=num_threads,
        prompt_format=prompt_format,
        prompt_formatting_fn=prompt_formatting_fn,
    )

    self.max_new_tokens = max_new_tokens
    self.do_sample = do_sample
    self.temperature = temperature
    self.top_k = top_k
    self.top_p = top_p
    self.typical_p = typical_p

    self.model = model
    if self.device != "cpu":
        self.model.to(self.device)

    self.tokenizer = tokenizer
    self.tokenizer.padding_side = "left"
    if self.tokenizer.pad_token is None:
        self.tokenizer.pad_token = self.tokenizer.eos_token
    if (
        hasattr(self.tokenizer, "use_default_system_prompt")
        and self.tokenizer.use_default_system_prompt  # type: ignore
    ):
        # The `tokenizer` also has a method named `apply_chat_template` that expects a `Conversation` as OpenAI does with the ChatML format
        warnings.warn(
            "The provided `tokenizer` has `use_default_system_prompt=True` which means that the default system prompt will be used, which may collide with the `task` provided as an arg to this class.",
            UserWarning,
            stacklevel=2,
        )

`vLLM`

Bases: LLM

Source code in src/distilabel/llm/vllm.py

class vLLM(LLM):
    def __init__(
        self,
        vllm: "_vLLM",
        task: "Task",
        max_new_tokens: int = 128,
        presence_penalty: float = 0.0,
        frequency_penalty: float = 0.0,
        temperature: float = 1.0,
        top_p: float = 1.0,
        top_k: int = -1,
        prompt_format: Union["SupportedFormats", None] = None,
        prompt_formatting_fn: Union[Callable[..., str], None] = None,
    ) -> None:
        """Initializes the vLLM class.

        Args:
            vllm (_vLLM): the vLLM model to be used.
            task (Task): the task to be performed by the LLM.
            max_new_tokens (int, optional): the maximum number of tokens to be generated.
                Defaults to 128.
            presence_penalty (float, optional): the presence penalty to be used for generation.
                Defaults to 0.0.
            frequency_penalty (float, optional): the frequency penalty to be used for generation.
                Defaults to 0.0.
            temperature (float, optional): the temperature to be used for generation.
                Defaults to 1.0.
            top_p (float, optional): the top-p value to be used for generation.
                Defaults to 1.0.
            top_k (int, optional): the top-k value to be used for generation.
                Defaults to -1.
            prompt_format (Union[SupportedFormats, None], optional): the format to be used
                for the prompt. If `None`, the default format of the task will be used, available
                formats are `openai`, `chatml`, `llama2`, `zephyr`, and `default`. Defaults to `None`,
                but `default` (concatenation of `system_prompt` and `formatted_prompt` with a line-break)
                will be used if no `prompt_formatting_fn` is provided.
            prompt_formatting_fn (Union[Callable[..., str], None], optional): a function to be
                applied to the prompt before generation. If `None`, no formatting will be applied.

        Examples:
            >>> from vllm import LLM
            >>> from distilabel.tasks.text_generation import TextGenerationTask as Task
            >>> from distilabel.llm import vLLM
            >>> model = LLM(model="gpt2")
            >>> task = Task()
            >>> llm = vLLM(model=model, task=task)
        """
        super().__init__(
            task=task,
            prompt_format=prompt_format,
            prompt_formatting_fn=prompt_formatting_fn,
        )

        if not _VLLM_AVAILABLE:
            raise ImportError(
                "`vLLM` cannot be used as `vllm` is not installed, please "
                " install it with `pip install vllm`."
            )

        self.presence_penalty = presence_penalty
        self.frequency_penalty = frequency_penalty
        self.temperature = temperature
        self.top_p = top_p
        self.top_k = top_k
        self.max_tokens = max_new_tokens

        self.vllm = vllm

    def __rich_repr__(self) -> Generator[Any, None, None]:
        yield from super().__rich_repr__()
        yield (
            "parameters",
            {
                "max_tokens": self.max_tokens,
                "presence_penalty": self.presence_penalty,
                "frequency_penalty": self.frequency_penalty,
                "temperature": self.temperature,
                "top_p": self.top_p,
                "top_k": self.top_k,
            },
        )

    @property
    def model_name(self) -> str:
        """Returns the name of the vLLM model."""
        return self.vllm.llm_engine.model_config.model  # type: ignore

    def _generate(
        self, inputs: List[Dict[str, Any]], num_generations: int = 1
    ) -> List[List[LLMOutput]]:
        """Generates `num_generations` for each input in `inputs`.

        Args:
            inputs (List[Dict[str, Any]]): the inputs to be used for generation.
            num_generations (int, optional): the number of generations to be performed for each
                input. Defaults to 1.

        Returns:
            List[List[LLMOutput]]: the outputs of the LLM.
        """
        prompts = self._generate_prompts(
            inputs, default_format=None, expected_output_type=str
        )
        requests = self.vllm.generate(
            prompts,
            SamplingParams(  # type: ignore
                n=num_generations,
                presence_penalty=self.presence_penalty,
                frequency_penalty=self.frequency_penalty,
                temperature=self.temperature,
                top_p=self.top_p,
                top_k=self.top_k,
                max_tokens=self.max_tokens,
            ),
            use_tqdm=False,  # type: ignore
        )
        outputs = []
        for request, prompt in zip(requests, prompts):
            output = []
            for request_output in request.outputs:
                try:
                    parsed_output = self.task.parse_output(request_output.text)
                except Exception as e:
                    logger.error(f"Error parsing vLLM output: {e}")
                    parsed_output = None
                output.append(
                    LLMOutput(
                        model_name=self.model_name,
                        prompt_used=prompt,
                        raw_output=request_output.text,
                        parsed_output=parsed_output,
                    )
                )
            outputs.append(output)
        return outputs

`model_name: str` `property`

Returns the name of the vLLM model.

`init(vllm, task, max_new_tokens=128, presence_penalty=0.0, frequency_penalty=0.0, temperature=1.0, top_p=1.0, top_k=-1, prompt_format=None, prompt_formatting_fn=None)`

Initializes the vLLM class.

Parameters:

Name	Type	Description	Default
`vllm`	`LLM`	the vLLM model to be used.	required
`task`	`Task`	the task to be performed by the LLM.	required
`max_new_tokens`	`int`	the maximum number of tokens to be generated. Defaults to 128.	`128`
`presence_penalty`	`float`	the presence penalty to be used for generation. Defaults to 0.0.	`0.0`
`frequency_penalty`	`float`	the frequency penalty to be used for generation. Defaults to 0.0.	`0.0`
`temperature`	`float`	the temperature to be used for generation. Defaults to 1.0.	`1.0`
`top_p`	`float`	the top-p value to be used for generation. Defaults to 1.0.	`1.0`
`top_k`	`int`	the top-k value to be used for generation. Defaults to -1.	`-1`
`prompt_format`	`Union[SupportedFormats, None]`	the format to be used for the prompt. If `None`, the default format of the task will be used, available formats are `openai`, `chatml`, `llama2`, `zephyr`, and `default`. Defaults to `None`, but `default` (concatenation of `system_prompt` and `formatted_prompt` with a line-break) will be used if no `prompt_formatting_fn` is provided.	`None`
`prompt_formatting_fn`	`Union[Callable[..., str], None]`	a function to be applied to the prompt before generation. If `None`, no formatting will be applied.	`None`

Examples:

>>> from vllm import LLM
>>> from distilabel.tasks.text_generation import TextGenerationTask as Task
>>> from distilabel.llm import vLLM
>>> model = LLM(model="gpt2")
>>> task = Task()
>>> llm = vLLM(model=model, task=task)

Source code in src/distilabel/llm/vllm.py

def __init__(
    self,
    vllm: "_vLLM",
    task: "Task",
    max_new_tokens: int = 128,
    presence_penalty: float = 0.0,
    frequency_penalty: float = 0.0,
    temperature: float = 1.0,
    top_p: float = 1.0,
    top_k: int = -1,
    prompt_format: Union["SupportedFormats", None] = None,
    prompt_formatting_fn: Union[Callable[..., str], None] = None,
) -> None:
    """Initializes the vLLM class.

    Args:
        vllm (_vLLM): the vLLM model to be used.
        task (Task): the task to be performed by the LLM.
        max_new_tokens (int, optional): the maximum number of tokens to be generated.
            Defaults to 128.
        presence_penalty (float, optional): the presence penalty to be used for generation.
            Defaults to 0.0.
        frequency_penalty (float, optional): the frequency penalty to be used for generation.
            Defaults to 0.0.
        temperature (float, optional): the temperature to be used for generation.
            Defaults to 1.0.
        top_p (float, optional): the top-p value to be used for generation.
            Defaults to 1.0.
        top_k (int, optional): the top-k value to be used for generation.
            Defaults to -1.
        prompt_format (Union[SupportedFormats, None], optional): the format to be used
            for the prompt. If `None`, the default format of the task will be used, available
            formats are `openai`, `chatml`, `llama2`, `zephyr`, and `default`. Defaults to `None`,
            but `default` (concatenation of `system_prompt` and `formatted_prompt` with a line-break)
            will be used if no `prompt_formatting_fn` is provided.
        prompt_formatting_fn (Union[Callable[..., str], None], optional): a function to be
            applied to the prompt before generation. If `None`, no formatting will be applied.

    Examples:
        >>> from vllm import LLM
        >>> from distilabel.tasks.text_generation import TextGenerationTask as Task
        >>> from distilabel.llm import vLLM
        >>> model = LLM(model="gpt2")
        >>> task = Task()
        >>> llm = vLLM(model=model, task=task)
    """
    super().__init__(
        task=task,
        prompt_format=prompt_format,
        prompt_formatting_fn=prompt_formatting_fn,
    )

    if not _VLLM_AVAILABLE:
        raise ImportError(
            "`vLLM` cannot be used as `vllm` is not installed, please "
            " install it with `pip install vllm`."
        )

    self.presence_penalty = presence_penalty
    self.frequency_penalty = frequency_penalty
    self.temperature = temperature
    self.top_p = top_p
    self.top_k = top_k
    self.max_tokens = max_new_tokens

    self.vllm = vllm

llm

InferenceEndpointsLLM

model_name: str property

__init__(endpoint_name, task, endpoint_namespace=None, token=None, max_new_tokens=128, repetition_penalty=None, seed=None, do_sample=False, temperature=None, top_k=None, top_p=None, typical_p=None, num_threads=None, prompt_format=None, prompt_formatting_fn=None)

LlamaCppLLM

model_name: str property

__init__(model, task, max_new_tokens=128, temperature=0.8, top_p=0.95, top_k=40, repeat_penalty=1.1, prompt_format=None, prompt_formatting_fn=None)

OpenAILLM

available_models: List[str] cached property

model_name: str property

__init__(task, model='gpt-3.5-turbo', client=None, openai_api_key=None, max_new_tokens=128, frequency_penalty=0.0, presence_penalty=0.0, temperature=1.0, top_p=1.0, num_threads=None, prompt_format=None, prompt_formatting_fn=None)

TransformersLLM

device: device cached property

model_name: str property

__init__(model, tokenizer, task, max_new_tokens=128, do_sample=False, temperature=1.0, top_k=50, top_p=1.0, typical_p=1.0, num_threads=None, prompt_format=None, prompt_formatting_fn=None)

vLLM

model_name: str property

__init__(vllm, task, max_new_tokens=128, presence_penalty=0.0, frequency_penalty=0.0, temperature=1.0, top_p=1.0, top_k=-1, prompt_format=None, prompt_formatting_fn=None)

`InferenceEndpointsLLM`

`model_name: str` `property`

`init(endpoint_name, task, endpoint_namespace=None, token=None, max_new_tokens=128, repetition_penalty=None, seed=None, do_sample=False, temperature=None, top_k=None, top_p=None, typical_p=None, num_threads=None, prompt_format=None, prompt_formatting_fn=None)`

`LlamaCppLLM`

`model_name: str` `property`

`init(model, task, max_new_tokens=128, temperature=0.8, top_p=0.95, top_k=40, repeat_penalty=1.1, prompt_format=None, prompt_formatting_fn=None)`

`OpenAILLM`

`available_models: List[str]` `cached` `property`

`model_name: str` `property`

`init(task, model='gpt-3.5-turbo', client=None, openai_api_key=None, max_new_tokens=128, frequency_penalty=0.0, presence_penalty=0.0, temperature=1.0, top_p=1.0, num_threads=None, prompt_format=None, prompt_formatting_fn=None)`

`TransformersLLM`

`device: device` `cached` `property`

`model_name: str` `property`

`init(model, tokenizer, task, max_new_tokens=128, do_sample=False, temperature=1.0, top_k=50, top_p=1.0, typical_p=1.0, num_threads=None, prompt_format=None, prompt_formatting_fn=None)`

`vLLM`

`model_name: str` `property`

`init(vllm, task, max_new_tokens=128, presence_penalty=0.0, frequency_penalty=0.0, temperature=1.0, top_p=1.0, top_k=-1, prompt_format=None, prompt_formatting_fn=None)`