Skip to content

together

TogetherInferenceLLM

Bases: LLM

Source code in src/distilabel/llm/together.py
class TogetherInferenceLLM(LLM):
    def __init__(
        self,
        model: str,
        task: "Task",
        api_key: Union[str, None] = None,
        max_new_tokens: int = 128,
        repetition_penalty: float = 1.0,
        temperature: float = 1.0,
        top_p: float = 1.0,
        top_k: int = 1,
        stop: Union[List[str], None] = None,
        logprobs: int = 0,
        num_threads: Union[int, None] = None,
        prompt_format: Union["SupportedFormats", None] = None,
        prompt_formatting_fn: Union[Callable[..., str], None] = None,
    ) -> None:
        """Initializes the TogetherInferenceLLM class.

        Args:
            model (str): the model to be used for generation.
            task (Task): the task to be performed by the LLM.
            max_new_tokens (int, optional): the maximum number of tokens to be generated.
                Defaults to 128.
            temperature (float, optional): the temperature to be used for generation. From the Together
                Inference docs: "A decimal number that determines the degree of randomness in the response.
                A value of 0 will always yield the same output. A temperature much less than 1 favors more
                correctness and is appropriate for question answering or summarization. A value approaching
                1 introduces more randomness in the output.". Defaults to 1.0.
            repetition_penalty (float, optional): the repetition penalty to be used for generation. From the
                Together Inference docs: "Controls the diversity of generated text by reducing the likelihood
                of repeated sequences. Higher values decrease repetition.". Defaults to 1.0.
            top_p (float, optional): the top-p value to be used for generation. From the Together
                Inference docs: "used to dynamically adjust the number of choices for each predicted
                token based on the cumulative probabilities. It specifies a probability threshold,
                below which all less likely tokens are filtered out. This technique helps to maintain
                diversity and generate more fluent and natural-sounding text.". Defaults to 1.0.
            top_k (int, optional): the top-k value to be used for generation. From the Together Inference
                docs: "used to limit the number of choices for the next predicted word or token. It specifies
                the maximum number of tokens to consider at each step, based on their probability of occurrence.
                This technique helps to speed up the generation process and can improve the quality of the
                generated text by focusing on the most likely options.". Defaults to 1.
            stop (List[str], optional): strings to delimitate the generation process, so that when the
                model generates any of the provided characters, the generation process is considered completed.
                Defaults to None.
            logprobs (int, optional): the number of logprobs to be returned for each token. From the
                Together Inference docs: "An integer that specifies how many top token log probabilities
                are included in the response for each token generation step.". Defaults to None.
            num_threads (Union[int, None], optional): the number of threads to be used
                for parallel generation. If `None`, no parallel generation will be performed.
                Defaults to `None`.
            prompt_format (Union[SupportedFormats, None], optional): the format to be used
                for the prompt. If `None`, the default format of the task will be used, available
                formats are `openai`, `chatml`, `llama2`, `zephyr`, and `default`. Defaults to `None`,
                but `default` (concatenation of `system_prompt` and `formatted_prompt` with a line-break)
                will be used if no `prompt_formatting_fn` is provided.
            prompt_formatting_fn (Union[Callable[..., str], None], optional): a function to be
                applied to the prompt before generation. If `None`, no formatting will be applied.
                Defaults to `None`.

        Raises:
            AssertionError: if the provided `model` is not available in Together Inference.

        Examples:
            >>> from distilabel.tasks import TextGenerationTask
            >>> from distilabel.llm import TogetherInferenceLLM
            >>> llm = TogetherInferenceLLM(model="togethercomputer/llama-2-7b", task=TextGenerationTask(), prompt_format="llama2")
            >>> llm.generate([{"input": "What's the capital of Spain?"}])
        """
        if not _TOGETHER_AVAILABLE:
            raise ImportError(
                "`TogetherInferenceLLM` cannot be used as `together` is not installed, please "
                " install it with `pip install together`."
            )

        together.api_key = api_key or os.getenv("TOGETHER_API_KEY", None)
        if together.api_key is None:
            raise ValueError(
                "No `api_key` provided, please provide one or set the `TOGETHER_API_KEY` "
                "environment variable."
            )

        super().__init__(
            task=task,
            num_threads=num_threads,
            prompt_format=prompt_format,
            prompt_formatting_fn=prompt_formatting_fn,
        )

        assert (
            model in self.available_models
        ), f"Provided `model` is not available in Together Inference, available models are {self.available_models}"
        self.model = model

        self.max_new_tokens = max_new_tokens
        self.temperature = temperature
        self.top_p = top_p
        self.top_k = top_k
        self.repetition_penalty = repetition_penalty
        self.stop = stop
        self.logprobs = logprobs

    def __rich_repr__(self) -> Generator[Any, None, None]:
        yield from super().__rich_repr__()
        yield (
            "parameters",
            {
                "max_new_tokens": self.max_new_tokens,
                "temperature": self.temperature,
                "repetition_penalty": self.repetition_penalty,
                "top_p": self.top_p,
                "top_k": self.top_k,
                "stop": self.stop,
                "logprobs": self.logprobs,
            },
        )

    @cached_property
    def available_models(self) -> List[str]:
        """Returns the list of available models in Together Inference."""
        return [
            model["name"]
            for model in together.Models.list()
            if model["display_type"] != "image"
        ]

    @property
    def model_name(self) -> str:
        """Returns the name of the Together Inference model."""
        return self.model

    def _generate_single_output(self, prompt: str) -> LLMOutput:
        """Runs the Together Inference text generation function over a single prompt
        producing a single `LLMOutput`.

        Args:
            prompt (str): the formatted prompt to be provided to the Together Inference
                endpoint.

        Raises:
            RuntimeError: raised if the Together Inference endpoint fails.
        """
        try:
            output = together.Complete.create(
                prompt=prompt,
                model=self.model,
                max_tokens=self.max_new_tokens,
                stop=self.stop,
                temperature=self.temperature,
                top_k=self.top_k,
                top_p=self.top_p,
                repetition_penalty=self.repetition_penalty,
                logprobs=self.logprobs,
            )
        except Exception as e:
            raise RuntimeError(
                f"Together Inference generation failed with exception: {e}"
            ) from e

        if output["output"]["choices"] is None or len(output["output"]["choices"]) < 1:  # type: ignore
            raise RuntimeError("Together Inference generation returned no generations.")

        choice = output["output"]["choices"][0]  # type: ignore
        try:
            parsed_response = self.task.parse_output(choice["text"].strip())
        except Exception as e:
            logger.error(f"Error parsing Together Inference response: {e}")
            parsed_response = None

        return LLMOutput(
            model_name=self.model_name,
            prompt_used=prompt,
            raw_output=choice["text"] or None,
            parsed_output=parsed_response,
        )

    def _generate(
        self,
        inputs: List[Dict[str, Any]],
        num_generations: int = 1,
    ) -> List[List[LLMOutput]]:
        """Generates `num_generations` for each input in `inputs`.

        Args:
            inputs (List[Dict[str, Any]]): the inputs to be used for generation.
            num_generations (int, optional): the number of generations to be performed for each
                input. Defaults to 1.

        Returns:
            List[List[LLMOutput]]: the generated outputs.
        """
        prompts = self._generate_prompts(inputs, default_format=None)
        outputs = []
        for prompt in prompts:
            outputs.append(
                [self._generate_single_output(prompt) for _ in range(num_generations)]
            )
        return outputs

available_models: List[str] cached property

Returns the list of available models in Together Inference.

model_name: str property

Returns the name of the Together Inference model.

__init__(model, task, api_key=None, max_new_tokens=128, repetition_penalty=1.0, temperature=1.0, top_p=1.0, top_k=1, stop=None, logprobs=0, num_threads=None, prompt_format=None, prompt_formatting_fn=None)

Initializes the TogetherInferenceLLM class.

Parameters:

Name Type Description Default
model str

the model to be used for generation.

required
task Task

the task to be performed by the LLM.

required
max_new_tokens int

the maximum number of tokens to be generated. Defaults to 128.

128
temperature float

the temperature to be used for generation. From the Together Inference docs: "A decimal number that determines the degree of randomness in the response. A value of 0 will always yield the same output. A temperature much less than 1 favors more correctness and is appropriate for question answering or summarization. A value approaching 1 introduces more randomness in the output.". Defaults to 1.0.

1.0
repetition_penalty float

the repetition penalty to be used for generation. From the Together Inference docs: "Controls the diversity of generated text by reducing the likelihood of repeated sequences. Higher values decrease repetition.". Defaults to 1.0.

1.0
top_p float

the top-p value to be used for generation. From the Together Inference docs: "used to dynamically adjust the number of choices for each predicted token based on the cumulative probabilities. It specifies a probability threshold, below which all less likely tokens are filtered out. This technique helps to maintain diversity and generate more fluent and natural-sounding text.". Defaults to 1.0.

1.0
top_k int

the top-k value to be used for generation. From the Together Inference docs: "used to limit the number of choices for the next predicted word or token. It specifies the maximum number of tokens to consider at each step, based on their probability of occurrence. This technique helps to speed up the generation process and can improve the quality of the generated text by focusing on the most likely options.". Defaults to 1.

1
stop List[str]

strings to delimitate the generation process, so that when the model generates any of the provided characters, the generation process is considered completed. Defaults to None.

None
logprobs int

the number of logprobs to be returned for each token. From the Together Inference docs: "An integer that specifies how many top token log probabilities are included in the response for each token generation step.". Defaults to None.

0
num_threads Union[int, None]

the number of threads to be used for parallel generation. If None, no parallel generation will be performed. Defaults to None.

None
prompt_format Union[SupportedFormats, None]

the format to be used for the prompt. If None, the default format of the task will be used, available formats are openai, chatml, llama2, zephyr, and default. Defaults to None, but default (concatenation of system_prompt and formatted_prompt with a line-break) will be used if no prompt_formatting_fn is provided.

None
prompt_formatting_fn Union[Callable[..., str], None]

a function to be applied to the prompt before generation. If None, no formatting will be applied. Defaults to None.

None

Raises:

Type Description
AssertionError

if the provided model is not available in Together Inference.

Examples:

>>> from distilabel.tasks import TextGenerationTask
>>> from distilabel.llm import TogetherInferenceLLM
>>> llm = TogetherInferenceLLM(model="togethercomputer/llama-2-7b", task=TextGenerationTask(), prompt_format="llama2")
>>> llm.generate([{"input": "What's the capital of Spain?"}])
Source code in src/distilabel/llm/together.py
def __init__(
    self,
    model: str,
    task: "Task",
    api_key: Union[str, None] = None,
    max_new_tokens: int = 128,
    repetition_penalty: float = 1.0,
    temperature: float = 1.0,
    top_p: float = 1.0,
    top_k: int = 1,
    stop: Union[List[str], None] = None,
    logprobs: int = 0,
    num_threads: Union[int, None] = None,
    prompt_format: Union["SupportedFormats", None] = None,
    prompt_formatting_fn: Union[Callable[..., str], None] = None,
) -> None:
    """Initializes the TogetherInferenceLLM class.

    Args:
        model (str): the model to be used for generation.
        task (Task): the task to be performed by the LLM.
        max_new_tokens (int, optional): the maximum number of tokens to be generated.
            Defaults to 128.
        temperature (float, optional): the temperature to be used for generation. From the Together
            Inference docs: "A decimal number that determines the degree of randomness in the response.
            A value of 0 will always yield the same output. A temperature much less than 1 favors more
            correctness and is appropriate for question answering or summarization. A value approaching
            1 introduces more randomness in the output.". Defaults to 1.0.
        repetition_penalty (float, optional): the repetition penalty to be used for generation. From the
            Together Inference docs: "Controls the diversity of generated text by reducing the likelihood
            of repeated sequences. Higher values decrease repetition.". Defaults to 1.0.
        top_p (float, optional): the top-p value to be used for generation. From the Together
            Inference docs: "used to dynamically adjust the number of choices for each predicted
            token based on the cumulative probabilities. It specifies a probability threshold,
            below which all less likely tokens are filtered out. This technique helps to maintain
            diversity and generate more fluent and natural-sounding text.". Defaults to 1.0.
        top_k (int, optional): the top-k value to be used for generation. From the Together Inference
            docs: "used to limit the number of choices for the next predicted word or token. It specifies
            the maximum number of tokens to consider at each step, based on their probability of occurrence.
            This technique helps to speed up the generation process and can improve the quality of the
            generated text by focusing on the most likely options.". Defaults to 1.
        stop (List[str], optional): strings to delimitate the generation process, so that when the
            model generates any of the provided characters, the generation process is considered completed.
            Defaults to None.
        logprobs (int, optional): the number of logprobs to be returned for each token. From the
            Together Inference docs: "An integer that specifies how many top token log probabilities
            are included in the response for each token generation step.". Defaults to None.
        num_threads (Union[int, None], optional): the number of threads to be used
            for parallel generation. If `None`, no parallel generation will be performed.
            Defaults to `None`.
        prompt_format (Union[SupportedFormats, None], optional): the format to be used
            for the prompt. If `None`, the default format of the task will be used, available
            formats are `openai`, `chatml`, `llama2`, `zephyr`, and `default`. Defaults to `None`,
            but `default` (concatenation of `system_prompt` and `formatted_prompt` with a line-break)
            will be used if no `prompt_formatting_fn` is provided.
        prompt_formatting_fn (Union[Callable[..., str], None], optional): a function to be
            applied to the prompt before generation. If `None`, no formatting will be applied.
            Defaults to `None`.

    Raises:
        AssertionError: if the provided `model` is not available in Together Inference.

    Examples:
        >>> from distilabel.tasks import TextGenerationTask
        >>> from distilabel.llm import TogetherInferenceLLM
        >>> llm = TogetherInferenceLLM(model="togethercomputer/llama-2-7b", task=TextGenerationTask(), prompt_format="llama2")
        >>> llm.generate([{"input": "What's the capital of Spain?"}])
    """
    if not _TOGETHER_AVAILABLE:
        raise ImportError(
            "`TogetherInferenceLLM` cannot be used as `together` is not installed, please "
            " install it with `pip install together`."
        )

    together.api_key = api_key or os.getenv("TOGETHER_API_KEY", None)
    if together.api_key is None:
        raise ValueError(
            "No `api_key` provided, please provide one or set the `TOGETHER_API_KEY` "
            "environment variable."
        )

    super().__init__(
        task=task,
        num_threads=num_threads,
        prompt_format=prompt_format,
        prompt_formatting_fn=prompt_formatting_fn,
    )

    assert (
        model in self.available_models
    ), f"Provided `model` is not available in Together Inference, available models are {self.available_models}"
    self.model = model

    self.max_new_tokens = max_new_tokens
    self.temperature = temperature
    self.top_p = top_p
    self.top_k = top_k
    self.repetition_penalty = repetition_penalty
    self.stop = stop
    self.logprobs = logprobs