Skip to content

Command Line Interface (CLI)

This section contains the API reference for the CLI. For more information on how to use the CLI, see Tutorial - CLI.

Utility functions for the distilabel pipeline sub-commands

Here are some utility functions to help working with the pipelines in the console.

utils

parse_runtime_parameters(params)

Parses the runtime parameters from the CLI format to the format expected by the Pipeline.run method. The CLI format is a list of tuples, where the first element is a list of keys and the second element is the value.

Parameters:

Name Type Description Default
params List[Tuple[List[str], str]]

A list of tuples, where the first element is a list of keys and the second element is the value.

required

Returns:

Type Description
Dict[str, Dict[str, Any]]

A dictionary with the runtime parameters in the format expected by the

Dict[str, Dict[str, Any]]

Pipeline.run method.

Source code in src/distilabel/cli/pipeline/utils.py
def parse_runtime_parameters(
    params: List[Tuple[List[str], str]],
) -> Dict[str, Dict[str, Any]]:
    """Parses the runtime parameters from the CLI format to the format expected by the
    `Pipeline.run` method. The CLI format is a list of tuples, where the first element is
    a list of keys and the second element is the value.

    Args:
        params: A list of tuples, where the first element is a list of keys and the
            second element is the value.

    Returns:
        A dictionary with the runtime parameters in the format expected by the
        `Pipeline.run` method.
    """
    runtime_params = {}
    for keys, value in params:
        current = runtime_params
        for i, key in enumerate(keys):
            if i == len(keys) - 1:
                current[key] = value
            else:
                current = current.setdefault(key, {})
    return runtime_params

valid_http_url(url)

Check if the URL is a valid HTTP URL.

Parameters:

Name Type Description Default
url str

The URL to check.

required

Returns:

Type Description
bool

True, if the URL is a valid HTTP URL. False, otherwise.

Source code in src/distilabel/cli/pipeline/utils.py
def valid_http_url(url: str) -> bool:
    """Check if the URL is a valid HTTP URL.

    Args:
        url: The URL to check.

    Returns:
        `True`, if the URL is a valid HTTP URL. `False`, otherwise.
    """
    try:
        TypeAdapter(HttpUrl).validate_python(url)  # type: ignore
    except ValidationError:
        return False

    return True

get_config_from_url(url)

Loads the pipeline configuration from a URL pointing to a JSON or YAML file.

Parameters:

Name Type Description Default
url str

The URL pointing to the pipeline configuration file.

required

Returns:

Type Description
Dict[str, Any]

The pipeline configuration as a dictionary.

Raises:

Type Description
ValueError

If the file format is not supported.

Source code in src/distilabel/cli/pipeline/utils.py
def get_config_from_url(url: str) -> Dict[str, Any]:
    """Loads the pipeline configuration from a URL pointing to a JSON or YAML file.

    Args:
        url: The URL pointing to the pipeline configuration file.

    Returns:
        The pipeline configuration as a dictionary.

    Raises:
        ValueError: If the file format is not supported.
    """
    if not url.endswith((".json", ".yaml", ".yml")):
        raise DistilabelUserError(
            f"Unsupported file format for '{url}'. Only JSON and YAML are supported",
            page="sections/how_to_guides/basic/pipeline/?h=seriali#serializing-the-pipeline",
        )
    response = _download_remote_file(url)

    if url.endswith((".yaml", ".yml")):
        content = response.content.decode("utf-8")
        return yaml.safe_load(content)

    return response.json()

get_pipeline_from_url(url, pipeline_name='pipeline')

Downloads the file to the current working directory and loads the pipeline object from a python script.

Parameters:

Name Type Description Default
url str

The URL pointing to the python script with the pipeline definition.

required
pipeline_name str

The name of the pipeline in the script. I.e: with Pipeline(...) as pipeline:....

'pipeline'

Returns:

Type Description
BasePipeline

The pipeline instantiated.

Raises:

Type Description
ValueError

If the file format is not supported.

Source code in src/distilabel/cli/pipeline/utils.py
def get_pipeline_from_url(url: str, pipeline_name: str = "pipeline") -> "BasePipeline":
    """Downloads the file to the current working directory and loads the pipeline object
    from a python script.

    Args:
        url: The URL pointing to the python script with the pipeline definition.
        pipeline_name: The name of the pipeline in the script.
            I.e: `with Pipeline(...) as pipeline:...`.

    Returns:
        The pipeline instantiated.

    Raises:
        ValueError: If the file format is not supported.
    """
    if not url.endswith(".py"):
        raise DistilabelUserError(
            f"Unsupported file format for '{url}'. It must be a python file.",
            page="sections/how_to_guides/advanced/cli/#distilabel-pipeline-run",
        )
    response = _download_remote_file(url)

    content = response.content.decode("utf-8")
    script_local = Path.cwd() / Path(url).name
    script_local.write_text(content)

    # Add the current working directory to sys.path
    sys.path.insert(0, os.getcwd())
    module = importlib.import_module(str(Path(url).stem))
    pipeline = getattr(module, pipeline_name, None)
    if not pipeline:
        raise ImportError(
            f"The script must contain an object with the pipeline named: '{pipeline_name}' that can be imported"
        )

    return pipeline

get_pipeline(config_or_script, pipeline_name='pipeline')

Get a pipeline from a configuration file or a remote python script.

Parameters:

Name Type Description Default
config_or_script str

The path or URL to the pipeline configuration file or URL to a python script.

required
pipeline_name str

The name of the pipeline in the script. I.e: with Pipeline(...) as pipeline:....

'pipeline'

Returns:

Type Description
BasePipeline

The pipeline.

Raises:

Type Description
ValueError

If the file format is not supported.

FileNotFoundError

If the configuration file does not exist.

Source code in src/distilabel/cli/pipeline/utils.py
def get_pipeline(
    config_or_script: str, pipeline_name: str = "pipeline"
) -> "BasePipeline":
    """Get a pipeline from a configuration file or a remote python script.

    Args:
        config_or_script: The path or URL to the pipeline configuration file
            or URL to a python script.
        pipeline_name: The name of the pipeline in the script.
            I.e: `with Pipeline(...) as pipeline:...`.

    Returns:
        The pipeline.

    Raises:
        ValueError: If the file format is not supported.
        FileNotFoundError: If the configuration file does not exist.
    """
    config = script = None
    if config_or_script.endswith((".json", ".yaml", ".yml")):
        config = config_or_script
    elif config_or_script.endswith(".py"):
        script = config_or_script
    else:
        raise DistilabelUserError(
            "The file must be a valid config file or python script with a pipeline.",
            page="sections/how_to_guides/advanced/cli/#distilabel-pipeline-run",
        )

    if valid_http_url(config_or_script):
        if config:
            data = get_config_from_url(config)
            return Pipeline.from_dict(data)
        return get_pipeline_from_url(script, pipeline_name=pipeline_name)

    if not config:
        raise ValueError(
            f"To run a pipeline from a python script, run it as `python {script}`"
        )

    if Path(config).is_file():
        return Pipeline.from_file(config)

    raise FileNotFoundError(f"File '{config_or_script}' does not exist.")

display_pipeline_information(pipeline)

Displays the pipeline information to the console.

Parameters:

Name Type Description Default
pipeline BasePipeline

The pipeline.

required
Source code in src/distilabel/cli/pipeline/utils.py
def display_pipeline_information(pipeline: "BasePipeline") -> None:
    """Displays the pipeline information to the console.

    Args:
        pipeline: The pipeline.
    """
    from rich.console import Console

    Console().print(_build_pipeline_panel(pipeline))