Source code for QEfficient.compile.compile_helper

# -----------------------------------------------------------------------------
#
# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
# SPDX-License-Identifier: BSD-3-Clause
#
# -----------------------------------------------------------------------------

import json
import os
import shutil
import subprocess
import warnings
from typing import List, Optional, Tuple

from QEfficient.compile.qnn_compiler import compile as qnn_compile
from QEfficient.utils import constants
from QEfficient.utils._utils import load_json, load_yaml
from QEfficient.utils.logging_utils import logger


def create_and_dump_specializations(
    batch_size: int, prompt_len: int, ctx_len: int, path: str, full_batch_size: Optional[int] = None
):
    # Create specialization file.
    specializations = {
        "specializations": [
            {
                "batch_size": str(batch_size),
                "seq_len": str(prompt_len),
                "ctx_len": str(ctx_len),
            },
            {"batch_size": str(batch_size), "seq_len": "1", "ctx_len": str(ctx_len)},
        ]
    }
    # If continuous batching is enabled by proving full_batch_size we need to add FBS to the specialization file and update the batch size of decoder part to FBS
    if full_batch_size is not None:
        specializations["specializations"][0]["full_batch_size"] = str(full_batch_size)
        specializations["specializations"][1]["full_batch_size"] = str(full_batch_size)
        specializations["specializations"][1]["batch_size"] = str(full_batch_size)

    # To handle repetative input in specializations when prompt_len is 1
    if prompt_len == 1 and full_batch_size is None:
        specializations["specializations"].pop()

    # Dump
    with open(path, "w") as file:
        json.dump(specializations, file, indent=4)


def compile_kv_model_on_cloud_ai_100(
    onnx_path: str,
    specializations_json: str,
    num_cores: int,
    base_path: str,
    mxfp6: bool,
    custom_io_path: str,
    aic_enable_depth_first: bool,
    allow_mxint8_mdp_io: bool,
    mos: int = -1,
    device_group: Optional[List[int]] = None,
    **kwargs,
) -> Tuple[bool, str]:
    """
    Compiles an ONNX Key-Value (KV) model for Cloud AI 100 hardware using `qaic-exec`.

    This function sets up and executes the Qualcomm AI 100 compiler with various options
    to generate a QPC package.

    Parameters
    ----------
    onnx_path : str
        Path to the ONNX model file to be compiled.
    specializations_json : str
        Path to the JSON file defining compilation specializations (batch size, sequence length, etc.).
    num_cores : int
        Number of cores to use for compilation on Cloud AI 100.
    base_path : str
        Base directory where QPC binaries will be stored (a `qpcs` subdirectory will be created).
    mxfp6 : bool
        If True, enables MXFP6 precision for MatMul weights.
    custom_io_path : str
        Path to the Custom IO list file (e.g., YAML format) specifying input/output data types.
    aic_enable_depth_first : bool
        If True, enables Depth-First Search (DFS) optimization with default memory size.
    allow_mxint8_mdp_io : bool
        If True, allows MXINT8 compression of MDP IO traffic.

    Other Parameters
    ----------------
    mos : int, optional
        Effort level to reduce on-chip memory. A value greater than 0 applies this effort. Default is -1 (no effort).
    device_group : List[int], optional
        List of device IDs for multi-device compilation (tensor slicing). If `len(device_group) > 1`,
        a multi-device partition configuration is generated. Default is None.
    **kwargs :
        Additional compiler options passed directly to `qaic-exec`. These are formatted as
        `-key=value` or `-key` for boolean flags.

    Returns
    -------
    Tuple[bool, str]
        A tuple containing:
        - bool: True if compilation was successful, False otherwise.
        - str: Path to the generated QPC binary directory.

    Raises
    ------
    FileNotFoundError
        If the `specializations_json` or `custom_io_path` files are not found.
    RuntimeError
        If the `qaic-exec` compilation process fails.

    Warnings
    --------
    DeprecationWarning
        This method will be removed soon; use `QEFFAutoModelForCausalLM.compile` instead.
    """
    warnings.warn(
        "\033[93mUse `QEFFAutoModelForCausalLM.compile` instead, this method will be removed soon.\033[0m",
        DeprecationWarning,
        stacklevel=2,
    )
    aic_binary_dir = os.path.join(base_path, "qpcs")

    if os.path.isdir(aic_binary_dir):
        shutil.rmtree(aic_binary_dir)

    if not os.path.isfile(specializations_json):
        raise FileNotFoundError(f"Please use 'QEfficient.compile', as {specializations_json} file was not found")
    if not os.path.isfile(custom_io_path):
        raise FileNotFoundError(f"{custom_io_path} file was not found!")
    command = [
        "/opt/qti-aic/exec/qaic-exec",
        f"-m={onnx_path}",
        "-aic-hw",
        f"-aic-hw-version={kwargs.pop('aic_hw_version', kwargs.pop('aic-hw-version', constants.DEFAULT_AIC_HW_VERSION))}",
        f"-network-specialization-config={specializations_json}",
        "-convert-to-fp16",
        "-retained-state",
        f"-aic-num-cores={num_cores}",
        f"-custom-IO-list-file={custom_io_path}",
        "-compile-only",
        f"-aic-binary-dir={aic_binary_dir}",
    ]
    if mxfp6:
        command.append("-mxfp6-matmul")
    if mos > 0:
        command.append(f"-mos={mos}")
    if aic_enable_depth_first:
        command.append("-aic-enable-depth-first")
    if allow_mxint8_mdp_io:
        command.append("-allow-mxint8-mdp-io")
    if device_group is not None and len(device_group) > 1:
        mdp_ts_config = {
            "connections": [{"devices": list(range(len(device_group))), "type": "p2p"}],
            "partitions": [
                {
                    "name": "Partition0",
                    "devices": [{"deviceId": device, "numCores": num_cores} for device in range(len(device_group))],
                }
            ],
        }
        mdp_ts_config_path = os.path.join(base_path, "mdp_ts_config.json")
        with open(mdp_ts_config_path, "w") as file:
            json.dump(mdp_ts_config, file, indent=4)
        command.append(f"-mdp-load-partition-config={mdp_ts_config_path}")
    for key, value in kwargs.items():
        option = "-" + key.replace("_", "-")
        if isinstance(value, bool):
            if value:
                command.append(option)
            continue
        command.append(f"{option}={value}")
    print("Running AI 100 compiler:", " ".join(command))
    result = subprocess.run(command, capture_output=True, text=True)
    if result.returncode != 0:
        raise RuntimeError(f"Compilation Failed!!\n\nSTDOUT\n{result.stdout}\n\nSTDERR\n{result.stderr}")

    print("\n===================== Compilation Done! =====================\n")
    return result.returncode == 0, aic_binary_dir


[docs]def compile(
    onnx_path: str,
    qpc_path: str,
    num_cores: int,
    device_group: Optional[List[int]] = None,  #  FIXME: use num_devices instead
    aic_enable_depth_first: bool = False,
    mos: int = -1,
    batch_size: int = 1,
    prompt_len: int = 32,
    ctx_len: int = 128,
    mxfp6: bool = True,
    mxint8: bool = False,
    custom_io_file_path: Optional[str] = None,
    full_batch_size: Optional[int] = None,
    allow_mxint8_mdp_io: Optional[bool] = False,
    enable_qnn: Optional[bool] = False,
    qnn_config: Optional[str] = None,
    **kwargs,
) -> str:
    """
    Compiles the given ONNX model using either the Cloud AI 100 platform SDK compiler
    or the QNN compiler, and saves the compiled QPC package.

    This function handles the creation of specialization files, selection of custom IO
    configurations, and execution of the appropriate compiler (QAIC or QNN).
    It supports multi-device compilation for tensor slicing.

    Parameters
    ----------
    onnx_path : str
        Path to the generated ONNX model file.
    qpc_path : str
        Target directory path for saving the compiled QPC binaries.
    num_cores : int
        Number of cores to use for compilation.

    Other Parameters
    ----------------
    device_group : List[int], optional
        List of device IDs. Used to determine the number of devices for multi-device compilation.
        Default is None.
    aic_enable_depth_first : bool, optional
        If True, enables Depth-First Search (DFS) optimization with default memory size during QAIC compilation.
        Default is False.
    mos : int, optional
        Effort level to reduce on-chip memory during QAIC compilation. A value greater than 0 applies this effort.
        Default is -1 (no effort).
    batch_size : int, optional
        Batch size to compile the model for. Default is 1.
    full_batch_size : int, optional
        Sets the full batch size to enable continuous batching mode. If provided, `batch_size` must be 1.
        Default is None.
    prompt_len : int, optional
        Prompt length for the model to compile. Default is 32.
    ctx_len : int, optional
        Maximum context length to compile the model for. Default is 128.
    mxfp6 : bool, optional
        If True, enables MXFP6 precision for MatMul weights during compilation. Default is True.
    mxint8 : bool, optional
        If True, compresses Present/Past KV to MXINT8 using a CustomIO configuration. Default is False.
    custom_io_file_path : str, optional
        Explicit path to a Custom IO file (e.g., YAML format). If None, it's inferred based on `mxint8`.
        Default is None.
    allow_mxint8_mdp_io : bool, optional
        If True, allows MXINT8 compression of MDP IO traffic during QAIC compilation. Default is False.
    enable_qnn : bool, optional
        If True, enables compilation using the QNN compiler instead of QAIC. Default is False.
    qnn_config : str, optional
        Path to the QNN Config parameters file, used if `enable_qnn` is True. Default is None.
    **kwargs :
        Additional compiler options passed directly to the chosen compiler.

    Returns
    -------
    str
        Path to the compiled QPC package directory.

    Raises
    ------
    ValueError
        If both `batch_size` and `full_batch_size` are greater than one (mutually exclusive in some contexts).
    FileNotFoundError
        If required Custom IO files are not found.

    Warnings
    --------
    DeprecationWarning
        This method will be removed soon; use `QEFFAutoModelForCausalLM.compile` instead.

    """
    if full_batch_size and batch_size != 1:
        raise ValueError("Only either batch_size or full_batch_size should be greater than one")

    os.makedirs(qpc_path, exist_ok=True)
    specialization_json_path = os.path.join(qpc_path, "specializations.json")

    create_and_dump_specializations(
        batch_size=batch_size,
        prompt_len=prompt_len,
        ctx_len=ctx_len,
        path=specialization_json_path,
        full_batch_size=full_batch_size,
    )

    # Select the customIO config based on the mx flag.
    custom_io_file_name = "custom_io_int8.yaml" if mxint8 else "custom_io_fp16.yaml"

    if custom_io_file_path is None:
        custom_io_file_path = os.path.join(os.path.dirname(onnx_path), custom_io_file_name)

    if not os.path.isfile(custom_io_file_path):
        raise FileNotFoundError(
            f"Custom IO file {custom_io_file_name} is not present at the expected path {custom_io_file_path}. Please pass the correct file path or rerun infer/export API"
        )

    if enable_qnn:
        qpc_path = qnn_compile(
            onnx_path=onnx_path,
            qpc_base_path=qpc_path,
            qnn_binary_dir=os.path.join(qpc_path, "qpcs"),
            num_cores=num_cores,
            mxfp6=mxfp6,
            mxint8=mxint8,
            allow_mxint8_mdp_io=allow_mxint8_mdp_io,
            aic_enable_depth_first=aic_enable_depth_first,
            mos=mos,
            device_group=device_group,
            qnn_config=qnn_config,
            specializations=(load_json(specialization_json_path))["specializations"],
            custom_io=load_yaml(custom_io_file_path),
        )
        logger.info(f"QNN Compiled QPC files can be found here: {qpc_path}")
    else:
        _, qpc_path = compile_kv_model_on_cloud_ai_100(
            onnx_path=onnx_path,
            specializations_json=specialization_json_path,
            num_cores=num_cores,
            custom_io_path=custom_io_file_path,
            base_path=qpc_path,
            mxfp6=mxfp6,
            aic_enable_depth_first=aic_enable_depth_first,
            allow_mxint8_mdp_io=allow_mxint8_mdp_io,
            mos=mos,
            device_group=device_group,
            **kwargs,
        )
        if kwargs.get("io_encrypt", None):
            logger.warning(
                f"Compilation for IO-Encrypt has been successfully completed at path: {qpc_path}. However, Efficient-Transformers do not support IO-Encrypt execution. Please run the execution separately"
            )
        else:
            logger.info(f"Compiled QPC files can be found here: {qpc_path}")

    return qpc_path