Source code for QEfficient.compile.compile_helper

# -----------------------------------------------------------------------------
#
# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# -----------------------------------------------------------------------------

import json
import os
import shutil
import subprocess
import warnings
from typing import List, Optional, Tuple

from QEfficient.utils.logging_utils import logger


def create_and_dump_specializations(
    batch_size: int, prompt_len: int, ctx_len: int, path: str, full_batch_size: Optional[int] = None
):
    # Create specialization file.
    specializations = {
        "specializations": [
            {
                "batch_size": str(batch_size),
                "seq_len": str(prompt_len),
                "ctx_len": str(ctx_len),
            },
            {"batch_size": str(batch_size), "seq_len": "1", "ctx_len": str(ctx_len)},
        ]
    }
    # If continuous batching is enabled by proving full_batch_size we need to add FBS to the specialization file and update the batch size of decoder part to FBS
    if full_batch_size is not None:
        specializations["specializations"][0]["full_batch_size"] = str(full_batch_size)
        specializations["specializations"][1]["full_batch_size"] = str(full_batch_size)
        specializations["specializations"][1]["batch_size"] = str(full_batch_size)

    # Dump
    with open(path, "w") as file:
        json.dump(specializations, file, indent=4)


def compile_kv_model_on_cloud_ai_100(
    onnx_path: str,
    specializations_json: str,
    num_cores: int,
    base_path: str,
    mxfp6: bool,
    custom_io_path: str,
    aic_enable_depth_first: bool,
    mos: int = -1,
    device_group: Optional[List[int]] = None,
    **kwargs,
) -> Tuple[bool, str]:
    warnings.warn(
        "\033[93mUse `QEFFAutoModelForCausalLM.compile` instead, this method will be removed soon.\033[0m",
        DeprecationWarning,
        stacklevel=2,
    )
    if kwargs:
        # FIXME
        raise NotImplementedError("Can't handle extra compilation args now!")
    aic_binary_dir = os.path.join(base_path, "qpcs")

    if os.path.isdir(aic_binary_dir):
        shutil.rmtree(aic_binary_dir)

    if not os.path.isfile(specializations_json):
        raise FileNotFoundError(f"Please use 'QEfficient.compile', as {specializations_json} file was not found")
    if not os.path.isfile(custom_io_path):
        raise FileNotFoundError(f"{custom_io_path} file was not found!")
    command = [
        "/opt/qti-aic/exec/qaic-exec",
        f"-m={onnx_path}",
        "-aic-hw",
        "-aic-hw-version=2.0",
        f"-network-specialization-config={specializations_json}",
        "-convert-to-fp16",
        "-retained-state",
        f"-aic-num-cores={num_cores}",
        f"-custom-IO-list-file={custom_io_path}",
        "-compile-only",
        f"-aic-binary-dir={aic_binary_dir}",
    ]
    if mxfp6:
        command.append("-mxfp6-matmul")
    if mos > 0:
        command.append(f"-mos={mos}")
    if aic_enable_depth_first:
        command.append("-aic-enable-depth-first")
    if device_group is not None and len(device_group) > 1:
        mdp_ts_config = {
            "connections": [{"devices": list(range(len(device_group))), "type": "p2p"}],
            "partitions": [
                {
                    "name": "Partition0",
                    "devices": [{"deviceId": device, "numCores": num_cores} for device in range(len(device_group))],
                }
            ],
        }
        mdp_ts_config_path = os.path.join(base_path, "mdp_ts_config.json")
        with open(mdp_ts_config_path, "w") as file:
            json.dump(mdp_ts_config, file, indent=4)
        command.append(f"-mdp-load-partition-config={mdp_ts_config_path}")
    print("Running AI 100 compiler:", " ".join(command))
    result = subprocess.run(command, capture_output=True, text=True)
    if result.returncode != 0:
        raise RuntimeError(f"Compilation Failed!!\n\nSTDOUT\n{result.stdout}\n\nSTDERR\n{result.stderr}")

    print("\n===================== Compilation Done! =====================\n")
    return result.returncode == 0, aic_binary_dir


[docs]def compile( onnx_path: str, qpc_path: str, num_cores: int, device_group: Optional[List[int]] = None, # FIXME: use num_devices instead aic_enable_depth_first: bool = False, mos: int = -1, batch_size: int = 1, prompt_len: int = 32, ctx_len: int = 128, mxfp6: bool = True, mxint8: bool = False, custom_io_file_path: Optional[str] = None, full_batch_size: Optional[int] = None, **kwargs, ) -> str: """ Compiles the given ``ONNX`` model using Cloud AI 100 platform SDK compiler and saves the compiled ``qpc`` package at ``qpc_path``. Generates tensor-slicing configuration if multiple devices are passed in ``device_group``. This function will be deprecated soon and will be replaced by ``QEFFAutoModelForCausalLM.compile``. ``Mandatory`` Args: :onnx_path (str): Generated ``ONNX`` Model Path. :qpc_path (str): Path for saving compiled qpc binaries. :num_cores (int): Number of cores to compile the model on. ``Optional`` Args: :device_group (List[int]): Used for finding the number of devices to compile for. ``Defaults to None.`` :aic_enable_depth_first (bool): Enables ``DFS`` with default memory size. ``Defaults to False.`` :mos (int): Effort level to reduce the on-chip memory. ``Defaults to -1.`` :batch_size (int): Batch size to compile the model for. ``Defaults to 1.`` :full_batch_size (int): Set full batch size to enable continuous batching mode. ``Default to None`` :prompt_len (int): Prompt length for the model to compile. ``Defaults to 32`` :ctx_len (int): Maximum context length to compile the model. ``Defaults to 128`` :mxfp6 (bool): Enable compilation for ``MXFP6`` precision. ``Defaults to True.`` :mxint8 (bool): Compress Present/Past KV to ``MXINT8`` using ``CustomIO`` config. ``Defaults to False.`` :custom_io_file_path (str): Path to ``customIO`` file (formatted as a string). ``Defaults to None.`` Returns: :str: Path to compiled ``qpc`` package. """ if full_batch_size and batch_size != 1: raise ValueError("Only either batch_size or full_batch_size should be greater than one") os.makedirs(qpc_path, exist_ok=True) specialization_json_path = os.path.join(qpc_path, "specializations.json") create_and_dump_specializations( batch_size=batch_size, prompt_len=prompt_len, ctx_len=ctx_len, path=specialization_json_path, full_batch_size=full_batch_size, ) # Select the customIO config based on the mx flag. custom_io_file_name = "custom_io_int8.yaml" if mxint8 else "custom_io_fp16.yaml" if custom_io_file_path is None: custom_io_file_path = os.path.join(os.path.dirname(onnx_path), custom_io_file_name) if not os.path.isfile(custom_io_file_path): raise FileNotFoundError( f"Custom IO file {custom_io_file_name} is not present at the expected path {custom_io_file_path}. Please pass the correct file path or rerun infer/export API" ) _, qpc_path = compile_kv_model_on_cloud_ai_100( onnx_path=onnx_path, specializations_json=specialization_json_path, num_cores=num_cores, custom_io_path=custom_io_file_path, base_path=qpc_path, mxfp6=mxfp6, aic_enable_depth_first=aic_enable_depth_first, mos=mos, device_group=device_group, ) logger.info(f"Compiled QPC files can be found here: {qpc_path}") return qpc_path