Source code for QEfficient.cloud.execute

# -----------------------------------------------------------------------------
#
# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
# SPDX-License-Identifier: BSD-3-Clause
#
# -----------------------------------------------------------------------------

import argparse
from typing import List, Optional

from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv
from QEfficient.utils import load_hf_tokenizer


[docs]def main( model_name: str, qpc_path: str, device_group: List[int] = None, local_model_dir: Optional[str] = None, prompt: Optional[str] = None, # type: ignore prompts_txt_file_path: Optional[str] = None, generation_len: Optional[int] = None, cache_dir: Optional[str] = None, hf_token: Optional[str] = None, full_batch_size: Optional[int] = None, ): """ Main function for the QEfficient execution CLI application. This function serves as the entry point for running a compiled model (QPC package) on the Cloud AI 100 Platform. It loads the necessary tokenizer and then orchestrates the text generation inference. Parameters ---------- model_name : str Hugging Face Model Card name (e.g., ``gpt2``) for loading the tokenizer. qpc_path : str Path to the generated binary (QPC package) after compilation. Other Parameters ---------------- device_group : List[int], optional List of device IDs to be used for inference. If `len(device_group) > 1`, a multi-card setup is enabled. Default is None. local_model_dir : str, optional Path to custom model weights and config files, used if not loading tokenizer from Hugging Face Hub. Default is None. prompt : str, optional Sample prompt(s) for the model text generation. For batch size > 1, pass multiple prompts separated by a pipe (``|``) symbol. Default is None. prompts_txt_file_path : str, optional Path to a text file containing multiple input prompts, one per line. Default is None. generation_len : int, optional Maximum number of tokens to be generated during inference. Default is None. cache_dir : str, optional Cache directory where downloaded HuggingFace files (like tokenizer) are stored. Default is None. hf_token : str, optional HuggingFace login token to access private repositories. Default is None. full_batch_size : int, optional Ignored in this context as continuous batching is managed by the compiled QPC. However, it might be passed through from CLI arguments. Default is None. Example ------- To execute a compiled model from the command line: .. code-block:: bash python -m QEfficient.cloud.execute --model-name gpt2 --qpc-path /path/to/qpc/binaries --prompt "Hello world" For multi-device inference: .. code-block:: bash python -m QEfficient.cloud.execute --model-name gpt2 --qpc-path /path/to/qpc/binaries --device-group "[0,1]" --prompt "Hello | Hi" """ tokenizer = load_hf_tokenizer( pretrained_model_name_or_path=(local_model_dir if local_model_dir else model_name), cache_dir=cache_dir, hf_token=hf_token, ) # Execute cloud_ai_100_exec_kv( tokenizer=tokenizer, qpc_path=qpc_path, device_id=device_group, prompt=prompt, prompts_txt_file_path=prompts_txt_file_path, generation_len=generation_len, )
if __name__ == "__main__": parser = argparse.ArgumentParser(description="Execution script.") parser.add_argument( "--model_name", "--model-name", required=False, type=str, help="HF model card name for tokenizing the inputs" ) parser.add_argument("--qpc_path", "--qpc-path", required=True, help="Path to generated QPC") parser.add_argument( "--device_group", "--device-group", type=lambda device_ids: [int(x) for x in device_ids.strip("[]").split(",")], help="Cloud AI 100 device ids (comma-separated) e.g. [0]", ) parser.add_argument( "--prompt", type=lambda prompt: prompt.split("|"), help="Input prompt, if executing for batch size>1, pass input prompts in single string but separate with pipe (|) symbol", ) parser.add_argument( "--prompts_txt_file_path", "--prompts-txt-file-path", type=str, help="File path for taking input prompts from txt file, sample prompts.txt file present in examples folder", ) parser.add_argument("--generation_len", "--generation-len", type=int, help="Number of tokens to generate") parser.add_argument( "--local-model-dir", "--local_model_dir", required=False, help="Path to custom model weights and config files" ) parser.add_argument( "--cache-dir", "--cache_dir", default=None, required=False, help="Cache dir to store HF Downloads", ) parser.add_argument( "--full_batch_size", "--full-batch-size", type=int, default=None, help="Set full batch size to enable continuous batching mode, default is None", ) parser.add_argument( "--hf-token", "--hf_token", default=None, type=str, required=False, help="HF token id for private HF models" ) args = parser.parse_args() main(**args.__dict__)