Source code for QEfficient.utils.generate_inputs

# -----------------------------------------------------------------------------
#
# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# -----------------------------------------------------------------------------

import numpy as np
import torch

from QEfficient.utils import get_num_layers_from_config, get_padding_shape_from_config, padding_check_and_fix


[docs]class InputHandler: def __init__(self, batch_size, tokenizer, config, prompt, prompt_len, ctx_len, full_batch_size): """ Initialization ``Mandatory`` Args: :batch_size (int): Number of prompts to run in one batch. :tokenizer (Union[PreTrainedTokenizer, PreTrainedTokenizerFast]): Pass model tokenizer. :config (AutoConfig): From pretrained model. :prompt (List[str]): String to used as input prompt for the model. :prompt_len (int): Prompt length for the model to compile. :ctx_len (int): Maximum context length to compile the model. :full_batch_size (int): Continuous batching batch size """ # check and fix tokenizer viability padding_check_and_fix(tokenizer) self.tokenizer = tokenizer self.prompt = prompt self.prompt_len = prompt_len self.ctx_len = ctx_len self.full_batch_size = full_batch_size self.n_layer = get_num_layers_from_config(config) self.padding_shape = get_padding_shape_from_config( config=config, batch_size=full_batch_size if full_batch_size else batch_size, seq_len=ctx_len )
[docs] def prepare_pytorch_inputs(self): """ Function responsible for creating Prefill stage tensor inputs for PyTorch model. Return: :Dict: input_ids, position_ids, past_key_values """ inputs = self.tokenizer( self.prompt, return_tensors="pt", padding=True, ) input_ids = inputs["input_ids"] batch_size, input_len = input_ids.shape inputs.pop("attention_mask") inputs.pop("token_type_ids", None) position_ids = torch.arange(input_len).view(1, -1) inputs["input_ids"] = torch.concat( [ input_ids, torch.ones((batch_size, self.prompt_len - input_len), dtype=torch.int64) * (self.tokenizer.pad_token_id), ], 1, ) inputs["position_ids"] = torch.concat( [ position_ids, torch.ones((batch_size, self.prompt_len - input_len), dtype=torch.int64) * (-1), ], 1, ) if self.full_batch_size: inputs["input_ids"] = input_ids inputs["position_ids"] = torch.arange(input_len).view(1, input_len) inputs["batch_index"] = torch.arange(1).view(-1, 1) past_key_values = [] for i in range(self.n_layer): past_key = torch.zeros((self.padding_shape), dtype=torch.float32) past_value = torch.zeros((self.padding_shape), dtype=torch.float32) pkv = (past_key, past_value) past_key_values.append(pkv) inputs["past_key_values"] = tuple(past_key_values) return inputs
[docs] def update_pytorch_inputs(self, inputs, pt_outputs): """ Function responsible for updating Prefill stage inputs to create decode stage inputs for PyTorch model. ``Mandatory`` Args: :inputs (Dict): Pytorch inputs from previous iteration :pt_outputs (Dict): Pytorch outputs from previous iteration Return: :Dict: Updated input_ids, position_ids and past_key_values """ updated_inputs = {} if self.full_batch_size: batch_index = torch.arange(1).view(-1, 1) input_ids = pt_outputs.logits.detach().argmax(2) updated_inputs["input_ids"] = torch.full((self.full_batch_size, 1), self.tokenizer.pad_token_id) updated_inputs["input_ids"][batch_index.view(-1)] = input_ids position_ids = inputs["position_ids"].max(1, keepdim=True).values + 1 updated_inputs["position_ids"] = torch.full((self.full_batch_size, 1), 0) updated_inputs["position_ids"][batch_index.view(-1)] = position_ids updated_inputs["batch_index"] = torch.arange(self.full_batch_size).view(-1, 1) else: updated_inputs["input_ids"] = pt_outputs["logits"].argmax(-1).reshape(-1, 1) updated_inputs["position_ids"] = inputs["position_ids"].max(1, keepdim=True).values + 1 updated_inputs["past_key_values"] = tuple( [(key.detach(), value.detach()) for key, value in pt_outputs["past_key_values"]] ) return updated_inputs
[docs] def prepare_ort_inputs(self): """ Function responsible for creating Prefill stage numpy inputs for ONNX model to be run on ONNXRT. Return: :Dict: input_ids, position_ids, past_key_values """ inputs = self.tokenizer( self.prompt, return_tensors="np", padding=True, ) input_ids = inputs["input_ids"] batch_size, input_len = input_ids.shape inputs.pop("attention_mask") inputs.pop("token_type_ids", None) position_ids = np.arange(input_len).reshape(1, -1) inputs["input_ids"] = np.concatenate( [input_ids, np.full((batch_size, self.prompt_len - input_len), self.tokenizer.pad_token_id)], axis=1, ).astype(np.int64) inputs["position_ids"] = np.concatenate( [position_ids, np.full((batch_size, self.prompt_len - input_len), -1)], axis=1, ).astype(np.int64) for i in range(self.n_layer): inputs["past_key." + str(i)] = np.zeros((self.padding_shape), dtype=np.float32) inputs["past_value." + str(i)] = np.zeros((self.padding_shape), dtype=np.float32) return inputs
[docs] def update_ort_inputs(self, inputs, ort_outputs): """ Function responsible for updating Prefill stage inputs to create inputs for decode stage inputs for ONNX model to be run on ONNXRT. ``Mandatory`` Args: :inputs (Dict): NumPy inputs of Onnx model from previous iteration :ort_outputs (Dict): Numpy outputs of Onnx model from previous iteration Return: :Dict: Updated input_ids, position_ids and past_key_values """ updated_inputs = {} updated_inputs["input_ids"] = ort_outputs["logits"].argmax(-1) updated_inputs["position_ids"] = np.max(inputs["position_ids"], axis=1, keepdims=True) + 1 for i in range(self.n_layer): updated_inputs["past_key." + str(i)] = ort_outputs["past_key_values"][i * 2] updated_inputs["past_value." + str(i)] = ort_outputs["past_key_values"][i * 2 + 1] return updated_inputs
[docs] def update_ort_outputs(self, ort_outputs): """ Function responsible for updating ONNXRT session outputs. ``Mandatory`` Args: :ort_outputs (Dict): Numpy outputs of Onnx model from current iteration Return: updated_outputs (Dict): Updated past_key_values, logits """ present_key_values = [] for i in range(self.n_layer): if "past_key." + str(i) + "_RetainedState" in ort_outputs: present_key_values.append(ort_outputs["past_key." + str(i) + "_RetainedState"]) if "past_value." + str(i) + "_RetainedState" in ort_outputs: present_key_values.append(ort_outputs["past_value." + str(i) + "_RetainedState"]) outputs = {} outputs["past_key_values"] = present_key_values outputs["logits"] = ort_outputs["logits"] return outputs