# -*- mode: python -*-# =============================================================================# @@-COPYRIGHT-START-@@## Copyright (c) 2022-2023, Qualcomm Innovation Center, Inc. All rights reserved.## Redistribution and use in source and binary forms, with or without# modification, are permitted provided that the following conditions are met:## 1. Redistributions of source code must retain the above copyright notice,# this list of conditions and the following disclaimer.## 2. Redistributions in binary form must reproduce the above copyright notice,# this list of conditions and the following disclaimer in the documentation# and/or other materials provided with the distribution.## 3. Neither the name of the copyright holder nor the names of its contributors# may be used to endorse or promote products derived from this software# without specific prior written permission.## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE# POSSIBILITY OF SUCH DAMAGE.## SPDX-License-Identifier: BSD-3-Clause## @@-COPYRIGHT-END-@@# =============================================================================""" Quantsim for Keras """from__future__importannotationsimportcontextlibfromdataclassesimportdataclassimportjsonimportosfromtypingimportUnion,Dict,Tuple,Optional,Listimporttensorflowastffromaimet_commonimportlibpymofromaimet_common.defsimportQuantScheme,QuantizationDataTypefromaimet_common.utilsimportAimetLogger,save_json_yamlfromaimet_commonimportquantsimfromaimet_common.quantsimimportextract_global_quantizer_argsfromaimet_tensorflow.keras.connectedgraphimportConnectedGraphfromaimet_tensorflow.keras.graphsearchtuilsimportGraphSearchUtilsfromaimet_tensorflow.keras.quant_sim.qc_quantize_wrapperimportQcQuantizeWrapper,QuantizerSettingsfromaimet_tensorflow.keras.quant_sim.qc_mha_wrapperimportQcQuantizableMultiHeadAttentionfromaimet_tensorflow.keras.rnn.qc_quant_LSTMimportQuantizedLSTMfromaimet_tensorflow.keras.quant_sim.tensor_quantizerimportTensorQuantizer,ActivationTensorQuantizer, \
ParamPerTensorQuantizer,StaticGridPerChannelQuantizer,ParamPerChannelQuantizerfromaimet_tensorflow.keras.quantsim_config.quantsim_configimportQuantSimConfigurator,INPUT_QUANTIZERS, \
OUTPUT_QUANTIZERS,PARAM_QUANTIZERSfromaimet_tensorflow.keras.utils.commonimportconvert_h5_model_to_pb_modelfromaimet_tensorflow.keras.defsimportAxisHandlingimportaimet_tensorflow.keras.utils.commonaskeras_common_utils_logger=AimetLogger.get_area_logger(AimetLogger.LogAreas.Quant)unquantizable_modules=(tf.keras.layers.InputLayer,QcQuantizeWrapper)substitutable_modules={tf.keras.layers.MultiHeadAttention:QcQuantizableMultiHeadAttention,tf.keras.layers.LSTM:QuantizedLSTM}@dataclassclassQuantizationSimModelParams:""" Data class that holds parameters for QuantizationSimModel. Used specifically to rebuild after converting to TF frozen pb """quant_scheme:Union[QuantScheme,str]='tf_enhanced'rounding_mode:str='nearest'default_output_bw:int=8default_param_bw:int=8in_place:bool=Falseconfig_file:str=Nonedefault_data_type:QuantizationDataType=QuantizationDataType.int# pylint: disable=too-many-ancestors# pylint: disable=too-many-instance-attributes
[docs]classQuantizationSimModel(tf.keras.Model):""" Implements mechanism to add quantization simulations ops to a model. This allows for off-target simulation of inference accuracy. Also allows the model to be fine-tuned to counter the effects of quantization. """# pylint: disable=too-many-arguments# pylint: disable=unused-argumentdef__init__(self,model,quant_scheme:Union[QuantScheme,str]='tf_enhanced',rounding_mode:str='nearest',default_output_bw:int=8,default_param_bw:int=8,in_place:bool=False,config_file:str=None,default_data_type:QuantizationDataType=QuantizationDataType.int):""" :param model: Model to quantize :param quant_scheme: Quantization Scheme, currently supported schemes are post_training_tf and post_training_tf_enhanced, defaults to post_training_tf_enhanced :param rounding_mode: The round scheme to used. One of: 'nearest' or 'stochastic', defaults to 'nearest'. :param default_output_bw: bitwidth to use for activation tensors, defaults to 8 :param default_param_bw: bitwidth to use for parameter tensors, defaults to 8 :param in_place: If True, then the given 'model' is modified in-place to add quant-sim nodes. Only suggested use of this option is when the user wants to avoid creating a copy of the model :param config_file: Path to a config file to use to specify rules for placing quant ops in the model :param default_data_type: Default data type to use for quantizing all layer parameters. Possible options are QuantizationDataType.int and QuantizationDataType.float. Note that the mode default_data_type=QuantizationDataType.float is only supported with default_output_bw=16 and default_param_bw=16 """super().__init__()self._model_without_wrappers=modelifnotin_place:self._model_without_wrappers=tf.keras.models.clone_model(model)n_weights=len(self._model_without_wrappers.weights)self._model_without_wrappers.set_weights(model.get_weights()[:n_weights])self._layer_name_to_quant_wrapper={}self._substituted_layer={}# to hold the substituted layersself._validate_model()self.connected_graph=ConnectedGraph(self._model_without_wrappers)self._quantsim_configurator=self._initialize_quantsim_configurator(quant_scheme,rounding_mode,default_output_bw,default_param_bw,default_data_type,config_file)self.quant_scheme=quant_schemeself._percentile_value=100# default percentile valueself.per_channel_quantization_enabled=self._quantsim_configurator.per_channel_quantization_flagself.model=self._add_quantization_wrappers(quant_scheme,rounding_mode,default_output_bw,default_param_bw,default_data_type)self.quant_args=extract_global_quantizer_args(quant_scheme,self._quantsim_configurator)self._params=QuantizationSimModelParams(quant_scheme,rounding_mode,default_output_bw,default_param_bw,in_place,config_file,default_data_type)def_validate_model(self):""" Check that model is appropriate for quantsim. """multiple_inbound_node_layers=[]forlayerinself._model_without_wrappers.layers:iflen(layer.inbound_nodes)>1:multiple_inbound_node_layers.append(layer.name)ifmultiple_inbound_node_layers:error_msg=(f'Layers with more than one inbound nodes are unsupported. This may occur if a layer is 'f'reused multiple times in the model definition.\n'f'Layers with multiple inbound nodes: {multiple_inbound_node_layers}')_logger.error(error_msg)raiseNotImplementedError(error_msg)sep_conv_found=self.check_separable_conv(self._model_without_wrappers)ifsep_conv_found:# Raising an assertion error incase there's SeparableConv2D in the model because in this case we have two sets of weights: Depthwise# and Pointwise. For depthwise kernels, LAST TWO AXIS should be considered and for pointwise kernels LAST AXIS# should be considered, which is not handled here. Running model preparer beforehand will resolve this as there the# SeparableConv2D is splitted into two layers Depthwise and Pointwise seperately.raiseAssertionError("SeparableConv2D found in the model. Please run model preparer before calling QuantizationSimModel")defcheck_separable_conv(self,model:tf.keras.models.Model|tf.keras.Sequential)->bool:""" Checks for SeparableConv2D layer in the model :param model: Keras Model :return: Boolean value, True if SeperableConv layer is found else False """forlayerinmodel.layers:ifisinstance(layer,tf.keras.Sequential):ifself.check_separable_conv(layer):returnTrueelifisinstance(layer,tf.keras.layers.SeparableConv2D):returnTruereturnFalsedef_get_quantizer_list(self)->Tuple[List,List,List]:""" Method to provide a list of input, output and parameter quantizers :return: Three lists containing input, paramater and output quantizers respectively """input_quantizers=[]parameter_quantizers=[]output_quantizers=[]forwrapperinself.quant_wrappers():forquantizerinwrapper.input_quantizers:input_quantizers.append(quantizer)forquantizerinwrapper.param_quantizers:parameter_quantizers.append(quantizer)forquantizerinwrapper.output_quantizers:output_quantizers.append(quantizer)returninput_quantizers,parameter_quantizers,output_quantizersdefset_percentile_value(self,percentile_value:float):""" Set the percentile value to be used while computing encodings for quantizers having percentile quant scheme. :param percentile_value: Percentile value to be set to """ifpercentile_value<90orpercentile_value>100:raiseValueError("Percentile value must be in range [90, 100]")self._percentile_value=percentile_value# Set the percentile value to the activation quantizersinput_quantizers,_,output_quantizers=self._get_quantizer_list()forquantizerininput_quantizers+output_quantizers:ifquantizer.quant_scheme==QuantScheme.post_training_percentile:quantizer.set_percentile_value(self._percentile_value)def_initialize_quantsim_configurator(self,quant_scheme:Union[QuantScheme,str],rounding_mode:str,default_output_bw:int,default_param_bw:int,default_data_type:QuantizationDataType=QuantizationDataType.int,config_file:str=None)->QuantSimConfigurator:""" Initialize quantsim configurator :param quant_scheme: Quantization Scheme :param rounding_mode: The round scheme to used :param default_output_bw: bitwidth to use for activation tensors :param default_param_bw: bitwidth to use for parameter tensors :param default_data_type: data type to use for the parameter tensors :param config_file: Path to a config file to use to specify rules for placing quant ops in the model :return: QuantSimConfigurator """returnQuantSimConfigurator(self.connected_graph,quant_scheme,rounding_mode,default_output_bw,default_param_bw,default_data_type,config_file)def_add_quantization_wrappers(self,quant_scheme,rounding_mode,default_output_bw,default_param_bw,default_data_type):""" Add quantization wrappers to the model and return a new model with the wrappers inserted. :param quant_scheme: Quantization scheme to use :param rounding_mode: Rounding mode to use :param default_output_bw: Default bitwidth for activation quantizers :param default_param_bw: Default bitwidth for param quantizers :param default_data_type: data type to use for param quantizers """defwrap_layer(layer)->tf.keras.layers.Layer:""" Function to wrap layers with QcQuantizeWrappers, used by keras clone_model() :param layer: Layer to wrap :return: Wrapped layer, or original layer if layer is not to be wrapped """ifisinstance(layer,tuple(substitutable_modules.keys())):new_class=substitutable_modules[type(layer)]config=layer.get_config()config["copy_source_weights"]=layer.get_weights()ifisinstance(layer,tf.keras.layers.LSTM):ifisinstance(self._model_without_wrappers,tf.keras.Sequential):config["is_sequential_model"]=True# pylint: disable=protected-accessifself._quantsim_configurator._layer_to_config_dict[layer]["is_input_quantized"]["setting"]:config["is_input_quantized"]=Trueconfig["quant_scheme"]=quant_schemeconfig["rounding_mode"]=rounding_modeconfig["default_output_bw"]=default_output_bwconfig["default_param_bw"]=default_param_bwconfig["default_data_type"]=default_data_typewrapped_layer=new_class.from_config(config)self._substituted_layer[layer]=wrapped_layerreturnwrapped_layerifisinstance(layer,tf.keras.Sequential):returntf.keras.models.clone_model(layer,clone_function=wrap_layer)ifisinstance(layer,unquantizable_modules)orlayer.submodules:returnlayeractivation_quant_settings=QuantizerSettings(default_output_bw,default_data_type,rounding_mode,quant_scheme,False,False,False)param_quant_settings=QuantizerSettings(default_param_bw,default_data_type,rounding_mode,quant_scheme,False,False,False)input_quantizers,output_quantizers,param_quantizers=self._get_quantizers_by_layer(layer)wrapper=QcQuantizeWrapper(layer,activation_quant_settings,param_quant_settings,num_inputs=len(layer.inbound_nodes[0].keras_inputs),input_quantizers=input_quantizers,output_quantizers=output_quantizers,param_quantizers=param_quantizers,per_channel_quantization_enabled=self.per_channel_quantization_enabled)self._layer_name_to_quant_wrapper[layer.name]=wrapperreturnwrapperreturntf.keras.models.clone_model(self._model_without_wrappers,clone_function=wrap_layer)def_get_quantizers_by_layer(self,layer:tf.keras.layers.Layer)->Tuple[Optional[ActivationTensorQuantizer],Optional[ActivationTensorQuantizer],Union[ParamPerTensorQuantizer,ParamPerChannelQuantizer]]:""" Get input/output/param quantizers from quantizers dictionary or initialize quantizers if layer is not found :param layer: Target layer :return: tuple of input, output, param quantizers """quantizers_dict=self._quantsim_configurator.get_quantizers_dict(layer)ifquantizers_dictisNone:_logger.warning("%s not found in quantizers dict, will generate quantizers automatically",layer.name)input_quantizers=Noneoutput_quantizers=Noneparam_quantizers=Noneelse:input_quantizers=quantizers_dict.get(INPUT_QUANTIZERS)output_quantizers=quantizers_dict.get(OUTPUT_QUANTIZERS)param_quantizers=quantizers_dict.get(PARAM_QUANTIZERS)returninput_quantizers,output_quantizers,param_quantizers@staticmethoddef_quantizer_to_name_tuple(quantizers:List[TensorQuantizer])->Tuple[Optional[List[str]]]:""" Converts a list of quantizers to a tuple of quantizer names :param quantizers: quantizers :return: tuple of quantizer names """quant_list=[]ifnotquantizers:returnNoneforquantizerinquantizers:quant_list.append(quantizer.name)returntuple(quant_list)defget_quantizer_name_by_layer(self,layer:tf.keras.layers.Layer)->Tuple[Optional[List[str]],Optional[List[str]],Optional[List[str]]]:""" Get the names of input, output and param quantizers :param layer: the keras layer :return: Tuple of quantizer names """input_quantizers,output_quantizers,param_quantizers=self._get_quantizers_by_layer(layer)output_quantizers_names=self._quantizer_to_name_tuple(output_quantizers)input_quantizers_names=self._quantizer_to_name_tuple(input_quantizers)parameter_quantizers_names=self._quantizer_to_name_tuple(param_quantizers)returninput_quantizers_names,output_quantizers_names,parameter_quantizers_namesdef_disable_quantizers_in_folded_batchnorm(self):""" Disable input/output/param quantizers if layer is folded batch normalization """forquantsim_wrapperinself._layer_name_to_quant_wrapper.values():ifGraphSearchUtils.is_folded_batch_normalization(quantsim_wrapper.original_layer):forqinquantsim_wrapper.input_quantizers:q.disable()forqinquantsim_wrapper.output_quantizers:q.disable()forqinquantsim_wrapper.param_quantizers:q.disable()@staticmethoddef_get_encoding_dict_for_quantizer(quantizer:TensorQuantizer)->Union[List[Dict[str,Union[str,int,float]]],Dict[str,Union[str,int,float]]]:""" Get encoding dict for a tensor quantizer. :param quantizer: Quantizer to get encoding info from :return: Dictionary or List of dictionaries containing encodings info for the tensor quantizer """ifnotisinstance(quantizer,ParamPerChannelQuantizer)orquantizer.data_type==QuantizationDataType.float:quantizer_encodings=[quantizer.encoding]else:quantizer_encodings=quantizer.encodingreturn[{'min':encoding.min,'max':encoding.max,'scale':encoding.delta,'offset':int(encoding.offset),'bitwidth':encoding.bw,'is_symmetric':str(quantizer.is_symmetric),'dtype':'int'}ifquantizer.data_type==QuantizationDataType.intelse{'dtype':'float','bitwidth':int(quantizer.bitwidth)}forencodinginquantizer_encodings]defget_encodings_dict(self)->Dict[str,Union[str,Dict]]:""" Get encodings dict containing all activation and parameter encodings info in the model :return: Dictionary containing all activation and parameter encodings info in the model """# pylint: disable=protected-access, too-many-branchesmodel_input_tensor_names=[inp.nameforinpinself.model.inputs]activation_encodings={}param_encodings={}forwrapperinself.quant_wrappers():foridx,input_quantizerinenumerate(wrapper.input_quantizers):ifinput_quantizer.is_encoding_valid()orinput_quantizer.data_type==QuantizationDataType.float:# because dense layers in quantizable MHA are not explicitly sublayers, they don't have their# inbound_nodes parameter populated, so the name of the quantizer is used insteadifnotwrapper._layer_to_wrap.inbound_nodes:tensor_name=wrapper.name+"/"+input_quantizer.name+":0"else:tensor_name=wrapper._layer_to_wrap.inbound_nodes[0].keras_inputs[idx].nameencoding_dict=self._get_encoding_dict_for_quantizer(input_quantizer)iftensor_nameinmodel_input_tensor_names:tensor_name+=":0"activation_encodings[tensor_name]=encoding_dictforidx,param_quantizerinenumerate(wrapper.param_quantizers):ifparam_quantizer.is_encoding_valid()orparam_quantizer.data_type==QuantizationDataType.float:param_name=wrapper._layer_to_wrap.weights[idx].nameencoding_dict=self._get_encoding_dict_for_quantizer(param_quantizer)param_encodings[param_name]=encoding_dictforidx,output_quantizerinenumerate(wrapper.output_quantizers):ifoutput_quantizer.is_encoding_valid()oroutput_quantizer.data_type==QuantizationDataType.float:# because dense layers in quantizable MHA are not explicitly sublayers, they don't have their# inbound_nodes parameter populated, so the name of the quantizer is used insteadifnotwrapper._layer_to_wrap.inbound_nodes:tensor_name=wrapper.name+":0"elifisinstance(wrapper._layer_to_wrap.output,List):tensor_name=wrapper._layer_to_wrap.output[idx].nameelse:tensor_name=wrapper._layer_to_wrap.output.nameencoding_dict=self._get_encoding_dict_for_quantizer(output_quantizer)activation_encodings[tensor_name]=encoding_dictreturn{'version':quantsim.encoding_version,'activation_encodings':activation_encodings,'param_encodings':param_encodings,'quantizer_args':self.quant_argsifhasattr(self,"quant_args")else{}}
[docs]defcompute_encodings(self,forward_pass_callback,forward_pass_callback_args):""" Computes encodings for all quantization sim nodes in the model. :param forward_pass_callback: A callback function that is expected to run forward passes on a model. This callback function should use representative data for the forward pass, so the calculated encodings work for all data samples. :param forward_pass_callback_args: These argument(s) are passed to the forward_pass_callback as-is. Up to the user to determine the type of this parameter. E.g. could be simply an integer representing the number of data samples to use. Or could be a tuple of parameters or an object representing something more complex. """ops_with_invalid_encodings=[]self._compute_and_set_parameter_encodings(ops_with_invalid_encodings)self._set_op_mode_parameters(libpymo.TensorQuantizerOpMode.quantizeDequantize)forward_pass_callback(self.model,forward_pass_callback_args)forquant_wrapperinself.quant_wrappers():quant_wrapper.compute_encoding(ops_with_invalid_encodings)op_mode=self._param_op_mode_after_analysis(self.quant_scheme)self._set_op_mode_parameters(op_mode)ifops_with_invalid_encodings:_logger.info('The following quantizers did not have valid encodings and have been set to passThrough mode: ''%s',ops_with_invalid_encodings)_logger.info('This can be due to the quantizers not having been evaluated during the forward pass in ''compute encodings. Evaluation is required to collect statistics needed to compute valid ''encodings.\n''As a result, the quantizers have been set to passThrough mode, meaning no quantization noise ''will be simulated for these ops if they are evaluated in the future.\n''If this is not desired, amend the forward pass to evaluate tensors which require these ops ''to be evaluated, and recompute encodings.')
def_set_op_mode_parameters(self,op_mode:libpymo.TensorQuantizerOpMode):""" Sets quant mode for parameters and if the encodings are invalid, then adds those wrappers to wrappers_with_invalid_encodings :param op_mode: Quant mode to set to """forquantizer_infoinself.quant_wrappers():forparam_quantizerinquantizer_info.param_quantizers:ifparam_quantizer.is_enabled():param_quantizer.quant_mode=op_mode@staticmethod@contextlib.contextmanagerdef_set_encoding_version_to_0_6_1():assertquantsim.encoding_versionin{'0.6.1','1.0.0'}ifquantsim.encoding_version=='1.0.0':_logger.info('Exporting to encoding version 1.0.0 is not yet supported. Exporting using version 0.6.1 ''instead.')old_encoding_version=quantsim.encoding_versionquantsim.encoding_version='0.6.1'yieldquantsim.encoding_version=old_encoding_version
[docs]defexport(self,path,filename_prefix,custom_objects=None,convert_to_pb=True):""" This method exports out the quant-sim model so it is ready to be run on-target. Specifically, the following are saved 1. The sim-model is exported to a regular Keras model without any simulation ops 2. The quantization encodings are exported to a separate JSON-formatted file that can then be imported by the on-target runtime (if desired) :param path: path where to store model pth and encodings :param filename_prefix: Prefix to use for filenames of the model pth and encodings files :param custom_objects: If there are custom objects to load, Keras needs a dict of them to map them """withself._set_encoding_version_to_0_6_1():model_path=os.path.join(path,filename_prefix)#TF Version 2.4 has bug i.e. save() in tf format don't work for unrolled LSTM.forlayerinself._model_without_wrappers.layers:ifisinstance(layer,tf.keras.layers.LSTM):breakelse:self._model_without_wrappers.save(model_path)self._model_without_wrappers.save(model_path+'.h5',save_format='h5')# Conversion of saved h5 model to pb model for consumption by SNPE/QNNtry:ifconvert_to_pb:convert_h5_model_to_pb_model(f'{model_path}.h5',custom_objects=custom_objects)exceptValueError:_logger.error("Could not convert h5 to frozen pb. ""Please call export() again with custom_objects defined.")raisefinally:encodings_dict=self.get_encodings_dict()encoding_file_path=os.path.join(path,filename_prefix+'.encodings')save_json_yaml(encoding_file_path,encodings_dict)
def_compute_and_set_parameter_encodings(self,ops_with_invalid_encodings:List):# pylint: disable=too-many-nested-blocksforquantizer_wrapperinself.quant_wrappers():foridx,param_quantizerinenumerate(quantizer_wrapper.param_quantizers):ifparam_quantizer.is_enabled()andparam_quantizer.data_type==QuantizationDataType.int:# 0th input to our quant wrapper is the tensor being quantizedweight_tensor=quantizer_wrapper.original_layer.get_weights()[idx]# Per-channelifisinstance(param_quantizer,StaticGridPerChannelQuantizer):forindex,tensor_quantizerinenumerate(param_quantizer.tensor_quantizer):ifparam_quantizer.axis_handling==AxisHandling.LAST_TWO_AXES.value:last_two_axes_combined_shape=list(weight_tensor.shape[:-2])+[-1]channel_slice=weight_tensor.reshape(*last_two_axes_combined_shape)channel_slice=channel_slice.take(index,channel_slice.ndim-1)elifisinstance(quantizer_wrapper.original_layer,tf.keras.layers.Conv2DTranspose):ifweight_tensor.ndim==4:channel_slice=weight_tensor.take(index,weight_tensor.ndim-2)else:# For bias in Transpose layerschannel_slice=weight_tensor.take(index,weight_tensor.ndim-1)else:channel_slice=weight_tensor.take(index,weight_tensor.ndim-1)tensor_quantizer.updateStats(channel_slice,False)# Per-tensorelse:tensor_quantizer=param_quantizer.tensor_quantizertensor_quantizer.updateStats(weight_tensor,False)param_quantizer.compute_encoding(ops_with_invalid_encodings)defset_and_freeze_param_encodings(self,encoding_path:str):""" Set and freeze parameter encodings from encodings JSON file :param encoding_path: path from where to load parameter encodings file """# Load parameter encodings filewithopen(encoding_path)asjson_file:param_encodings=json.load(json_file)forquant_wrapperinself.quant_wrappers():quant_wrapper.set_and_freeze_param_encoding(param_encodings)# pylint: disable=too-many-nested-blocks
[docs]defload_encodings_to_sim(self,encoding_file_path:str):""" Loads the saved encodings to quant sim model :param encoding_file_path: path from where to load encodings file :return: """# pylint: disable=protected-access, too-many-branches, too-many-locals, too-many-statements# Load encodings filewithopen(encoding_file_path)asjson_file:encodings=json.load(json_file)param_encodings=encodings['param_encodings']activation_encodings=encodings['activation_encodings']model_input_tensor_names=[inp.nameforinpinself.model.inputs]forwrapperinself.quant_wrappers():foridx,input_quantizerinenumerate(wrapper.input_quantizers):# because dense layers in quantizable MHA and RNN are not explicitly sublayers, they don't have their# inbound_nodes parameter populated, so the name of the quantizer is used insteadifnotwrapper._layer_to_wrap.inbound_nodes:tensor_name=wrapper.name+"/"+input_quantizer.name+":0"else:tensor_name=wrapper._layer_to_wrap.inbound_nodes[0].keras_inputs[idx].nameiftensor_nameinmodel_input_tensor_names:tensor_name+=":0"iftensor_nameinactivation_encodings:ifnotinput_quantizer.is_enabled():_logger.info("Not loading encodings for quantizer: %s as it is disabled",tensor_name)continueencoding_dict=activation_encodings[tensor_name][0]ifencoding_dict['dtype']=='int':encoding,is_symmetric=keras_common_utils.create_encoding_from_dict(encoding_dict)input_quantizer.tensor_quantizer.isEncodingValid=Trueinput_quantizer.set_quantizer_encodings(encoding.bw,is_symmetric,encoding,libpymo.TensorQuantizerOpMode.quantizeDequantize)_logger.info("Setting encodings for : %s",tensor_name)elifencoding_dict['dtype']=='float':input_quantizer.data_type=QuantizationDataType.floatinput_quantizer.bitwidth=encoding_dict['bitwidth']_logger.info("Setting quantizer dtype to float for : %s",tensor_name)else:raiseRuntimeError("Unrecognized dtype %s for: %s"%(encoding_dict['dtype'],tensor_name))else:ifinput_quantizer.is_enabled():input_quantizer.disable()_logger.info("Encoding for quantizer: %s is not present thus disabling it.",tensor_name)foridx,param_quantizerinenumerate(wrapper.param_quantizers):param_name=wrapper._layer_to_wrap.weights[idx].nameifparam_nameinparam_encodings:ifnotparam_quantizer.is_enabled():_logger.info("Not loading encodings for parameter: %s as quantizer is disabled",param_name)continueifisinstance(param_quantizer,StaticGridPerChannelQuantizer):ifparam_encodings[param_name][0]['dtype']=='float':wrapper.param_quantizers[idx]=ParamPerTensorQuantizer(layer=param_quantizer._original_layer,name=param_quantizer.name,quant_scheme=param_quantizer._quant_scheme,round_mode='nearest',bitwidth=param_encodings[param_name][0]['bitwidth'],data_type=QuantizationDataType.float,is_symmetric=param_quantizer.is_symmetric,use_strict_symmetric=param_quantizer.use_strict_symmetric,use_unsigned_symmetric=param_quantizer.use_unsigned_symmetric,enabled=False)else:encoding,is_symmetric=keras_common_utils.create_encoding_from_dict(param_encodings[param_name])fortensor_quantizerinparam_quantizer.tensor_quantizer:tensor_quantizer.isEncodingValid=Truebw=encoding[0].bwparam_quantizer.set_quantizer_encodings(bw,is_symmetric,encoding,libpymo.TensorQuantizerOpMode.oneShotQuantizeDequantize)_logger.info("Setting encodings for : %s",param_name)else:encoding_dict=param_encodings[param_name][0]ifencoding_dict['dtype']=='int':encoding,is_symmetric=keras_common_utils.create_encoding_from_dict(encoding_dict)param_quantizer.tensor_quantizer.isEncodingValid=Truebw=encoding.bwparam_quantizer.set_quantizer_encodings(bw,is_symmetric,encoding,libpymo.TensorQuantizerOpMode.oneShotQuantizeDequantize)_logger.info("Setting encodings for : %s",param_name)elifencoding_dict['dtype']=='float':param_quantizer.data_type=QuantizationDataType.floatparam_quantizer.bitwidth=encoding_dict['bitwidth']_logger.info("Setting quantizer to float for : %s",param_name)else:raiseRuntimeError("Unrecognized dtype %s for: %s"%(encoding_dict['dtype'],tensor_name))else:ifparam_quantizer.is_enabled():param_quantizer.disable()_logger.info("Encoding for parameter: %s not present thus disabling this quantizer.",param_name)# Loading encodings means that compute encodings was called. Therefore, these two lines set the correct# op mode for the correct quant scheme and if the quantization was per channel or not.op_mode=self._param_op_mode_after_analysis(self.quant_scheme)self._set_op_mode_parameters(op_mode)foridx,output_quantizerinenumerate(wrapper.output_quantizers):# because dense layers in quantizable MHA are not explicitly sublayers, they don't have their# inbound_nodes parameter populated, so the name of the quantizer is used insteadifnotwrapper._layer_to_wrap.inbound_nodes:tensor_names=[wrapper.name+":0"]else:# There can be multiple outputs if there is a# `tf.split` in the model.ifisinstance(wrapper._layer_to_wrap.output,list):tensor_names=[output.nameforoutputinwrapper._layer_to_wrap.output]else:tensor_names=[wrapper._layer_to_wrap.output.name]fortensor_nameintensor_names:iftensor_nameinactivation_encodings:ifnotoutput_quantizer.is_enabled():_logger.info("Not loading encodings for quantizer: %s as it is disabled",tensor_name)continueencoding_dict=activation_encodings[tensor_name][0]ifencoding_dict['dtype']=='int':encoding,is_symmetric=keras_common_utils.create_encoding_from_dict(encoding_dict)output_quantizer.tensor_quantizer.isEncodingValid=Trueoutput_quantizer.set_quantizer_encodings(encoding.bw,is_symmetric,encoding,libpymo.TensorQuantizerOpMode.quantizeDequantize)_logger.info("Setting encodings for : %s",tensor_name)elifencoding_dict['dtype']=='float':output_quantizer.data_type=QuantizationDataType.floatoutput_quantizer.bitwidth=encoding_dict['bitwidth']_logger.info("Setting quantizer dtype to float for : %s",tensor_name)else:raiseRuntimeError("Unrecognized dtype %s for: %s"%(encoding_dict['dtype'],tensor_name))else:ifoutput_quantizer.is_enabled():output_quantizer.disable()_logger.info("Encoding for quantizer: %s is not present thus disabling it.",tensor_name)
def_param_op_mode_after_analysis(self,quant_scheme)->libpymo.TensorQuantizerOpMode:""" Returns quant mode to use for parameters after encodings have been computed :param quant_scheme: Quantization scheme to use :return: Quant mode to use """ifquant_schemein[QuantScheme.training_range_learning_with_tf_init,QuantScheme.training_range_learning_with_tf_enhanced_init] \
orself.per_channel_quantization_enabled:returnlibpymo.TensorQuantizerOpMode.quantizeDequantizereturnlibpymo.TensorQuantizerOpMode.oneShotQuantizeDequantizedefquant_wrappers(self):""" Generator for yielding all quantization wrappers """forlayerinself.model.layers:ifisinstance(layer,QcQuantizeWrapper):yieldlayerifisinstance(layer,tuple(substitutable_modules.values())):yield fromlayer.quant_wrappers()# For Getting Quantizers from Sequantial Blockifisinstance(layer,tf.keras.Sequential):yield fromquant_wrappers_for_sequential_block(layer)defget_quant_wrapper_for_layer_name(self,layer_name:str)->QcQuantizeWrapper:""" Return qc quant wrapper corresponding to a layer name :param layer_name: Layer name to get quantize wrapper for :return: Qc quant wrapper corresponding to a layer name """returnself._layer_name_to_quant_wrapper.get(layer_name)# pylint: disable=too-many-localsdef_fill_missing_encoding_min_max_gradients(self,gradients:list):""" Computes the encoding min/max gradients and populates the gradients list :param gradients: gradients computed using GradientTape(gradients for encoding min/max will be `None`) """def_find_weight_in_layer(weight_name:str,model_layer:tf.keras.layers.Layer):forweightinmodel_layer.weights:ifweight.name.split(":")[0]==weight_name:returnweightreturnNone# Mapping used to get the gradients of weights(kernel, bias etc)weight_name_to_gradient=dict(zip([weight.name.split(":")[0]forweightinself.model.trainable_weights],gradients))# Mapping used to get index of encoding min/max gradients (which would be `None`) and fill themweight_name_to_index=dict(zip([weight.nameforweightinself.model.trainable_weights],range(len(self.model.trainable_weights))))# Only process layers where 'param_quantizers' is defined (i.e. QcQuantizeWrapper layers)forlayerinfilter(lambda_layer:hasattr(_layer,'param_quantizers'),self.model.layers):forparam_quantizerinlayer.param_quantizers:ifparam_quantizer.nameinweight_name_to_gradient:# Value of weight associated with this param quantizerweight_tensor=_find_weight_in_layer(param_quantizer.name,layer.original_layer)# Gradients of the weightsgrad=weight_name_to_gradient[param_quantizer.name]# Using the weights and it's gradients, compute gradients for encoding min/maxdloss_by_dmin,dloss_by_dmax=param_quantizer.get_gradients_for_encoding_min_max(weight_tensor,grad)enc_min_index=weight_name_to_index[param_quantizer.encoding_min.name]enc_max_index=weight_name_to_index[param_quantizer.encoding_max.name]gradients[enc_min_index]=dloss_by_dmingradients[enc_max_index]=dloss_by_dmax# TODO: Remove this logic once this has been resolved in QNN/SNPE# Go through activation quantizers (Input/Output) and set any ReLU's encoding min to 0relu_quantize_wrappers=[_layerfor_layerinself.model.layersifisinstance(_layer,QcQuantizeWrapper)andisinstance(_layer.original_layer,tf.keras.layers.ReLU)]def_set_encoding_min_grad_to_None(quantizer):enc_min_index=weight_name_to_index[quantizer.encoding_min.name]gradients[enc_min_index]=Noneforrelu_quantizerinrelu_quantize_wrappers:foroutput_quantizerinrelu_quantizer.output_quantizers:_set_encoding_min_grad_to_None(output_quantizer)# pylint: disable=useless-super-delegationdefget_config(self):returnsuper().get_config()defcall(self,inputs,training=None,mask=None):returnself.model.call(inputs,training,mask)deftrain_step(self,data):""" Custom training loop, equivalent to overriding `keras.Model.fit` function Reference: https://keras.io/guides/customizing_what_happens_in_fit/ Only relevant when using range-learning, otherwise equivalent to `keras.Model.fit` Param quantizers are disconnected in the op graph of the wrapped model Because of this, the gradients are not computed for encoding min/max(when range learning is enabled) This custom train_step function computes the missing gradients for encoding min/max of param quantizers """x,y=datawithtf.GradientTape()astape:predictions=self(x,training=True)loss=self.compiled_loss(y,predictions)gradients=tape.gradient(loss,self.model.trainable_weights)# Manually compute missing gradients for encoding min/max when using range learningifself.quant_schemein[QuantScheme.training_range_learning_with_tf_init,QuantScheme.training_range_learning_with_tf_enhanced_init]:self._fill_missing_encoding_min_max_gradients(gradients)gradients_to_apply=[(gradient,weight)forgradient,weightinzip(gradients,self.model.trainable_weights)ifgradientisnotNone]self.optimizer.apply_gradients(gradients_to_apply)self.compiled_metrics.update_state(y,predictions)return{m.name:m.result()forminself.metrics}
defquant_wrappers_for_sequential_block(seq_block:tf.keras.Sequential):""" Generator for yielding all quantization wrappers for a Sequantial Block """forlayerinseq_block.layers:ifisinstance(layer,QcQuantizeWrapper):yieldlayerifisinstance(layer,tuple(substitutable_modules.values())):yield fromlayer.quant_wrappers()# in cases of nested Sequential Blockifisinstance(layer,tf.keras.Sequential):yield fromquant_wrappers_for_sequential_block(layer)