# -*- mode: python -*-
# =============================================================================
# @@-COPYRIGHT-START-@@
#
# Copyright (c) 2023-2024, Qualcomm Innovation Center, Inc. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its contributors
# may be used to endorse or promote products derived from this software
# without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
#
# SPDX-License-Identifier: BSD-3-Clause
#
# @@-COPYRIGHT-END-@@
# =============================================================================
# pylint: disable=redefined-builtin
""" Affine quantizers """
import abc
from itertools import chain, repeat
from typing import Optional, List, Dict, Tuple, overload
import contextlib
import functools
import torch
from torch import nn
from aimet_torch.v2.utils import patch_attr, _is_expandable, StatisticsNotFoundError, docstring
from aimet_torch.v2.quantization.encoding_analyzer import EncodingAnalyzer, MinMaxEncodingAnalyzer, _flag_extreme_min_max
from aimet_torch.v2.quantization.affine import AffineEncoding, GroupedBlockEncoding
from aimet_torch.v2.quantization.tensor import QuantizedTensor, DequantizedTensor
from aimet_torch.v2.quantization.base import QuantizerBase
from aimet_torch.v2.quantization.affine.backends import quantize, quantize_dequantize, torch_builtins, _derive_qmin_qmax
from aimet_torch.v2.utils import ste_round
from aimet_torch.v2.deepspeed_utils import SafeGatheredParameters
from ._utils import _GridMixin, _register_signature # pylint: disable=import-error
__all__ = ['AffineQuantizerBase', 'MinMaxQuantizer', 'Quantize', 'QuantizeDequantize',
'GroupedBlockQuantizeDequantize']
class AffineQuantizerBase(QuantizerBase, _GridMixin):
"""
Base class for linear quantization modules.
Args:
shape (tuple): Shape of the quantization parameters
bitwidth (int): Quantization bitwidth
symmetric (bool): If True, performs symmetric quantization;
otherwise, performs asymmetric quantization
encoding_analyzer (EncodingAnalyzer, optional): Encoding analyzer for calibrating quantization encodings
(default: absolute min-max encoding analyzer)
"""
_init_signatures = []
@overload
@_register_signature(_init_signatures)
def __init__(self, shape, qmin: int, qmax: int, symmetric: bool, encoding_analyzer: EncodingAnalyzer = None,
block_size: Optional[Tuple[int, ...]] = None):
...
@overload
@_register_signature(_init_signatures)
def __init__(self, shape, bitwidth: int, symmetric: bool, encoding_analyzer: EncodingAnalyzer = None,
block_size: Optional[Tuple[int, ...]] = None):
...
def __init__(self, shape, *args, **kwargs):
super().__init__()
if isinstance(shape, int):
shape = (shape,)
self.shape = tuple(shape)
full_args = (shape, *args)
# Pad positional args with None's such that len(args) == 5
args = tuple(chain(args, repeat(None, 5 - len(args))))
arg0 = kwargs.pop('qmin', kwargs.pop('bitwidth', args[0]))
arg1 = kwargs.pop('qmax', args[1])
if arg1 is not None and not isinstance(arg1, bool):
# (arg0, arg1, arg2) == (qmin, qmax, symmetric)
qmin, qmax = arg0, arg1
symmetric = kwargs.pop('symmetric', args[2])
if (qmin is None) or (qmax is None) or (symmetric is None):
raise self._arg_parsing_error(full_args, kwargs)
encoding_analyzer = kwargs.pop('encoding_analyzer', args[3])
block_size = kwargs.pop('block_size', args[4])
else:
# (arg0, arg1) == (bitwidth, symmetric)
bitwidth = arg0
symmetric = kwargs.pop('symmetric', args[1])
if (bitwidth is None) or (symmetric is None):
raise self._arg_parsing_error(full_args, kwargs)
# We support two quantization modes: (unsigned) asymmetric and signed-symmetric
qmin, qmax = _derive_qmin_qmax(bitwidth=bitwidth, signed=symmetric)
encoding_analyzer = kwargs.pop('encoding_analyzer', args[2])
block_size = kwargs.pop('block_size', args[3])
assert qmin is not None
assert qmax is not None
if kwargs:
cls = type(self).__qualname__
unexpected_keys = ', '.join(kwargs.keys())
raise TypeError(f"{cls}.__init__ got unexpected keyword argument: {unexpected_keys}")
if qmin >= qmax:
raise ValueError(f"qmax should be strictly larger than qmin. Got qmax={qmax}, qmin={qmin}")
self.qmin = qmin
self.qmax = qmax
self._symmetric = symmetric
self.block_size = block_size
self.encoding_analyzer = encoding_analyzer or \
MinMaxEncodingAnalyzer(torch_builtins.get_encoding_shape_with_blocks(self.shape,
self.block_size))
if self.block_size is None and not _is_expandable(self.encoding_analyzer.observer.shape, self.shape):
raise RuntimeError(f'Encoding analyzer of shape {self.encoding_analyzer.observer.shape} '
f'is incompatible with quantizer of shape {self.shape}.')
@abc.abstractmethod
def get_min(self, dtype=None) -> torch.Tensor:
"""
Compute quantization min to be used for forward pass.
Return None f the quantizer is not initialized yet.
Args:
dtype (torch.dtype): dtype of the computed min
Returns:
Quantization min
"""
@abc.abstractmethod
def get_max(self, dtype=None) -> torch.Tensor:
"""
Compute quantization max to be used for forward pass.
Return None f the quantizer is not initialized yet.
Args:
dtype (torch.dtype): dtype of the computed max
Returns:
Quantization max
"""
@abc.abstractmethod
def get_scale(self, dtype=None) -> torch.Tensor:
"""
Compute quantization scale to be used for forward pass.
Return None f the quantizer is not initialized yet.
Args:
dtype (torch.dtype): dtype of the computed scale
Returns:
Quantization scale
"""
@abc.abstractmethod
def get_offset(self, dtype=None) -> torch.Tensor:
"""
Compute quantization offset to be used for forward pass.
Return None f the quantizer is not initialized yet.
Args:
dtype (torch.dtype): dtype of the computed offset
Returns:
Quantization offset
"""
@abc.abstractmethod
def set_range(self, min: torch.Tensor, max: torch.Tensor):
"""
Set quantization parameters to the given min-max range
"""
def get_encodings(self) -> Optional[AffineEncoding]:
"""
Return the quantizer's encodings as an AffineEncoding object
"""
if self.is_initialized():
return AffineEncoding(self.get_scale(dtype=torch.float32),
self.get_offset(dtype=torch.float32),
self.qmin, self.qmax, self._symmetric, self.block_size)
return None
@torch.no_grad()
def get_legacy_encodings(self) -> Optional[List[Dict]]:
"""
Returns a list of encodings, each represented as a List of Dicts
"""
# pylint: disable=redefined-builtin, protected-access
if not self.is_initialized():
return None
return self.get_encodings()._to_legacy_format()
@torch.no_grad()
def set_legacy_encodings(self, encodings: List[Dict]):
"""
Set encodings represented in the same format as the output of get_legacy_encodings as below:
[
{'min': float, 'max': float, 'scale': float, 'offset': float,
'bitwidth': int, 'dtype': str, 'is_symmetric': str},
{'min': float, 'max': float, 'scale': float, 'offset': float,
'bitwidth': int, 'dtype': str, 'is_symmetric': str},
...
]
"""
def str_to_bool(s: str):
s = s.lower()
if s == "false":
return False
if s == "true":
return True
raise ValueError
bitwidth = encodings[0]['bitwidth']
symmetric = str_to_bool(encodings[0]['is_symmetric'])
# We support two quantization modes: (unsigned) asymmetric and signed-symmetric
self.qmin, self.qmax = _derive_qmin_qmax(bitwidth=bitwidth, signed=symmetric)
self.symmetric = symmetric
# Note: We can only accurately infer signed-ness in the symmetric case, but AIMET uses unsigned for asymmetric
min_ = torch.tensor([e['min'] for e in encodings]).view(self.shape)
max_ = torch.tensor([e['max'] for e in encodings]).view(self.shape)
self.set_range(min_, max_)
def extra_repr(self) -> str:
extra_repr = f'shape={self.shape}'
if self.block_size is not None:
extra_repr += f", block_size={self.block_size}"
extra_repr += f', qmin={self.qmin}, qmax={self.qmax}, symmetric={self.symmetric}'
return extra_repr
@property
def symmetric(self) -> bool:
"""
Indicates whether this quantizer uses symmetric quantization
"""
return self._symmetric
@symmetric.setter
def symmetric(self, symmetric: bool):
"""
Set the quantizer symmetry
:param symmetric: If True, use symmetric encodings. Else, use asymmetric encodings
"""
self._symmetric = symmetric
@property
@docstring(_GridMixin._get_bitwidth.__doc__)
def bitwidth(self) -> int: # pylint: disable=missing-function-docstring
return self._get_bitwidth()
@bitwidth.setter
def bitwidth(self, bitwidth: int):
self._set_bitwidth(bitwidth)
@property
@docstring(_GridMixin._get_signed.__doc__)
def signed(self) -> bool: # pylint: disable=missing-function-docstring
return self._get_signed()
@signed.setter
def signed(self, signed: bool):
self._set_signed(signed)
class MinMaxQuantizer(AffineQuantizerBase): # pylint: disable=abstract-method
"""
Affine quantizer with min-max as trainable parameters
"""
min: torch.nn.Parameter
max: torch.nn.Parameter
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.register_quantization_parameter('min', nn.Parameter(-torch.ones(self.shape)))
self.register_quantization_parameter('max', nn.Parameter(torch.ones(self.shape)))
@contextlib.contextmanager
def compute_encodings(self):
"""
Observe inputs and update quantization parameters based on the input statistics.
During ``compute_encodings`` is enabled, the quantizer forward pass performs
dynamic quantization using the batch statistics.
"""
if not self._allow_overwrite:
yield
return
original_forward = self.forward
@functools.wraps(original_forward)
def forward_wrapper(input):
input = input.as_subclass(torch.Tensor)
expanded_input = torch_builtins.reshape_tensor_for_blocks(input, self.shape, self.block_size)
batch_statistics = self.encoding_analyzer.update_stats(expanded_input)
num_steps = self.qmax - self.qmin
dynamic_min, dynamic_max =\
self.encoding_analyzer.compute_encodings_from_stats(batch_statistics,
num_steps,
self.symmetric)
if self.block_size is not None:
dynamic_min = dynamic_min.view(self.min.shape)
dynamic_max = dynamic_max.view(self.max.shape)
dynamic_min = dynamic_min.to(dtype=self.min.dtype,
device=self.min.device).expand_as(self.min)
dynamic_max = dynamic_max.to(dtype=self.max.dtype,
device=self.max.device).expand_as(self.max)
with patch_attr(self, 'min', dynamic_min),\
patch_attr(self, 'max', dynamic_max):
return original_forward(input)
self.encoding_analyzer.reset_stats()
try:
with patch_attr(self, 'forward', forward_wrapper):
yield
except: # pylint: disable=try-except-raise
raise
else:
try:
num_steps = self.qmax - self.qmin
enc_min, enc_max = self.encoding_analyzer.compute_encodings(num_steps, self.symmetric)
if self.block_size is not None:
enc_min = enc_min.view(self.min.shape)
enc_max = enc_max.view(self.max.shape)
_flag_extreme_min_max(enc_min, enc_max)
except StatisticsNotFoundError:
return
if enc_min is None or enc_max is None:
return
self.set_range(enc_min, enc_max)
def get_min(self, dtype=None) -> Optional[torch.Tensor]:
"""
Compute quantization min to be used for forward pass.
NOTE: self.min may not be equal to self.get_min().
self.get_min() returns slightly recalibrated version of self.min.
:param dtype: dtype of the computed min. Use of self.min.dtype by default.
:return: Quantization min
"""
if not self.is_initialized():
return None
return self.get_scale(dtype) * (self.get_offset(dtype) + self.qmin)
def get_max(self, dtype=None) -> Optional[torch.Tensor]:
"""
Compute quantization max to be used for forward pass.
NOTE: self.max may not be equal to self.get_max()
self.get_max() returns slightly recalibrated version of self.max.
:param dtype: dtype of the computed max. Use of self.min.dtype by default.
:return: Quantization max
"""
if not self.is_initialized():
return None
return self.get_scale(dtype) * (self.get_offset(dtype) + self.qmax)
def get_scale(self, dtype=None) -> Optional[torch.Tensor]:
"""
Compute quantization scale to be used for forward pass.
:param dtype: dtype of the computed scale. Use of self.min.dtype by default.
:return: Quantization scale
"""
if not self.is_initialized():
return None
dtype = dtype or torch.float32
num_steps = self.qmax - self.qmin
scale = (self.max.to(dtype) - self.min.to(dtype)) / num_steps
return scale.to(dtype)
def get_offset(self, dtype=None) -> Optional[torch.Tensor]:
"""
Compute quantization offset to be used for forward pass.
:param dtype: dtype of the computed offset. Use of self.min.dtype by default.
:return: Quantization offset
"""
if not self.is_initialized():
return None
dtype = dtype or torch.float32
if self.symmetric:
offset = torch.full_like(self.min,
fill_value=-round((self.qmin + self.qmax) / 2),
requires_grad=False,
dtype=dtype)
else:
offset = ste_round(self.min.to(dtype) / self.get_scale(dtype)) - self.qmin
return offset.to(dtype)
def set_range(self, min: torch.Tensor, max: torch.Tensor):
"""
Set quantization parameters to the given min-max range
"""
with torch.no_grad(), SafeGatheredParameters(self.parameters(recurse=False), modifier_rank=0):
self.min.copy_(min)
self.max.copy_(max)
[docs]
class Quantize(MinMaxQuantizer):
r"""Applies quantization to the input.
Precisely,
.. math::
out = clamp\left(\left\lceil\frac{input}{scale}\right\rfloor - offset, qmin, qmax\right)
where :math:`scale` and :math:`offset` are derived from learnable parameters
:math:`\theta_{min}` and :math:`\theta_{max}`.
If block size :math:`B = \begin{pmatrix} B_0 & B_1 & \cdots & B_{D-1} \end{pmatrix}` is specified,
this equation will be further generalized as
.. math::
out_{j_0 \cdots j_{D-1}} & = clamp\left(
\left\lceil\frac{input_{j_0 \cdots j_{D-1}}}{scale_{i_0 \cdots i_{D-1}}}\right\rfloor
- offset_{i_0 \cdots i_{D-1}}, qmin, qmax\right)\\
\text{where} \quad \forall_{0 \leq d < D} \quad i_d = \left\lfloor \frac{j_d}{B_d} \right\rfloor
Args:
shape (tuple): Shape of the quantization parameters
bitwidth (int): Quantization bitwidth
symmetric (bool): If True, performs symmetric quantization;
otherwise, performs asymmetric quantization
encoding_analyzer (EncodingAnalyzer, optional): Encoding analyzer for calibrating quantization encodings
(default: absolute min-max encoding analyzer)
block_size (Tuple[int, ...], optional): Block size
:ivar Tensor min: :math:`\theta_{min}` from which scale and offset will be derived.
:ivar Tensor max: :math:`\theta_{max}` from which scale and offset will be derived.
.. note::
:class:`Quantize` cannot run :meth:`forward` until :attr:`min` and :attr:`max` are properly initialized,
which can be done based on input statistics using :meth:`compute_encodings` or
by manually assigning a new value to :attr:`min` and :attr:`max`.
See the examples below.
Examples:
>>> import aimet_torch.v2.quantization as Q
>>> input = torch.randn(5, 10)
>>> q = Q.affine.Quantize(shape=(5, 1), bitwidth=8, symmetric=False, block_size=(1, 5))
>>> q.is_initialized()
False
>>> with q.compute_encodings():
... _ = q(input)
...
>>> q.is_initialized()
True
>>> q(input)
QuantizedTensor([[129., 64., 255., 122., 0., 192., 106., 94., 255., 0.],
[ 0., 145., 181., 255., 144., 255., 194., 0., 74., 86.],
[122., 0., 255., 150., 33., 103., 103., 0., 37., 255.],
[255., 111., 237., 218., 0., 49., 155., 255., 0., 179.],
[ 0., 66., 255., 89., 110., 17., 36., 83., 255., 0.]],
grad_fn=<AliasBackward0>)
>>> import aimet_torch.v2.quantization as Q
>>> input = torch.randn(5, 10)
>>> q = Q.affine.Quantize(shape=(5, 1), bitwidth=8, symmetric=False, block_size=(1, 5))
>>> q.is_initialized()
False
>>> q.min = torch.nn.Parameter(-torch.ones_like(q.min))
>>> q.max = torch.nn.Parameter(torch.ones_like(q.max))
>>> q.is_initialized()
True
>>> q(input)
QuantizedTensor([[187., 186., 131., 0., 203., 64., 80., 0., 143., 152.],
[ 16., 0., 255., 0., 0., 150., 0., 255., 32., 255.],
[255., 226., 0., 255., 55., 172., 0., 255., 145., 255.],
[207., 146., 216., 238., 0., 0., 141., 178., 255., 188.],
[ 63., 59., 19., 162., 30., 255., 109., 255., 0., 255.]],
grad_fn=<AliasBackward0>)
"""
# NOTE: Deepspeed has a bug where it will inadvertently patch __init__ method permanently
# unless each leaf class explicitly defines its own __init__ separately.
# As a temporary workaround, we define __init__ to avoid triggering this bug.
# pylint: disable=useless-super-delegation
def __init__(self, shape, *args, **kwargs):
super().__init__(shape, *args, **kwargs)
[docs]
def forward(self, input: torch.Tensor) -> QuantizedTensor:
"""Quantizes the input tensor
Args:
input (torch.Tensor): Input to quantize
Returns:
Quantized output
"""
if not self.is_initialized():
raise RuntimeError(
'Failed to run Quantize since quantization parameters are not initialized.'
' Please initialize the quantization parameters using `compute_encodings()`.'
)
encoding = self.get_encodings()
# Subclasses of torch.Tensor with custom __torch_function__ (in our case, QuantizedTensorBase)
# is known to introduce substantial CPU overhead.
# Cast types of the inputs to plain torch.Tensor for faster execution.
input = input.as_subclass(torch.Tensor)
output = quantize(input,
encoding.scale,
encoding.offset,
encoding.qmin,
encoding.qmax,
block_size=self.block_size)
output = output.as_subclass(QuantizedTensor)
output.encoding = encoding
return output
[docs]
class QuantizeDequantize(MinMaxQuantizer):
r"""Applies fake-quantization by quantizing and dequantizing the input.
Precisely,
.. math::
out = (\overline{input} + offset) * scale
where
.. math::
\overline{input} = clamp\left(\left\lceil\frac{input}{scale}\right\rfloor - offset, qmin, qmax\right)
and :math:`scale` and :math:`offset` are derived from learnable parameters
:math:`\theta_{min}` and :math:`\theta_{max}`.
If block size :math:`B = \begin{pmatrix} B_0 & B_1 & \cdots & B_{D-1} \end{pmatrix}` is specified,
this equation will be further generalized as
.. math::
out_{j_0 \cdots j_{D-1}} &= (\overline{input}_{j_0 \cdots j_{D-1}} + offset_{i_0 \cdots i_{D-1}}) * scale_{i_0 \cdots i_{D-1}}\\
\overline{input}_{j_0 \cdots j_{D-1}} &= clamp\left(
\left\lceil\frac{input_{j_0 \cdots j_{D-1}}}{scale_{i_0 \cdots i_{D-1}}}\right\rfloor
- offset_{i_0 \cdots i_{D-1}}, qmin, qmax\right)\\
\text{where} \quad \forall_{0 \leq d < D} \quad i_d = \left\lfloor \frac{j_d}{B_d} \right\rfloor
Args:
shape (tuple): Shape of the quantization parameters
bitwidth (int): Quantization bitwidth
symmetric (bool): If True, performs symmetric quantization;
otherwise, performs asymmetric quantization
encoding_analyzer (EncodingAnalyzer, optional): Encoding analyzer for calibrating quantization encodings
(default: absolute min-max encoding analyzer)
block_size (Tuple[int, ...], optional): Block size
:ivar Tensor min: :math:`\theta_{min}` from which scale and offset will be derived.
:ivar Tensor max: :math:`\theta_{max}` from which scale and offset will be derived.
.. note::
:class:`QuantizeDequantize` cannot run :meth:`forward` until :attr:`min` and :attr:`max` are properly initialized,
which can be done based on input statistics using :meth:`compute_encodings` or
by manually assigning a new value to :attr:`min` and :attr:`max`.
See the examples below.
Examples:
>>> import aimet_torch.v2.quantization as Q
>>> input = torch.randn(5, 10)
>>> qdq = Q.affine.QuantizeDequantize(shape=(5, 2), bitwidth=8, symmetric=False, block_size=(1, 5))
>>> qdq.is_initialized()
False
>>> with qdq.compute_encodings():
... _ = qdq(input)
...
>>> qdq.is_initialized()
True
>>> qdq(input)
DequantizedTensor([[-0.2771, 0.3038, 1.0819, 0.9700, 0.9487, -0.1307,
-1.7894, -0.1709, -0.2212, 0.7741],
[-1.0295, -1.2265, -1.0295, 1.0564, 0.6177, -1.0386,
-0.0176, -2.6054, 1.8836, -0.1232],
[-0.8229, 0.5540, 0.3992, -0.2363, 1.2546, -1.0036,
0.2355, 0.1741, 1.6079, 0.6247],
[-1.0115, 1.2458, 0.9157, -1.4694, -0.0639, -0.2568,
0.0680, 1.6695, 0.7932, -0.1889],
[ 0.0158, 0.5695, 0.5220, 0.1977, -1.4475, -0.0424,
-1.1128, -0.8796, -0.1060, 1.5897]],
grad_fn=<AliasBackward0>)
>>> import aimet_torch.v2.quantization as Q
>>> input = torch.randn(5, 10)
>>> qdq = Q.affine.QuantizeDequantize(shape=(5, 2), bitwidth=8, symmetric=False, block_size=(1, 5))
>>> qdq.is_initialized()
False
>>> qdq.min = torch.nn.Parameter(-torch.ones_like(qdq.min))
>>> qdq.max = torch.nn.Parameter(torch.ones_like(qdq.max))
>>> qdq.is_initialized()
True
>>> qdq(input)
DequantizedTensor([[-0.6196, -0.9961, 0.0549, -0.6431, 1.0039, -0.8706,
1.0039, 0.4706, -0.2353, 0.8078],
[ 0.3451, -0.1176, -0.9961, -0.4549, -0.0549, -0.0471,
-0.5255, -0.2353, 1.0039, -0.9961],
[-0.4157, 0.0784, 0.5333, 0.1647, -0.9961, -0.9961,
-0.2118, -0.2196, 0.9176, 0.9490],
[ 1.0039, -0.7765, 0.4784, -0.8706, 1.0039, 0.6039,
-0.4157, -0.2118, -0.9961, 0.3137],
[ 1.0039, 0.3216, -0.2353, -0.7765, -0.9961, 0.8000,
1.0039, 0.4157, 0.4392, 0.4863]],
grad_fn=<AliasBackward0>)
"""
# NOTE: Deepspeed has a bug where it will inadvertently patch __init__ method permanently
# unless each leaf class explicitly defines its own __init__ separately.
# As a temporary workaround, we define __init__ to avoid triggering this bug.
# pylint: disable=useless-super-delegation
def __init__(self, shape, *args, **kwargs):
super().__init__(shape, *args, **kwargs)
[docs]
def forward(self, input: torch.Tensor) -> DequantizedTensor:
"""Quantizes and dequantizes the input tensor
Args:
input (torch.Tensor): Input to quantize and dequantize
Returns:
Quantize-dequantized output
"""
if not self.is_initialized():
raise RuntimeError(
'Failed to run QuantizeDequantize since quantization parameters are not initialized.'
' Please initialize the quantization parameters using `compute_encodings()`.'
)
encoding = self.get_encodings()
# Subclasses of torch.Tensor with custom __torch_function__ (in our case, QuantizedTensorBase)
# is known to introduce substantial CPU overhead.
# Cast types of the inputs to plain torch.Tensor for faster execution.
input = input.as_subclass(torch.Tensor)
output = quantize_dequantize(input,
encoding.scale,
encoding.offset,
encoding.qmin,
encoding.qmax,
block_size=self.block_size)
output = output.as_subclass(DequantizedTensor)
output.encoding = encoding
return output
class GroupedBlockQuantizeDequantize(QuantizeDequantize): # pylint: disable=too-many-ancestors
""" Class for performing Grouped Block Quantize Dequantize """
def __init__(self, shape, bitwidth: int, symmetric: bool, decompressed_bw: int,
encoding_analyzer: EncodingAnalyzer = None, block_size: Optional[Tuple[int, ...]] = None,
block_grouping: Optional[Tuple[int, ...]] = None):
"""
Grouped Block Quantize Dequantize constructor.
:param shape: Shape of the quantization parameters
:type shape: tuple
:param bitwidth: Quantization bitwidth
:type bitwidth: int
:param symmetric: If True, performs symmetric quantization;
otherwise, performs asymmetric quantization
:type symmetric: bool
:param decompressed_bw: Bitwidth used for decompression
:type decompressed_bw: int
:param encoding_analyzer: Encoding analyzer for calibrating quantization encodings
(default: absolute min-max encoding analyzer)
:type encoding_analyzer: EncodingAnalyzer, optional
:param block_size: Block size per dimension.
:type block_size: Tuple
:param block_grouping: Block grouping per dimension. If provided, every set of block_group scales will be
grouped together, and the maximum scale for all blocks in the group will be used to find
the scale in the decompressed_grid to be shared by all blocks in the group.
If no block_grouping is provided, default behavior uses a block group of 1 for all dims,
equivalent to Blockwise Quantization.
A value of -1 for a block group for a dimension is equivalent to grouping all blocks in
the dimension in one group. This is also equivalent to a block group value equal to the
number of blocks for that dimension.
:type block_grouping: Tuple
"""
super().__init__(shape, bitwidth, symmetric, encoding_analyzer, block_size)
self.decompressed_bw = decompressed_bw
self.block_grouping = block_grouping
if self.block_grouping is None:
# Default to BQ behavior with 1 for all block grouping dims if not provided
self.block_grouping = tuple(1 for _ in enumerate(self.shape))
if block_grouping is not None:
if len(block_grouping) != len(shape):
raise RuntimeError(f'Length of block grouping {block_grouping} must equal length of shape {shape}.')
for idx, block_group in enumerate(block_grouping):
if block_group != -1 and shape[idx] % block_group != 0:
raise RuntimeError(f'Quantizer shape dimensions must divide evenly with corresponding block '
f'grouping values for shapes {shape} and block grouping {block_grouping}.')
if self.decompressed_bw < self.bitwidth:
raise RuntimeError(f'Decompressed bitwidth {decompressed_bw} cannot be smaller than self.bitwidth '
f'{bitwidth}')
if not symmetric:
raise RuntimeError('GroupedBlockQuantizeDequantize only supports symmetric quantization.')
def get_scale(self, dtype=None) -> torch.Tensor:
"""
Compute quantization scale to be used for forward pass.
Overrides QuantizeDequantize self.get_scale() to apply the grouped block algorithm for calculating modified
scales.
:param dtype: dtype of the computed scale. Use of self.min.dtype by default.
:return: Updated scale
"""
orig_scale = super().get_scale(dtype)
orig_scale_shape = orig_scale.shape
reshaped_scale = orig_scale.view(self.get_expanded_scale_shape())
max_scale = torch.amax(reshaped_scale, list(range(1, len(orig_scale_shape) * 2, 2)), keepdim=True)
per_channel_scale = max_scale / 2 ** (self.decompressed_bw - self.bitwidth)
updated_scale = quantize_dequantize(reshaped_scale,
scale=per_channel_scale,
offset=torch.zeros_like(per_channel_scale),
qmin=1,
qmax=2 ** (self.decompressed_bw - self.bitwidth))
return updated_scale.view(orig_scale_shape)
def get_expanded_scale_shape(self) -> Tuple[int, ...]:
"""
Get expanded scale shape which breaks each scale dimension into a pair of dimensions with sizes
(original_shape / block_grouping, block_grouping).
:return: Expanded scale shape
"""
expanded_shape = []
for idx, block_group in enumerate(self.block_grouping):
# Block group of -1 is equivalent to grouping all blocks together
if block_group == -1:
expanded_shape.append(1)
expanded_shape.append(self.shape[idx])
else:
expanded_shape.append(self.shape[idx] // block_group)
expanded_shape.append(block_group)
return expanded_shape
def get_per_channel_scale(self, dtype=None) -> torch.Tensor:
"""
Get per channel scale.
:return: Per channel scale
"""
orig_scale = super().get_scale(dtype)
orig_scale_shape = orig_scale.shape
reshaped_scale = orig_scale.view(self.get_expanded_scale_shape())
max_scale = torch.amax(reshaped_scale, list(range(1, len(orig_scale_shape) * 2, 2)), keepdim=True)
per_channel_scale = max_scale / 2 ** (self.decompressed_bw - self.bitwidth)
return per_channel_scale
def get_per_block_integer_scale(self) -> torch.Tensor:
"""
Get per block integer scale.
:return: Per block integer scale
"""
per_channel_scale = self.get_per_channel_scale()
expanded_scale = self.get_scale().view(self.get_expanded_scale_shape())
integer_scale = torch.round(expanded_scale / per_channel_scale).int().view(self.get_scale().shape)
return integer_scale
def get_encodings(self) -> Optional[GroupedBlockEncoding]:
"""
Return the quantizer's encodings as an EncodingBase object
"""
if self.is_initialized():
return GroupedBlockEncoding(scale=self.get_scale(dtype=torch.float32),
offset=self.get_offset(dtype=torch.float32),
bitwidth=self.bitwidth,
signed=self.signed,
symmetry=self.symmetric,
block_size=self.block_size,
block_grouping=self.block_grouping,
decompressed_bw=self.decompressed_bw,
per_channel_scale=self.get_per_channel_scale(dtype=torch.float32),
per_block_int_scale=self.get_per_block_integer_scale())
return None