#pragma once #include #include #include #include #include #include #include #include #include #include #include #include #include namespace at { /** * UnknownQuantizer is a placeholder quantizer for functions that implement * quantization in a two step process. First a tensor is allocated but with * unknown quantizer, and then the quantization kernel decides what the final * quantizer will be. */ struct TORCH_API UnknownQuantizer : public Quantizer { explicit UnknownQuantizer(ScalarType scalar_type) : Quantizer(scalar_type) {} Tensor quantize(const Tensor& tensor) override; Tensor dequantize(const Tensor& qtensor) override; Tensor& dequantize_out(Tensor& rtensor, const Tensor& qtensor) override; QScheme qscheme() const override; bool equalTo(QuantizerPtr other) const override; }; /** * UniformQuantizer is the parent class for all uniform quantizers. * These quantization scheme will map float value uniformly to * the quantized value. For example, affine quantizer is * the most commonly used scheme in this category. */ struct TORCH_API UniformQuantizer : public Quantizer { explicit UniformQuantizer(ScalarType scalar_type) : Quantizer(scalar_type) {} }; /** * NonUniformQuantizer is the parent class for all non-uniform quantizers. * These quantization scheme may map float value non-uniformly to the quantized * value. K-means quantization is a representative example in this category. */ struct TORCH_API NonUniformQuantizer : public Quantizer { explicit NonUniformQuantizer(ScalarType scalar_type) : Quantizer(scalar_type) {} }; // There is also StochasticQuantizer which is uniform but not affine /** * AffineQuantizer uses affine transformation to do quantization. * * For quantize: * Y = clamp(round(X / scale + zero_point), min, max) * For dequantize: * X = (Y - zero_point) * scale */ struct TORCH_API AffineQuantizer : public UniformQuantizer { explicit AffineQuantizer(ScalarType scalar_type) : UniformQuantizer(scalar_type) {} }; // Note that we will not have Symmetric Quantizer in backend to reduce // complications in quantized kernel implementation. /** * PerTensorAffineQuantizer stores a scale and a zero_point, which is used for * all the values in the Tensor. */ struct TORCH_API PerTensorAffineQuantizer : public AffineQuantizer { explicit PerTensorAffineQuantizer(ScalarType scalar_type, double scale, int64_t zero_point) : AffineQuantizer(scalar_type), scale_(scale), zero_point_(zero_point) {} Tensor quantize(const Tensor& tensor) override; Tensor dequantize(const Tensor& qtensor) override; Tensor& dequantize_out(Tensor& rtensor, const Tensor& qtensor) override; QScheme qscheme() const override { return kPerTensorAffine; } double scale() const { return scale_; } int64_t zero_point() const { return zero_point_; } bool equalTo(QuantizerPtr other) const override { if (!other.get() || other->qscheme() != kPerTensorAffine) { return false; } auto* other_per_tensor_affine = static_cast(other.get()); return scalar_type() == other_per_tensor_affine->scalar_type() && scale() == other_per_tensor_affine->scale() && zero_point() == other_per_tensor_affine->zero_point(); } private: const double scale_; // We use int64_t for consistency with Python const int64_t zero_point_; }; /** * PerChannelAffineQuantizer is the same as PerTensorAffineQuantizer * except that we have an independent scale and zero_point parameter * for each channel. * * Also note that per channel quantization is mostly applied to output channels * of weights since per-input channel of weight quantization or per-channel * quantization for activations can't be efficiently supported in most of * processors since it requires each multiplication result within a single * dot-product to have a different scale. */ struct TORCH_API PerChannelAffineQuantizer : public AffineQuantizer { explicit PerChannelAffineQuantizer( ScalarType scalar_type, Tensor scales, Tensor zero_points, int64_t axis) : AffineQuantizer(scalar_type), scales_(std::move(scales)), zero_points_(std::move(zero_points)), axis_(axis) {} QScheme qscheme() const override { return kPerChannelAffine; } Tensor scales() const { return scales_; } Tensor zero_points() const { return zero_points_; } int64_t axis() const { return axis_; } Tensor quantize(const Tensor& tensor) override; Tensor dequantize(const Tensor& qtensor) override; Tensor& dequantize_out(Tensor& rtensor, const Tensor& qtensor) override; bool equalTo(QuantizerPtr other) const override { if (!other.get() || other->qscheme() != kPerChannelAffine) { return false; } auto* other_per_channel_affine = static_cast(other.get()); return scalar_type() == other_per_channel_affine->scalar_type() && scales().equal(other_per_channel_affine->scales()) && zero_points().equal(other_per_channel_affine->zero_points()) && axis() == other_per_channel_affine->axis(); } protected: Tensor scales_; Tensor zero_points_; const int64_t axis_; }; /** * PerChannelAffineFloatQParamsQuantizer is the same as PerChannelAffineQuantizer * except that it expects both scale and zero point to be floating point values. * * This quantizer uses the kPerChannelAffineFloatQParams qscheme which is a variant of * kPerChannelAffine. * * The quantize equation in this case looks like - * Xq = (Xf - zero_point) * inv_scale, where inv_scale = 1.0/scale * * Note: Usage of floating point zero point is useful in cases where 0 doesn't need to * be exactly represented in the quantized space. We can get additional precision by * using floating point values for zero point. */ struct TORCH_API PerChannelAffineFloatQParamsQuantizer : public PerChannelAffineQuantizer { explicit PerChannelAffineFloatQParamsQuantizer( ScalarType scalar_type, Tensor scales, Tensor zero_points, int64_t axis) : PerChannelAffineQuantizer(scalar_type, scales, zero_points, axis) {} QScheme qscheme() const override { return kPerChannelAffineFloatQParams; } Tensor quantize(const Tensor& tensor) override; Tensor dequantize(const Tensor& qtensor) override; Tensor& dequantize_out(Tensor& rtensor, const Tensor& qtensor) override; bool equalTo(QuantizerPtr other) const override { if (!other.get() || other->qscheme() != kPerChannelAffineFloatQParams) { return false; } auto* other_per_channel_float_qparams = static_cast(other.get()); return scalar_type() == other_per_channel_float_qparams->scalar_type() && scales().equal(other_per_channel_float_qparams->scales()) && zero_points().equal(other_per_channel_float_qparams->zero_points()) && axis() == other_per_channel_float_qparams->axis(); } }; // This is an internal utility function for getting at the QTensorImpl, // You should only use this for writing low level // setters/getters for QTensorImpl fields; otherwise, you should use // the low level setters/getters that were implemented using this. // This may be called repeatedly, so make sure it's pretty cheap. TORCH_API QTensorImpl* get_qtensorimpl(const TensorBase& self); // double and int64_t are because of the native function API, we only have these // argument types right now in native functions TORCH_API QuantizerPtr make_per_tensor_affine_quantizer( double scale, int64_t zero_point, ScalarType scalar_type); TORCH_API QuantizerPtr make_per_channel_affine_quantizer( const Tensor& scales, const Tensor& zero_points, int64_t axis, ScalarType scalar_type); TORCH_API QuantizerPtr make_unknown_quantizer(ScalarType scalar_type); // Create a Quantized Tensor given arguments for normal Tensor and a quantizer TORCH_API Tensor new_qtensor( IntArrayRef sizes, const TensorOptions& options, QuantizerPtr quantizer); TORCH_API void set_quantizer_(const Tensor& self, ConstQuantizerPtr quantizer); TORCH_API Tensor from_blob_quantized_per_tensor_affine( void* data, IntArrayRef sizes, IntArrayRef strides, std::function deleter, const float scale, const int64_t zeroPoint, const TensorOptions& options); TORCH_API Tensor from_blob_quantized_per_tensor_affine( void* data, IntArrayRef sizes, std::function deleter, const float scale, const int64_t zeroPoint, const TensorOptions& options); TORCH_API Tensor from_blob_quantized_per_channel_affine( void* data, IntArrayRef sizes, std::function deleter, const Tensor& scales, const Tensor& zero_points, const int64_t axis, const TensorOptions& options); } // namespace at