280 lines
9.0 KiB
C++
280 lines
9.0 KiB
C++
#pragma once
|
|
|
|
#include <c10/core/QScheme.h>
|
|
#include <c10/core/MemoryFormat.h>
|
|
#include <c10/macros/Macros.h>
|
|
#include <c10/util/Exception.h>
|
|
#include <c10/util/intrusive_ptr.h>
|
|
#include <c10/core/ScalarType.h>
|
|
#include <c10/core/TensorOptions.h>
|
|
|
|
#include <ATen/Tensor.h>
|
|
#include <ATen/TensorUtils.h>
|
|
|
|
#include <ATen/core/QuantizerBase.h>
|
|
|
|
#include <cmath>
|
|
#include <memory>
|
|
#include <utility>
|
|
|
|
namespace at {
|
|
|
|
/**
|
|
* UnknownQuantizer is a placeholder quantizer for functions that implement
|
|
* quantization in a two step process. First a tensor is allocated but with
|
|
* unknown quantizer, and then the quantization kernel decides what the final
|
|
* quantizer will be.
|
|
*/
|
|
struct TORCH_API UnknownQuantizer : public Quantizer {
|
|
explicit UnknownQuantizer(ScalarType scalar_type)
|
|
: Quantizer(scalar_type) {}
|
|
|
|
Tensor quantize(const Tensor& tensor) override;
|
|
Tensor dequantize(const Tensor& qtensor) override;
|
|
Tensor& dequantize_out(Tensor& rtensor, const Tensor& qtensor) override;
|
|
QScheme qscheme() const override;
|
|
bool equalTo(QuantizerPtr other) const override;
|
|
};
|
|
|
|
/**
|
|
* UniformQuantizer is the parent class for all uniform quantizers.
|
|
* These quantization scheme will map float value uniformly to
|
|
* the quantized value. For example, affine quantizer is
|
|
* the most commonly used scheme in this category.
|
|
*/
|
|
struct TORCH_API UniformQuantizer : public Quantizer {
|
|
explicit UniformQuantizer(ScalarType scalar_type) : Quantizer(scalar_type) {}
|
|
};
|
|
|
|
/**
|
|
* NonUniformQuantizer is the parent class for all non-uniform quantizers.
|
|
* These quantization scheme may map float value non-uniformly to the quantized
|
|
* value. K-means quantization is a representative example in this category.
|
|
*/
|
|
struct TORCH_API NonUniformQuantizer : public Quantizer {
|
|
explicit NonUniformQuantizer(ScalarType scalar_type) : Quantizer(scalar_type) {}
|
|
};
|
|
|
|
// There is also StochasticQuantizer which is uniform but not affine
|
|
|
|
/**
|
|
* AffineQuantizer uses affine transformation to do quantization.
|
|
*
|
|
* For quantize:
|
|
* Y = clamp(round(X / scale + zero_point), min, max)
|
|
* For dequantize:
|
|
* X = (Y - zero_point) * scale
|
|
*/
|
|
struct TORCH_API AffineQuantizer : public UniformQuantizer {
|
|
explicit AffineQuantizer(ScalarType scalar_type) : UniformQuantizer(scalar_type) {}
|
|
};
|
|
|
|
// Note that we will not have Symmetric Quantizer in backend to reduce
|
|
// complications in quantized kernel implementation.
|
|
|
|
/**
|
|
* PerTensorAffineQuantizer stores a scale and a zero_point, which is used for
|
|
* all the values in the Tensor.
|
|
*/
|
|
struct TORCH_API PerTensorAffineQuantizer : public AffineQuantizer {
|
|
explicit PerTensorAffineQuantizer(ScalarType scalar_type, double scale, int64_t zero_point)
|
|
: AffineQuantizer(scalar_type),
|
|
scale_(scale),
|
|
zero_point_(zero_point) {}
|
|
|
|
Tensor quantize(const Tensor& tensor) override;
|
|
Tensor dequantize(const Tensor& qtensor) override;
|
|
Tensor& dequantize_out(Tensor& rtensor, const Tensor& qtensor) override;
|
|
|
|
QScheme qscheme() const override {
|
|
return kPerTensorAffine;
|
|
}
|
|
|
|
double scale() const {
|
|
return scale_;
|
|
}
|
|
|
|
int64_t zero_point() const {
|
|
return zero_point_;
|
|
}
|
|
|
|
bool equalTo(QuantizerPtr other) const override {
|
|
if (!other.get() || other->qscheme() != kPerTensorAffine) {
|
|
return false;
|
|
}
|
|
auto* other_per_tensor_affine =
|
|
static_cast<PerTensorAffineQuantizer*>(other.get());
|
|
return scalar_type() == other_per_tensor_affine->scalar_type() &&
|
|
scale() == other_per_tensor_affine->scale() &&
|
|
zero_point() == other_per_tensor_affine->zero_point();
|
|
}
|
|
|
|
private:
|
|
const double scale_;
|
|
// We use int64_t for consistency with Python
|
|
const int64_t zero_point_;
|
|
};
|
|
|
|
/**
|
|
* PerChannelAffineQuantizer is the same as PerTensorAffineQuantizer
|
|
* except that we have an independent scale and zero_point parameter
|
|
* for each channel.
|
|
*
|
|
* Also note that per channel quantization is mostly applied to output channels
|
|
* of weights since per-input channel of weight quantization or per-channel
|
|
* quantization for activations can't be efficiently supported in most of
|
|
* processors since it requires each multiplication result within a single
|
|
* dot-product to have a different scale.
|
|
*/
|
|
struct TORCH_API PerChannelAffineQuantizer : public AffineQuantizer {
|
|
explicit PerChannelAffineQuantizer(
|
|
ScalarType scalar_type,
|
|
Tensor scales,
|
|
Tensor zero_points,
|
|
int64_t axis)
|
|
: AffineQuantizer(scalar_type),
|
|
scales_(std::move(scales)),
|
|
zero_points_(std::move(zero_points)),
|
|
axis_(axis) {}
|
|
|
|
QScheme qscheme() const override {
|
|
return kPerChannelAffine;
|
|
}
|
|
|
|
Tensor scales() const {
|
|
return scales_;
|
|
}
|
|
|
|
Tensor zero_points() const {
|
|
return zero_points_;
|
|
}
|
|
|
|
int64_t axis() const {
|
|
return axis_;
|
|
}
|
|
|
|
Tensor quantize(const Tensor& tensor) override;
|
|
Tensor dequantize(const Tensor& qtensor) override;
|
|
Tensor& dequantize_out(Tensor& rtensor, const Tensor& qtensor) override;
|
|
|
|
bool equalTo(QuantizerPtr other) const override {
|
|
if (!other.get() || other->qscheme() != kPerChannelAffine) {
|
|
return false;
|
|
}
|
|
auto* other_per_channel_affine =
|
|
static_cast<PerChannelAffineQuantizer*>(other.get());
|
|
return scalar_type() == other_per_channel_affine->scalar_type() &&
|
|
scales().equal(other_per_channel_affine->scales()) &&
|
|
zero_points().equal(other_per_channel_affine->zero_points()) &&
|
|
axis() == other_per_channel_affine->axis();
|
|
}
|
|
|
|
protected:
|
|
Tensor scales_;
|
|
Tensor zero_points_;
|
|
const int64_t axis_;
|
|
};
|
|
|
|
/**
|
|
* PerChannelAffineFloatQParamsQuantizer is the same as PerChannelAffineQuantizer
|
|
* except that it expects both scale and zero point to be floating point values.
|
|
*
|
|
* This quantizer uses the kPerChannelAffineFloatQParams qscheme which is a variant of
|
|
* kPerChannelAffine.
|
|
*
|
|
* The quantize equation in this case looks like -
|
|
* Xq = (Xf - zero_point) * inv_scale, where inv_scale = 1.0/scale
|
|
*
|
|
* Note: Usage of floating point zero point is useful in cases where 0 doesn't need to
|
|
* be exactly represented in the quantized space. We can get additional precision by
|
|
* using floating point values for zero point.
|
|
*/
|
|
struct TORCH_API PerChannelAffineFloatQParamsQuantizer : public PerChannelAffineQuantizer {
|
|
explicit PerChannelAffineFloatQParamsQuantizer(
|
|
ScalarType scalar_type,
|
|
Tensor scales,
|
|
Tensor zero_points,
|
|
int64_t axis)
|
|
: PerChannelAffineQuantizer(scalar_type,
|
|
scales,
|
|
zero_points,
|
|
axis) {}
|
|
|
|
QScheme qscheme() const override {
|
|
return kPerChannelAffineFloatQParams;
|
|
}
|
|
|
|
Tensor quantize(const Tensor& tensor) override;
|
|
Tensor dequantize(const Tensor& qtensor) override;
|
|
Tensor& dequantize_out(Tensor& rtensor, const Tensor& qtensor) override;
|
|
|
|
bool equalTo(QuantizerPtr other) const override {
|
|
if (!other.get() || other->qscheme() != kPerChannelAffineFloatQParams) {
|
|
return false;
|
|
}
|
|
auto* other_per_channel_float_qparams =
|
|
static_cast<PerChannelAffineFloatQParamsQuantizer*>(other.get());
|
|
return scalar_type() == other_per_channel_float_qparams->scalar_type() &&
|
|
scales().equal(other_per_channel_float_qparams->scales()) &&
|
|
zero_points().equal(other_per_channel_float_qparams->zero_points()) &&
|
|
axis() == other_per_channel_float_qparams->axis();
|
|
}
|
|
};
|
|
|
|
// This is an internal utility function for getting at the QTensorImpl,
|
|
// You should only use this for writing low level
|
|
// setters/getters for QTensorImpl fields; otherwise, you should use
|
|
// the low level setters/getters that were implemented using this.
|
|
// This may be called repeatedly, so make sure it's pretty cheap.
|
|
TORCH_API QTensorImpl* get_qtensorimpl(const TensorBase& self);
|
|
|
|
// double and int64_t are because of the native function API, we only have these
|
|
// argument types right now in native functions
|
|
TORCH_API QuantizerPtr
|
|
make_per_tensor_affine_quantizer(
|
|
double scale, int64_t zero_point, ScalarType scalar_type);
|
|
|
|
TORCH_API QuantizerPtr make_per_channel_affine_quantizer(
|
|
const Tensor& scales,
|
|
const Tensor& zero_points,
|
|
int64_t axis,
|
|
ScalarType scalar_type);
|
|
|
|
TORCH_API QuantizerPtr make_unknown_quantizer(ScalarType scalar_type);
|
|
|
|
// Create a Quantized Tensor given arguments for normal Tensor and a quantizer
|
|
TORCH_API Tensor new_qtensor(
|
|
IntArrayRef sizes,
|
|
const TensorOptions& options,
|
|
QuantizerPtr quantizer);
|
|
|
|
TORCH_API void set_quantizer_(const Tensor& self, ConstQuantizerPtr quantizer);
|
|
|
|
TORCH_API Tensor from_blob_quantized_per_tensor_affine(
|
|
void* data,
|
|
IntArrayRef sizes,
|
|
IntArrayRef strides,
|
|
std::function<void(void*)> deleter,
|
|
const float scale,
|
|
const int64_t zeroPoint,
|
|
const TensorOptions& options);
|
|
|
|
TORCH_API Tensor from_blob_quantized_per_tensor_affine(
|
|
void* data,
|
|
IntArrayRef sizes,
|
|
std::function<void(void*)> deleter,
|
|
const float scale,
|
|
const int64_t zeroPoint,
|
|
const TensorOptions& options);
|
|
|
|
TORCH_API Tensor from_blob_quantized_per_channel_affine(
|
|
void* data,
|
|
IntArrayRef sizes,
|
|
std::function<void(void*)> deleter,
|
|
const Tensor& scales,
|
|
const Tensor& zero_points,
|
|
const int64_t axis,
|
|
const TensorOptions& options);
|
|
|
|
} // namespace at
|