godot/thirdparty/astcenc/astcenc_compute_variance.cpp

// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2011-2022 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
// of the License at:
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations
// under the License.
// ----------------------------------------------------------------------------

#if !defined(ASTCENC_DECOMPRESS_ONLY)

/**
 * @brief Functions to calculate variance per component in a NxN footprint.
 *
 * We need N to be parametric, so the routine below uses summed area tables in order to execute in
 * O(1) time independent of how big N is.
 *
 * The addition uses a Brent-Kung-based parallel prefix adder. This uses the prefix tree to first
 * perform a binary reduction, and then distributes the results. This method means that there is no
 * serial dependency between a given element and the next one, and also significantly improves
 * numerical stability allowing us to use floats rather than doubles.
 */

#include "astcenc_internal.h"

#include <cassert>

/**
 * @brief Generate a prefix-sum array using the Brent-Kung algorithm.
 *
 * This will take an input array of the form:
 *     v0, v1, v2, ...
 * ... and modify in-place to turn it into a prefix-sum array of the form:
 *     v0, v0+v1, v0+v1+v2, ...
 *
 * @param d      The array to prefix-sum.
 * @param items  The number of items in the array.
 * @param stride The item spacing in the array; i.e. dense arrays should use 1.
 */
static void brent_kung_prefix_sum(
	vfloat4* d,
	size_t items,
	int stride
) {
	if (items < 2)
		return;

	size_t lc_stride = 2;
	size_t log2_stride = 1;

	// The reduction-tree loop
	do {
		size_t step = lc_stride >> 1;
		size_t start = lc_stride - 1;
		size_t iters = items >> log2_stride;

		vfloat4 *da = d + (start * stride);
		ptrdiff_t ofs = -static_cast<ptrdiff_t>(step * stride);
		size_t ofs_stride = stride << log2_stride;

		while (iters)
		{
			*da = *da + da[ofs];
			da += ofs_stride;
			iters--;
		}

		log2_stride += 1;
		lc_stride <<= 1;
	} while (lc_stride <= items);

	// The expansion-tree loop
	do {
		log2_stride -= 1;
		lc_stride >>= 1;

		size_t step = lc_stride >> 1;
		size_t start = step + lc_stride - 1;
		size_t iters = (items - step) >> log2_stride;

		vfloat4 *da = d + (start * stride);
		ptrdiff_t ofs = -static_cast<ptrdiff_t>(step * stride);
		size_t ofs_stride = stride << log2_stride;

		while (iters)
		{
			*da = *da + da[ofs];
			da += ofs_stride;
			iters--;
		}
	} while (lc_stride > 2);
}

/* See header for documentation. */
void compute_pixel_region_variance(
	astcenc_contexti& ctx,
	const pixel_region_args& arg
) {
	// Unpack the memory structure into local variables
	const astcenc_image* img = arg.img;
	astcenc_swizzle swz = arg.swz;
	bool have_z = arg.have_z;

	int size_x = arg.size_x;
	int size_y = arg.size_y;
	int size_z = arg.size_z;

	int offset_x = arg.offset_x;
	int offset_y = arg.offset_y;
	int offset_z = arg.offset_z;

	int alpha_kernel_radius = arg.alpha_kernel_radius;

	float*   input_alpha_averages = ctx.input_alpha_averages;
	vfloat4* work_memory = arg.work_memory;

	// Compute memory sizes and dimensions that we need
	int kernel_radius = alpha_kernel_radius;
	int kerneldim = 2 * kernel_radius + 1;
	int kernel_radius_xy = kernel_radius;
	int kernel_radius_z = have_z ? kernel_radius : 0;

	int padsize_x = size_x + kerneldim;
	int padsize_y = size_y + kerneldim;
	int padsize_z = size_z + (have_z ? kerneldim : 0);
	int sizeprod = padsize_x * padsize_y * padsize_z;

	int zd_start = have_z ? 1 : 0;

	vfloat4 *varbuf1 = work_memory;
	vfloat4 *varbuf2 = work_memory + sizeprod;

	// Scaling factors to apply to Y and Z for accesses into the work buffers
	int yst = padsize_x;
	int zst = padsize_x * padsize_y;

	// Scaling factors to apply to Y and Z for accesses into result buffers
	int ydt = img->dim_x;
	int zdt = img->dim_x * img->dim_y;

	// Macros to act as accessor functions for the work-memory
	#define VARBUF1(z, y, x) varbuf1[z * zst + y * yst + x]
	#define VARBUF2(z, y, x) varbuf2[z * zst + y * yst + x]

	// Load N and N^2 values into the work buffers
	if (img->data_type == ASTCENC_TYPE_U8)
	{
		// Swizzle data structure 4 = ZERO, 5 = ONE
		uint8_t data[6];
		data[ASTCENC_SWZ_0] = 0;
		data[ASTCENC_SWZ_1] = 255;

		for (int z = zd_start; z < padsize_z; z++)
		{
			int z_src = (z - zd_start) + offset_z - kernel_radius_z;
			z_src = astc::clamp(z_src, 0, static_cast<int>(img->dim_z - 1));
			uint8_t* data8 = static_cast<uint8_t*>(img->data[z_src]);

			for (int y = 1; y < padsize_y; y++)
			{
				int y_src = (y - 1) + offset_y - kernel_radius_xy;
				y_src = astc::clamp(y_src, 0, static_cast<int>(img->dim_y - 1));

				for (int x = 1; x < padsize_x; x++)
				{
					int x_src = (x - 1) + offset_x - kernel_radius_xy;
					x_src = astc::clamp(x_src, 0, static_cast<int>(img->dim_x - 1));

					data[0] = data8[(4 * img->dim_x * y_src) + (4 * x_src    )];
					data[1] = data8[(4 * img->dim_x * y_src) + (4 * x_src + 1)];
					data[2] = data8[(4 * img->dim_x * y_src) + (4 * x_src + 2)];
					data[3] = data8[(4 * img->dim_x * y_src) + (4 * x_src + 3)];

					uint8_t r = data[swz.r];
					uint8_t g = data[swz.g];
					uint8_t b = data[swz.b];
					uint8_t a = data[swz.a];

					vfloat4 d = vfloat4 (r * (1.0f / 255.0f),
					                     g * (1.0f / 255.0f),
					                     b * (1.0f / 255.0f),
					                     a * (1.0f / 255.0f));

					VARBUF1(z, y, x) = d;
					VARBUF2(z, y, x) = d * d;
				}
			}
		}
	}
	else if (img->data_type == ASTCENC_TYPE_F16)
	{
		// Swizzle data structure 4 = ZERO, 5 = ONE (in FP16)
		uint16_t data[6];
		data[ASTCENC_SWZ_0] = 0;
		data[ASTCENC_SWZ_1] = 0x3C00;

		for (int z = zd_start; z < padsize_z; z++)
		{
			int z_src = (z - zd_start) + offset_z - kernel_radius_z;
			z_src = astc::clamp(z_src, 0, static_cast<int>(img->dim_z - 1));
			uint16_t* data16 = static_cast<uint16_t*>(img->data[z_src]);

			for (int y = 1; y < padsize_y; y++)
			{
				int y_src = (y - 1) + offset_y - kernel_radius_xy;
				y_src = astc::clamp(y_src, 0, static_cast<int>(img->dim_y - 1));

				for (int x = 1; x < padsize_x; x++)
				{
					int x_src = (x - 1) + offset_x - kernel_radius_xy;
					x_src = astc::clamp(x_src, 0, static_cast<int>(img->dim_x - 1));

					data[0] = data16[(4 * img->dim_x * y_src) + (4 * x_src    )];
					data[1] = data16[(4 * img->dim_x * y_src) + (4 * x_src + 1)];
					data[2] = data16[(4 * img->dim_x * y_src) + (4 * x_src + 2)];
					data[3] = data16[(4 * img->dim_x * y_src) + (4 * x_src + 3)];

					vint4 di(data[swz.r], data[swz.g], data[swz.b], data[swz.a]);
					vfloat4 d = float16_to_float(di);

					VARBUF1(z, y, x) = d;
					VARBUF2(z, y, x) = d * d;
				}
			}
		}
	}
	else // if (img->data_type == ASTCENC_TYPE_F32)
	{
		assert(img->data_type == ASTCENC_TYPE_F32);

		// Swizzle data structure 4 = ZERO, 5 = ONE (in FP16)
		float data[6];
		data[ASTCENC_SWZ_0] = 0.0f;
		data[ASTCENC_SWZ_1] = 1.0f;

		for (int z = zd_start; z < padsize_z; z++)
		{
			int z_src = (z - zd_start) + offset_z - kernel_radius_z;
			z_src = astc::clamp(z_src, 0, static_cast<int>(img->dim_z - 1));
			float* data32 = static_cast<float*>(img->data[z_src]);

			for (int y = 1; y < padsize_y; y++)
			{
				int y_src = (y - 1) + offset_y - kernel_radius_xy;
				y_src = astc::clamp(y_src, 0, static_cast<int>(img->dim_y - 1));

				for (int x = 1; x < padsize_x; x++)
				{
					int x_src = (x - 1) + offset_x - kernel_radius_xy;
					x_src = astc::clamp(x_src, 0, static_cast<int>(img->dim_x - 1));

					data[0] = data32[(4 * img->dim_x * y_src) + (4 * x_src    )];
					data[1] = data32[(4 * img->dim_x * y_src) + (4 * x_src + 1)];
					data[2] = data32[(4 * img->dim_x * y_src) + (4 * x_src + 2)];
					data[3] = data32[(4 * img->dim_x * y_src) + (4 * x_src + 3)];

					float r = data[swz.r];
					float g = data[swz.g];
					float b = data[swz.b];
					float a = data[swz.a];

					vfloat4 d(r, g, b, a);

					VARBUF1(z, y, x) = d;
					VARBUF2(z, y, x) = d * d;
				}
			}
		}
	}

	// Pad with an extra layer of 0s; this forms the edge of the SAT tables
	vfloat4 vbz = vfloat4::zero();
	for (int z = 0; z < padsize_z; z++)
	{
		for (int y = 0; y < padsize_y; y++)
		{
			VARBUF1(z, y, 0) = vbz;
			VARBUF2(z, y, 0) = vbz;
		}

		for (int x = 0; x < padsize_x; x++)
		{
			VARBUF1(z, 0, x) = vbz;
			VARBUF2(z, 0, x) = vbz;
		}
	}

	if (have_z)
	{
		for (int y = 0; y < padsize_y; y++)
		{
			for (int x = 0; x < padsize_x; x++)
			{
				VARBUF1(0, y, x) = vbz;
				VARBUF2(0, y, x) = vbz;
			}
		}
	}

	// Generate summed-area tables for N and N^2; this is done in-place, using
	// a Brent-Kung parallel-prefix based algorithm to minimize precision loss
	for (int z = zd_start; z < padsize_z; z++)
	{
		for (int y = 1; y < padsize_y; y++)
		{
			brent_kung_prefix_sum(&(VARBUF1(z, y, 1)), padsize_x - 1, 1);
			brent_kung_prefix_sum(&(VARBUF2(z, y, 1)), padsize_x - 1, 1);
		}
	}

	for (int z = zd_start; z < padsize_z; z++)
	{
		for (int x = 1; x < padsize_x; x++)
		{
			brent_kung_prefix_sum(&(VARBUF1(z, 1, x)), padsize_y - 1, yst);
			brent_kung_prefix_sum(&(VARBUF2(z, 1, x)), padsize_y - 1, yst);
		}
	}

	if (have_z)
	{
		for (int y = 1; y < padsize_y; y++)
		{
			for (int x = 1; x < padsize_x; x++)
			{
				brent_kung_prefix_sum(&(VARBUF1(1, y, x)), padsize_z - 1, zst);
				brent_kung_prefix_sum(&(VARBUF2(1, y, x)), padsize_z - 1, zst);
			}
		}
	}

	// Compute a few constants used in the variance-calculation.
	float alpha_kdim = static_cast<float>(2 * alpha_kernel_radius + 1);
	float alpha_rsamples;

	if (have_z)
	{
		alpha_rsamples = 1.0f / (alpha_kdim * alpha_kdim * alpha_kdim);
	}
	else
	{
		alpha_rsamples = 1.0f / (alpha_kdim * alpha_kdim);
	}

	// Use the summed-area tables to compute variance for each neighborhood
	if (have_z)
	{
		for (int z = 0; z < size_z; z++)
		{
			int z_src = z + kernel_radius_z;
			int z_dst = z + offset_z;
			int z_low  = z_src - alpha_kernel_radius;
			int z_high = z_src + alpha_kernel_radius + 1;

			for (int y = 0; y < size_y; y++)
			{
				int y_src = y + kernel_radius_xy;
				int y_dst = y + offset_y;
				int y_low  = y_src - alpha_kernel_radius;
				int y_high = y_src + alpha_kernel_radius + 1;

				for (int x = 0; x < size_x; x++)
				{
					int x_src = x + kernel_radius_xy;
					int x_dst = x + offset_x;
					int x_low  = x_src - alpha_kernel_radius;
					int x_high = x_src + alpha_kernel_radius + 1;

					// Summed-area table lookups for alpha average
					float vasum = (  VARBUF1(z_high, y_low,  x_low).lane<3>()
					               - VARBUF1(z_high, y_low,  x_high).lane<3>()
					               - VARBUF1(z_high, y_high, x_low).lane<3>()
					               + VARBUF1(z_high, y_high, x_high).lane<3>()) -
					              (  VARBUF1(z_low,  y_low,  x_low).lane<3>()
					               - VARBUF1(z_low,  y_low,  x_high).lane<3>()
					               - VARBUF1(z_low,  y_high, x_low).lane<3>()
					               + VARBUF1(z_low,  y_high, x_high).lane<3>());

					int out_index = z_dst * zdt + y_dst * ydt + x_dst;
					input_alpha_averages[out_index] = (vasum * alpha_rsamples);
				}
			}
		}
	}
	else
	{
		for (int y = 0; y < size_y; y++)
		{
			int y_src = y + kernel_radius_xy;
			int y_dst = y + offset_y;
			int y_low  = y_src - alpha_kernel_radius;
			int y_high = y_src + alpha_kernel_radius + 1;

			for (int x = 0; x < size_x; x++)
			{
				int x_src = x + kernel_radius_xy;
				int x_dst = x + offset_x;
				int x_low  = x_src - alpha_kernel_radius;
				int x_high = x_src + alpha_kernel_radius + 1;

				// Summed-area table lookups for alpha average
				float vasum = VARBUF1(0, y_low,  x_low).lane<3>()
				            - VARBUF1(0, y_low,  x_high).lane<3>()
				            - VARBUF1(0, y_high, x_low).lane<3>()
				            + VARBUF1(0, y_high, x_high).lane<3>();

				int out_index = y_dst * ydt + x_dst;
				input_alpha_averages[out_index] = (vasum * alpha_rsamples);
			}
		}
	}
}

/* See header for documentation. */
unsigned int init_compute_averages(
	const astcenc_image& img,
	unsigned int alpha_kernel_radius,
	const astcenc_swizzle& swz,
	avg_args& ag
) {
	unsigned int size_x = img.dim_x;
	unsigned int size_y = img.dim_y;
	unsigned int size_z = img.dim_z;

	// Compute maximum block size and from that the working memory buffer size
	unsigned int kernel_radius = alpha_kernel_radius;
	unsigned int kerneldim = 2 * kernel_radius + 1;

	bool have_z = (size_z > 1);
	unsigned int max_blk_size_xy = have_z ? 16 : 32;
	unsigned int max_blk_size_z = astc::min(size_z, have_z ? 16u : 1u);

	unsigned int max_padsize_xy = max_blk_size_xy + kerneldim;
	unsigned int max_padsize_z = max_blk_size_z + (have_z ? kerneldim : 0);

	// Perform block-wise averages calculations across the image
	// Initialize fields which are not populated until later
	ag.arg.size_x = 0;
	ag.arg.size_y = 0;
	ag.arg.size_z = 0;
	ag.arg.offset_x = 0;
	ag.arg.offset_y = 0;
	ag.arg.offset_z = 0;
	ag.arg.work_memory = nullptr;

	ag.arg.img = &img;
	ag.arg.swz = swz;
	ag.arg.have_z = have_z;
	ag.arg.alpha_kernel_radius = alpha_kernel_radius;

	ag.img_size_x = size_x;
	ag.img_size_y = size_y;
	ag.img_size_z = size_z;
	ag.blk_size_xy = max_blk_size_xy;
	ag.blk_size_z = max_blk_size_z;
	ag.work_memory_size = 2 * max_padsize_xy * max_padsize_xy * max_padsize_z;

	// The parallel task count
	unsigned int z_tasks = (size_z + max_blk_size_z - 1) / max_blk_size_z;
	unsigned int y_tasks = (size_y + max_blk_size_xy - 1) / max_blk_size_xy;
	return z_tasks * y_tasks;
}

#endif
Add ASTC compression and decompression with Arm astcenc. Co-authored-by: Gordon A Macpherson <gordon.a.macpherson@gmail.com> Co-authored-by: Rémi Verschelde <rverschelde@gmail.com> 2022-12-21 02:54:01 +08:00			`// SPDX-License-Identifier: Apache-2.0`
			`// ----------------------------------------------------------------------------`
			`// Copyright 2011-2022 Arm Limited`
			`//`
			`// Licensed under the Apache License, Version 2.0 (the "License"); you may not`
			`// use this file except in compliance with the License. You may obtain a copy`
			`// of the License at:`
			`//`
			`// http://www.apache.org/licenses/LICENSE-2.0`
			`//`
			`// Unless required by applicable law or agreed to in writing, software`
			`// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT`
			`// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the`
			`// License for the specific language governing permissions and limitations`
			`// under the License.`
			`// ----------------------------------------------------------------------------`

			`#if !defined(ASTCENC_DECOMPRESS_ONLY)`

			`/**`
			`* @brief Functions to calculate variance per component in a NxN footprint.`
			`*`
			`* We need N to be parametric, so the routine below uses summed area tables in order to execute in`
			`* O(1) time independent of how big N is.`
			`*`
			`* The addition uses a Brent-Kung-based parallel prefix adder. This uses the prefix tree to first`
			`* perform a binary reduction, and then distributes the results. This method means that there is no`
			`* serial dependency between a given element and the next one, and also significantly improves`
			`* numerical stability allowing us to use floats rather than doubles.`
			`*/`

			`#include "astcenc_internal.h"`

			`#include <cassert>`

			`/**`
			`* @brief Generate a prefix-sum array using the Brent-Kung algorithm.`
			`*`
			`* This will take an input array of the form:`
			`* v0, v1, v2, ...`
			`* ... and modify in-place to turn it into a prefix-sum array of the form:`
			`* v0, v0+v1, v0+v1+v2, ...`
			`*`
			`* @param d The array to prefix-sum.`
			`* @param items The number of items in the array.`
			`* @param stride The item spacing in the array; i.e. dense arrays should use 1.`
			`*/`
			`static void brent_kung_prefix_sum(`
			`vfloat4* d,`
			`size_t items,`
			`int stride`
			`) {`
			`if (items < 2)`
			`return;`

			`size_t lc_stride = 2;`
			`size_t log2_stride = 1;`

			`// The reduction-tree loop`
			`do {`
			`size_t step = lc_stride >> 1;`
			`size_t start = lc_stride - 1;`
			`size_t iters = items >> log2_stride;`

			`vfloat4 da = d + (start stride);`
			`ptrdiff_t ofs = -static_cast<ptrdiff_t>(step * stride);`
			`size_t ofs_stride = stride << log2_stride;`

			`while (iters)`
			`{`
			`da = da + da[ofs];`
			`da += ofs_stride;`
			`iters--;`
			`}`

			`log2_stride += 1;`
			`lc_stride <<= 1;`
			`} while (lc_stride <= items);`

			`// The expansion-tree loop`
			`do {`
			`log2_stride -= 1;`
			`lc_stride >>= 1;`

			`size_t step = lc_stride >> 1;`
			`size_t start = step + lc_stride - 1;`
			`size_t iters = (items - step) >> log2_stride;`

			`vfloat4 da = d + (start stride);`
			`ptrdiff_t ofs = -static_cast<ptrdiff_t>(step * stride);`
			`size_t ofs_stride = stride << log2_stride;`

			`while (iters)`
			`{`
			`da = da + da[ofs];`
			`da += ofs_stride;`
			`iters--;`
			`}`
			`} while (lc_stride > 2);`
			`}`

			`/* See header for documentation. */`
			`void compute_pixel_region_variance(`
			`astcenc_contexti& ctx,`
			`const pixel_region_args& arg`
			`) {`
			`// Unpack the memory structure into local variables`
			`const astcenc_image* img = arg.img;`
			`astcenc_swizzle swz = arg.swz;`
			`bool have_z = arg.have_z;`

			`int size_x = arg.size_x;`
			`int size_y = arg.size_y;`
			`int size_z = arg.size_z;`

			`int offset_x = arg.offset_x;`
			`int offset_y = arg.offset_y;`
			`int offset_z = arg.offset_z;`

			`int alpha_kernel_radius = arg.alpha_kernel_radius;`

			`float* input_alpha_averages = ctx.input_alpha_averages;`
			`vfloat4* work_memory = arg.work_memory;`

			`// Compute memory sizes and dimensions that we need`
			`int kernel_radius = alpha_kernel_radius;`
			`int kerneldim = 2 * kernel_radius + 1;`
			`int kernel_radius_xy = kernel_radius;`
			`int kernel_radius_z = have_z ? kernel_radius : 0;`

			`int padsize_x = size_x + kerneldim;`
			`int padsize_y = size_y + kerneldim;`
			`int padsize_z = size_z + (have_z ? kerneldim : 0);`
			`int sizeprod = padsize_x * padsize_y * padsize_z;`

			`int zd_start = have_z ? 1 : 0;`

			`vfloat4 *varbuf1 = work_memory;`
			`vfloat4 *varbuf2 = work_memory + sizeprod;`

			`// Scaling factors to apply to Y and Z for accesses into the work buffers`
			`int yst = padsize_x;`
			`int zst = padsize_x * padsize_y;`

			`// Scaling factors to apply to Y and Z for accesses into result buffers`
			`int ydt = img->dim_x;`
			`int zdt = img->dim_x * img->dim_y;`

			`// Macros to act as accessor functions for the work-memory`
			`#define VARBUF1(z, y, x) varbuf1[z * zst + y * yst + x]`
			`#define VARBUF2(z, y, x) varbuf2[z * zst + y * yst + x]`

			`// Load N and N^2 values into the work buffers`
			`if (img->data_type == ASTCENC_TYPE_U8)`
			`{`
			`// Swizzle data structure 4 = ZERO, 5 = ONE`
			`uint8_t data[6];`
			`data[ASTCENC_SWZ_0] = 0;`
			`data[ASTCENC_SWZ_1] = 255;`

			`for (int z = zd_start; z < padsize_z; z++)`
			`{`
			`int z_src = (z - zd_start) + offset_z - kernel_radius_z;`
			`z_src = astc::clamp(z_src, 0, static_cast<int>(img->dim_z - 1));`
			`uint8_t* data8 = static_cast<uint8_t*>(img->data[z_src]);`

			`for (int y = 1; y < padsize_y; y++)`
			`{`
			`int y_src = (y - 1) + offset_y - kernel_radius_xy;`
			`y_src = astc::clamp(y_src, 0, static_cast<int>(img->dim_y - 1));`

			`for (int x = 1; x < padsize_x; x++)`
			`{`
			`int x_src = (x - 1) + offset_x - kernel_radius_xy;`
			`x_src = astc::clamp(x_src, 0, static_cast<int>(img->dim_x - 1));`

			`data[0] = data8[(4 * img->dim_x * y_src) + (4 * x_src )];`
			`data[1] = data8[(4 * img->dim_x * y_src) + (4 * x_src + 1)];`
			`data[2] = data8[(4 * img->dim_x * y_src) + (4 * x_src + 2)];`
			`data[3] = data8[(4 * img->dim_x * y_src) + (4 * x_src + 3)];`

			`uint8_t r = data[swz.r];`
			`uint8_t g = data[swz.g];`
			`uint8_t b = data[swz.b];`
			`uint8_t a = data[swz.a];`

			`vfloat4 d = vfloat4 (r * (1.0f / 255.0f),`
			`g * (1.0f / 255.0f),`
			`b * (1.0f / 255.0f),`
			`a * (1.0f / 255.0f));`

			`VARBUF1(z, y, x) = d;`
			`VARBUF2(z, y, x) = d * d;`
			`}`
			`}`
			`}`
			`}`
			`else if (img->data_type == ASTCENC_TYPE_F16)`
			`{`
			`// Swizzle data structure 4 = ZERO, 5 = ONE (in FP16)`
			`uint16_t data[6];`
			`data[ASTCENC_SWZ_0] = 0;`
			`data[ASTCENC_SWZ_1] = 0x3C00;`

			`for (int z = zd_start; z < padsize_z; z++)`
			`{`
			`int z_src = (z - zd_start) + offset_z - kernel_radius_z;`
			`z_src = astc::clamp(z_src, 0, static_cast<int>(img->dim_z - 1));`
			`uint16_t* data16 = static_cast<uint16_t*>(img->data[z_src]);`

			`for (int y = 1; y < padsize_y; y++)`
			`{`
			`int y_src = (y - 1) + offset_y - kernel_radius_xy;`
			`y_src = astc::clamp(y_src, 0, static_cast<int>(img->dim_y - 1));`

			`for (int x = 1; x < padsize_x; x++)`
			`{`
			`int x_src = (x - 1) + offset_x - kernel_radius_xy;`
			`x_src = astc::clamp(x_src, 0, static_cast<int>(img->dim_x - 1));`

			`data[0] = data16[(4 * img->dim_x * y_src) + (4 * x_src )];`
			`data[1] = data16[(4 * img->dim_x * y_src) + (4 * x_src + 1)];`
			`data[2] = data16[(4 * img->dim_x * y_src) + (4 * x_src + 2)];`
			`data[3] = data16[(4 * img->dim_x * y_src) + (4 * x_src + 3)];`

			`vint4 di(data[swz.r], data[swz.g], data[swz.b], data[swz.a]);`
			`vfloat4 d = float16_to_float(di);`

			`VARBUF1(z, y, x) = d;`
			`VARBUF2(z, y, x) = d * d;`
			`}`
			`}`
			`}`
			`}`
			`else // if (img->data_type == ASTCENC_TYPE_F32)`
			`{`
			`assert(img->data_type == ASTCENC_TYPE_F32);`

			`// Swizzle data structure 4 = ZERO, 5 = ONE (in FP16)`
			`float data[6];`
			`data[ASTCENC_SWZ_0] = 0.0f;`
			`data[ASTCENC_SWZ_1] = 1.0f;`

			`for (int z = zd_start; z < padsize_z; z++)`
			`{`
			`int z_src = (z - zd_start) + offset_z - kernel_radius_z;`
			`z_src = astc::clamp(z_src, 0, static_cast<int>(img->dim_z - 1));`
			`float* data32 = static_cast<float*>(img->data[z_src]);`

			`for (int y = 1; y < padsize_y; y++)`
			`{`
			`int y_src = (y - 1) + offset_y - kernel_radius_xy;`
			`y_src = astc::clamp(y_src, 0, static_cast<int>(img->dim_y - 1));`

			`for (int x = 1; x < padsize_x; x++)`
			`{`
			`int x_src = (x - 1) + offset_x - kernel_radius_xy;`
			`x_src = astc::clamp(x_src, 0, static_cast<int>(img->dim_x - 1));`

			`data[0] = data32[(4 * img->dim_x * y_src) + (4 * x_src )];`
			`data[1] = data32[(4 * img->dim_x * y_src) + (4 * x_src + 1)];`
			`data[2] = data32[(4 * img->dim_x * y_src) + (4 * x_src + 2)];`
			`data[3] = data32[(4 * img->dim_x * y_src) + (4 * x_src + 3)];`

			`float r = data[swz.r];`
			`float g = data[swz.g];`
			`float b = data[swz.b];`
			`float a = data[swz.a];`

			`vfloat4 d(r, g, b, a);`

			`VARBUF1(z, y, x) = d;`
			`VARBUF2(z, y, x) = d * d;`
			`}`
			`}`
			`}`
			`}`

			`// Pad with an extra layer of 0s; this forms the edge of the SAT tables`
			`vfloat4 vbz = vfloat4::zero();`
			`for (int z = 0; z < padsize_z; z++)`
			`{`
			`for (int y = 0; y < padsize_y; y++)`
			`{`
			`VARBUF1(z, y, 0) = vbz;`
			`VARBUF2(z, y, 0) = vbz;`
			`}`

			`for (int x = 0; x < padsize_x; x++)`
			`{`
			`VARBUF1(z, 0, x) = vbz;`
			`VARBUF2(z, 0, x) = vbz;`
			`}`
			`}`

			`if (have_z)`
			`{`
			`for (int y = 0; y < padsize_y; y++)`
			`{`
			`for (int x = 0; x < padsize_x; x++)`
			`{`
			`VARBUF1(0, y, x) = vbz;`
			`VARBUF2(0, y, x) = vbz;`
			`}`
			`}`
			`}`

			`// Generate summed-area tables for N and N^2; this is done in-place, using`
			`// a Brent-Kung parallel-prefix based algorithm to minimize precision loss`
			`for (int z = zd_start; z < padsize_z; z++)`
			`{`
			`for (int y = 1; y < padsize_y; y++)`
			`{`
			`brent_kung_prefix_sum(&(VARBUF1(z, y, 1)), padsize_x - 1, 1);`
			`brent_kung_prefix_sum(&(VARBUF2(z, y, 1)), padsize_x - 1, 1);`
			`}`
			`}`

			`for (int z = zd_start; z < padsize_z; z++)`
			`{`
			`for (int x = 1; x < padsize_x; x++)`
			`{`
			`brent_kung_prefix_sum(&(VARBUF1(z, 1, x)), padsize_y - 1, yst);`
			`brent_kung_prefix_sum(&(VARBUF2(z, 1, x)), padsize_y - 1, yst);`
			`}`
			`}`

			`if (have_z)`
			`{`
			`for (int y = 1; y < padsize_y; y++)`
			`{`
			`for (int x = 1; x < padsize_x; x++)`
			`{`
			`brent_kung_prefix_sum(&(VARBUF1(1, y, x)), padsize_z - 1, zst);`
			`brent_kung_prefix_sum(&(VARBUF2(1, y, x)), padsize_z - 1, zst);`
			`}`
			`}`
			`}`

			`// Compute a few constants used in the variance-calculation.`
			`float alpha_kdim = static_cast<float>(2 * alpha_kernel_radius + 1);`
			`float alpha_rsamples;`

			`if (have_z)`
			`{`
			`alpha_rsamples = 1.0f / (alpha_kdim * alpha_kdim * alpha_kdim);`
			`}`
			`else`
			`{`
			`alpha_rsamples = 1.0f / (alpha_kdim * alpha_kdim);`
			`}`

			`// Use the summed-area tables to compute variance for each neighborhood`
			`if (have_z)`
			`{`
			`for (int z = 0; z < size_z; z++)`
			`{`
			`int z_src = z + kernel_radius_z;`
			`int z_dst = z + offset_z;`
			`int z_low = z_src - alpha_kernel_radius;`
			`int z_high = z_src + alpha_kernel_radius + 1;`

			`for (int y = 0; y < size_y; y++)`
			`{`
			`int y_src = y + kernel_radius_xy;`
			`int y_dst = y + offset_y;`
			`int y_low = y_src - alpha_kernel_radius;`
			`int y_high = y_src + alpha_kernel_radius + 1;`

			`for (int x = 0; x < size_x; x++)`
			`{`
			`int x_src = x + kernel_radius_xy;`
			`int x_dst = x + offset_x;`
			`int x_low = x_src - alpha_kernel_radius;`
			`int x_high = x_src + alpha_kernel_radius + 1;`

			`// Summed-area table lookups for alpha average`
			`float vasum = ( VARBUF1(z_high, y_low, x_low).lane<3>()`
			`- VARBUF1(z_high, y_low, x_high).lane<3>()`
			`- VARBUF1(z_high, y_high, x_low).lane<3>()`
			`+ VARBUF1(z_high, y_high, x_high).lane<3>()) -`
			`( VARBUF1(z_low, y_low, x_low).lane<3>()`
			`- VARBUF1(z_low, y_low, x_high).lane<3>()`
			`- VARBUF1(z_low, y_high, x_low).lane<3>()`
			`+ VARBUF1(z_low, y_high, x_high).lane<3>());`

			`int out_index = z_dst * zdt + y_dst * ydt + x_dst;`
			`input_alpha_averages[out_index] = (vasum * alpha_rsamples);`
			`}`
			`}`
			`}`
			`}`
			`else`
			`{`
			`for (int y = 0; y < size_y; y++)`
			`{`
			`int y_src = y + kernel_radius_xy;`
			`int y_dst = y + offset_y;`
			`int y_low = y_src - alpha_kernel_radius;`
			`int y_high = y_src + alpha_kernel_radius + 1;`

			`for (int x = 0; x < size_x; x++)`
			`{`
			`int x_src = x + kernel_radius_xy;`
			`int x_dst = x + offset_x;`
			`int x_low = x_src - alpha_kernel_radius;`
			`int x_high = x_src + alpha_kernel_radius + 1;`

			`// Summed-area table lookups for alpha average`
			`float vasum = VARBUF1(0, y_low, x_low).lane<3>()`
			`- VARBUF1(0, y_low, x_high).lane<3>()`
			`- VARBUF1(0, y_high, x_low).lane<3>()`
			`+ VARBUF1(0, y_high, x_high).lane<3>();`

			`int out_index = y_dst * ydt + x_dst;`
			`input_alpha_averages[out_index] = (vasum * alpha_rsamples);`
			`}`
			`}`
			`}`
			`}`

			`/* See header for documentation. */`
			`unsigned int init_compute_averages(`
			`const astcenc_image& img,`
			`unsigned int alpha_kernel_radius,`
			`const astcenc_swizzle& swz,`
			`avg_args& ag`
			`) {`
			`unsigned int size_x = img.dim_x;`
			`unsigned int size_y = img.dim_y;`
			`unsigned int size_z = img.dim_z;`

			`// Compute maximum block size and from that the working memory buffer size`
			`unsigned int kernel_radius = alpha_kernel_radius;`
			`unsigned int kerneldim = 2 * kernel_radius + 1;`

			`bool have_z = (size_z > 1);`
			`unsigned int max_blk_size_xy = have_z ? 16 : 32;`
			`unsigned int max_blk_size_z = astc::min(size_z, have_z ? 16u : 1u);`

			`unsigned int max_padsize_xy = max_blk_size_xy + kerneldim;`
			`unsigned int max_padsize_z = max_blk_size_z + (have_z ? kerneldim : 0);`

			`// Perform block-wise averages calculations across the image`
			`// Initialize fields which are not populated until later`
			`ag.arg.size_x = 0;`
			`ag.arg.size_y = 0;`
			`ag.arg.size_z = 0;`
			`ag.arg.offset_x = 0;`
			`ag.arg.offset_y = 0;`
			`ag.arg.offset_z = 0;`
			`ag.arg.work_memory = nullptr;`

			`ag.arg.img = &img;`
			`ag.arg.swz = swz;`
			`ag.arg.have_z = have_z;`
			`ag.arg.alpha_kernel_radius = alpha_kernel_radius;`

			`ag.img_size_x = size_x;`
			`ag.img_size_y = size_y;`
			`ag.img_size_z = size_z;`
			`ag.blk_size_xy = max_blk_size_xy;`
			`ag.blk_size_z = max_blk_size_z;`
			`ag.work_memory_size = 2 * max_padsize_xy * max_padsize_xy * max_padsize_z;`

			`// The parallel task count`
			`unsigned int z_tasks = (size_z + max_blk_size_z - 1) / max_blk_size_z;`
			`unsigned int y_tasks = (size_y + max_blk_size_xy - 1) / max_blk_size_xy;`
			`return z_tasks * y_tasks;`
			`}`

			`#endif`