godot/thirdparty/astcenc/astcenc_decompress_symbolic.cpp

// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2011-2023 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
// of the License at:
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations
// under the License.
// ----------------------------------------------------------------------------

/**
 * @brief Functions to decompress a symbolic block.
 */

#include "astcenc_internal.h"

#include <stdio.h>
#include <assert.h>

/**
 * @brief Compute the integer linear interpolation of two color endpoints.
 *
 * @param decode_mode   The ASTC profile (linear or sRGB)
 * @param color0        The endpoint0 color.
 * @param color1        The endpoint1 color.
 * @param weights        The interpolation weight (between 0 and 64).
 *
 * @return The interpolated color.
 */
static vint4 lerp_color_int(
	astcenc_profile decode_mode,
	vint4 color0,
	vint4 color1,
	vint4 weights
) {
	vint4 weight1 = weights;
	vint4 weight0 = vint4(64) - weight1;

	if (decode_mode == ASTCENC_PRF_LDR_SRGB)
	{
		color0 = asr<8>(color0);
		color1 = asr<8>(color1);
	}

	vint4 color = (color0 * weight0) + (color1 * weight1) + vint4(32);
	color = asr<6>(color);

	if (decode_mode == ASTCENC_PRF_LDR_SRGB)
	{
		color = color * vint4(257);
	}

	return color;
}


/**
 * @brief Convert integer color value into a float value for the decoder.
 *
 * @param data       The integer color value post-interpolation.
 * @param lns_mask   If set treat lane as HDR (LNS) else LDR (unorm16).
 *
 * @return The float color value.
 */
static inline vfloat4 decode_texel(
	vint4 data,
	vmask4 lns_mask
) {
	vint4 color_lns = vint4::zero();
	vint4 color_unorm = vint4::zero();

	if (any(lns_mask))
	{
		color_lns = lns_to_sf16(data);
	}

	if (!all(lns_mask))
	{
		color_unorm = unorm16_to_sf16(data);
	}

	// Pick components and then convert to FP16
	vint4 datai = select(color_unorm, color_lns, lns_mask);
	return float16_to_float(datai);
}

/* See header for documentation. */
void unpack_weights(
	const block_size_descriptor& bsd,
	const symbolic_compressed_block& scb,
	const decimation_info& di,
	bool is_dual_plane,
	int weights_plane1[BLOCK_MAX_TEXELS],
	int weights_plane2[BLOCK_MAX_TEXELS]
) {
	// Safe to overshoot as all arrays are allocated to full size
	if (!is_dual_plane)
	{
		// Build full 64-entry weight lookup table
		vint4 tab0(reinterpret_cast<const int*>(scb.weights +  0));
		vint4 tab1(reinterpret_cast<const int*>(scb.weights + 16));
		vint4 tab2(reinterpret_cast<const int*>(scb.weights + 32));
		vint4 tab3(reinterpret_cast<const int*>(scb.weights + 48));

		vint tab0p, tab1p, tab2p, tab3p;
		vtable_prepare(tab0, tab1, tab2, tab3, tab0p, tab1p, tab2p, tab3p);

		for (unsigned int i = 0; i < bsd.texel_count; i += ASTCENC_SIMD_WIDTH)
		{
			vint summed_value(8);
			vint weight_count(di.texel_weight_count + i);
			int max_weight_count = hmax(weight_count).lane<0>();

			promise(max_weight_count > 0);
			for (int j = 0; j < max_weight_count; j++)
			{
				vint texel_weights(di.texel_weights_tr[j] + i);
				vint texel_weights_int(di.texel_weight_contribs_int_tr[j] + i);

				summed_value += vtable_8bt_32bi(tab0p, tab1p, tab2p, tab3p, texel_weights) * texel_weights_int;
			}

			store(lsr<4>(summed_value), weights_plane1 + i);
		}
	}
	else
	{
		// Build a 32-entry weight lookup table per plane
		// Plane 1
		vint4 tab0_plane1(reinterpret_cast<const int*>(scb.weights +  0));
		vint4 tab1_plane1(reinterpret_cast<const int*>(scb.weights + 16));
		vint tab0_plane1p, tab1_plane1p;
		vtable_prepare(tab0_plane1, tab1_plane1, tab0_plane1p, tab1_plane1p);

		// Plane 2
		vint4 tab0_plane2(reinterpret_cast<const int*>(scb.weights + 32));
		vint4 tab1_plane2(reinterpret_cast<const int*>(scb.weights + 48));
		vint tab0_plane2p, tab1_plane2p;
		vtable_prepare(tab0_plane2, tab1_plane2, tab0_plane2p, tab1_plane2p);

		for (unsigned int i = 0; i < bsd.texel_count; i += ASTCENC_SIMD_WIDTH)
		{
			vint sum_plane1(8);
			vint sum_plane2(8);

			vint weight_count(di.texel_weight_count + i);
			int max_weight_count = hmax(weight_count).lane<0>();

			promise(max_weight_count > 0);
			for (int j = 0; j < max_weight_count; j++)
			{
				vint texel_weights(di.texel_weights_tr[j] + i);
				vint texel_weights_int(di.texel_weight_contribs_int_tr[j] + i);

				sum_plane1 += vtable_8bt_32bi(tab0_plane1p, tab1_plane1p, texel_weights) * texel_weights_int;
				sum_plane2 += vtable_8bt_32bi(tab0_plane2p, tab1_plane2p, texel_weights) * texel_weights_int;
			}

			store(lsr<4>(sum_plane1), weights_plane1 + i);
			store(lsr<4>(sum_plane2), weights_plane2 + i);
		}
	}
}

/**
 * @brief Return an FP32 NaN value for use in error colors.
 *
 * This NaN encoding will turn into 0xFFFF when converted to an FP16 NaN.
 *
 * @return The float color value.
 */
static float error_color_nan()
{
	if32 v;
	v.u = 0xFFFFE000U;
	return v.f;
}

/* See header for documentation. */
void decompress_symbolic_block(
	astcenc_profile decode_mode,
	const block_size_descriptor& bsd,
	int xpos,
	int ypos,
	int zpos,
	const symbolic_compressed_block& scb,
	image_block& blk
) {
	blk.xpos = xpos;
	blk.ypos = ypos;
	blk.zpos = zpos;

	blk.data_min = vfloat4::zero();
	blk.data_mean = vfloat4::zero();
	blk.data_max = vfloat4::zero();
	blk.grayscale = false;

	// If we detected an error-block, blow up immediately.
	if (scb.block_type == SYM_BTYPE_ERROR)
	{
		for (unsigned int i = 0; i < bsd.texel_count; i++)
		{
			blk.data_r[i] = error_color_nan();
			blk.data_g[i] = error_color_nan();
			blk.data_b[i] = error_color_nan();
			blk.data_a[i] = error_color_nan();
			blk.rgb_lns[i] = 0;
			blk.alpha_lns[i] = 0;
		}

		return;
	}

	if ((scb.block_type == SYM_BTYPE_CONST_F16) ||
	    (scb.block_type == SYM_BTYPE_CONST_U16))
	{
		vfloat4 color;
		uint8_t use_lns = 0;

		// UNORM16 constant color block
		if (scb.block_type == SYM_BTYPE_CONST_U16)
		{
			vint4 colori(scb.constant_color);

			// For sRGB decoding a real decoder would just use the top 8 bits for color conversion.
			// We don't color convert, so rescale the top 8 bits into the full 16 bit dynamic range.
			if (decode_mode == ASTCENC_PRF_LDR_SRGB)
			{
				colori = asr<8>(colori) * 257;
			}

			vint4 colorf16 = unorm16_to_sf16(colori);
			color = float16_to_float(colorf16);
		}
		// FLOAT16 constant color block
		else
		{
			switch (decode_mode)
			{
			case ASTCENC_PRF_LDR_SRGB:
			case ASTCENC_PRF_LDR:
				color = vfloat4(error_color_nan());
				break;
			case ASTCENC_PRF_HDR_RGB_LDR_A:
			case ASTCENC_PRF_HDR:
				// Constant-color block; unpack from FP16 to FP32.
				color = float16_to_float(vint4(scb.constant_color));
				use_lns = 1;
				break;
			}
		}

		for (unsigned int i = 0; i < bsd.texel_count; i++)
		{
			blk.data_r[i] = color.lane<0>();
			blk.data_g[i] = color.lane<1>();
			blk.data_b[i] = color.lane<2>();
			blk.data_a[i] = color.lane<3>();
			blk.rgb_lns[i] = use_lns;
			blk.alpha_lns[i] = use_lns;
		}

		return;
	}

	// Get the appropriate partition-table entry
	int partition_count = scb.partition_count;
	const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index);

	// Get the appropriate block descriptors
	const auto& bm = bsd.get_block_mode(scb.block_mode);
	const auto& di = bsd.get_decimation_info(bm.decimation_mode);

	bool is_dual_plane = static_cast<bool>(bm.is_dual_plane);

	// Unquantize and undecimate the weights
	int plane1_weights[BLOCK_MAX_TEXELS];
	int plane2_weights[BLOCK_MAX_TEXELS];
	unpack_weights(bsd, scb, di, is_dual_plane, plane1_weights, plane2_weights);

	// Now that we have endpoint colors and weights, we can unpack texel colors
	int plane2_component = scb.plane2_component;
	vmask4 plane2_mask = vint4::lane_id() == vint4(plane2_component);

	for (int i = 0; i < partition_count; i++)
	{
		// Decode the color endpoints for this partition
		vint4 ep0;
		vint4 ep1;
		bool rgb_lns;
		bool a_lns;

		unpack_color_endpoints(decode_mode,
		                       scb.color_formats[i],
		                       scb.color_values[i],
		                       rgb_lns, a_lns,
		                       ep0, ep1);

		vmask4 lns_mask(rgb_lns, rgb_lns, rgb_lns, a_lns);

		int texel_count = pi.partition_texel_count[i];
		for (int j = 0; j < texel_count; j++)
		{
			int tix = pi.texels_of_partition[i][j];
			vint4 weight = select(vint4(plane1_weights[tix]), vint4(plane2_weights[tix]), plane2_mask);
			vint4 color = lerp_color_int(decode_mode, ep0, ep1, weight);
			vfloat4 colorf = decode_texel(color, lns_mask);

			blk.data_r[tix] = colorf.lane<0>();
			blk.data_g[tix] = colorf.lane<1>();
			blk.data_b[tix] = colorf.lane<2>();
			blk.data_a[tix] = colorf.lane<3>();
		}
	}
}

#if !defined(ASTCENC_DECOMPRESS_ONLY)

/* See header for documentation. */
float compute_symbolic_block_difference_2plane(
	const astcenc_config& config,
	const block_size_descriptor& bsd,
	const symbolic_compressed_block& scb,
	const image_block& blk
) {
	// If we detected an error-block, blow up immediately.
	if (scb.block_type == SYM_BTYPE_ERROR)
	{
		return ERROR_CALC_DEFAULT;
	}

	assert(scb.block_mode >= 0);
	assert(scb.partition_count == 1);
	assert(bsd.get_block_mode(scb.block_mode).is_dual_plane == 1);

	// Get the appropriate block descriptor
	const block_mode& bm = bsd.get_block_mode(scb.block_mode);
	const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode);

	// Unquantize and undecimate the weights
	int plane1_weights[BLOCK_MAX_TEXELS];
	int plane2_weights[BLOCK_MAX_TEXELS];
	unpack_weights(bsd, scb, di, true, plane1_weights, plane2_weights);

	vmask4 plane2_mask = vint4::lane_id() == vint4(scb.plane2_component);

	vfloat4 summa = vfloat4::zero();

	// Decode the color endpoints for this partition
	vint4 ep0;
	vint4 ep1;
	bool rgb_lns;
	bool a_lns;

	unpack_color_endpoints(config.profile,
	                       scb.color_formats[0],
	                       scb.color_values[0],
	                       rgb_lns, a_lns,
	                       ep0, ep1);

	// Unpack and compute error for each texel in the partition
	unsigned int texel_count = bsd.texel_count;
	for (unsigned int i = 0; i < texel_count; i++)
	{
		vint4 weight = select(vint4(plane1_weights[i]), vint4(plane2_weights[i]), plane2_mask);
		vint4 colori = lerp_color_int(config.profile, ep0, ep1, weight);

		vfloat4 color = int_to_float(colori);
		vfloat4 oldColor = blk.texel(i);

		// Compare error using a perceptual decode metric for RGBM textures
		if (config.flags & ASTCENC_FLG_MAP_RGBM)
		{
			// Fail encodings that result in zero weight M pixels. Note that this can cause
			// "interesting" artifacts if we reject all useful encodings - we typically get max
			// brightness encodings instead which look just as bad. We recommend users apply a
			// bias to their stored M value, limiting the lower value to 16 or 32 to avoid
			// getting small M values post-quantization, but we can't prove it would never
			// happen, especially at low bit rates ...
			if (color.lane<3>() == 0.0f)
			{
				return -ERROR_CALC_DEFAULT;
			}

			// Compute error based on decoded RGBM color
			color = vfloat4(
				color.lane<0>() * color.lane<3>() * config.rgbm_m_scale,
				color.lane<1>() * color.lane<3>() * config.rgbm_m_scale,
				color.lane<2>() * color.lane<3>() * config.rgbm_m_scale,
				1.0f
			);

			oldColor = vfloat4(
				oldColor.lane<0>() * oldColor.lane<3>() * config.rgbm_m_scale,
				oldColor.lane<1>() * oldColor.lane<3>() * config.rgbm_m_scale,
				oldColor.lane<2>() * oldColor.lane<3>() * config.rgbm_m_scale,
				1.0f
			);
		}

		vfloat4 error = oldColor - color;
		error = min(abs(error), 1e15f);
		error = error * error;

		summa += min(dot(error, blk.channel_weight), ERROR_CALC_DEFAULT);
	}

	return summa.lane<0>();
}

/* See header for documentation. */
float compute_symbolic_block_difference_1plane(
	const astcenc_config& config,
	const block_size_descriptor& bsd,
	const symbolic_compressed_block& scb,
	const image_block& blk
) {
	assert(bsd.get_block_mode(scb.block_mode).is_dual_plane == 0);

	// If we detected an error-block, blow up immediately.
	if (scb.block_type == SYM_BTYPE_ERROR)
	{
		return ERROR_CALC_DEFAULT;
	}

	assert(scb.block_mode >= 0);

	// Get the appropriate partition-table entry
	unsigned int partition_count = scb.partition_count;
	const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index);

	// Get the appropriate block descriptor
	const block_mode& bm = bsd.get_block_mode(scb.block_mode);
	const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode);

	// Unquantize and undecimate the weights
	int plane1_weights[BLOCK_MAX_TEXELS];
	unpack_weights(bsd, scb, di, false, plane1_weights, nullptr);

	vfloat4 summa = vfloat4::zero();
	for (unsigned int i = 0; i < partition_count; i++)
	{
		// Decode the color endpoints for this partition
		vint4 ep0;
		vint4 ep1;
		bool rgb_lns;
		bool a_lns;

		unpack_color_endpoints(config.profile,
		                       scb.color_formats[i],
		                       scb.color_values[i],
		                       rgb_lns, a_lns,
		                       ep0, ep1);

		// Unpack and compute error for each texel in the partition
		unsigned int texel_count = pi.partition_texel_count[i];
		for (unsigned int j = 0; j < texel_count; j++)
		{
			unsigned int tix = pi.texels_of_partition[i][j];
			vint4 colori = lerp_color_int(config.profile, ep0, ep1,
			                              vint4(plane1_weights[tix]));

			vfloat4 color = int_to_float(colori);
			vfloat4 oldColor = blk.texel(tix);

			// Compare error using a perceptual decode metric for RGBM textures
			if (config.flags & ASTCENC_FLG_MAP_RGBM)
			{
				// Fail encodings that result in zero weight M pixels. Note that this can cause
				// "interesting" artifacts if we reject all useful encodings - we typically get max
				// brightness encodings instead which look just as bad. We recommend users apply a
				// bias to their stored M value, limiting the lower value to 16 or 32 to avoid
				// getting small M values post-quantization, but we can't prove it would never
				// happen, especially at low bit rates ...
				if (color.lane<3>() == 0.0f)
				{
					return -ERROR_CALC_DEFAULT;
				}

				// Compute error based on decoded RGBM color
				color = vfloat4(
					color.lane<0>() * color.lane<3>() * config.rgbm_m_scale,
					color.lane<1>() * color.lane<3>() * config.rgbm_m_scale,
					color.lane<2>() * color.lane<3>() * config.rgbm_m_scale,
					1.0f
				);

				oldColor = vfloat4(
					oldColor.lane<0>() * oldColor.lane<3>() * config.rgbm_m_scale,
					oldColor.lane<1>() * oldColor.lane<3>() * config.rgbm_m_scale,
					oldColor.lane<2>() * oldColor.lane<3>() * config.rgbm_m_scale,
					1.0f
				);
			}

			vfloat4 error = oldColor - color;
			error = min(abs(error), 1e15f);
			error = error * error;

			summa += min(dot(error, blk.channel_weight), ERROR_CALC_DEFAULT);
		}
	}

	return summa.lane<0>();
}

/* See header for documentation. */
float compute_symbolic_block_difference_1plane_1partition(
	const astcenc_config& config,
	const block_size_descriptor& bsd,
	const symbolic_compressed_block& scb,
	const image_block& blk
) {
	// If we detected an error-block, blow up immediately.
	if (scb.block_type == SYM_BTYPE_ERROR)
	{
		return ERROR_CALC_DEFAULT;
	}

	assert(scb.block_mode >= 0);
	assert(bsd.get_partition_info(scb.partition_count, scb.partition_index).partition_count == 1);

	// Get the appropriate block descriptor
	const block_mode& bm = bsd.get_block_mode(scb.block_mode);
	const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode);

	// Unquantize and undecimate the weights
	alignas(ASTCENC_VECALIGN) int plane1_weights[BLOCK_MAX_TEXELS];
	unpack_weights(bsd, scb, di, false, plane1_weights, nullptr);

	// Decode the color endpoints for this partition
	vint4 ep0;
	vint4 ep1;
	bool rgb_lns;
	bool a_lns;

	unpack_color_endpoints(config.profile,
	                       scb.color_formats[0],
	                       scb.color_values[0],
	                       rgb_lns, a_lns,
	                       ep0, ep1);


	// Pre-shift sRGB so things round correctly
	if (config.profile == ASTCENC_PRF_LDR_SRGB)
	{
		ep0 = asr<8>(ep0);
		ep1 = asr<8>(ep1);
	}

	// Unpack and compute error for each texel in the partition
	vfloatacc summav = vfloatacc::zero();

	vint lane_id = vint::lane_id();
	vint srgb_scale(config.profile == ASTCENC_PRF_LDR_SRGB ? 257 : 1);

	unsigned int texel_count = bsd.texel_count;
	for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
	{
		// Compute EP1 contribution
		vint weight1 = vint::loada(plane1_weights + i);
		vint ep1_r = vint(ep1.lane<0>()) * weight1;
		vint ep1_g = vint(ep1.lane<1>()) * weight1;
		vint ep1_b = vint(ep1.lane<2>()) * weight1;
		vint ep1_a = vint(ep1.lane<3>()) * weight1;

		// Compute EP0 contribution
		vint weight0 = vint(64) - weight1;
		vint ep0_r = vint(ep0.lane<0>()) * weight0;
		vint ep0_g = vint(ep0.lane<1>()) * weight0;
		vint ep0_b = vint(ep0.lane<2>()) * weight0;
		vint ep0_a = vint(ep0.lane<3>()) * weight0;

		// Shift so things round correctly
		vint colori_r = asr<6>(ep0_r + ep1_r + vint(32)) * srgb_scale;
		vint colori_g = asr<6>(ep0_g + ep1_g + vint(32)) * srgb_scale;
		vint colori_b = asr<6>(ep0_b + ep1_b + vint(32)) * srgb_scale;
		vint colori_a = asr<6>(ep0_a + ep1_a + vint(32)) * srgb_scale;

		// Compute color diff
		vfloat color_r = int_to_float(colori_r);
		vfloat color_g = int_to_float(colori_g);
		vfloat color_b = int_to_float(colori_b);
		vfloat color_a = int_to_float(colori_a);

		vfloat color_orig_r = loada(blk.data_r + i);
		vfloat color_orig_g = loada(blk.data_g + i);
		vfloat color_orig_b = loada(blk.data_b + i);
		vfloat color_orig_a = loada(blk.data_a + i);

		vfloat color_error_r = min(abs(color_orig_r - color_r), vfloat(1e15f));
		vfloat color_error_g = min(abs(color_orig_g - color_g), vfloat(1e15f));
		vfloat color_error_b = min(abs(color_orig_b - color_b), vfloat(1e15f));
		vfloat color_error_a = min(abs(color_orig_a - color_a), vfloat(1e15f));

		// Compute squared error metric
		color_error_r = color_error_r * color_error_r;
		color_error_g = color_error_g * color_error_g;
		color_error_b = color_error_b * color_error_b;
		color_error_a = color_error_a * color_error_a;

		vfloat metric = color_error_r * blk.channel_weight.lane<0>()
		              + color_error_g * blk.channel_weight.lane<1>()
		              + color_error_b * blk.channel_weight.lane<2>()
		              + color_error_a * blk.channel_weight.lane<3>();

		// Mask off bad lanes
		vmask mask = lane_id < vint(texel_count);
		lane_id += vint(ASTCENC_SIMD_WIDTH);
		haccumulate(summav, metric, mask);
	}

	return hadd_s(summav);
}

#endif
Add ASTC compression and decompression with Arm astcenc. Co-authored-by: Gordon A Macpherson <gordon.a.macpherson@gmail.com> Co-authored-by: Rémi Verschelde <rverschelde@gmail.com> 2022-12-21 02:54:01 +08:00			`// SPDX-License-Identifier: Apache-2.0`
			`// ----------------------------------------------------------------------------`
			`// Copyright 2011-2023 Arm Limited`
			`//`
			`// Licensed under the Apache License, Version 2.0 (the "License"); you may not`
			`// use this file except in compliance with the License. You may obtain a copy`
			`// of the License at:`
			`//`
			`// http://www.apache.org/licenses/LICENSE-2.0`
			`//`
			`// Unless required by applicable law or agreed to in writing, software`
			`// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT`
			`// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the`
			`// License for the specific language governing permissions and limitations`
			`// under the License.`
			`// ----------------------------------------------------------------------------`

			`/**`
			`* @brief Functions to decompress a symbolic block.`
			`*/`

			`#include "astcenc_internal.h"`

			`#include <stdio.h>`
			`#include <assert.h>`

			`/**`
			`* @brief Compute the integer linear interpolation of two color endpoints.`
			`*`
			`* @param decode_mode The ASTC profile (linear or sRGB)`
			`* @param color0 The endpoint0 color.`
			`* @param color1 The endpoint1 color.`
			`* @param weights The interpolation weight (between 0 and 64).`
			`*`
			`* @return The interpolated color.`
			`*/`
			`static vint4 lerp_color_int(`
			`astcenc_profile decode_mode,`
			`vint4 color0,`
			`vint4 color1,`
			`vint4 weights`
			`) {`
			`vint4 weight1 = weights;`
			`vint4 weight0 = vint4(64) - weight1;`

			`if (decode_mode == ASTCENC_PRF_LDR_SRGB)`
			`{`
			`color0 = asr<8>(color0);`
			`color1 = asr<8>(color1);`
			`}`

			`vint4 color = (color0 * weight0) + (color1 * weight1) + vint4(32);`
			`color = asr<6>(color);`

			`if (decode_mode == ASTCENC_PRF_LDR_SRGB)`
			`{`
			`color = color * vint4(257);`
			`}`

			`return color;`
			`}`


			`/**`
			`* @brief Convert integer color value into a float value for the decoder.`
			`*`
			`* @param data The integer color value post-interpolation.`
			`* @param lns_mask If set treat lane as HDR (LNS) else LDR (unorm16).`
			`*`
			`* @return The float color value.`
			`*/`
			`static inline vfloat4 decode_texel(`
			`vint4 data,`
			`vmask4 lns_mask`
			`) {`
			`vint4 color_lns = vint4::zero();`
			`vint4 color_unorm = vint4::zero();`

			`if (any(lns_mask))`
			`{`
			`color_lns = lns_to_sf16(data);`
			`}`

			`if (!all(lns_mask))`
			`{`
			`color_unorm = unorm16_to_sf16(data);`
			`}`

			`// Pick components and then convert to FP16`
			`vint4 datai = select(color_unorm, color_lns, lns_mask);`
			`return float16_to_float(datai);`
			`}`

			`/* See header for documentation. */`
			`void unpack_weights(`
			`const block_size_descriptor& bsd,`
			`const symbolic_compressed_block& scb,`
			`const decimation_info& di,`
			`bool is_dual_plane,`
			`int weights_plane1[BLOCK_MAX_TEXELS],`
			`int weights_plane2[BLOCK_MAX_TEXELS]`
			`) {`
			`// Safe to overshoot as all arrays are allocated to full size`
			`if (!is_dual_plane)`
			`{`
			`// Build full 64-entry weight lookup table`
			`vint4 tab0(reinterpret_cast<const int*>(scb.weights + 0));`
			`vint4 tab1(reinterpret_cast<const int*>(scb.weights + 16));`
			`vint4 tab2(reinterpret_cast<const int*>(scb.weights + 32));`
			`vint4 tab3(reinterpret_cast<const int*>(scb.weights + 48));`

			`vint tab0p, tab1p, tab2p, tab3p;`
			`vtable_prepare(tab0, tab1, tab2, tab3, tab0p, tab1p, tab2p, tab3p);`

			`for (unsigned int i = 0; i < bsd.texel_count; i += ASTCENC_SIMD_WIDTH)`
			`{`
			`vint summed_value(8);`
			`vint weight_count(di.texel_weight_count + i);`
			`int max_weight_count = hmax(weight_count).lane<0>();`

			`promise(max_weight_count > 0);`
			`for (int j = 0; j < max_weight_count; j++)`
			`{`
			`vint texel_weights(di.texel_weights_tr[j] + i);`
			`vint texel_weights_int(di.texel_weight_contribs_int_tr[j] + i);`

			`summed_value += vtable_8bt_32bi(tab0p, tab1p, tab2p, tab3p, texel_weights) * texel_weights_int;`
			`}`

			`store(lsr<4>(summed_value), weights_plane1 + i);`
			`}`
			`}`
			`else`
			`{`
			`// Build a 32-entry weight lookup table per plane`
			`// Plane 1`
			`vint4 tab0_plane1(reinterpret_cast<const int*>(scb.weights + 0));`
			`vint4 tab1_plane1(reinterpret_cast<const int*>(scb.weights + 16));`
			`vint tab0_plane1p, tab1_plane1p;`
			`vtable_prepare(tab0_plane1, tab1_plane1, tab0_plane1p, tab1_plane1p);`

			`// Plane 2`
			`vint4 tab0_plane2(reinterpret_cast<const int*>(scb.weights + 32));`
			`vint4 tab1_plane2(reinterpret_cast<const int*>(scb.weights + 48));`
			`vint tab0_plane2p, tab1_plane2p;`
			`vtable_prepare(tab0_plane2, tab1_plane2, tab0_plane2p, tab1_plane2p);`

			`for (unsigned int i = 0; i < bsd.texel_count; i += ASTCENC_SIMD_WIDTH)`
			`{`
			`vint sum_plane1(8);`
			`vint sum_plane2(8);`

			`vint weight_count(di.texel_weight_count + i);`
			`int max_weight_count = hmax(weight_count).lane<0>();`

			`promise(max_weight_count > 0);`
			`for (int j = 0; j < max_weight_count; j++)`
			`{`
			`vint texel_weights(di.texel_weights_tr[j] + i);`
			`vint texel_weights_int(di.texel_weight_contribs_int_tr[j] + i);`

			`sum_plane1 += vtable_8bt_32bi(tab0_plane1p, tab1_plane1p, texel_weights) * texel_weights_int;`
			`sum_plane2 += vtable_8bt_32bi(tab0_plane2p, tab1_plane2p, texel_weights) * texel_weights_int;`
			`}`

			`store(lsr<4>(sum_plane1), weights_plane1 + i);`
			`store(lsr<4>(sum_plane2), weights_plane2 + i);`
			`}`
			`}`
			`}`

			`/**`
			`* @brief Return an FP32 NaN value for use in error colors.`
			`*`
			`* This NaN encoding will turn into 0xFFFF when converted to an FP16 NaN.`
			`*`
			`* @return The float color value.`
			`*/`
			`static float error_color_nan()`
			`{`
			`if32 v;`
			`v.u = 0xFFFFE000U;`
			`return v.f;`
			`}`

			`/* See header for documentation. */`
			`void decompress_symbolic_block(`
			`astcenc_profile decode_mode,`
			`const block_size_descriptor& bsd,`
			`int xpos,`
			`int ypos,`
			`int zpos,`
			`const symbolic_compressed_block& scb,`
			`image_block& blk`
			`) {`
			`blk.xpos = xpos;`
			`blk.ypos = ypos;`
			`blk.zpos = zpos;`

			`blk.data_min = vfloat4::zero();`
			`blk.data_mean = vfloat4::zero();`
			`blk.data_max = vfloat4::zero();`
			`blk.grayscale = false;`

			`// If we detected an error-block, blow up immediately.`
			`if (scb.block_type == SYM_BTYPE_ERROR)`
			`{`
			`for (unsigned int i = 0; i < bsd.texel_count; i++)`
			`{`
			`blk.data_r[i] = error_color_nan();`
			`blk.data_g[i] = error_color_nan();`
			`blk.data_b[i] = error_color_nan();`
			`blk.data_a[i] = error_color_nan();`
			`blk.rgb_lns[i] = 0;`
			`blk.alpha_lns[i] = 0;`
			`}`

			`return;`
			`}`

			`if ((scb.block_type == SYM_BTYPE_CONST_F16) \|\|`
			`(scb.block_type == SYM_BTYPE_CONST_U16))`
			`{`
			`vfloat4 color;`
			`uint8_t use_lns = 0;`

			`// UNORM16 constant color block`
			`if (scb.block_type == SYM_BTYPE_CONST_U16)`
			`{`
			`vint4 colori(scb.constant_color);`

			`// For sRGB decoding a real decoder would just use the top 8 bits for color conversion.`
			`// We don't color convert, so rescale the top 8 bits into the full 16 bit dynamic range.`
			`if (decode_mode == ASTCENC_PRF_LDR_SRGB)`
			`{`
			`colori = asr<8>(colori) * 257;`
			`}`

			`vint4 colorf16 = unorm16_to_sf16(colori);`
			`color = float16_to_float(colorf16);`
			`}`
			`// FLOAT16 constant color block`
			`else`
			`{`
			`switch (decode_mode)`
			`{`
			`case ASTCENC_PRF_LDR_SRGB:`
			`case ASTCENC_PRF_LDR:`
			`color = vfloat4(error_color_nan());`
			`break;`
			`case ASTCENC_PRF_HDR_RGB_LDR_A:`
			`case ASTCENC_PRF_HDR:`
			`// Constant-color block; unpack from FP16 to FP32.`
			`color = float16_to_float(vint4(scb.constant_color));`
			`use_lns = 1;`
			`break;`
			`}`
			`}`

			`for (unsigned int i = 0; i < bsd.texel_count; i++)`
			`{`
			`blk.data_r[i] = color.lane<0>();`
			`blk.data_g[i] = color.lane<1>();`
			`blk.data_b[i] = color.lane<2>();`
			`blk.data_a[i] = color.lane<3>();`
			`blk.rgb_lns[i] = use_lns;`
			`blk.alpha_lns[i] = use_lns;`
			`}`

			`return;`
			`}`

			`// Get the appropriate partition-table entry`
			`int partition_count = scb.partition_count;`
			`const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index);`

			`// Get the appropriate block descriptors`
			`const auto& bm = bsd.get_block_mode(scb.block_mode);`
			`const auto& di = bsd.get_decimation_info(bm.decimation_mode);`

			`bool is_dual_plane = static_cast<bool>(bm.is_dual_plane);`

			`// Unquantize and undecimate the weights`
			`int plane1_weights[BLOCK_MAX_TEXELS];`
			`int plane2_weights[BLOCK_MAX_TEXELS];`
			`unpack_weights(bsd, scb, di, is_dual_plane, plane1_weights, plane2_weights);`

			`// Now that we have endpoint colors and weights, we can unpack texel colors`
			`int plane2_component = scb.plane2_component;`
			`vmask4 plane2_mask = vint4::lane_id() == vint4(plane2_component);`

			`for (int i = 0; i < partition_count; i++)`
			`{`
			`// Decode the color endpoints for this partition`
			`vint4 ep0;`
			`vint4 ep1;`
			`bool rgb_lns;`
			`bool a_lns;`

			`unpack_color_endpoints(decode_mode,`
			`scb.color_formats[i],`
			`scb.color_values[i],`
			`rgb_lns, a_lns,`
			`ep0, ep1);`

			`vmask4 lns_mask(rgb_lns, rgb_lns, rgb_lns, a_lns);`

			`int texel_count = pi.partition_texel_count[i];`
			`for (int j = 0; j < texel_count; j++)`
			`{`
			`int tix = pi.texels_of_partition[i][j];`
			`vint4 weight = select(vint4(plane1_weights[tix]), vint4(plane2_weights[tix]), plane2_mask);`
			`vint4 color = lerp_color_int(decode_mode, ep0, ep1, weight);`
			`vfloat4 colorf = decode_texel(color, lns_mask);`

			`blk.data_r[tix] = colorf.lane<0>();`
			`blk.data_g[tix] = colorf.lane<1>();`
			`blk.data_b[tix] = colorf.lane<2>();`
			`blk.data_a[tix] = colorf.lane<3>();`
			`}`
			`}`
			`}`

			`#if !defined(ASTCENC_DECOMPRESS_ONLY)`

			`/* See header for documentation. */`
			`float compute_symbolic_block_difference_2plane(`
			`const astcenc_config& config,`
			`const block_size_descriptor& bsd,`
			`const symbolic_compressed_block& scb,`
			`const image_block& blk`
			`) {`
			`// If we detected an error-block, blow up immediately.`
			`if (scb.block_type == SYM_BTYPE_ERROR)`
			`{`
			`return ERROR_CALC_DEFAULT;`
			`}`

			`assert(scb.block_mode >= 0);`
			`assert(scb.partition_count == 1);`
			`assert(bsd.get_block_mode(scb.block_mode).is_dual_plane == 1);`

			`// Get the appropriate block descriptor`
			`const block_mode& bm = bsd.get_block_mode(scb.block_mode);`
			`const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode);`

			`// Unquantize and undecimate the weights`
			`int plane1_weights[BLOCK_MAX_TEXELS];`
			`int plane2_weights[BLOCK_MAX_TEXELS];`
			`unpack_weights(bsd, scb, di, true, plane1_weights, plane2_weights);`

			`vmask4 plane2_mask = vint4::lane_id() == vint4(scb.plane2_component);`

			`vfloat4 summa = vfloat4::zero();`

			`// Decode the color endpoints for this partition`
			`vint4 ep0;`
			`vint4 ep1;`
			`bool rgb_lns;`
			`bool a_lns;`

			`unpack_color_endpoints(config.profile,`
			`scb.color_formats[0],`
			`scb.color_values[0],`
			`rgb_lns, a_lns,`
			`ep0, ep1);`

			`// Unpack and compute error for each texel in the partition`
			`unsigned int texel_count = bsd.texel_count;`
			`for (unsigned int i = 0; i < texel_count; i++)`
			`{`
			`vint4 weight = select(vint4(plane1_weights[i]), vint4(plane2_weights[i]), plane2_mask);`
			`vint4 colori = lerp_color_int(config.profile, ep0, ep1, weight);`

			`vfloat4 color = int_to_float(colori);`
			`vfloat4 oldColor = blk.texel(i);`

			`// Compare error using a perceptual decode metric for RGBM textures`
			`if (config.flags & ASTCENC_FLG_MAP_RGBM)`
			`{`
			`// Fail encodings that result in zero weight M pixels. Note that this can cause`
			`// "interesting" artifacts if we reject all useful encodings - we typically get max`
			`// brightness encodings instead which look just as bad. We recommend users apply a`
			`// bias to their stored M value, limiting the lower value to 16 or 32 to avoid`
			`// getting small M values post-quantization, but we can't prove it would never`
			`// happen, especially at low bit rates ...`
			`if (color.lane<3>() == 0.0f)`
			`{`
			`return -ERROR_CALC_DEFAULT;`
			`}`

			`// Compute error based on decoded RGBM color`
			`color = vfloat4(`
			`color.lane<0>() * color.lane<3>() * config.rgbm_m_scale,`
			`color.lane<1>() * color.lane<3>() * config.rgbm_m_scale,`
			`color.lane<2>() * color.lane<3>() * config.rgbm_m_scale,`
			`1.0f`
			`);`

			`oldColor = vfloat4(`
			`oldColor.lane<0>() * oldColor.lane<3>() * config.rgbm_m_scale,`
			`oldColor.lane<1>() * oldColor.lane<3>() * config.rgbm_m_scale,`
			`oldColor.lane<2>() * oldColor.lane<3>() * config.rgbm_m_scale,`
			`1.0f`
			`);`
			`}`

			`vfloat4 error = oldColor - color;`
			`error = min(abs(error), 1e15f);`
			`error = error * error;`

			`summa += min(dot(error, blk.channel_weight), ERROR_CALC_DEFAULT);`
			`}`

			`return summa.lane<0>();`
			`}`

			`/* See header for documentation. */`
			`float compute_symbolic_block_difference_1plane(`
			`const astcenc_config& config,`
			`const block_size_descriptor& bsd,`
			`const symbolic_compressed_block& scb,`
			`const image_block& blk`
			`) {`
			`assert(bsd.get_block_mode(scb.block_mode).is_dual_plane == 0);`

			`// If we detected an error-block, blow up immediately.`
			`if (scb.block_type == SYM_BTYPE_ERROR)`
			`{`
			`return ERROR_CALC_DEFAULT;`
			`}`

			`assert(scb.block_mode >= 0);`

			`// Get the appropriate partition-table entry`
			`unsigned int partition_count = scb.partition_count;`
			`const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index);`

			`// Get the appropriate block descriptor`
			`const block_mode& bm = bsd.get_block_mode(scb.block_mode);`
			`const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode);`

			`// Unquantize and undecimate the weights`
			`int plane1_weights[BLOCK_MAX_TEXELS];`
			`unpack_weights(bsd, scb, di, false, plane1_weights, nullptr);`

			`vfloat4 summa = vfloat4::zero();`
			`for (unsigned int i = 0; i < partition_count; i++)`
			`{`
			`// Decode the color endpoints for this partition`
			`vint4 ep0;`
			`vint4 ep1;`
			`bool rgb_lns;`
			`bool a_lns;`

			`unpack_color_endpoints(config.profile,`
			`scb.color_formats[i],`
			`scb.color_values[i],`
			`rgb_lns, a_lns,`
			`ep0, ep1);`

			`// Unpack and compute error for each texel in the partition`
			`unsigned int texel_count = pi.partition_texel_count[i];`
			`for (unsigned int j = 0; j < texel_count; j++)`
			`{`
			`unsigned int tix = pi.texels_of_partition[i][j];`
			`vint4 colori = lerp_color_int(config.profile, ep0, ep1,`
			`vint4(plane1_weights[tix]));`

			`vfloat4 color = int_to_float(colori);`
			`vfloat4 oldColor = blk.texel(tix);`

			`// Compare error using a perceptual decode metric for RGBM textures`
			`if (config.flags & ASTCENC_FLG_MAP_RGBM)`
			`{`
			`// Fail encodings that result in zero weight M pixels. Note that this can cause`
			`// "interesting" artifacts if we reject all useful encodings - we typically get max`
			`// brightness encodings instead which look just as bad. We recommend users apply a`
			`// bias to their stored M value, limiting the lower value to 16 or 32 to avoid`
			`// getting small M values post-quantization, but we can't prove it would never`
			`// happen, especially at low bit rates ...`
			`if (color.lane<3>() == 0.0f)`
			`{`
			`return -ERROR_CALC_DEFAULT;`
			`}`

			`// Compute error based on decoded RGBM color`
			`color = vfloat4(`
			`color.lane<0>() * color.lane<3>() * config.rgbm_m_scale,`
			`color.lane<1>() * color.lane<3>() * config.rgbm_m_scale,`
			`color.lane<2>() * color.lane<3>() * config.rgbm_m_scale,`
			`1.0f`
			`);`

			`oldColor = vfloat4(`
			`oldColor.lane<0>() * oldColor.lane<3>() * config.rgbm_m_scale,`
			`oldColor.lane<1>() * oldColor.lane<3>() * config.rgbm_m_scale,`
			`oldColor.lane<2>() * oldColor.lane<3>() * config.rgbm_m_scale,`
			`1.0f`
			`);`
			`}`

			`vfloat4 error = oldColor - color;`
			`error = min(abs(error), 1e15f);`
			`error = error * error;`

			`summa += min(dot(error, blk.channel_weight), ERROR_CALC_DEFAULT);`
			`}`
			`}`

			`return summa.lane<0>();`
			`}`

			`/* See header for documentation. */`
			`float compute_symbolic_block_difference_1plane_1partition(`
			`const astcenc_config& config,`
			`const block_size_descriptor& bsd,`
			`const symbolic_compressed_block& scb,`
			`const image_block& blk`
			`) {`
			`// If we detected an error-block, blow up immediately.`
			`if (scb.block_type == SYM_BTYPE_ERROR)`
			`{`
			`return ERROR_CALC_DEFAULT;`
			`}`

			`assert(scb.block_mode >= 0);`
			`assert(bsd.get_partition_info(scb.partition_count, scb.partition_index).partition_count == 1);`

			`// Get the appropriate block descriptor`
			`const block_mode& bm = bsd.get_block_mode(scb.block_mode);`
			`const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode);`

			`// Unquantize and undecimate the weights`
			`alignas(ASTCENC_VECALIGN) int plane1_weights[BLOCK_MAX_TEXELS];`
			`unpack_weights(bsd, scb, di, false, plane1_weights, nullptr);`

			`// Decode the color endpoints for this partition`
			`vint4 ep0;`
			`vint4 ep1;`
			`bool rgb_lns;`
			`bool a_lns;`

			`unpack_color_endpoints(config.profile,`
			`scb.color_formats[0],`
			`scb.color_values[0],`
			`rgb_lns, a_lns,`
			`ep0, ep1);`


			`// Pre-shift sRGB so things round correctly`
			`if (config.profile == ASTCENC_PRF_LDR_SRGB)`
			`{`
			`ep0 = asr<8>(ep0);`
			`ep1 = asr<8>(ep1);`
			`}`

			`// Unpack and compute error for each texel in the partition`
			`vfloatacc summav = vfloatacc::zero();`

			`vint lane_id = vint::lane_id();`
			`vint srgb_scale(config.profile == ASTCENC_PRF_LDR_SRGB ? 257 : 1);`

			`unsigned int texel_count = bsd.texel_count;`
			`for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)`
			`{`
			`// Compute EP1 contribution`
			`vint weight1 = vint::loada(plane1_weights + i);`
			`vint ep1_r = vint(ep1.lane<0>()) * weight1;`
			`vint ep1_g = vint(ep1.lane<1>()) * weight1;`
			`vint ep1_b = vint(ep1.lane<2>()) * weight1;`
			`vint ep1_a = vint(ep1.lane<3>()) * weight1;`

			`// Compute EP0 contribution`
			`vint weight0 = vint(64) - weight1;`
			`vint ep0_r = vint(ep0.lane<0>()) * weight0;`
			`vint ep0_g = vint(ep0.lane<1>()) * weight0;`
			`vint ep0_b = vint(ep0.lane<2>()) * weight0;`
			`vint ep0_a = vint(ep0.lane<3>()) * weight0;`

			`// Shift so things round correctly`
			`vint colori_r = asr<6>(ep0_r + ep1_r + vint(32)) * srgb_scale;`
			`vint colori_g = asr<6>(ep0_g + ep1_g + vint(32)) * srgb_scale;`
			`vint colori_b = asr<6>(ep0_b + ep1_b + vint(32)) * srgb_scale;`
			`vint colori_a = asr<6>(ep0_a + ep1_a + vint(32)) * srgb_scale;`

			`// Compute color diff`
			`vfloat color_r = int_to_float(colori_r);`
			`vfloat color_g = int_to_float(colori_g);`
			`vfloat color_b = int_to_float(colori_b);`
			`vfloat color_a = int_to_float(colori_a);`

			`vfloat color_orig_r = loada(blk.data_r + i);`
			`vfloat color_orig_g = loada(blk.data_g + i);`
			`vfloat color_orig_b = loada(blk.data_b + i);`
			`vfloat color_orig_a = loada(blk.data_a + i);`

			`vfloat color_error_r = min(abs(color_orig_r - color_r), vfloat(1e15f));`
			`vfloat color_error_g = min(abs(color_orig_g - color_g), vfloat(1e15f));`
			`vfloat color_error_b = min(abs(color_orig_b - color_b), vfloat(1e15f));`
			`vfloat color_error_a = min(abs(color_orig_a - color_a), vfloat(1e15f));`

			`// Compute squared error metric`
			`color_error_r = color_error_r * color_error_r;`
			`color_error_g = color_error_g * color_error_g;`
			`color_error_b = color_error_b * color_error_b;`
			`color_error_a = color_error_a * color_error_a;`

			`vfloat metric = color_error_r * blk.channel_weight.lane<0>()`
			`+ color_error_g * blk.channel_weight.lane<1>()`
			`+ color_error_b * blk.channel_weight.lane<2>()`
			`+ color_error_a * blk.channel_weight.lane<3>();`

			`// Mask off bad lanes`
			`vmask mask = lane_id < vint(texel_count);`
			`lane_id += vint(ASTCENC_SIMD_WIDTH);`
			`haccumulate(summav, metric, mask);`
			`}`

			`return hadd_s(summav);`
			`}`

			`#endif`