godot/thirdparty/basis_universal/encoder/basisu_ssim.cpp

// basisu_ssim.cpp
// Copyright (C) 2019 Binomial LLC. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//    http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "basisu_ssim.h"

#ifndef M_PI
#define M_PI 3.14159265358979323846
#endif

namespace basisu
{
	float gauss(int x, int y, float sigma_sqr)
	{
		float pow = expf(-((x * x + y * y) / (2.0f * sigma_sqr)));
		float g = (1.0f / (sqrtf((float)(2.0f * M_PI * sigma_sqr)))) * pow;
		return g;
	}
		
	// size_x/y should be odd
	void compute_gaussian_kernel(float *pDst, int size_x, int size_y, float sigma_sqr, uint32_t flags)
	{
		assert(size_x & size_y & 1);

		if (!(size_x | size_y))
			return;

		int mid_x = size_x / 2;
		int mid_y = size_y / 2;

		double sum = 0;
		for (int x = 0; x < size_x; x++)
		{
			for (int y = 0; y < size_y; y++)
			{
				float g;
				if ((x > mid_x) && (y < mid_y))
					g = pDst[(size_x - x - 1) + y * size_x];
				else if ((x < mid_x) && (y > mid_y))
					g = pDst[x + (size_y - y - 1) * size_x];
				else if ((x > mid_x) && (y > mid_y))
					g = pDst[(size_x - x - 1) + (size_y - y - 1) * size_x];
				else
					g = gauss(x - mid_x, y - mid_y, sigma_sqr);

				pDst[x + y * size_x] = g;
				sum += g;
			}
		}

		if (flags & cComputeGaussianFlagNormalizeCenterToOne)
		{
			sum = pDst[mid_x + mid_y * size_x];
		}

		if (flags & (cComputeGaussianFlagNormalizeCenterToOne | cComputeGaussianFlagNormalize))
		{
			double one_over_sum = 1.0f / sum;
			for (int i = 0; i < size_x * size_y; i++)
				pDst[i] = static_cast<float>(pDst[i] * one_over_sum);

			if (flags & cComputeGaussianFlagNormalizeCenterToOne)
				pDst[mid_x + mid_y * size_x] = 1.0f;
		}

		if (flags & cComputeGaussianFlagPrint)
		{
			printf("{\n");
			for (int y = 0; y < size_y; y++)
			{
				printf("  ");
				for (int x = 0; x < size_x; x++)
				{
					printf("%f, ", pDst[x + y * size_x]);
				}
				printf("\n");
			}
			printf("}");
		}
	}

	void gaussian_filter(imagef &dst, const imagef &orig_img, uint32_t odd_filter_width, float sigma_sqr, bool wrapping, uint32_t width_divisor, uint32_t height_divisor)
	{
		assert(odd_filter_width && (odd_filter_width & 1));
		odd_filter_width |= 1;

		vector2D<float> kernel(odd_filter_width, odd_filter_width);
		compute_gaussian_kernel(kernel.get_ptr(), odd_filter_width, odd_filter_width, sigma_sqr, cComputeGaussianFlagNormalize);

		const int dst_width = orig_img.get_width() / width_divisor;
		const int dst_height = orig_img.get_height() / height_divisor;

		const int H = odd_filter_width / 2;
		const int L = -H;

		dst.crop(dst_width, dst_height);

//#pragma omp parallel for
		for (int oy = 0; oy < dst_height; oy++)
		{
			for (int ox = 0; ox < dst_width; ox++)
			{
				vec4F c(0.0f);

				for (int yd = L; yd <= H; yd++)
				{
					int y = oy * height_divisor + (height_divisor >> 1) + yd;

					for (int xd = L; xd <= H; xd++)
					{
						int x = ox * width_divisor + (width_divisor >> 1) + xd;

						const vec4F &p = orig_img.get_clamped_or_wrapped(x, y, wrapping, wrapping);

						float w = kernel(xd + H, yd + H);
						c[0] += p[0] * w;
						c[1] += p[1] * w;
						c[2] += p[2] * w;
						c[3] += p[3] * w;
					}
				}

				dst(ox, oy).set(c[0], c[1], c[2], c[3]);
			}
		}
	}

	void pow_image(const imagef &src, imagef &dst, const vec4F &power)
	{
		dst.resize(src);

//#pragma omp parallel for
		for (int y = 0; y < (int)dst.get_height(); y++)
		{
			for (uint32_t x = 0; x < dst.get_width(); x++)
			{
				const vec4F &p = src(x, y);

				if ((power[0] == 2.0f) && (power[1] == 2.0f) && (power[2] == 2.0f) && (power[3] == 2.0f))
					dst(x, y).set(p[0] * p[0], p[1] * p[1], p[2] * p[2], p[3] * p[3]);
				else
					dst(x, y).set(powf(p[0], power[0]), powf(p[1], power[1]), powf(p[2], power[2]), powf(p[3], power[3]));
			}
		}
	}

	void mul_image(const imagef &src, imagef &dst, const vec4F &mul)
	{
		dst.resize(src);

//#pragma omp parallel for
		for (int y = 0; y < (int)dst.get_height(); y++)
		{
			for (uint32_t x = 0; x < dst.get_width(); x++)
			{
				const vec4F &p = src(x, y);
				dst(x, y).set(p[0] * mul[0], p[1] * mul[1], p[2] * mul[2], p[3] * mul[3]);
			}
		}
	}

	void scale_image(const imagef &src, imagef &dst, const vec4F &scale, const vec4F &shift)
	{
		dst.resize(src);

//#pragma omp parallel for
		for (int y = 0; y < (int)dst.get_height(); y++)
		{
			for (uint32_t x = 0; x < dst.get_width(); x++)
			{
				const vec4F &p = src(x, y);

				vec4F d;

				for (uint32_t c = 0; c < 4; c++)
					d[c] = scale[c] * p[c] + shift[c];

				dst(x, y).set(d[0], d[1], d[2], d[3]);
			}
		}
	}

	void add_weighted_image(const imagef &src1, const vec4F &alpha, const imagef &src2, const vec4F &beta, const vec4F &gamma, imagef &dst)
	{
		dst.resize(src1);

//#pragma omp parallel for
		for (int y = 0; y < (int)dst.get_height(); y++)
		{
			for (uint32_t x = 0; x < dst.get_width(); x++)
			{
				const vec4F &s1 = src1(x, y);
				const vec4F &s2 = src2(x, y);

				dst(x, y).set(
					s1[0] * alpha[0] + s2[0] * beta[0] + gamma[0],
					s1[1] * alpha[1] + s2[1] * beta[1] + gamma[1],
					s1[2] * alpha[2] + s2[2] * beta[2] + gamma[2],
					s1[3] * alpha[3] + s2[3] * beta[3] + gamma[3]);
			}
		}
	}

	void add_image(const imagef &src1, const imagef &src2, imagef &dst)
	{
		dst.resize(src1);

//#pragma omp parallel for
		for (int y = 0; y < (int)dst.get_height(); y++)
		{
			for (uint32_t x = 0; x < dst.get_width(); x++)
			{
				const vec4F &s1 = src1(x, y);
				const vec4F &s2 = src2(x, y);

				dst(x, y).set(s1[0] + s2[0], s1[1] + s2[1], s1[2] + s2[2], s1[3] + s2[3]);
			}
		}
	}

	void adds_image(const imagef &src, const vec4F &value, imagef &dst)
	{
		dst.resize(src);

//#pragma omp parallel for
		for (int y = 0; y < (int)dst.get_height(); y++)
		{
			for (uint32_t x = 0; x < dst.get_width(); x++)
			{
				const vec4F &p = src(x, y);

				dst(x, y).set(p[0] + value[0], p[1] + value[1], p[2] + value[2], p[3] + value[3]);
			}
		}
	}

	void mul_image(const imagef &src1, const imagef &src2, imagef &dst, const vec4F &scale)
	{
		dst.resize(src1);

//#pragma omp parallel for
		for (int y = 0; y < (int)dst.get_height(); y++)
		{
			for (uint32_t x = 0; x < dst.get_width(); x++)
			{
				const vec4F &s1 = src1(x, y);
				const vec4F &s2 = src2(x, y);

				vec4F d;

				for (uint32_t c = 0; c < 4; c++)
				{
					float v1 = s1[c];
					float v2 = s2[c];
					d[c] = v1 * v2 * scale[c];
				}

				dst(x, y) = d;
			}
		}
	}

	void div_image(const imagef &src1, const imagef &src2, imagef &dst, const vec4F &scale)
	{
		dst.resize(src1);

//#pragma omp parallel for
		for (int y = 0; y < (int)dst.get_height(); y++)
		{
			for (uint32_t x = 0; x < dst.get_width(); x++)
			{
				const vec4F &s1 = src1(x, y);
				const vec4F &s2 = src2(x, y);

				vec4F d;

				for (uint32_t c = 0; c < 4; c++)
				{
					float v = s2[c];
					if (v == 0.0f)
						d[c] = 0.0f;
					else
						d[c] = (s1[c] * scale[c]) / v;
				}

				dst(x, y) = d;
			}
		}
	}

	vec4F avg_image(const imagef &src)
	{
		vec4F avg(0.0f);

		for (uint32_t y = 0; y < src.get_height(); y++)
		{
			for (uint32_t x = 0; x < src.get_width(); x++)
			{
				const vec4F &s = src(x, y);

				avg += vec4F(s[0], s[1], s[2], s[3]);
			}
		}

		avg /= static_cast<float>(src.get_total_pixels());

		return avg;
	}
		
	// Reference: https://ece.uwaterloo.ca/~z70wang/research/ssim/index.html
	vec4F compute_ssim(const imagef &a, const imagef &b)
	{
		imagef axb, a_sq, b_sq, mu1, mu2, mu1_sq, mu2_sq, mu1_mu2, s1_sq, s2_sq, s12, smap, t1, t2, t3;

		const float C1 = 6.50250f, C2 = 58.52250f;
				
		pow_image(a, a_sq, vec4F(2));
		pow_image(b, b_sq, vec4F(2));
		mul_image(a, b, axb, vec4F(1.0f));

		gaussian_filter(mu1, a, 11, 1.5f * 1.5f);
		gaussian_filter(mu2, b, 11, 1.5f * 1.5f);

		pow_image(mu1, mu1_sq, vec4F(2));
		pow_image(mu2, mu2_sq, vec4F(2));
		mul_image(mu1, mu2, mu1_mu2, vec4F(1.0f));

		gaussian_filter(s1_sq, a_sq, 11, 1.5f * 1.5f);
		add_weighted_image(s1_sq, vec4F(1), mu1_sq, vec4F(-1), vec4F(0), s1_sq);

		gaussian_filter(s2_sq, b_sq, 11, 1.5f * 1.5f);
		add_weighted_image(s2_sq, vec4F(1), mu2_sq, vec4F(-1), vec4F(0), s2_sq);

		gaussian_filter(s12, axb, 11, 1.5f * 1.5f);
		add_weighted_image(s12, vec4F(1), mu1_mu2, vec4F(-1), vec4F(0), s12);

		scale_image(mu1_mu2, t1, vec4F(2), vec4F(0));
		adds_image(t1, vec4F(C1), t1);

		scale_image(s12, t2, vec4F(2), vec4F(0));
		adds_image(t2, vec4F(C2), t2);

		mul_image(t1, t2, t3, vec4F(1));

		add_image(mu1_sq, mu2_sq, t1);
		adds_image(t1, vec4F(C1), t1);

		add_image(s1_sq, s2_sq, t2);
		adds_image(t2, vec4F(C2), t2);

		mul_image(t1, t2, t1, vec4F(1));

		div_image(t3, t1, smap, vec4F(1));

		return avg_image(smap);
	}

	vec4F compute_ssim(const image &a, const image &b, bool luma, bool luma_601)
	{
		image ta(a), tb(b);

		if ((ta.get_width() != tb.get_width()) || (ta.get_height() != tb.get_height()))
		{
			debug_printf("compute_ssim: Cropping input images to equal dimensions\n");

			const uint32_t w = minimum(a.get_width(), b.get_width());
			const uint32_t h = minimum(a.get_height(), b.get_height());
			ta.crop(w, h);
			tb.crop(w, h);
		}

		if (!ta.get_width() || !ta.get_height())
		{
			assert(0);
			return vec4F(0);
		}

		if (luma)
		{
			for (uint32_t y = 0; y < ta.get_height(); y++)
			{
				for (uint32_t x = 0; x < ta.get_width(); x++)
				{
					ta(x, y).set(ta(x, y).get_luma(luma_601), ta(x, y).a);
					tb(x, y).set(tb(x, y).get_luma(luma_601), tb(x, y).a);
				}
			}
		}

		imagef fta, ftb;

		fta.set(ta);
		ftb.set(tb);

		return compute_ssim(fta, ftb);
	}

} // namespace basisu
Rewritten StreamTexture for better code reuse, added basis universal support 2019-09-27 10:16:44 +08:00			`// basisu_ssim.cpp`
			`// Copyright (C) 2019 Binomial LLC. All Rights Reserved.`
			`//`
			`// Licensed under the Apache License, Version 2.0 (the "License");`
			`// you may not use this file except in compliance with the License.`
			`// You may obtain a copy of the License at`
			`//`
			`// http://www.apache.org/licenses/LICENSE-2.0`
			`//`
			`// Unless required by applicable law or agreed to in writing, software`
			`// distributed under the License is distributed on an "AS IS" BASIS,`
			`// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`// See the License for the specific language governing permissions and`
			`// limitations under the License.`
			`#include "basisu_ssim.h"`

			`#ifndef M_PI`
			`#define M_PI 3.14159265358979323846`
			`#endif`

			`namespace basisu`
			`{`
			`float gauss(int x, int y, float sigma_sqr)`
			`{`
			`float pow = expf(-((x * x + y * y) / (2.0f * sigma_sqr)));`
			`float g = (1.0f / (sqrtf((float)(2.0f * M_PI * sigma_sqr)))) * pow;`
			`return g;`
			`}`

			`// size_x/y should be odd`
			`void compute_gaussian_kernel(float *pDst, int size_x, int size_y, float sigma_sqr, uint32_t flags)`
			`{`
			`assert(size_x & size_y & 1);`

			`if (!(size_x \| size_y))`
			`return;`

			`int mid_x = size_x / 2;`
			`int mid_y = size_y / 2;`

			`double sum = 0;`
			`for (int x = 0; x < size_x; x++)`
			`{`
			`for (int y = 0; y < size_y; y++)`
			`{`
			`float g;`
			`if ((x > mid_x) && (y < mid_y))`
			`g = pDst[(size_x - x - 1) + y * size_x];`
			`else if ((x < mid_x) && (y > mid_y))`
			`g = pDst[x + (size_y - y - 1) * size_x];`
			`else if ((x > mid_x) && (y > mid_y))`
			`g = pDst[(size_x - x - 1) + (size_y - y - 1) * size_x];`
			`else`
			`g = gauss(x - mid_x, y - mid_y, sigma_sqr);`

			`pDst[x + y * size_x] = g;`
			`sum += g;`
			`}`
			`}`

			`if (flags & cComputeGaussianFlagNormalizeCenterToOne)`
			`{`
			`sum = pDst[mid_x + mid_y * size_x];`
			`}`

			`if (flags & (cComputeGaussianFlagNormalizeCenterToOne \| cComputeGaussianFlagNormalize))`
			`{`
			`double one_over_sum = 1.0f / sum;`
			`for (int i = 0; i < size_x * size_y; i++)`
			`pDst[i] = static_cast<float>(pDst[i] * one_over_sum);`

			`if (flags & cComputeGaussianFlagNormalizeCenterToOne)`
			`pDst[mid_x + mid_y * size_x] = 1.0f;`
			`}`

			`if (flags & cComputeGaussianFlagPrint)`
			`{`
			`printf("{\n");`
			`for (int y = 0; y < size_y; y++)`
			`{`
			`printf(" ");`
			`for (int x = 0; x < size_x; x++)`
			`{`
			`printf("%f, ", pDst[x + y * size_x]);`
			`}`
			`printf("\n");`
			`}`
			`printf("}");`
			`}`
			`}`

			`void gaussian_filter(imagef &dst, const imagef &orig_img, uint32_t odd_filter_width, float sigma_sqr, bool wrapping, uint32_t width_divisor, uint32_t height_divisor)`
			`{`
			`assert(odd_filter_width && (odd_filter_width & 1));`
			`odd_filter_width \|= 1;`

			`vector2D<float> kernel(odd_filter_width, odd_filter_width);`
			`compute_gaussian_kernel(kernel.get_ptr(), odd_filter_width, odd_filter_width, sigma_sqr, cComputeGaussianFlagNormalize);`

			`const int dst_width = orig_img.get_width() / width_divisor;`
			`const int dst_height = orig_img.get_height() / height_divisor;`

			`const int H = odd_filter_width / 2;`
			`const int L = -H;`

			`dst.crop(dst_width, dst_height);`

			`//#pragma omp parallel for`
			`for (int oy = 0; oy < dst_height; oy++)`
			`{`
			`for (int ox = 0; ox < dst_width; ox++)`
			`{`
			`vec4F c(0.0f);`

			`for (int yd = L; yd <= H; yd++)`
			`{`
			`int y = oy * height_divisor + (height_divisor >> 1) + yd;`

			`for (int xd = L; xd <= H; xd++)`
			`{`
			`int x = ox * width_divisor + (width_divisor >> 1) + xd;`

			`const vec4F &p = orig_img.get_clamped_or_wrapped(x, y, wrapping, wrapping);`

			`float w = kernel(xd + H, yd + H);`
			`c[0] += p[0] * w;`
			`c[1] += p[1] * w;`
			`c[2] += p[2] * w;`
			`c[3] += p[3] * w;`
			`}`
			`}`

			`dst(ox, oy).set(c[0], c[1], c[2], c[3]);`
			`}`
			`}`
			`}`

			`void pow_image(const imagef &src, imagef &dst, const vec4F &power)`
			`{`
			`dst.resize(src);`

			`//#pragma omp parallel for`
			`for (int y = 0; y < (int)dst.get_height(); y++)`
			`{`
			`for (uint32_t x = 0; x < dst.get_width(); x++)`
			`{`
			`const vec4F &p = src(x, y);`

			`if ((power[0] == 2.0f) && (power[1] == 2.0f) && (power[2] == 2.0f) && (power[3] == 2.0f))`
			`dst(x, y).set(p[0] * p[0], p[1] * p[1], p[2] * p[2], p[3] * p[3]);`
			`else`
			`dst(x, y).set(powf(p[0], power[0]), powf(p[1], power[1]), powf(p[2], power[2]), powf(p[3], power[3]));`
			`}`
			`}`
			`}`

			`void mul_image(const imagef &src, imagef &dst, const vec4F &mul)`
			`{`
			`dst.resize(src);`

			`//#pragma omp parallel for`
			`for (int y = 0; y < (int)dst.get_height(); y++)`
			`{`
			`for (uint32_t x = 0; x < dst.get_width(); x++)`
			`{`
			`const vec4F &p = src(x, y);`
			`dst(x, y).set(p[0] * mul[0], p[1] * mul[1], p[2] * mul[2], p[3] * mul[3]);`
			`}`
			`}`
			`}`

			`void scale_image(const imagef &src, imagef &dst, const vec4F &scale, const vec4F &shift)`
			`{`
			`dst.resize(src);`

			`//#pragma omp parallel for`
			`for (int y = 0; y < (int)dst.get_height(); y++)`
			`{`
			`for (uint32_t x = 0; x < dst.get_width(); x++)`
			`{`
			`const vec4F &p = src(x, y);`

			`vec4F d;`

			`for (uint32_t c = 0; c < 4; c++)`
			`d[c] = scale[c] * p[c] + shift[c];`

			`dst(x, y).set(d[0], d[1], d[2], d[3]);`
			`}`
			`}`
			`}`

			`void add_weighted_image(const imagef &src1, const vec4F &alpha, const imagef &src2, const vec4F &beta, const vec4F &gamma, imagef &dst)`
			`{`
			`dst.resize(src1);`

			`//#pragma omp parallel for`
			`for (int y = 0; y < (int)dst.get_height(); y++)`
			`{`
			`for (uint32_t x = 0; x < dst.get_width(); x++)`
			`{`
			`const vec4F &s1 = src1(x, y);`
			`const vec4F &s2 = src2(x, y);`

			`dst(x, y).set(`
			`s1[0] * alpha[0] + s2[0] * beta[0] + gamma[0],`
			`s1[1] * alpha[1] + s2[1] * beta[1] + gamma[1],`
			`s1[2] * alpha[2] + s2[2] * beta[2] + gamma[2],`
			`s1[3] * alpha[3] + s2[3] * beta[3] + gamma[3]);`
			`}`
			`}`
			`}`

			`void add_image(const imagef &src1, const imagef &src2, imagef &dst)`
			`{`
			`dst.resize(src1);`

			`//#pragma omp parallel for`
			`for (int y = 0; y < (int)dst.get_height(); y++)`
			`{`
			`for (uint32_t x = 0; x < dst.get_width(); x++)`
			`{`
			`const vec4F &s1 = src1(x, y);`
			`const vec4F &s2 = src2(x, y);`

			`dst(x, y).set(s1[0] + s2[0], s1[1] + s2[1], s1[2] + s2[2], s1[3] + s2[3]);`
			`}`
			`}`
			`}`

			`void adds_image(const imagef &src, const vec4F &value, imagef &dst)`
			`{`
			`dst.resize(src);`

			`//#pragma omp parallel for`
			`for (int y = 0; y < (int)dst.get_height(); y++)`
			`{`
			`for (uint32_t x = 0; x < dst.get_width(); x++)`
			`{`
			`const vec4F &p = src(x, y);`

			`dst(x, y).set(p[0] + value[0], p[1] + value[1], p[2] + value[2], p[3] + value[3]);`
			`}`
			`}`
			`}`

			`void mul_image(const imagef &src1, const imagef &src2, imagef &dst, const vec4F &scale)`
			`{`
			`dst.resize(src1);`

			`//#pragma omp parallel for`
			`for (int y = 0; y < (int)dst.get_height(); y++)`
			`{`
			`for (uint32_t x = 0; x < dst.get_width(); x++)`
			`{`
			`const vec4F &s1 = src1(x, y);`
			`const vec4F &s2 = src2(x, y);`

			`vec4F d;`

			`for (uint32_t c = 0; c < 4; c++)`
			`{`
			`float v1 = s1[c];`
			`float v2 = s2[c];`
			`d[c] = v1 * v2 * scale[c];`
			`}`

			`dst(x, y) = d;`
			`}`
			`}`
			`}`

			`void div_image(const imagef &src1, const imagef &src2, imagef &dst, const vec4F &scale)`
			`{`
			`dst.resize(src1);`

			`//#pragma omp parallel for`
			`for (int y = 0; y < (int)dst.get_height(); y++)`
			`{`
			`for (uint32_t x = 0; x < dst.get_width(); x++)`
			`{`
			`const vec4F &s1 = src1(x, y);`
			`const vec4F &s2 = src2(x, y);`

			`vec4F d;`

			`for (uint32_t c = 0; c < 4; c++)`
			`{`
			`float v = s2[c];`
			`if (v == 0.0f)`
			`d[c] = 0.0f;`
			`else`
			`d[c] = (s1[c] * scale[c]) / v;`
			`}`

			`dst(x, y) = d;`
			`}`
			`}`
			`}`

			`vec4F avg_image(const imagef &src)`
			`{`
			`vec4F avg(0.0f);`

			`for (uint32_t y = 0; y < src.get_height(); y++)`
			`{`
			`for (uint32_t x = 0; x < src.get_width(); x++)`
			`{`
			`const vec4F &s = src(x, y);`

			`avg += vec4F(s[0], s[1], s[2], s[3]);`
			`}`
			`}`

			`avg /= static_cast<float>(src.get_total_pixels());`

			`return avg;`
			`}`

			`// Reference: https://ece.uwaterloo.ca/~z70wang/research/ssim/index.html`
			`vec4F compute_ssim(const imagef &a, const imagef &b)`
			`{`
			`imagef axb, a_sq, b_sq, mu1, mu2, mu1_sq, mu2_sq, mu1_mu2, s1_sq, s2_sq, s12, smap, t1, t2, t3;`

			`const float C1 = 6.50250f, C2 = 58.52250f;`

			`pow_image(a, a_sq, vec4F(2));`
			`pow_image(b, b_sq, vec4F(2));`
			`mul_image(a, b, axb, vec4F(1.0f));`

			`gaussian_filter(mu1, a, 11, 1.5f * 1.5f);`
			`gaussian_filter(mu2, b, 11, 1.5f * 1.5f);`

			`pow_image(mu1, mu1_sq, vec4F(2));`
			`pow_image(mu2, mu2_sq, vec4F(2));`
			`mul_image(mu1, mu2, mu1_mu2, vec4F(1.0f));`

			`gaussian_filter(s1_sq, a_sq, 11, 1.5f * 1.5f);`
			`add_weighted_image(s1_sq, vec4F(1), mu1_sq, vec4F(-1), vec4F(0), s1_sq);`

			`gaussian_filter(s2_sq, b_sq, 11, 1.5f * 1.5f);`
			`add_weighted_image(s2_sq, vec4F(1), mu2_sq, vec4F(-1), vec4F(0), s2_sq);`

			`gaussian_filter(s12, axb, 11, 1.5f * 1.5f);`
			`add_weighted_image(s12, vec4F(1), mu1_mu2, vec4F(-1), vec4F(0), s12);`

			`scale_image(mu1_mu2, t1, vec4F(2), vec4F(0));`
			`adds_image(t1, vec4F(C1), t1);`

			`scale_image(s12, t2, vec4F(2), vec4F(0));`
			`adds_image(t2, vec4F(C2), t2);`

			`mul_image(t1, t2, t3, vec4F(1));`

			`add_image(mu1_sq, mu2_sq, t1);`
			`adds_image(t1, vec4F(C1), t1);`

			`add_image(s1_sq, s2_sq, t2);`
			`adds_image(t2, vec4F(C2), t2);`

			`mul_image(t1, t2, t1, vec4F(1));`

			`div_image(t3, t1, smap, vec4F(1));`

			`return avg_image(smap);`
			`}`

			`vec4F compute_ssim(const image &a, const image &b, bool luma, bool luma_601)`
			`{`
			`image ta(a), tb(b);`

			`if ((ta.get_width() != tb.get_width()) \|\| (ta.get_height() != tb.get_height()))`
			`{`
			`debug_printf("compute_ssim: Cropping input images to equal dimensions\n");`

			`const uint32_t w = minimum(a.get_width(), b.get_width());`
			`const uint32_t h = minimum(a.get_height(), b.get_height());`
			`ta.crop(w, h);`
			`tb.crop(w, h);`
			`}`

			`if (!ta.get_width() \|\| !ta.get_height())`
			`{`
			`assert(0);`
			`return vec4F(0);`
			`}`

			`if (luma)`
			`{`
			`for (uint32_t y = 0; y < ta.get_height(); y++)`
			`{`
			`for (uint32_t x = 0; x < ta.get_width(); x++)`
			`{`
			`ta(x, y).set(ta(x, y).get_luma(luma_601), ta(x, y).a);`
			`tb(x, y).set(tb(x, y).get_luma(luma_601), tb(x, y).a);`
			`}`
			`}`
			`}`

			`imagef fta, ftb;`

			`fta.set(ta);`
			`ftb.set(tb);`

			`return compute_ssim(fta, ftb);`
			`}`

			`} // namespace basisu`