godot/thirdparty/cvtt/ConvectionKernels_IndexSelector.h

#pragma once
#ifndef __CVTT_INDEXSELECTOR_H__
#define __CVTT_INDEXSELECTOR_H__

#include "ConvectionKernels_ParallelMath.h"

namespace cvtt
{
    namespace Internal
    {
        extern const ParallelMath::UInt16 g_weightReciprocals[17];

        template<int TVectorSize>
        class IndexSelector
        {
        public:
            typedef ParallelMath::Float MFloat;
            typedef ParallelMath::UInt16 MUInt16;
            typedef ParallelMath::UInt15 MUInt15;
            typedef ParallelMath::SInt16 MSInt16;
            typedef ParallelMath::AInt16 MAInt16;
            typedef ParallelMath::SInt32 MSInt32;
            typedef ParallelMath::UInt31 MUInt31;


            template<class TInterpolationEPType, class TColorEPType>
            void Init(const float *channelWeights, const TInterpolationEPType interpolationEndPoints[2][TVectorSize], const TColorEPType colorSpaceEndpoints[2][TVectorSize], int range)
            {
                // In BC6H, the interpolation endpoints are higher-precision than the endpoints in color space.
                // We need to select indexes using the color-space endpoints.

                m_isUniform = true;
                for (int ch = 1; ch < TVectorSize; ch++)
                {
                    if (channelWeights[ch] != channelWeights[0])
                        m_isUniform = false;
                }

                // To work with channel weights, we need something where:
                // pxDiff = px - ep[0]
                // epDiff = ep[1] - ep[0]
                //
                // weightedEPDiff = epDiff * channelWeights
                // normalizedWeightedAxis = weightedEPDiff / len(weightedEPDiff)
                // normalizedIndex = dot(pxDiff * channelWeights, normalizedWeightedAxis) / len(weightedEPDiff)
                // index = normalizedIndex * maxValue
                //
                // Equivalent to:
                // axis = channelWeights * maxValue * epDiff * channelWeights / lenSquared(epDiff * channelWeights)
                // index = dot(axis, pxDiff)

                for (int ep = 0; ep < 2; ep++)
                    for (int ch = 0; ch < TVectorSize; ch++)
                        m_endPoint[ep][ch] = ParallelMath::LosslessCast<MAInt16>::Cast(interpolationEndPoints[ep][ch]);

                m_range = range;
                m_maxValue = static_cast<float>(range - 1);

                MFloat epDiffWeighted[TVectorSize];
                for (int ch = 0; ch < TVectorSize; ch++)
                {
                    m_origin[ch] = ParallelMath::ToFloat(colorSpaceEndpoints[0][ch]);
                    MFloat opposingOriginCh = ParallelMath::ToFloat(colorSpaceEndpoints[1][ch]);
                    epDiffWeighted[ch] = (opposingOriginCh - m_origin[ch]) * channelWeights[ch];
                }

                MFloat lenSquared = epDiffWeighted[0] * epDiffWeighted[0];
                for (int ch = 1; ch < TVectorSize; ch++)
                    lenSquared = lenSquared + epDiffWeighted[ch] * epDiffWeighted[ch];

                ParallelMath::MakeSafeDenominator(lenSquared);

                MFloat maxValueDividedByLengthSquared = ParallelMath::MakeFloat(m_maxValue) / lenSquared;

                for (int ch = 0; ch < TVectorSize; ch++)
                    m_axis[ch] = epDiffWeighted[ch] * channelWeights[ch] * maxValueDividedByLengthSquared;
            }

            template<bool TSigned>
            void Init(const float channelWeights[TVectorSize], const MUInt15 endPoints[2][TVectorSize], int range)
            {
                MAInt16 converted[2][TVectorSize];
                for (int epi = 0; epi < 2; epi++)
                    for (int ch = 0; ch < TVectorSize; ch++)
                        converted[epi][ch] = ParallelMath::LosslessCast<MAInt16>::Cast(endPoints[epi][ch]);

                Init<MUInt15, MUInt15>(channelWeights, endPoints, endPoints, range);
            }

            void ReconstructLDR_BC7(const MUInt15 &index, MUInt15* pixel, int numRealChannels)
            {
                MUInt15 weight = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ParallelMath::CompactMultiply(g_weightReciprocals[m_range], index) + 256, 9));

                for (int ch = 0; ch < numRealChannels; ch++)
                {
                    MUInt15 ep0f = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::CompactMultiply((ParallelMath::MakeUInt15(64) - weight), ParallelMath::LosslessCast<MUInt15>::Cast(m_endPoint[0][ch])));
                    MUInt15 ep1f = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::CompactMultiply(weight, ParallelMath::LosslessCast<MUInt15>::Cast(m_endPoint[1][ch])));
                    pixel[ch] = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ep0f + ep1f + ParallelMath::MakeUInt15(32), 6));
                }
            }

            void ReconstructLDRPrecise(const MUInt15 &index, MUInt15* pixel, int numRealChannels)
            {
                MUInt15 weight = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ParallelMath::CompactMultiply(g_weightReciprocals[m_range], index) + 64, 7));

                for (int ch = 0; ch < numRealChannels; ch++)
                {
                    MUInt15 ep0f = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::CompactMultiply((ParallelMath::MakeUInt15(256) - weight), ParallelMath::LosslessCast<MUInt15>::Cast(m_endPoint[0][ch])));
                    MUInt15 ep1f = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::CompactMultiply(weight, ParallelMath::LosslessCast<MUInt15>::Cast(m_endPoint[1][ch])));
                    pixel[ch] = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ep0f + ep1f + ParallelMath::MakeUInt15(128), 8));
                }
            }

            void ReconstructLDR_BC7(const MUInt15 &index, MUInt15* pixel)
            {
                ReconstructLDR_BC7(index, pixel, TVectorSize);
            }

            void ReconstructLDRPrecise(const MUInt15 &index, MUInt15* pixel)
            {
                ReconstructLDRPrecise(index, pixel, TVectorSize);
            }

            MUInt15 SelectIndexLDR(const MFloat* pixel, const ParallelMath::RoundTowardNearestForScope* rtn) const
            {
                MFloat dist = (pixel[0] - m_origin[0]) * m_axis[0];
                for (int ch = 1; ch < TVectorSize; ch++)
                    dist = dist + (pixel[ch] - m_origin[ch]) * m_axis[ch];

                return ParallelMath::RoundAndConvertToU15(ParallelMath::Clamp(dist, 0.0f, m_maxValue), rtn);
            }

        protected:
            MAInt16 m_endPoint[2][TVectorSize];

        private:
            MFloat m_origin[TVectorSize];
            MFloat m_axis[TVectorSize];
            int m_range;
            float m_maxValue;
            bool m_isUniform;
        };
    }
}

#endif
Faster CVTT by reducing quality. Make BC6 and BC7 CVTT faster while still having better quality than DXT5. 2022-01-18 20:39:55 +08:00			`#pragma once`
			`#ifndef __CVTT_INDEXSELECTOR_H__`
			`#define __CVTT_INDEXSELECTOR_H__`

			`#include "ConvectionKernels_ParallelMath.h"`

			`namespace cvtt`
			`{`
			`namespace Internal`
			`{`
			`extern const ParallelMath::UInt16 g_weightReciprocals[17];`

			`template<int TVectorSize>`
			`class IndexSelector`
			`{`
			`public:`
			`typedef ParallelMath::Float MFloat;`
			`typedef ParallelMath::UInt16 MUInt16;`
			`typedef ParallelMath::UInt15 MUInt15;`
			`typedef ParallelMath::SInt16 MSInt16;`
			`typedef ParallelMath::AInt16 MAInt16;`
			`typedef ParallelMath::SInt32 MSInt32;`
			`typedef ParallelMath::UInt31 MUInt31;`


			`template<class TInterpolationEPType, class TColorEPType>`
			`void Init(const float *channelWeights, const TInterpolationEPType interpolationEndPoints[2][TVectorSize], const TColorEPType colorSpaceEndpoints[2][TVectorSize], int range)`
			`{`
			`// In BC6H, the interpolation endpoints are higher-precision than the endpoints in color space.`
			`// We need to select indexes using the color-space endpoints.`

			`m_isUniform = true;`
			`for (int ch = 1; ch < TVectorSize; ch++)`
			`{`
			`if (channelWeights[ch] != channelWeights[0])`
			`m_isUniform = false;`
			`}`

			`// To work with channel weights, we need something where:`
			`// pxDiff = px - ep[0]`
			`// epDiff = ep[1] - ep[0]`
			`//`
			`// weightedEPDiff = epDiff * channelWeights`
			`// normalizedWeightedAxis = weightedEPDiff / len(weightedEPDiff)`
			`// normalizedIndex = dot(pxDiff * channelWeights, normalizedWeightedAxis) / len(weightedEPDiff)`
			`// index = normalizedIndex * maxValue`
			`//`
			`// Equivalent to:`
			`// axis = channelWeights * maxValue * epDiff * channelWeights / lenSquared(epDiff * channelWeights)`
			`// index = dot(axis, pxDiff)`

			`for (int ep = 0; ep < 2; ep++)`
			`for (int ch = 0; ch < TVectorSize; ch++)`
			`m_endPoint[ep][ch] = ParallelMath::LosslessCast<MAInt16>::Cast(interpolationEndPoints[ep][ch]);`

			`m_range = range;`
			`m_maxValue = static_cast<float>(range - 1);`

			`MFloat epDiffWeighted[TVectorSize];`
			`for (int ch = 0; ch < TVectorSize; ch++)`
			`{`
			`m_origin[ch] = ParallelMath::ToFloat(colorSpaceEndpoints[0][ch]);`
			`MFloat opposingOriginCh = ParallelMath::ToFloat(colorSpaceEndpoints[1][ch]);`
			`epDiffWeighted[ch] = (opposingOriginCh - m_origin[ch]) * channelWeights[ch];`
			`}`

			`MFloat lenSquared = epDiffWeighted[0] * epDiffWeighted[0];`
			`for (int ch = 1; ch < TVectorSize; ch++)`
			`lenSquared = lenSquared + epDiffWeighted[ch] * epDiffWeighted[ch];`

			`ParallelMath::MakeSafeDenominator(lenSquared);`

			`MFloat maxValueDividedByLengthSquared = ParallelMath::MakeFloat(m_maxValue) / lenSquared;`

			`for (int ch = 0; ch < TVectorSize; ch++)`
			`m_axis[ch] = epDiffWeighted[ch] * channelWeights[ch] * maxValueDividedByLengthSquared;`
			`}`

			`template<bool TSigned>`
			`void Init(const float channelWeights[TVectorSize], const MUInt15 endPoints[2][TVectorSize], int range)`
			`{`
			`MAInt16 converted[2][TVectorSize];`
			`for (int epi = 0; epi < 2; epi++)`
			`for (int ch = 0; ch < TVectorSize; ch++)`
			`converted[epi][ch] = ParallelMath::LosslessCast<MAInt16>::Cast(endPoints[epi][ch]);`

			`Init<MUInt15, MUInt15>(channelWeights, endPoints, endPoints, range);`
			`}`

			`void ReconstructLDR_BC7(const MUInt15 &index, MUInt15* pixel, int numRealChannels)`
			`{`
			`MUInt15 weight = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ParallelMath::CompactMultiply(g_weightReciprocals[m_range], index) + 256, 9));`

			`for (int ch = 0; ch < numRealChannels; ch++)`
			`{`
			`MUInt15 ep0f = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::CompactMultiply((ParallelMath::MakeUInt15(64) - weight), ParallelMath::LosslessCast<MUInt15>::Cast(m_endPoint[0][ch])));`
			`MUInt15 ep1f = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::CompactMultiply(weight, ParallelMath::LosslessCast<MUInt15>::Cast(m_endPoint[1][ch])));`
			`pixel[ch] = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ep0f + ep1f + ParallelMath::MakeUInt15(32), 6));`
			`}`
			`}`

			`void ReconstructLDRPrecise(const MUInt15 &index, MUInt15* pixel, int numRealChannels)`
			`{`
			`MUInt15 weight = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ParallelMath::CompactMultiply(g_weightReciprocals[m_range], index) + 64, 7));`

			`for (int ch = 0; ch < numRealChannels; ch++)`
			`{`
			`MUInt15 ep0f = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::CompactMultiply((ParallelMath::MakeUInt15(256) - weight), ParallelMath::LosslessCast<MUInt15>::Cast(m_endPoint[0][ch])));`
			`MUInt15 ep1f = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::CompactMultiply(weight, ParallelMath::LosslessCast<MUInt15>::Cast(m_endPoint[1][ch])));`
			`pixel[ch] = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ep0f + ep1f + ParallelMath::MakeUInt15(128), 8));`
			`}`
			`}`

			`void ReconstructLDR_BC7(const MUInt15 &index, MUInt15* pixel)`
			`{`
			`ReconstructLDR_BC7(index, pixel, TVectorSize);`
			`}`

			`void ReconstructLDRPrecise(const MUInt15 &index, MUInt15* pixel)`
			`{`
			`ReconstructLDRPrecise(index, pixel, TVectorSize);`
			`}`

			`MUInt15 SelectIndexLDR(const MFloat* pixel, const ParallelMath::RoundTowardNearestForScope* rtn) const`
			`{`
			`MFloat dist = (pixel[0] - m_origin[0]) * m_axis[0];`
			`for (int ch = 1; ch < TVectorSize; ch++)`
			`dist = dist + (pixel[ch] - m_origin[ch]) * m_axis[ch];`

			`return ParallelMath::RoundAndConvertToU15(ParallelMath::Clamp(dist, 0.0f, m_maxValue), rtn);`
			`}`

			`protected:`
			`MAInt16 m_endPoint[2][TVectorSize];`

			`private:`
			`MFloat m_origin[TVectorSize];`
			`MFloat m_axis[TVectorSize];`
			`int m_range;`
			`float m_maxValue;`
			`bool m_isUniform;`
			`};`
			`}`
			`}`

			`#endif`