mirror of
https://git.postgresql.org/git/postgresql.git
synced 2025-01-12 18:34:36 +08:00
Improve the accuracy of floating point statistical aggregates.
When computing statistical aggregates like variance, the common schoolbook algorithm which computes the sum of the squares of the values and subtracts the square of the mean can lead to a large loss of precision when using floating point arithmetic, because the difference between the two terms is often very small relative to the terms themselves. To avoid this, re-work these aggregates to use the Youngs-Cramer algorithm, which is a proven, numerically stable algorithm that directly aggregates the sum of the squares of the differences of the values from the mean in a single pass over the data. While at it, improve the test coverage to test the aggregate combine functions used during parallel aggregation. Per report and suggested algorithm from Erich Schubert. Patch by me, reviewed by Madeleine Thompson. Discussion: https://postgr.es/m/153313051300.1397.9594490737341194671@wrigleys.postgresql.org
This commit is contained in:
parent
38921d1416
commit
e954a727f0
File diff suppressed because it is too large
Load Diff
@ -198,6 +198,50 @@ select avg('NaN'::numeric) from generate_series(1,3);
|
||||
NaN
|
||||
(1 row)
|
||||
|
||||
-- verify correct results for infinite inputs
|
||||
SELECT avg(x::float8), var_pop(x::float8)
|
||||
FROM (VALUES ('1'), ('infinity')) v(x);
|
||||
avg | var_pop
|
||||
----------+---------
|
||||
Infinity | NaN
|
||||
(1 row)
|
||||
|
||||
SELECT avg(x::float8), var_pop(x::float8)
|
||||
FROM (VALUES ('infinity'), ('1')) v(x);
|
||||
avg | var_pop
|
||||
----------+---------
|
||||
Infinity | NaN
|
||||
(1 row)
|
||||
|
||||
SELECT avg(x::float8), var_pop(x::float8)
|
||||
FROM (VALUES ('infinity'), ('infinity')) v(x);
|
||||
avg | var_pop
|
||||
----------+---------
|
||||
Infinity | NaN
|
||||
(1 row)
|
||||
|
||||
SELECT avg(x::float8), var_pop(x::float8)
|
||||
FROM (VALUES ('-infinity'), ('infinity')) v(x);
|
||||
avg | var_pop
|
||||
-----+---------
|
||||
NaN | NaN
|
||||
(1 row)
|
||||
|
||||
-- test accuracy with a large input offset
|
||||
SELECT avg(x::float8), var_pop(x::float8)
|
||||
FROM (VALUES (100000003), (100000004), (100000006), (100000007)) v(x);
|
||||
avg | var_pop
|
||||
-----------+---------
|
||||
100000005 | 2.5
|
||||
(1 row)
|
||||
|
||||
SELECT avg(x::float8), var_pop(x::float8)
|
||||
FROM (VALUES (7000000000005), (7000000000007)) v(x);
|
||||
avg | var_pop
|
||||
---------------+---------
|
||||
7000000000006 | 1
|
||||
(1 row)
|
||||
|
||||
-- SQL2003 binary aggregates
|
||||
SELECT regr_count(b, a) FROM aggtest;
|
||||
regr_count
|
||||
@ -253,6 +297,90 @@ SELECT corr(b, a) FROM aggtest;
|
||||
0.139634516517873
|
||||
(1 row)
|
||||
|
||||
-- test accum and combine functions directly
|
||||
CREATE TABLE regr_test (x float8, y float8);
|
||||
INSERT INTO regr_test VALUES (10,150),(20,250),(30,350),(80,540),(100,200);
|
||||
SELECT count(*), sum(x), regr_sxx(y,x), sum(y),regr_syy(y,x), regr_sxy(y,x)
|
||||
FROM regr_test WHERE x IN (10,20,30,80);
|
||||
count | sum | regr_sxx | sum | regr_syy | regr_sxy
|
||||
-------+-----+----------+------+----------+----------
|
||||
4 | 140 | 2900 | 1290 | 83075 | 15050
|
||||
(1 row)
|
||||
|
||||
SELECT count(*), sum(x), regr_sxx(y,x), sum(y),regr_syy(y,x), regr_sxy(y,x)
|
||||
FROM regr_test;
|
||||
count | sum | regr_sxx | sum | regr_syy | regr_sxy
|
||||
-------+-----+----------+------+----------+----------
|
||||
5 | 240 | 6280 | 1490 | 95080 | 8680
|
||||
(1 row)
|
||||
|
||||
SELECT float8_accum('{4,140,2900}'::float8[], 100);
|
||||
float8_accum
|
||||
--------------
|
||||
{5,240,6280}
|
||||
(1 row)
|
||||
|
||||
SELECT float8_regr_accum('{4,140,2900,1290,83075,15050}'::float8[], 200, 100);
|
||||
float8_regr_accum
|
||||
------------------------------
|
||||
{5,240,6280,1490,95080,8680}
|
||||
(1 row)
|
||||
|
||||
SELECT count(*), sum(x), regr_sxx(y,x), sum(y),regr_syy(y,x), regr_sxy(y,x)
|
||||
FROM regr_test WHERE x IN (10,20,30);
|
||||
count | sum | regr_sxx | sum | regr_syy | regr_sxy
|
||||
-------+-----+----------+-----+----------+----------
|
||||
3 | 60 | 200 | 750 | 20000 | 2000
|
||||
(1 row)
|
||||
|
||||
SELECT count(*), sum(x), regr_sxx(y,x), sum(y),regr_syy(y,x), regr_sxy(y,x)
|
||||
FROM regr_test WHERE x IN (80,100);
|
||||
count | sum | regr_sxx | sum | regr_syy | regr_sxy
|
||||
-------+-----+----------+-----+----------+----------
|
||||
2 | 180 | 200 | 740 | 57800 | -3400
|
||||
(1 row)
|
||||
|
||||
SELECT float8_combine('{3,60,200}'::float8[], '{0,0,0}'::float8[]);
|
||||
float8_combine
|
||||
----------------
|
||||
{3,60,200}
|
||||
(1 row)
|
||||
|
||||
SELECT float8_combine('{0,0,0}'::float8[], '{2,180,200}'::float8[]);
|
||||
float8_combine
|
||||
----------------
|
||||
{2,180,200}
|
||||
(1 row)
|
||||
|
||||
SELECT float8_combine('{3,60,200}'::float8[], '{2,180,200}'::float8[]);
|
||||
float8_combine
|
||||
----------------
|
||||
{5,240,6280}
|
||||
(1 row)
|
||||
|
||||
SELECT float8_regr_combine('{3,60,200,750,20000,2000}'::float8[],
|
||||
'{0,0,0,0,0,0}'::float8[]);
|
||||
float8_regr_combine
|
||||
---------------------------
|
||||
{3,60,200,750,20000,2000}
|
||||
(1 row)
|
||||
|
||||
SELECT float8_regr_combine('{0,0,0,0,0,0}'::float8[],
|
||||
'{2,180,200,740,57800,-3400}'::float8[]);
|
||||
float8_regr_combine
|
||||
-----------------------------
|
||||
{2,180,200,740,57800,-3400}
|
||||
(1 row)
|
||||
|
||||
SELECT float8_regr_combine('{3,60,200,750,20000,2000}'::float8[],
|
||||
'{2,180,200,740,57800,-3400}'::float8[]);
|
||||
float8_regr_combine
|
||||
------------------------------
|
||||
{5,240,6280,1490,95080,8680}
|
||||
(1 row)
|
||||
|
||||
DROP TABLE regr_test;
|
||||
-- test count, distinct
|
||||
SELECT count(four) AS cnt_1000 FROM onek;
|
||||
cnt_1000
|
||||
----------
|
||||
|
@ -51,6 +51,22 @@ select avg(null::float8) from generate_series(1,3);
|
||||
select sum('NaN'::numeric) from generate_series(1,3);
|
||||
select avg('NaN'::numeric) from generate_series(1,3);
|
||||
|
||||
-- verify correct results for infinite inputs
|
||||
SELECT avg(x::float8), var_pop(x::float8)
|
||||
FROM (VALUES ('1'), ('infinity')) v(x);
|
||||
SELECT avg(x::float8), var_pop(x::float8)
|
||||
FROM (VALUES ('infinity'), ('1')) v(x);
|
||||
SELECT avg(x::float8), var_pop(x::float8)
|
||||
FROM (VALUES ('infinity'), ('infinity')) v(x);
|
||||
SELECT avg(x::float8), var_pop(x::float8)
|
||||
FROM (VALUES ('-infinity'), ('infinity')) v(x);
|
||||
|
||||
-- test accuracy with a large input offset
|
||||
SELECT avg(x::float8), var_pop(x::float8)
|
||||
FROM (VALUES (100000003), (100000004), (100000006), (100000007)) v(x);
|
||||
SELECT avg(x::float8), var_pop(x::float8)
|
||||
FROM (VALUES (7000000000005), (7000000000007)) v(x);
|
||||
|
||||
-- SQL2003 binary aggregates
|
||||
SELECT regr_count(b, a) FROM aggtest;
|
||||
SELECT regr_sxx(b, a) FROM aggtest;
|
||||
@ -62,6 +78,31 @@ SELECT regr_slope(b, a), regr_intercept(b, a) FROM aggtest;
|
||||
SELECT covar_pop(b, a), covar_samp(b, a) FROM aggtest;
|
||||
SELECT corr(b, a) FROM aggtest;
|
||||
|
||||
-- test accum and combine functions directly
|
||||
CREATE TABLE regr_test (x float8, y float8);
|
||||
INSERT INTO regr_test VALUES (10,150),(20,250),(30,350),(80,540),(100,200);
|
||||
SELECT count(*), sum(x), regr_sxx(y,x), sum(y),regr_syy(y,x), regr_sxy(y,x)
|
||||
FROM regr_test WHERE x IN (10,20,30,80);
|
||||
SELECT count(*), sum(x), regr_sxx(y,x), sum(y),regr_syy(y,x), regr_sxy(y,x)
|
||||
FROM regr_test;
|
||||
SELECT float8_accum('{4,140,2900}'::float8[], 100);
|
||||
SELECT float8_regr_accum('{4,140,2900,1290,83075,15050}'::float8[], 200, 100);
|
||||
SELECT count(*), sum(x), regr_sxx(y,x), sum(y),regr_syy(y,x), regr_sxy(y,x)
|
||||
FROM regr_test WHERE x IN (10,20,30);
|
||||
SELECT count(*), sum(x), regr_sxx(y,x), sum(y),regr_syy(y,x), regr_sxy(y,x)
|
||||
FROM regr_test WHERE x IN (80,100);
|
||||
SELECT float8_combine('{3,60,200}'::float8[], '{0,0,0}'::float8[]);
|
||||
SELECT float8_combine('{0,0,0}'::float8[], '{2,180,200}'::float8[]);
|
||||
SELECT float8_combine('{3,60,200}'::float8[], '{2,180,200}'::float8[]);
|
||||
SELECT float8_regr_combine('{3,60,200,750,20000,2000}'::float8[],
|
||||
'{0,0,0,0,0,0}'::float8[]);
|
||||
SELECT float8_regr_combine('{0,0,0,0,0,0}'::float8[],
|
||||
'{2,180,200,740,57800,-3400}'::float8[]);
|
||||
SELECT float8_regr_combine('{3,60,200,750,20000,2000}'::float8[],
|
||||
'{2,180,200,740,57800,-3400}'::float8[]);
|
||||
DROP TABLE regr_test;
|
||||
|
||||
-- test count, distinct
|
||||
SELECT count(four) AS cnt_1000 FROM onek;
|
||||
SELECT count(DISTINCT four) AS cnt_4 FROM onek;
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user