diff --git a/src/backend/utils/adt/float.c b/src/backend/utils/adt/float.c index df35557b73..d1e12c14be 100644 --- a/src/backend/utils/adt/float.c +++ b/src/backend/utils/adt/float.c @@ -2401,13 +2401,39 @@ setseed(PG_FUNCTION_ARGS) * float8_stddev_samp - produce final result for float STDDEV_SAMP() * float8_stddev_pop - produce final result for float STDDEV_POP() * + * The naive schoolbook implementation of these aggregates works by + * accumulating sum(X) and sum(X^2). However, this approach suffers from + * large rounding errors in the final computation of quantities like the + * population variance (N*sum(X^2) - sum(X)^2) / N^2, since each of the + * intermediate terms is potentially very large, while the difference is often + * quite small. + * + * Instead we use the Youngs-Cramer algorithm [1] which works by accumulating + * Sx=sum(X) and Sxx=sum((X-Sx/N)^2), using a numerically stable algorithm to + * incrementally update those quantities. The final computations of each of + * the aggregate values is then trivial and gives more accurate results (for + * example, the population variance is just Sxx/N). This algorithm is also + * fairly easy to generalize to allow parallel execution without loss of + * precision (see, for example, [2]). For more details, and a comparison of + * this with other algorithms, see [3]. + * * The transition datatype for all these aggregates is a 3-element array - * of float8, holding the values N, sum(X), sum(X*X) in that order. + * of float8, holding the values N, Sx, Sxx in that order. * * Note that we represent N as a float to avoid having to build a special * datatype. Given a reasonable floating-point implementation, there should * be no accuracy loss unless N exceeds 2 ^ 52 or so (by which time the * user will have doubtless lost interest anyway...) + * + * [1] Some Results Relevant to Choice of Sum and Sum-of-Product Algorithms, + * E. A. Youngs and E. M. Cramer, Technometrics Vol 13, No 3, August 1971. + * + * [2] Updating Formulae and a Pairwise Algorithm for Computing Sample + * Variances, T. F. Chan, G. H. Golub & R. J. LeVeque, COMPSTAT 1982. + * + * [3] Numerically Stable Parallel Computation of (Co-)Variance, Erich + * Schubert and Michael Gertz, Proceedings of the 30th International + * Conference on Scientific and Statistical Database Management, 2018. */ static float8 * @@ -2441,28 +2467,61 @@ float8_combine(PG_FUNCTION_ARGS) ArrayType *transarray2 = PG_GETARG_ARRAYTYPE_P(1); float8 *transvalues1; float8 *transvalues2; - - if (!AggCheckCallContext(fcinfo, NULL)) - elog(ERROR, "aggregate function called in non-aggregate context"); + float8 N1, + Sx1, + Sxx1, + N2, + Sx2, + Sxx2, + tmp, + N, + Sx, + Sxx; transvalues1 = check_float8_array(transarray1, "float8_combine", 3); transvalues2 = check_float8_array(transarray2, "float8_combine", 3); - transvalues1[0] = transvalues1[0] + transvalues2[0]; - transvalues1[1] = float8_pl(transvalues1[1], transvalues2[1]); - transvalues1[2] = float8_pl(transvalues1[2], transvalues2[2]); + N1 = transvalues1[0]; + Sx1 = transvalues1[1]; + Sxx1 = transvalues1[2]; - PG_RETURN_ARRAYTYPE_P(transarray1); -} + N2 = transvalues2[0]; + Sx2 = transvalues2[1]; + Sxx2 = transvalues2[2]; -Datum -float8_accum(PG_FUNCTION_ARGS) -{ - ArrayType *transarray = PG_GETARG_ARRAYTYPE_P(0); - float8 newval = PG_GETARG_FLOAT8(1); - float8 *transvalues; - - transvalues = check_float8_array(transarray, "float8_accum", 3); + /*-------------------- + * The transition values combine using a generalization of the + * Youngs-Cramer algorithm as follows: + * + * N = N1 + N2 + * Sx = Sx1 + Sx2 + * Sxx = Sxx1 + Sxx2 + N1 * N2 * (Sx1/N1 - Sx2/N2)^2 / N; + * + * It's worth handling the special cases N1 = 0 and N2 = 0 separately + * since those cases are trivial, and we then don't need to worry about + * division-by-zero errors in the general case. + *-------------------- + */ + if (N1 == 0.0) + { + N = N2; + Sx = Sx2; + Sxx = Sxx2; + } + else if (N2 == 0.0) + { + N = N1; + Sx = Sx1; + Sxx = Sxx1; + } + else + { + N = N1 + N2; + Sx = float8_pl(Sx1, Sx2); + tmp = Sx1 / N1 - Sx2 / N2; + Sxx = Sxx1 + Sxx2 + N1 * N2 * tmp * tmp / N; + check_float8_val(Sxx, isinf(Sxx1) || isinf(Sxx2), true); + } /* * If we're invoked as an aggregate, we can cheat and modify our first @@ -2471,9 +2530,83 @@ float8_accum(PG_FUNCTION_ARGS) */ if (AggCheckCallContext(fcinfo, NULL)) { - transvalues[0] = transvalues[0] + 1.0; - transvalues[1] = float8_pl(transvalues[1], newval); - transvalues[2] = float8_pl(transvalues[2], float8_mul(newval, newval)); + transvalues1[0] = N; + transvalues1[1] = Sx; + transvalues1[2] = Sxx; + + PG_RETURN_ARRAYTYPE_P(transarray1); + } + else + { + Datum transdatums[3]; + ArrayType *result; + + transdatums[0] = Float8GetDatumFast(N); + transdatums[1] = Float8GetDatumFast(Sx); + transdatums[2] = Float8GetDatumFast(Sxx); + + result = construct_array(transdatums, 3, + FLOAT8OID, + sizeof(float8), FLOAT8PASSBYVAL, 'd'); + + PG_RETURN_ARRAYTYPE_P(result); + } +} + +Datum +float8_accum(PG_FUNCTION_ARGS) +{ + ArrayType *transarray = PG_GETARG_ARRAYTYPE_P(0); + float8 newval = PG_GETARG_FLOAT8(1); + float8 *transvalues; + float8 N, + Sx, + Sxx, + tmp; + + transvalues = check_float8_array(transarray, "float8_accum", 3); + N = transvalues[0]; + Sx = transvalues[1]; + Sxx = transvalues[2]; + + /* + * Use the Youngs-Cramer algorithm to incorporate the new value into the + * transition values. + */ + N += 1.0; + Sx += newval; + if (transvalues[0] > 0.0) + { + tmp = newval * N - Sx; + Sxx += tmp * tmp / (N * transvalues[0]); + + /* + * Overflow check. We only report an overflow error when finite + * inputs lead to infinite results. Note also that Sxx should be NaN + * if any of the inputs are infinite, so we intentionally prevent Sxx + * from becoming infinite. + */ + if (isinf(Sx) || isinf(Sxx)) + { + if (!isinf(transvalues[1]) && !isinf(newval)) + ereport(ERROR, + (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), + errmsg("value out of range: overflow"))); + + Sxx = get_float8_nan(); + } + } + + /* + * If we're invoked as an aggregate, we can cheat and modify our first + * parameter in-place to reduce palloc overhead. Otherwise we construct a + * new array with the updated transition data and return it. + */ + if (AggCheckCallContext(fcinfo, NULL)) + { + transvalues[0] = N; + transvalues[1] = Sx; + transvalues[2] = Sxx; PG_RETURN_ARRAYTYPE_P(transarray); } @@ -2482,9 +2615,9 @@ float8_accum(PG_FUNCTION_ARGS) Datum transdatums[3]; ArrayType *result; - transvalues[0] = transvalues[0] + 1.0; - transvalues[1] = float8_pl(transvalues[1], newval); - transvalues[2] = float8_pl(transvalues[2], float8_mul(newval, newval)); + transdatums[0] = Float8GetDatumFast(N); + transdatums[1] = Float8GetDatumFast(Sx); + transdatums[2] = Float8GetDatumFast(Sxx); result = construct_array(transdatums, 3, FLOAT8OID, @@ -2502,8 +2635,43 @@ float4_accum(PG_FUNCTION_ARGS) /* do computations as float8 */ float8 newval = PG_GETARG_FLOAT4(1); float8 *transvalues; + float8 N, + Sx, + Sxx, + tmp; transvalues = check_float8_array(transarray, "float4_accum", 3); + N = transvalues[0]; + Sx = transvalues[1]; + Sxx = transvalues[2]; + + /* + * Use the Youngs-Cramer algorithm to incorporate the new value into the + * transition values. + */ + N += 1.0; + Sx += newval; + if (transvalues[0] > 0.0) + { + tmp = newval * N - Sx; + Sxx += tmp * tmp / (N * transvalues[0]); + + /* + * Overflow check. We only report an overflow error when finite + * inputs lead to infinite results. Note also that Sxx should be NaN + * if any of the inputs are infinite, so we intentionally prevent Sxx + * from becoming infinite. + */ + if (isinf(Sx) || isinf(Sxx)) + { + if (!isinf(transvalues[1]) && !isinf(newval)) + ereport(ERROR, + (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), + errmsg("value out of range: overflow"))); + + Sxx = get_float8_nan(); + } + } /* * If we're invoked as an aggregate, we can cheat and modify our first @@ -2512,9 +2680,9 @@ float4_accum(PG_FUNCTION_ARGS) */ if (AggCheckCallContext(fcinfo, NULL)) { - transvalues[0] = transvalues[0] + 1.0; - transvalues[1] = float8_pl(transvalues[1], newval); - transvalues[2] = float8_pl(transvalues[2], float8_mul(newval, newval)); + transvalues[0] = N; + transvalues[1] = Sx; + transvalues[2] = Sxx; PG_RETURN_ARRAYTYPE_P(transarray); } @@ -2523,9 +2691,9 @@ float4_accum(PG_FUNCTION_ARGS) Datum transdatums[3]; ArrayType *result; - transvalues[0] = transvalues[0] + 1.0; - transvalues[1] = float8_pl(transvalues[1], newval); - transvalues[2] = float8_pl(transvalues[2], float8_mul(newval, newval)); + transdatums[0] = Float8GetDatumFast(N); + transdatums[1] = Float8GetDatumFast(Sx); + transdatums[2] = Float8GetDatumFast(Sxx); result = construct_array(transdatums, 3, FLOAT8OID, @@ -2541,18 +2709,18 @@ float8_avg(PG_FUNCTION_ARGS) ArrayType *transarray = PG_GETARG_ARRAYTYPE_P(0); float8 *transvalues; float8 N, - sumX; + Sx; transvalues = check_float8_array(transarray, "float8_avg", 3); N = transvalues[0]; - sumX = transvalues[1]; - /* ignore sumX2 */ + Sx = transvalues[1]; + /* ignore Sxx */ /* SQL defines AVG of no values to be NULL */ if (N == 0.0) PG_RETURN_NULL(); - PG_RETURN_FLOAT8(sumX / N); + PG_RETURN_FLOAT8(Sx / N); } Datum @@ -2561,27 +2729,20 @@ float8_var_pop(PG_FUNCTION_ARGS) ArrayType *transarray = PG_GETARG_ARRAYTYPE_P(0); float8 *transvalues; float8 N, - sumX, - sumX2, - numerator; + Sxx; transvalues = check_float8_array(transarray, "float8_var_pop", 3); N = transvalues[0]; - sumX = transvalues[1]; - sumX2 = transvalues[2]; + /* ignore Sx */ + Sxx = transvalues[2]; /* Population variance is undefined when N is 0, so return NULL */ if (N == 0.0) PG_RETURN_NULL(); - numerator = N * sumX2 - sumX * sumX; - check_float8_val(numerator, isinf(sumX2) || isinf(sumX), true); + /* Note that Sxx is guaranteed to be non-negative */ - /* Watch out for roundoff error producing a negative numerator */ - if (numerator <= 0.0) - PG_RETURN_FLOAT8(0.0); - - PG_RETURN_FLOAT8(numerator / (N * N)); + PG_RETURN_FLOAT8(Sxx / N); } Datum @@ -2590,27 +2751,20 @@ float8_var_samp(PG_FUNCTION_ARGS) ArrayType *transarray = PG_GETARG_ARRAYTYPE_P(0); float8 *transvalues; float8 N, - sumX, - sumX2, - numerator; + Sxx; transvalues = check_float8_array(transarray, "float8_var_samp", 3); N = transvalues[0]; - sumX = transvalues[1]; - sumX2 = transvalues[2]; + /* ignore Sx */ + Sxx = transvalues[2]; /* Sample variance is undefined when N is 0 or 1, so return NULL */ if (N <= 1.0) PG_RETURN_NULL(); - numerator = N * sumX2 - sumX * sumX; - check_float8_val(numerator, isinf(sumX2) || isinf(sumX), true); + /* Note that Sxx is guaranteed to be non-negative */ - /* Watch out for roundoff error producing a negative numerator */ - if (numerator <= 0.0) - PG_RETURN_FLOAT8(0.0); - - PG_RETURN_FLOAT8(numerator / (N * (N - 1.0))); + PG_RETURN_FLOAT8(Sxx / (N - 1.0)); } Datum @@ -2619,27 +2773,20 @@ float8_stddev_pop(PG_FUNCTION_ARGS) ArrayType *transarray = PG_GETARG_ARRAYTYPE_P(0); float8 *transvalues; float8 N, - sumX, - sumX2, - numerator; + Sxx; transvalues = check_float8_array(transarray, "float8_stddev_pop", 3); N = transvalues[0]; - sumX = transvalues[1]; - sumX2 = transvalues[2]; + /* ignore Sx */ + Sxx = transvalues[2]; /* Population stddev is undefined when N is 0, so return NULL */ if (N == 0.0) PG_RETURN_NULL(); - numerator = N * sumX2 - sumX * sumX; - check_float8_val(numerator, isinf(sumX2) || isinf(sumX), true); + /* Note that Sxx is guaranteed to be non-negative */ - /* Watch out for roundoff error producing a negative numerator */ - if (numerator <= 0.0) - PG_RETURN_FLOAT8(0.0); - - PG_RETURN_FLOAT8(sqrt(numerator / (N * N))); + PG_RETURN_FLOAT8(sqrt(Sxx / N)); } Datum @@ -2648,27 +2795,20 @@ float8_stddev_samp(PG_FUNCTION_ARGS) ArrayType *transarray = PG_GETARG_ARRAYTYPE_P(0); float8 *transvalues; float8 N, - sumX, - sumX2, - numerator; + Sxx; transvalues = check_float8_array(transarray, "float8_stddev_samp", 3); N = transvalues[0]; - sumX = transvalues[1]; - sumX2 = transvalues[2]; + /* ignore Sx */ + Sxx = transvalues[2]; /* Sample stddev is undefined when N is 0 or 1, so return NULL */ if (N <= 1.0) PG_RETURN_NULL(); - numerator = N * sumX2 - sumX * sumX; - check_float8_val(numerator, isinf(sumX2) || isinf(sumX), true); + /* Note that Sxx is guaranteed to be non-negative */ - /* Watch out for roundoff error producing a negative numerator */ - if (numerator <= 0.0) - PG_RETURN_FLOAT8(0.0); - - PG_RETURN_FLOAT8(sqrt(numerator / (N * (N - 1.0)))); + PG_RETURN_FLOAT8(sqrt(Sxx / (N - 1.0))); } /* @@ -2676,9 +2816,14 @@ float8_stddev_samp(PG_FUNCTION_ARGS) * SQL2003 BINARY AGGREGATES * ========================= * + * As with the preceding aggregates, we use the Youngs-Cramer algorithm to + * reduce rounding errors in the aggregate final functions. + * * The transition datatype for all these aggregates is a 6-element array of - * float8, holding the values N, sum(X), sum(X*X), sum(Y), sum(Y*Y), sum(X*Y) - * in that order. Note that Y is the first argument to the aggregates! + * float8, holding the values N, Sx=sum(X), Sxx=sum((X-Sx/N)^2), Sy=sum(Y), + * Syy=sum((Y-Sy/N)^2), Sxy=sum((X-Sx/N)*(Y-Sy/N)) in that order. + * + * Note that Y is the first argument to all these aggregates! * * It might seem attractive to optimize this by having multiple accumulator * functions that only calculate the sums actually needed. But on most @@ -2695,32 +2840,66 @@ float8_regr_accum(PG_FUNCTION_ARGS) float8 newvalX = PG_GETARG_FLOAT8(2); float8 *transvalues; float8 N, - sumX, - sumX2, - sumY, - sumY2, - sumXY; + Sx, + Sxx, + Sy, + Syy, + Sxy, + tmpX, + tmpY, + scale; transvalues = check_float8_array(transarray, "float8_regr_accum", 6); N = transvalues[0]; - sumX = transvalues[1]; - sumX2 = transvalues[2]; - sumY = transvalues[3]; - sumY2 = transvalues[4]; - sumXY = transvalues[5]; + Sx = transvalues[1]; + Sxx = transvalues[2]; + Sy = transvalues[3]; + Syy = transvalues[4]; + Sxy = transvalues[5]; + /* + * Use the Youngs-Cramer algorithm to incorporate the new values into the + * transition values. + */ N += 1.0; - sumX += newvalX; - check_float8_val(sumX, isinf(transvalues[1]) || isinf(newvalX), true); - sumX2 += newvalX * newvalX; - check_float8_val(sumX2, isinf(transvalues[2]) || isinf(newvalX), true); - sumY += newvalY; - check_float8_val(sumY, isinf(transvalues[3]) || isinf(newvalY), true); - sumY2 += newvalY * newvalY; - check_float8_val(sumY2, isinf(transvalues[4]) || isinf(newvalY), true); - sumXY += newvalX * newvalY; - check_float8_val(sumXY, isinf(transvalues[5]) || isinf(newvalX) || - isinf(newvalY), true); + Sx += newvalX; + Sy += newvalY; + if (transvalues[0] > 0.0) + { + tmpX = newvalX * N - Sx; + tmpY = newvalY * N - Sy; + scale = 1.0 / (N * transvalues[0]); + Sxx += tmpX * tmpX * scale; + Syy += tmpY * tmpY * scale; + Sxy += tmpX * tmpY * scale; + + /* + * Overflow check. We only report an overflow error when finite + * inputs lead to infinite results. Note also that Sxx, Syy and Sxy + * should be NaN if any of the relevant inputs are infinite, so we + * intentionally prevent them from becoming infinite. + */ + if (isinf(Sx) || isinf(Sxx) || isinf(Sy) || isinf(Syy) || isinf(Sxy)) + { + if (((isinf(Sx) || isinf(Sxx)) && + !isinf(transvalues[1]) && !isinf(newvalX)) || + ((isinf(Sy) || isinf(Syy)) && + !isinf(transvalues[3]) && !isinf(newvalY)) || + (isinf(Sxy) && + !isinf(transvalues[1]) && !isinf(newvalX) && + !isinf(transvalues[3]) && !isinf(newvalY))) + ereport(ERROR, + (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), + errmsg("value out of range: overflow"))); + + if (isinf(Sxx)) + Sxx = get_float8_nan(); + if (isinf(Syy)) + Syy = get_float8_nan(); + if (isinf(Sxy)) + Sxy = get_float8_nan(); + } + } /* * If we're invoked as an aggregate, we can cheat and modify our first @@ -2730,11 +2909,11 @@ float8_regr_accum(PG_FUNCTION_ARGS) if (AggCheckCallContext(fcinfo, NULL)) { transvalues[0] = N; - transvalues[1] = sumX; - transvalues[2] = sumX2; - transvalues[3] = sumY; - transvalues[4] = sumY2; - transvalues[5] = sumXY; + transvalues[1] = Sx; + transvalues[2] = Sxx; + transvalues[3] = Sy; + transvalues[4] = Syy; + transvalues[5] = Sxy; PG_RETURN_ARRAYTYPE_P(transarray); } @@ -2744,11 +2923,11 @@ float8_regr_accum(PG_FUNCTION_ARGS) ArrayType *result; transdatums[0] = Float8GetDatumFast(N); - transdatums[1] = Float8GetDatumFast(sumX); - transdatums[2] = Float8GetDatumFast(sumX2); - transdatums[3] = Float8GetDatumFast(sumY); - transdatums[4] = Float8GetDatumFast(sumY2); - transdatums[5] = Float8GetDatumFast(sumXY); + transdatums[1] = Float8GetDatumFast(Sx); + transdatums[2] = Float8GetDatumFast(Sxx); + transdatums[3] = Float8GetDatumFast(Sy); + transdatums[4] = Float8GetDatumFast(Syy); + transdatums[5] = Float8GetDatumFast(Sxy); result = construct_array(transdatums, 6, FLOAT8OID, @@ -2773,21 +2952,127 @@ float8_regr_combine(PG_FUNCTION_ARGS) ArrayType *transarray2 = PG_GETARG_ARRAYTYPE_P(1); float8 *transvalues1; float8 *transvalues2; - - if (!AggCheckCallContext(fcinfo, NULL)) - elog(ERROR, "aggregate function called in non-aggregate context"); + float8 N1, + Sx1, + Sxx1, + Sy1, + Syy1, + Sxy1, + N2, + Sx2, + Sxx2, + Sy2, + Syy2, + Sxy2, + tmp1, + tmp2, + N, + Sx, + Sxx, + Sy, + Syy, + Sxy; transvalues1 = check_float8_array(transarray1, "float8_regr_combine", 6); transvalues2 = check_float8_array(transarray2, "float8_regr_combine", 6); - transvalues1[0] = transvalues1[0] + transvalues2[0]; - transvalues1[1] = float8_pl(transvalues1[1], transvalues2[1]); - transvalues1[2] = float8_pl(transvalues1[2], transvalues2[2]); - transvalues1[3] = float8_pl(transvalues1[3], transvalues2[3]); - transvalues1[4] = float8_pl(transvalues1[4], transvalues2[4]); - transvalues1[5] = float8_pl(transvalues1[5], transvalues2[5]); + N1 = transvalues1[0]; + Sx1 = transvalues1[1]; + Sxx1 = transvalues1[2]; + Sy1 = transvalues1[3]; + Syy1 = transvalues1[4]; + Sxy1 = transvalues1[5]; - PG_RETURN_ARRAYTYPE_P(transarray1); + N2 = transvalues2[0]; + Sx2 = transvalues2[1]; + Sxx2 = transvalues2[2]; + Sy2 = transvalues2[3]; + Syy2 = transvalues2[4]; + Sxy2 = transvalues2[5]; + + /*-------------------- + * The transition values combine using a generalization of the + * Youngs-Cramer algorithm as follows: + * + * N = N1 + N2 + * Sx = Sx1 + Sx2 + * Sxx = Sxx1 + Sxx2 + N1 * N2 * (Sx1/N1 - Sx2/N2)^2 / N + * Sy = Sy1 + Sy2 + * Syy = Syy1 + Syy2 + N1 * N2 * (Sy1/N1 - Sy2/N2)^2 / N + * Sxy = Sxy1 + Sxy2 + N1 * N2 * (Sx1/N1 - Sx2/N2) * (Sy1/N1 - Sy2/N2) / N + * + * It's worth handling the special cases N1 = 0 and N2 = 0 separately + * since those cases are trivial, and we then don't need to worry about + * division-by-zero errors in the general case. + *-------------------- + */ + if (N1 == 0.0) + { + N = N2; + Sx = Sx2; + Sxx = Sxx2; + Sy = Sy2; + Syy = Syy2; + Sxy = Sxy2; + } + else if (N2 == 0.0) + { + N = N1; + Sx = Sx1; + Sxx = Sxx1; + Sy = Sy1; + Syy = Syy1; + Sxy = Sxy1; + } + else + { + N = N1 + N2; + Sx = float8_pl(Sx1, Sx2); + tmp1 = Sx1 / N1 - Sx2 / N2; + Sxx = Sxx1 + Sxx2 + N1 * N2 * tmp1 * tmp1 / N; + check_float8_val(Sxx, isinf(Sxx1) || isinf(Sxx2), true); + Sy = float8_pl(Sy1, Sy2); + tmp2 = Sy1 / N1 - Sy2 / N2; + Syy = Syy1 + Syy2 + N1 * N2 * tmp2 * tmp2 / N; + check_float8_val(Syy, isinf(Syy1) || isinf(Syy2), true); + Sxy = Sxy1 + Sxy2 + N1 * N2 * tmp1 * tmp2 / N; + check_float8_val(Sxy, isinf(Sxy1) || isinf(Sxy2), true); + } + + /* + * If we're invoked as an aggregate, we can cheat and modify our first + * parameter in-place to reduce palloc overhead. Otherwise we construct a + * new array with the updated transition data and return it. + */ + if (AggCheckCallContext(fcinfo, NULL)) + { + transvalues1[0] = N; + transvalues1[1] = Sx; + transvalues1[2] = Sxx; + transvalues1[3] = Sy; + transvalues1[4] = Syy; + transvalues1[5] = Sxy; + + PG_RETURN_ARRAYTYPE_P(transarray1); + } + else + { + Datum transdatums[6]; + ArrayType *result; + + transdatums[0] = Float8GetDatumFast(N); + transdatums[1] = Float8GetDatumFast(Sx); + transdatums[2] = Float8GetDatumFast(Sxx); + transdatums[3] = Float8GetDatumFast(Sy); + transdatums[4] = Float8GetDatumFast(Syy); + transdatums[5] = Float8GetDatumFast(Sxy); + + result = construct_array(transdatums, 6, + FLOAT8OID, + sizeof(float8), FLOAT8PASSBYVAL, 'd'); + + PG_RETURN_ARRAYTYPE_P(result); + } } @@ -2797,27 +3082,19 @@ float8_regr_sxx(PG_FUNCTION_ARGS) ArrayType *transarray = PG_GETARG_ARRAYTYPE_P(0); float8 *transvalues; float8 N, - sumX, - sumX2, - numerator; + Sxx; transvalues = check_float8_array(transarray, "float8_regr_sxx", 6); N = transvalues[0]; - sumX = transvalues[1]; - sumX2 = transvalues[2]; + Sxx = transvalues[2]; /* if N is 0 we should return NULL */ if (N < 1.0) PG_RETURN_NULL(); - numerator = N * sumX2 - sumX * sumX; - check_float8_val(numerator, isinf(sumX2) || isinf(sumX), true); + /* Note that Sxx is guaranteed to be non-negative */ - /* Watch out for roundoff error producing a negative numerator */ - if (numerator <= 0.0) - PG_RETURN_FLOAT8(0.0); - - PG_RETURN_FLOAT8(numerator / N); + PG_RETURN_FLOAT8(Sxx); } Datum @@ -2826,27 +3103,19 @@ float8_regr_syy(PG_FUNCTION_ARGS) ArrayType *transarray = PG_GETARG_ARRAYTYPE_P(0); float8 *transvalues; float8 N, - sumY, - sumY2, - numerator; + Syy; transvalues = check_float8_array(transarray, "float8_regr_syy", 6); N = transvalues[0]; - sumY = transvalues[3]; - sumY2 = transvalues[4]; + Syy = transvalues[4]; /* if N is 0 we should return NULL */ if (N < 1.0) PG_RETURN_NULL(); - numerator = N * sumY2 - sumY * sumY; - check_float8_val(numerator, isinf(sumY2) || isinf(sumY), true); + /* Note that Syy is guaranteed to be non-negative */ - /* Watch out for roundoff error producing a negative numerator */ - if (numerator <= 0.0) - PG_RETURN_FLOAT8(0.0); - - PG_RETURN_FLOAT8(numerator / N); + PG_RETURN_FLOAT8(Syy); } Datum @@ -2855,28 +3124,19 @@ float8_regr_sxy(PG_FUNCTION_ARGS) ArrayType *transarray = PG_GETARG_ARRAYTYPE_P(0); float8 *transvalues; float8 N, - sumX, - sumY, - sumXY, - numerator; + Sxy; transvalues = check_float8_array(transarray, "float8_regr_sxy", 6); N = transvalues[0]; - sumX = transvalues[1]; - sumY = transvalues[3]; - sumXY = transvalues[5]; + Sxy = transvalues[5]; /* if N is 0 we should return NULL */ if (N < 1.0) PG_RETURN_NULL(); - numerator = N * sumXY - sumX * sumY; - check_float8_val(numerator, isinf(sumXY) || isinf(sumX) || - isinf(sumY), true); - /* A negative result is valid here */ - PG_RETURN_FLOAT8(numerator / N); + PG_RETURN_FLOAT8(Sxy); } Datum @@ -2885,17 +3145,17 @@ float8_regr_avgx(PG_FUNCTION_ARGS) ArrayType *transarray = PG_GETARG_ARRAYTYPE_P(0); float8 *transvalues; float8 N, - sumX; + Sx; transvalues = check_float8_array(transarray, "float8_regr_avgx", 6); N = transvalues[0]; - sumX = transvalues[1]; + Sx = transvalues[1]; /* if N is 0 we should return NULL */ if (N < 1.0) PG_RETURN_NULL(); - PG_RETURN_FLOAT8(sumX / N); + PG_RETURN_FLOAT8(Sx / N); } Datum @@ -2904,17 +3164,17 @@ float8_regr_avgy(PG_FUNCTION_ARGS) ArrayType *transarray = PG_GETARG_ARRAYTYPE_P(0); float8 *transvalues; float8 N, - sumY; + Sy; transvalues = check_float8_array(transarray, "float8_regr_avgy", 6); N = transvalues[0]; - sumY = transvalues[3]; + Sy = transvalues[3]; /* if N is 0 we should return NULL */ if (N < 1.0) PG_RETURN_NULL(); - PG_RETURN_FLOAT8(sumY / N); + PG_RETURN_FLOAT8(Sy / N); } Datum @@ -2923,26 +3183,17 @@ float8_covar_pop(PG_FUNCTION_ARGS) ArrayType *transarray = PG_GETARG_ARRAYTYPE_P(0); float8 *transvalues; float8 N, - sumX, - sumY, - sumXY, - numerator; + Sxy; transvalues = check_float8_array(transarray, "float8_covar_pop", 6); N = transvalues[0]; - sumX = transvalues[1]; - sumY = transvalues[3]; - sumXY = transvalues[5]; + Sxy = transvalues[5]; /* if N is 0 we should return NULL */ if (N < 1.0) PG_RETURN_NULL(); - numerator = N * sumXY - sumX * sumY; - check_float8_val(numerator, isinf(sumXY) || isinf(sumX) || - isinf(sumY), true); - - PG_RETURN_FLOAT8(numerator / (N * N)); + PG_RETURN_FLOAT8(Sxy / N); } Datum @@ -2951,26 +3202,17 @@ float8_covar_samp(PG_FUNCTION_ARGS) ArrayType *transarray = PG_GETARG_ARRAYTYPE_P(0); float8 *transvalues; float8 N, - sumX, - sumY, - sumXY, - numerator; + Sxy; transvalues = check_float8_array(transarray, "float8_covar_samp", 6); N = transvalues[0]; - sumX = transvalues[1]; - sumY = transvalues[3]; - sumXY = transvalues[5]; + Sxy = transvalues[5]; /* if N is <= 1 we should return NULL */ if (N < 2.0) PG_RETURN_NULL(); - numerator = N * sumXY - sumX * sumY; - check_float8_val(numerator, isinf(sumXY) || isinf(sumX) || - isinf(sumY), true); - - PG_RETURN_FLOAT8(numerator / (N * (N - 1.0))); + PG_RETURN_FLOAT8(Sxy / (N - 1.0)); } Datum @@ -2979,38 +3221,27 @@ float8_corr(PG_FUNCTION_ARGS) ArrayType *transarray = PG_GETARG_ARRAYTYPE_P(0); float8 *transvalues; float8 N, - sumX, - sumX2, - sumY, - sumY2, - sumXY, - numeratorX, - numeratorY, - numeratorXY; + Sxx, + Syy, + Sxy; transvalues = check_float8_array(transarray, "float8_corr", 6); N = transvalues[0]; - sumX = transvalues[1]; - sumX2 = transvalues[2]; - sumY = transvalues[3]; - sumY2 = transvalues[4]; - sumXY = transvalues[5]; + Sxx = transvalues[2]; + Syy = transvalues[4]; + Sxy = transvalues[5]; /* if N is 0 we should return NULL */ if (N < 1.0) PG_RETURN_NULL(); - numeratorX = N * sumX2 - sumX * sumX; - check_float8_val(numeratorX, isinf(sumX2) || isinf(sumX), true); - numeratorY = N * sumY2 - sumY * sumY; - check_float8_val(numeratorY, isinf(sumY2) || isinf(sumY), true); - numeratorXY = N * sumXY - sumX * sumY; - check_float8_val(numeratorXY, isinf(sumXY) || isinf(sumX) || - isinf(sumY), true); - if (numeratorX <= 0 || numeratorY <= 0) + /* Note that Sxx and Syy are guaranteed to be non-negative */ + + /* per spec, return NULL for horizontal and vertical lines */ + if (Sxx == 0 || Syy == 0) PG_RETURN_NULL(); - PG_RETURN_FLOAT8(numeratorXY / sqrt(numeratorX * numeratorY)); + PG_RETURN_FLOAT8(Sxy / sqrt(Sxx * Syy)); } Datum @@ -3019,42 +3250,31 @@ float8_regr_r2(PG_FUNCTION_ARGS) ArrayType *transarray = PG_GETARG_ARRAYTYPE_P(0); float8 *transvalues; float8 N, - sumX, - sumX2, - sumY, - sumY2, - sumXY, - numeratorX, - numeratorY, - numeratorXY; + Sxx, + Syy, + Sxy; transvalues = check_float8_array(transarray, "float8_regr_r2", 6); N = transvalues[0]; - sumX = transvalues[1]; - sumX2 = transvalues[2]; - sumY = transvalues[3]; - sumY2 = transvalues[4]; - sumXY = transvalues[5]; + Sxx = transvalues[2]; + Syy = transvalues[4]; + Sxy = transvalues[5]; /* if N is 0 we should return NULL */ if (N < 1.0) PG_RETURN_NULL(); - numeratorX = N * sumX2 - sumX * sumX; - check_float8_val(numeratorX, isinf(sumX2) || isinf(sumX), true); - numeratorY = N * sumY2 - sumY * sumY; - check_float8_val(numeratorY, isinf(sumY2) || isinf(sumY), true); - numeratorXY = N * sumXY - sumX * sumY; - check_float8_val(numeratorXY, isinf(sumXY) || isinf(sumX) || - isinf(sumY), true); - if (numeratorX <= 0) + /* Note that Sxx and Syy are guaranteed to be non-negative */ + + /* per spec, return NULL for a vertical line */ + if (Sxx == 0) PG_RETURN_NULL(); - /* per spec, horizontal line produces 1.0 */ - if (numeratorY <= 0) + + /* per spec, return 1.0 for a horizontal line */ + if (Syy == 0) PG_RETURN_FLOAT8(1.0); - PG_RETURN_FLOAT8((numeratorXY * numeratorXY) / - (numeratorX * numeratorY)); + PG_RETURN_FLOAT8((Sxy * Sxy) / (Sxx * Syy)); } Datum @@ -3063,33 +3283,25 @@ float8_regr_slope(PG_FUNCTION_ARGS) ArrayType *transarray = PG_GETARG_ARRAYTYPE_P(0); float8 *transvalues; float8 N, - sumX, - sumX2, - sumY, - sumXY, - numeratorX, - numeratorXY; + Sxx, + Sxy; transvalues = check_float8_array(transarray, "float8_regr_slope", 6); N = transvalues[0]; - sumX = transvalues[1]; - sumX2 = transvalues[2]; - sumY = transvalues[3]; - sumXY = transvalues[5]; + Sxx = transvalues[2]; + Sxy = transvalues[5]; /* if N is 0 we should return NULL */ if (N < 1.0) PG_RETURN_NULL(); - numeratorX = N * sumX2 - sumX * sumX; - check_float8_val(numeratorX, isinf(sumX2) || isinf(sumX), true); - numeratorXY = N * sumXY - sumX * sumY; - check_float8_val(numeratorXY, isinf(sumXY) || isinf(sumX) || - isinf(sumY), true); - if (numeratorX <= 0) + /* Note that Sxx is guaranteed to be non-negative */ + + /* per spec, return NULL for a vertical line */ + if (Sxx == 0) PG_RETURN_NULL(); - PG_RETURN_FLOAT8(numeratorXY / numeratorX); + PG_RETURN_FLOAT8(Sxy / Sxx); } Datum @@ -3098,33 +3310,29 @@ float8_regr_intercept(PG_FUNCTION_ARGS) ArrayType *transarray = PG_GETARG_ARRAYTYPE_P(0); float8 *transvalues; float8 N, - sumX, - sumX2, - sumY, - sumXY, - numeratorX, - numeratorXXY; + Sx, + Sxx, + Sy, + Sxy; transvalues = check_float8_array(transarray, "float8_regr_intercept", 6); N = transvalues[0]; - sumX = transvalues[1]; - sumX2 = transvalues[2]; - sumY = transvalues[3]; - sumXY = transvalues[5]; + Sx = transvalues[1]; + Sxx = transvalues[2]; + Sy = transvalues[3]; + Sxy = transvalues[5]; /* if N is 0 we should return NULL */ if (N < 1.0) PG_RETURN_NULL(); - numeratorX = N * sumX2 - sumX * sumX; - check_float8_val(numeratorX, isinf(sumX2) || isinf(sumX), true); - numeratorXXY = sumY * sumX2 - sumX * sumXY; - check_float8_val(numeratorXXY, isinf(sumY) || isinf(sumX2) || - isinf(sumX) || isinf(sumXY), true); - if (numeratorX <= 0) + /* Note that Sxx is guaranteed to be non-negative */ + + /* per spec, return NULL for a vertical line */ + if (Sxx == 0) PG_RETURN_NULL(); - PG_RETURN_FLOAT8(numeratorXXY / numeratorX); + PG_RETURN_FLOAT8((Sy - Sx * Sxy / Sxx) / N); } diff --git a/src/test/regress/expected/aggregates.out b/src/test/regress/expected/aggregates.out index 5e21622754..717e965f30 100644 --- a/src/test/regress/expected/aggregates.out +++ b/src/test/regress/expected/aggregates.out @@ -198,6 +198,50 @@ select avg('NaN'::numeric) from generate_series(1,3); NaN (1 row) +-- verify correct results for infinite inputs +SELECT avg(x::float8), var_pop(x::float8) +FROM (VALUES ('1'), ('infinity')) v(x); + avg | var_pop +----------+--------- + Infinity | NaN +(1 row) + +SELECT avg(x::float8), var_pop(x::float8) +FROM (VALUES ('infinity'), ('1')) v(x); + avg | var_pop +----------+--------- + Infinity | NaN +(1 row) + +SELECT avg(x::float8), var_pop(x::float8) +FROM (VALUES ('infinity'), ('infinity')) v(x); + avg | var_pop +----------+--------- + Infinity | NaN +(1 row) + +SELECT avg(x::float8), var_pop(x::float8) +FROM (VALUES ('-infinity'), ('infinity')) v(x); + avg | var_pop +-----+--------- + NaN | NaN +(1 row) + +-- test accuracy with a large input offset +SELECT avg(x::float8), var_pop(x::float8) +FROM (VALUES (100000003), (100000004), (100000006), (100000007)) v(x); + avg | var_pop +-----------+--------- + 100000005 | 2.5 +(1 row) + +SELECT avg(x::float8), var_pop(x::float8) +FROM (VALUES (7000000000005), (7000000000007)) v(x); + avg | var_pop +---------------+--------- + 7000000000006 | 1 +(1 row) + -- SQL2003 binary aggregates SELECT regr_count(b, a) FROM aggtest; regr_count @@ -253,6 +297,90 @@ SELECT corr(b, a) FROM aggtest; 0.139634516517873 (1 row) +-- test accum and combine functions directly +CREATE TABLE regr_test (x float8, y float8); +INSERT INTO regr_test VALUES (10,150),(20,250),(30,350),(80,540),(100,200); +SELECT count(*), sum(x), regr_sxx(y,x), sum(y),regr_syy(y,x), regr_sxy(y,x) +FROM regr_test WHERE x IN (10,20,30,80); + count | sum | regr_sxx | sum | regr_syy | regr_sxy +-------+-----+----------+------+----------+---------- + 4 | 140 | 2900 | 1290 | 83075 | 15050 +(1 row) + +SELECT count(*), sum(x), regr_sxx(y,x), sum(y),regr_syy(y,x), regr_sxy(y,x) +FROM regr_test; + count | sum | regr_sxx | sum | regr_syy | regr_sxy +-------+-----+----------+------+----------+---------- + 5 | 240 | 6280 | 1490 | 95080 | 8680 +(1 row) + +SELECT float8_accum('{4,140,2900}'::float8[], 100); + float8_accum +-------------- + {5,240,6280} +(1 row) + +SELECT float8_regr_accum('{4,140,2900,1290,83075,15050}'::float8[], 200, 100); + float8_regr_accum +------------------------------ + {5,240,6280,1490,95080,8680} +(1 row) + +SELECT count(*), sum(x), regr_sxx(y,x), sum(y),regr_syy(y,x), regr_sxy(y,x) +FROM regr_test WHERE x IN (10,20,30); + count | sum | regr_sxx | sum | regr_syy | regr_sxy +-------+-----+----------+-----+----------+---------- + 3 | 60 | 200 | 750 | 20000 | 2000 +(1 row) + +SELECT count(*), sum(x), regr_sxx(y,x), sum(y),regr_syy(y,x), regr_sxy(y,x) +FROM regr_test WHERE x IN (80,100); + count | sum | regr_sxx | sum | regr_syy | regr_sxy +-------+-----+----------+-----+----------+---------- + 2 | 180 | 200 | 740 | 57800 | -3400 +(1 row) + +SELECT float8_combine('{3,60,200}'::float8[], '{0,0,0}'::float8[]); + float8_combine +---------------- + {3,60,200} +(1 row) + +SELECT float8_combine('{0,0,0}'::float8[], '{2,180,200}'::float8[]); + float8_combine +---------------- + {2,180,200} +(1 row) + +SELECT float8_combine('{3,60,200}'::float8[], '{2,180,200}'::float8[]); + float8_combine +---------------- + {5,240,6280} +(1 row) + +SELECT float8_regr_combine('{3,60,200,750,20000,2000}'::float8[], + '{0,0,0,0,0,0}'::float8[]); + float8_regr_combine +--------------------------- + {3,60,200,750,20000,2000} +(1 row) + +SELECT float8_regr_combine('{0,0,0,0,0,0}'::float8[], + '{2,180,200,740,57800,-3400}'::float8[]); + float8_regr_combine +----------------------------- + {2,180,200,740,57800,-3400} +(1 row) + +SELECT float8_regr_combine('{3,60,200,750,20000,2000}'::float8[], + '{2,180,200,740,57800,-3400}'::float8[]); + float8_regr_combine +------------------------------ + {5,240,6280,1490,95080,8680} +(1 row) + +DROP TABLE regr_test; +-- test count, distinct SELECT count(four) AS cnt_1000 FROM onek; cnt_1000 ---------- diff --git a/src/test/regress/sql/aggregates.sql b/src/test/regress/sql/aggregates.sql index 7e77467ecd..83a5492fdd 100644 --- a/src/test/regress/sql/aggregates.sql +++ b/src/test/regress/sql/aggregates.sql @@ -51,6 +51,22 @@ select avg(null::float8) from generate_series(1,3); select sum('NaN'::numeric) from generate_series(1,3); select avg('NaN'::numeric) from generate_series(1,3); +-- verify correct results for infinite inputs +SELECT avg(x::float8), var_pop(x::float8) +FROM (VALUES ('1'), ('infinity')) v(x); +SELECT avg(x::float8), var_pop(x::float8) +FROM (VALUES ('infinity'), ('1')) v(x); +SELECT avg(x::float8), var_pop(x::float8) +FROM (VALUES ('infinity'), ('infinity')) v(x); +SELECT avg(x::float8), var_pop(x::float8) +FROM (VALUES ('-infinity'), ('infinity')) v(x); + +-- test accuracy with a large input offset +SELECT avg(x::float8), var_pop(x::float8) +FROM (VALUES (100000003), (100000004), (100000006), (100000007)) v(x); +SELECT avg(x::float8), var_pop(x::float8) +FROM (VALUES (7000000000005), (7000000000007)) v(x); + -- SQL2003 binary aggregates SELECT regr_count(b, a) FROM aggtest; SELECT regr_sxx(b, a) FROM aggtest; @@ -62,6 +78,31 @@ SELECT regr_slope(b, a), regr_intercept(b, a) FROM aggtest; SELECT covar_pop(b, a), covar_samp(b, a) FROM aggtest; SELECT corr(b, a) FROM aggtest; +-- test accum and combine functions directly +CREATE TABLE regr_test (x float8, y float8); +INSERT INTO regr_test VALUES (10,150),(20,250),(30,350),(80,540),(100,200); +SELECT count(*), sum(x), regr_sxx(y,x), sum(y),regr_syy(y,x), regr_sxy(y,x) +FROM regr_test WHERE x IN (10,20,30,80); +SELECT count(*), sum(x), regr_sxx(y,x), sum(y),regr_syy(y,x), regr_sxy(y,x) +FROM regr_test; +SELECT float8_accum('{4,140,2900}'::float8[], 100); +SELECT float8_regr_accum('{4,140,2900,1290,83075,15050}'::float8[], 200, 100); +SELECT count(*), sum(x), regr_sxx(y,x), sum(y),regr_syy(y,x), regr_sxy(y,x) +FROM regr_test WHERE x IN (10,20,30); +SELECT count(*), sum(x), regr_sxx(y,x), sum(y),regr_syy(y,x), regr_sxy(y,x) +FROM regr_test WHERE x IN (80,100); +SELECT float8_combine('{3,60,200}'::float8[], '{0,0,0}'::float8[]); +SELECT float8_combine('{0,0,0}'::float8[], '{2,180,200}'::float8[]); +SELECT float8_combine('{3,60,200}'::float8[], '{2,180,200}'::float8[]); +SELECT float8_regr_combine('{3,60,200,750,20000,2000}'::float8[], + '{0,0,0,0,0,0}'::float8[]); +SELECT float8_regr_combine('{0,0,0,0,0,0}'::float8[], + '{2,180,200,740,57800,-3400}'::float8[]); +SELECT float8_regr_combine('{3,60,200,750,20000,2000}'::float8[], + '{2,180,200,740,57800,-3400}'::float8[]); +DROP TABLE regr_test; + +-- test count, distinct SELECT count(four) AS cnt_1000 FROM onek; SELECT count(DISTINCT four) AS cnt_4 FROM onek;