From 4767bc8ff2edc1258cf4d8a83155d4cedd724231 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Thu, 16 Feb 2012 17:33:28 -0500 Subject: [PATCH] Improve statistics estimation to make some use of DISTINCT in sub-queries. Formerly, we just punted when trying to estimate stats for variables coming out of sub-queries using DISTINCT, on the grounds that whatever stats we might have for underlying table columns would be inapplicable. But if the sub-query has only one DISTINCT column, we can consider its output variable as being unique, which is useful information all by itself. The scope of this improvement is pretty narrow, but it costs nearly nothing, so we might as well do it. Per discussion with Andres Freund. This patch differs from the draft I submitted yesterday in updating various comments about vardata.isunique (to reflect its extended meaning) and in tweaking the interaction with security_barrier views. There does not seem to be a reason why we can't use this sort of knowledge even when the sub-query is such a view. --- src/backend/utils/adt/selfuncs.c | 94 ++++++++++++++++++++------------ src/include/utils/selfuncs.h | 2 +- 2 files changed, 60 insertions(+), 36 deletions(-) diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index 6d78068476..0a685aac2c 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -110,6 +110,7 @@ #include "optimizer/predtest.h" #include "optimizer/restrictinfo.h" #include "optimizer/var.h" +#include "parser/parse_clause.h" #include "parser/parse_coerce.h" #include "parser/parsetree.h" #include "utils/builtins.h" @@ -255,10 +256,11 @@ var_eq_const(VariableStatData *vardata, Oid operator, return 0.0; /* - * If we matched the var to a unique index, assume there is exactly one - * match regardless of anything else. (This is slightly bogus, since the - * index's equality operator might be different from ours, but it's more - * likely to be right than ignoring the information.) + * If we matched the var to a unique index or DISTINCT clause, assume + * there is exactly one match regardless of anything else. (This is + * slightly bogus, since the index or clause's equality operator might be + * different from ours, but it's much more likely to be right than + * ignoring the information.) */ if (vardata->isunique && vardata->rel && vardata->rel->tuples >= 1.0) return 1.0 / vardata->rel->tuples; @@ -389,10 +391,11 @@ var_eq_non_const(VariableStatData *vardata, Oid operator, bool isdefault; /* - * If we matched the var to a unique index, assume there is exactly one - * match regardless of anything else. (This is slightly bogus, since the - * index's equality operator might be different from ours, but it's more - * likely to be right than ignoring the information.) + * If we matched the var to a unique index or DISTINCT clause, assume + * there is exactly one match regardless of anything else. (This is + * slightly bogus, since the index or clause's equality operator might be + * different from ours, but it's much more likely to be right than + * ignoring the information.) */ if (vardata->isunique && vardata->rel && vardata->rel->tuples >= 1.0) return 1.0 / vardata->rel->tuples; @@ -4128,10 +4131,11 @@ get_join_variables(PlannerInfo *root, List *args, SpecialJoinInfo *sjinfo, * atttype, atttypmod: type data to pass to get_attstatsslot(). This is * commonly the same as the exposed type of the variable argument, * but can be different in binary-compatible-type cases. - * isunique: TRUE if we were able to match the var to a unique index, - * implying its values are unique for this query. (Caution: this - * should be trusted for statistical purposes only, since we do not - * check indimmediate.) + * isunique: TRUE if we were able to match the var to a unique index or a + * single-column DISTINCT clause, implying its values are unique for + * this query. (Caution: this should be trusted for statistical + * purposes only, since we do not check indimmediate nor verify that + * the exact same definition of equality applies.) * * Caller is responsible for doing ReleaseVariableStats() before exiting. */ @@ -4357,32 +4361,21 @@ examine_simple_variable(PlannerInfo *root, Var *var, { /* * Plain subquery (not one that was converted to an appendrel). - * - * Punt if subquery uses set operations, GROUP BY, or DISTINCT --- any - * of these will mash underlying columns' stats beyond recognition. - * (Set ops are particularly nasty; if we forged ahead, we would - * return stats relevant to only the leftmost subselect...) */ Query *subquery = rte->subquery; RelOptInfo *rel; TargetEntry *ste; - if (subquery->setOperations || - subquery->groupClause || - subquery->distinctClause) - return; - /* - * If the sub-query originated from a view with the security_barrier - * attribute, we treat it as a black-box from outside of the view. - * This is probably a harsher restriction than necessary; it's - * certainly OK for the selectivity estimator (which is a C function, - * and therefore omnipotent anyway) to look at the statistics. But - * many selectivity estimators will happily *invoke the operator - * function* to try to work out a good estimate - and that's not OK. - * So for now, we do this. + * Punt if subquery uses set operations or GROUP BY, as these will + * mash underlying columns' stats beyond recognition. (Set ops are + * particularly nasty; if we forged ahead, we would return stats + * relevant to only the leftmost subselect...) DISTINCT is also + * problematic, but we check that later because there is a possibility + * of learning something even with it. */ - if (rte->security_barrier) + if (subquery->setOperations || + subquery->groupClause) return; /* @@ -4415,6 +4408,37 @@ examine_simple_variable(PlannerInfo *root, Var *var, rte->eref->aliasname, var->varattno); var = (Var *) ste->expr; + /* + * If subquery uses DISTINCT, we can't make use of any stats for the + * variable ... but, if it's the only DISTINCT column, we are entitled + * to consider it unique. We do the test this way so that it works + * for cases involving DISTINCT ON. + */ + if (subquery->distinctClause) + { + if (list_length(subquery->distinctClause) == 1 && + targetIsInSortList(ste, InvalidOid, subquery->distinctClause)) + vardata->isunique = true; + /* cannot go further */ + return; + } + + /* + * If the sub-query originated from a view with the security_barrier + * attribute, we must not look at the variable's statistics, though + * it seems all right to notice the existence of a DISTINCT clause. + * So stop here. + * + * This is probably a harsher restriction than necessary; it's + * certainly OK for the selectivity estimator (which is a C function, + * and therefore omnipotent anyway) to look at the statistics. But + * many selectivity estimators will happily *invoke the operator + * function* to try to work out a good estimate - and that's not OK. + * So for now, don't dig down for stats. + */ + if (rte->security_barrier) + return; + /* Can only handle a simple Var of subquery's query level */ if (var && IsA(var, Var) && var->varlevelsup == 0) @@ -4513,10 +4537,10 @@ get_variable_numdistinct(VariableStatData *vardata, bool *isdefault) } /* - * If there is a unique index for the variable, assume it is unique no - * matter what pg_statistic says; the statistics could be out of date, or - * we might have found a partial unique index that proves the var is - * unique for this query. + * If there is a unique index or DISTINCT clause for the variable, assume + * it is unique no matter what pg_statistic says; the statistics could be + * out of date, or we might have found a partial unique index that proves + * the var is unique for this query. */ if (vardata->isunique) stadistinct = -1.0; diff --git a/src/include/utils/selfuncs.h b/src/include/utils/selfuncs.h index 78eda1b503..bffc2d80ef 100644 --- a/src/include/utils/selfuncs.h +++ b/src/include/utils/selfuncs.h @@ -74,7 +74,7 @@ typedef struct VariableStatData Oid vartype; /* exposed type of expression */ Oid atttype; /* type to pass to get_attstatsslot */ int32 atttypmod; /* typmod to pass to get_attstatsslot */ - bool isunique; /* true if matched to a unique index */ + bool isunique; /* matches unique index or DISTINCT clause */ } VariableStatData; #define ReleaseVariableStats(vardata) \