From 8787bc8ef39f962f76064c25971ca28a870fb4e4 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Fri, 13 Feb 2004 06:39:49 +0000 Subject: [PATCH] After further thought about support for gathering stats on functional indexes, it seems like we ought to put another layer of indirection between the compute_stats functions and the actual data storage. This would allow us to compute the values on-the-fly, for example. --- src/backend/commands/analyze.c | 101 ++++++++++++++++++++------------- src/include/commands/vacuum.h | 32 +++++++---- 2 files changed, 81 insertions(+), 52 deletions(-) diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c index eb8716b488..aba4255595 100644 --- a/src/backend/commands/analyze.c +++ b/src/backend/commands/analyze.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/commands/analyze.c,v 1.68 2004/02/12 23:41:02 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/commands/analyze.c,v 1.69 2004/02/13 06:39:49 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -52,6 +52,7 @@ static double init_selection_state(int n); static double select_next_random_record(double t, int n, double *stateptr); static int compare_rows(const void *a, const void *b); static void update_attstats(Oid relid, int natts, VacAttrStats **vacattrstats); +static Datum std_fetch_func(VacAttrStatsP stats, int rownum, bool *isNull); static bool std_typanalyze(VacAttrStats *stats); @@ -259,12 +260,14 @@ analyze_rel(Oid relid, VacuumStmt *vacstmt) old_context = MemoryContextSwitchTo(col_context); for (i = 0; i < attr_cnt; i++) { - (*vacattrstats[i]->compute_stats) (vacattrstats[i], - vacattrstats[i]->tupattnum, - onerel->rd_att, - totalrows, - rows, - numrows); + VacAttrStats *stats = vacattrstats[i]; + + stats->rows = rows; + stats->tupDesc = onerel->rd_att; + (*stats->compute_stats) (stats, + std_fetch_func, + numrows, + totalrows); MemoryContextResetAndDeleteChildren(col_context); } MemoryContextSwitchTo(old_context); @@ -861,6 +864,22 @@ update_attstats(Oid relid, int natts, VacAttrStats **vacattrstats) heap_close(sd, RowExclusiveLock); } +/* + * Standard fetch function for use by compute_stats subroutines. + * + * This exists to provide some insulation between compute_stats routines + * and the actual storage of the sample data. + */ +static Datum +std_fetch_func(VacAttrStatsP stats, int rownum, bool *isNull) +{ + int attnum = stats->tupattnum; + HeapTuple tuple = stats->rows[rownum]; + TupleDesc tupDesc = stats->tupDesc; + + return heap_getattr(tuple, attnum, tupDesc, isNull); +} + /*========================================================================== * @@ -915,12 +934,14 @@ static SortFunctionKind datumCmpFnKind; static int *datumCmpTupnoLink; -static void compute_minimal_stats(VacAttrStats *stats, int attnum, - TupleDesc tupDesc, double totalrows, - HeapTuple *rows, int numrows); -static void compute_scalar_stats(VacAttrStats *stats, int attnum, - TupleDesc tupDesc, double totalrows, - HeapTuple *rows, int numrows); +static void compute_minimal_stats(VacAttrStatsP stats, + AnalyzeAttrFetchFunc fetchfunc, + int samplerows, + double totalrows); +static void compute_scalar_stats(VacAttrStatsP stats, + AnalyzeAttrFetchFunc fetchfunc, + int samplerows, + double totalrows); static int compare_scalars(const void *a, const void *b); static int compare_mcvs(const void *a, const void *b); @@ -1024,9 +1045,10 @@ std_typanalyze(VacAttrStats *stats) * depend mainly on the length of the list we are willing to keep. */ static void -compute_minimal_stats(VacAttrStats *stats, int attnum, - TupleDesc tupDesc, double totalrows, - HeapTuple *rows, int numrows) +compute_minimal_stats(VacAttrStatsP stats, + AnalyzeAttrFetchFunc fetchfunc, + int samplerows, + double totalrows) { int i; int null_cnt = 0; @@ -1061,9 +1083,8 @@ compute_minimal_stats(VacAttrStats *stats, int attnum, fmgr_info(mystats->eqfunc, &f_cmpeq); - for (i = 0; i < numrows; i++) + for (i = 0; i < samplerows; i++) { - HeapTuple tuple = rows[i]; Datum value; bool isnull; bool match; @@ -1072,7 +1093,7 @@ compute_minimal_stats(VacAttrStats *stats, int attnum, vacuum_delay_point(); - value = heap_getattr(tuple, attnum, tupDesc, &isnull); + value = fetchfunc(stats, i, &isnull); /* Check for null/nonnull */ if (isnull) @@ -1166,7 +1187,7 @@ compute_minimal_stats(VacAttrStats *stats, int attnum, stats->stats_valid = true; /* Do the simple null-frac and width stats */ - stats->stanullfrac = (double) null_cnt / (double) numrows; + stats->stanullfrac = (double) null_cnt / (double) samplerows; if (is_varwidth) stats->stawidth = total_width / (double) nonnull_cnt; else @@ -1222,10 +1243,10 @@ compute_minimal_stats(VacAttrStats *stats, int attnum, denom, stadistinct; - numer = (double) numrows *(double) d; + numer = (double) samplerows *(double) d; - denom = (double) (numrows - f1) + - (double) f1 *(double) numrows / totalrows; + denom = (double) (samplerows - f1) + + (double) f1 *(double) samplerows / totalrows; stadistinct = numer / denom; /* Clamp to sane range in case of roundoff error */ @@ -1270,7 +1291,7 @@ compute_minimal_stats(VacAttrStats *stats, int attnum, if (ndistinct < 0) ndistinct = -ndistinct * totalrows; /* estimate # of occurrences in sample of a typical value */ - avgcount = (double) numrows / ndistinct; + avgcount = (double) samplerows / ndistinct; /* set minimum threshold count to store a value */ mincount = avgcount * 1.25; if (mincount < 2) @@ -1303,7 +1324,7 @@ compute_minimal_stats(VacAttrStats *stats, int attnum, mcv_values[i] = datumCopy(track[i].value, stats->attr->attbyval, stats->attr->attlen); - mcv_freqs[i] = (double) track[i].count / (double) numrows; + mcv_freqs[i] = (double) track[i].count / (double) samplerows; } MemoryContextSwitchTo(old_context); @@ -1333,9 +1354,10 @@ compute_minimal_stats(VacAttrStats *stats, int attnum, * data values into order. */ static void -compute_scalar_stats(VacAttrStats *stats, int attnum, - TupleDesc tupDesc, double totalrows, - HeapTuple *rows, int numrows) +compute_scalar_stats(VacAttrStatsP stats, + AnalyzeAttrFetchFunc fetchfunc, + int samplerows, + double totalrows) { int i; int null_cnt = 0; @@ -1359,23 +1381,22 @@ compute_scalar_stats(VacAttrStats *stats, int attnum, int num_bins = stats->attr->attstattarget; StdAnalyzeData *mystats = (StdAnalyzeData *) stats->extra_data; - values = (ScalarItem *) palloc(numrows * sizeof(ScalarItem)); - tupnoLink = (int *) palloc(numrows * sizeof(int)); + values = (ScalarItem *) palloc(samplerows * sizeof(ScalarItem)); + tupnoLink = (int *) palloc(samplerows * sizeof(int)); track = (ScalarMCVItem *) palloc(num_mcv * sizeof(ScalarMCVItem)); SelectSortFunction(mystats->ltopr, &cmpFn, &cmpFnKind); fmgr_info(cmpFn, &f_cmpfn); /* Initial scan to find sortable values */ - for (i = 0; i < numrows; i++) + for (i = 0; i < samplerows; i++) { - HeapTuple tuple = rows[i]; Datum value; bool isnull; vacuum_delay_point(); - value = heap_getattr(tuple, attnum, tupDesc, &isnull); + value = fetchfunc(stats, i, &isnull); /* Check for null/nonnull */ if (isnull) @@ -1505,7 +1526,7 @@ compute_scalar_stats(VacAttrStats *stats, int attnum, stats->stats_valid = true; /* Do the simple null-frac and width stats */ - stats->stanullfrac = (double) null_cnt / (double) numrows; + stats->stanullfrac = (double) null_cnt / (double) samplerows; if (is_varwidth) stats->stawidth = total_width / (double) nonnull_cnt; else @@ -1546,10 +1567,10 @@ compute_scalar_stats(VacAttrStats *stats, int attnum, denom, stadistinct; - numer = (double) numrows *(double) d; + numer = (double) samplerows *(double) d; - denom = (double) (numrows - f1) + - (double) f1 *(double) numrows / totalrows; + denom = (double) (samplerows - f1) + + (double) f1 *(double) samplerows / totalrows; stadistinct = numer / denom; /* Clamp to sane range in case of roundoff error */ @@ -1599,13 +1620,13 @@ compute_scalar_stats(VacAttrStats *stats, int attnum, if (ndistinct < 0) ndistinct = -ndistinct * totalrows; /* estimate # of occurrences in sample of a typical value */ - avgcount = (double) numrows / ndistinct; + avgcount = (double) samplerows / ndistinct; /* set minimum threshold count to store a value */ mincount = avgcount * 1.25; if (mincount < 2) mincount = 2; /* don't let threshold exceed 1/K, however */ - maxmincount = (double) numrows / (double) num_bins; + maxmincount = (double) samplerows / (double) num_bins; if (mincount > maxmincount) mincount = maxmincount; if (num_mcv > track_cnt) @@ -1636,7 +1657,7 @@ compute_scalar_stats(VacAttrStats *stats, int attnum, mcv_values[i] = datumCopy(values[track[i].first].value, stats->attr->attbyval, stats->attr->attlen); - mcv_freqs[i] = (double) track[i].count / (double) numrows; + mcv_freqs[i] = (double) track[i].count / (double) samplerows; } MemoryContextSwitchTo(old_context); diff --git a/src/include/commands/vacuum.h b/src/include/commands/vacuum.h index b80f068356..8c58f1ac38 100644 --- a/src/include/commands/vacuum.h +++ b/src/include/commands/vacuum.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/commands/vacuum.h,v 1.49 2004/02/12 23:41:04 tgl Exp $ + * $PostgreSQL: pgsql/src/include/commands/vacuum.h,v 1.50 2004/02/13 06:39:49 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -40,18 +40,18 @@ * and must return TRUE to continue analysis, FALSE to skip analysis of this * column. In the TRUE case it must set the compute_stats and minrows fields, * and can optionally set extra_data to pass additional info to compute_stats. + * minrows is its request for the minimum number of sample rows to be gathered + * (but note this request might not be honored, eg if there are fewer rows + * than that in the table). * * The compute_stats routine will be called after sample rows have been * gathered. Aside from this struct, it is passed: - * attnum: attribute number within the supplied tuples - * tupDesc: tuple descriptor for the supplied tuples + * fetchfunc: a function for accessing the column values from the + * sample rows + * samplerows: the number of sample tuples * totalrows: estimated total number of rows in relation - * rows: an array of the sample tuples - * numrows: the number of sample tuples - * Note that the passed attnum and tupDesc could possibly be different from - * what one would expect by looking at the pg_attribute row. It is important - * to use these values for extracting attribute values from the given rows - * (and not for any other purpose). + * The fetchfunc may be called with rownum running from 0 to samplerows-1. + * It returns a Datum and an isNull flag. * * compute_stats should set stats_valid TRUE if it is able to compute * any useful statistics. If it does, the remainder of the struct holds @@ -60,6 +60,11 @@ * be CurrentMemoryContext when compute_stats is called. *---------- */ +typedef struct VacAttrStats *VacAttrStatsP; + +typedef Datum (*AnalyzeAttrFetchFunc) (VacAttrStatsP stats, int rownum, + bool *isNull); + typedef struct VacAttrStats { /* @@ -74,9 +79,10 @@ typedef struct VacAttrStats * These fields must be filled in by the typanalyze routine, * unless it returns FALSE. */ - void (*compute_stats) (struct VacAttrStats *stats, int attnum, - TupleDesc tupDesc, double totalrows, - HeapTuple *rows, int numrows); + void (*compute_stats) (VacAttrStatsP stats, + AnalyzeAttrFetchFunc fetchfunc, + int samplerows, + double totalrows); int minrows; /* Minimum # of rows wanted for stats */ void *extra_data; /* for extra type-specific data */ @@ -100,6 +106,8 @@ typedef struct VacAttrStats * be looked at by type-specific functions. */ int tupattnum; /* attribute number within tuples */ + HeapTuple *rows; /* access info for fetch function */ + TupleDesc tupDesc; } VacAttrStats;