Phase 2 of hashed-aggregation project. nodeAgg.c now knows how to do

hashed aggregation, but there's not yet planner support for it.
2024-12-21 08:29:39 +08:00 · 2002-11-06 22:31:24 +00:00 · 2002-11-06 22:31:24 +00:00 · 2103b7baa2
commit 2103b7baa2
parent fc9814d17e
12 changed files with 696 additions and 266 deletions
--- a/src/backend/executor/nodeAgg.c
+++ b/src/backend/executor/nodeAgg.c
--- a/src/backend/executor/nodeGroup.c
+++ b/src/backend/executor/nodeGroup.c
@ -15,7 +15,7 @@
 *	  locate group boundaries.
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/executor/nodeGroup.c,v 1.48 2002/11/06 00:00:43 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/executor/nodeGroup.c,v 1.49 2002/11/06 22:31:23 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -151,9 +151,8 @@ ExecInitGroup(Group *node, EState *estate, Plan *parent)
 	 */
 	grpstate = makeNode(GroupState);
 	node->grpstate = grpstate;
-	grpstate->grp_useFirstTuple = FALSE;
-	grpstate->grp_done = FALSE;
 	grpstate->grp_firstTuple = NULL;
+	grpstate->grp_done = FALSE;

 	/*
 	 * create expression context
@ -236,7 +235,6 @@ ExecReScanGroup(Group *node, ExprContext *exprCtxt, Plan *parent)
 {
 	GroupState *grpstate = node->grpstate;

-	grpstate->grp_useFirstTuple = FALSE;
 	grpstate->grp_done = FALSE;
 	if (grpstate->grp_firstTuple != NULL)
 	{
--- a/src/backend/executor/nodeHash.c
+++ b/src/backend/executor/nodeHash.c
@ -7,7 +7,8 @@
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 *
- *	$Id: nodeHash.c,v 1.66 2002/09/04 20:31:18 momjian Exp $
+ * IDENTIFICATION
+ *	  $Header: /cvsroot/pgsql/src/backend/executor/nodeHash.c,v 1.67 2002/11/06 22:31:23 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -31,8 +32,6 @@
 #include "utils/lsyscache.h"


-static uint32 hashFunc(Datum key, int typLen, bool byVal);
-
 /* ----------------------------------------------------------------
 *		ExecHash
 *
@ -532,7 +531,7 @@ ExecHashGetBucket(HashJoinTable hashtable,

 	/*
 	 * We reset the eval context each time to reclaim any memory leaked in
-	 * the hashkey expression or hashFunc itself.
+	 * the hashkey expression or ComputeHashFunc itself.
 	 */
 	ResetExprContext(econtext);

@ -550,7 +549,7 @@ ExecHashGetBucket(HashJoinTable hashtable,
 		bucketno = 0;
 	else
 	{
-		bucketno = hashFunc(keyval,
+		bucketno = ComputeHashFunc(keyval,
 								   (int) hashtable->typLen,
 								   hashtable->typByVal)
 			% (uint32) hashtable->totalbuckets;
@ -622,16 +621,16 @@ ExecScanHashBucket(HashJoinState *hjstate,
 }

 /* ----------------------------------------------------------------
- *		hashFunc
+ *		ComputeHashFunc
 *
- *		the hash function for hash joins
+ *		the hash function for hash joins (also used for hash aggregation)
 *
 *		XXX this probably ought to be replaced with datatype-specific
 *		hash functions, such as those already implemented for hash indexes.
 * ----------------------------------------------------------------
 */
-static uint32
-hashFunc(Datum key, int typLen, bool byVal)
+uint32
+ComputeHashFunc(Datum key, int typLen, bool byVal)
 {
 	unsigned char *k;

@ -681,7 +680,7 @@ hashFunc(Datum key, int typLen, bool byVal)
 		}
 		else
 		{
-			elog(ERROR, "hashFunc: Invalid typLen %d", typLen);
+			elog(ERROR, "ComputeHashFunc: Invalid typLen %d", typLen);
 			k = NULL;			/* keep compiler quiet */
 		}
 	}
--- a/src/backend/nodes/copyfuncs.c
+++ b/src/backend/nodes/copyfuncs.c
@ -15,7 +15,7 @@
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/nodes/copyfuncs.c,v 1.215 2002/11/06 00:00:43 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/nodes/copyfuncs.c,v 1.216 2002/11/06 22:31:23 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -524,6 +524,7 @@ _copyAgg(Agg *from)
 		memcpy(newnode->grpColIdx, from->grpColIdx,
 			   from->numCols * sizeof(AttrNumber));
 	}
+	newnode->numGroups = from->numGroups;

 	return newnode;
 }
--- a/src/backend/nodes/outfuncs.c
+++ b/src/backend/nodes/outfuncs.c
@ -5,7 +5,7 @@
 * Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- *	$Header: /cvsroot/pgsql/src/backend/nodes/outfuncs.c,v 1.177 2002/11/06 00:00:44 tgl Exp $
+ *	$Header: /cvsroot/pgsql/src/backend/nodes/outfuncs.c,v 1.178 2002/11/06 22:31:24 tgl Exp $
 *
 * NOTES
 *	  Every (plan) node in POSTGRES has an associated "out" routine which
@ -597,8 +597,8 @@ _outAgg(StringInfo str, Agg *node)
 {
 	appendStringInfo(str, " AGG ");
 	_outPlanInfo(str, (Plan *) node);
-	appendStringInfo(str, " :aggstrategy %d :numCols %d ",
-					 (int) node->aggstrategy, node->numCols);
+	appendStringInfo(str, " :aggstrategy %d :numCols %d :numGroups %ld ",
+					 (int) node->aggstrategy, node->numCols, node->numGroups);
 }

 static void
--- a/src/backend/optimizer/plan/createplan.c
+++ b/src/backend/optimizer/plan/createplan.c
@ -10,7 +10,7 @@
 *
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/optimizer/plan/createplan.c,v 1.120 2002/11/06 00:00:44 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/optimizer/plan/createplan.c,v 1.121 2002/11/06 22:31:24 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -1675,6 +1675,7 @@ make_agg(List *tlist, List *qual, AggStrategy aggstrategy,
 		plan->plan_rows *= 0.1;
 		if (plan->plan_rows < 1)
 			plan->plan_rows = 1;
+		node->numGroups = (long) plan->plan_rows;
 	}

 	plan->state = (EState *) NULL;
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/optimizer/plan/planner.c,v 1.126 2002/11/06 00:00:44 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/optimizer/plan/planner.c,v 1.127 2002/11/06 22:31:24 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -931,6 +931,7 @@ grouping_planner(Query *parse, double tuple_fraction)
 		AttrNumber *groupColIdx = NULL;
 		Path	   *cheapest_path;
 		Path	   *sorted_path;
+		bool		use_hashed_grouping = false;

 		/* Preprocess targetlist in case we are inside an INSERT/UPDATE. */
 		tlist = preprocess_targetlist(tlist,
@ -1209,6 +1210,29 @@ grouping_planner(Query *parse, double tuple_fraction)
 		group_pathkeys = canonicalize_pathkeys(parse, group_pathkeys);
 		sort_pathkeys = canonicalize_pathkeys(parse, sort_pathkeys);

+		/*
+		 * Consider whether we might want to use hashed grouping.
+		 */
+		if (parse->groupClause)
+		{
+			/*
+			 * Executor doesn't support hashed aggregation with DISTINCT
+			 * aggregates.  (Doing so would imply storing *all* the input
+			 * values in the hash table, which seems like a certain loser.)
+			 */
+			if (parse->hasAggs &&
+				(contain_distinct_agg_clause((Node *) tlist) ||
+				 contain_distinct_agg_clause(parse->havingQual)))
+				use_hashed_grouping = false;
+			else
+			{
+#if 0							/* much more to do here */
+				/* TEMPORARY HOTWIRE FOR TESTING */
+				use_hashed_grouping = true;
+#endif
+			}
+		}
+
 		/*
 		 * Select the best path and create a plan to execute it.
 		 *
@ -1279,22 +1303,30 @@ grouping_planner(Query *parse, double tuple_fraction)
 		}

 		/*
-		 * If any aggregate is present, insert the Agg node, plus an explicit
-		 * sort if necessary.
+		 * Insert AGG or GROUP node if needed, plus an explicit sort step
+		 * if necessary.
 		 *
 		 * HAVING clause, if any, becomes qual of the Agg node
 		 */
-		if (parse->hasAggs)
+		if (use_hashed_grouping)
 		{
+			/* Hashed aggregate plan --- no sort needed */
+			result_plan = (Plan *) make_agg(tlist,
+											(List *) parse->havingQual,
+											AGG_HASHED,
+											length(parse->groupClause),
+											groupColIdx,
+											result_plan);
+			/* Hashed aggregation produces randomly-ordered results */
+			current_pathkeys = NIL;
+		}
+		else if (parse->hasAggs)
+		{
+			/* Plain aggregate plan --- sort if needed */
 			AggStrategy aggstrategy;

 			if (parse->groupClause)
 			{
-				aggstrategy = AGG_SORTED;
-				/*
-				 * Add an explicit sort if we couldn't make the path come out
-				 * the way the AGG node needs it.
-				 */
 				if (!pathkeys_contained_in(group_pathkeys, current_pathkeys))
 				{
 					result_plan = make_groupsortplan(parse,
@ -1303,9 +1335,18 @@ grouping_planner(Query *parse, double tuple_fraction)
 													 result_plan);
 					current_pathkeys = group_pathkeys;
 				}
+				aggstrategy = AGG_SORTED;
+				/*
+				 * The AGG node will not change the sort ordering of its
+				 * groups, so current_pathkeys describes the result too.
+				 */
 			}
 			else
+			{
 				aggstrategy = AGG_PLAIN;
+				/* Result will be only one row anyway; no sort order */
+				current_pathkeys = NIL;
+			}

 			result_plan = (Plan *) make_agg(tlist,
 											(List *) parse->havingQual,
@ -1313,10 +1354,6 @@ grouping_planner(Query *parse, double tuple_fraction)
 											length(parse->groupClause),
 											groupColIdx,
 											result_plan);
-			/*
-			 * Note: plain or grouped Agg does not affect any existing
-			 * sort order of the tuples
-			 */
 		}
 		else
 		{
--- a/src/backend/optimizer/util/clauses.c
+++ b/src/backend/optimizer/util/clauses.c
@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/optimizer/util/clauses.c,v 1.109 2002/09/11 14:48:54 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/optimizer/util/clauses.c,v 1.110 2002/11/06 22:31:24 tgl Exp $
 *
 * HISTORY
 *	  AUTHOR			DATE			MAJOR EVENT
@ -46,6 +46,7 @@ typedef struct
 } check_subplans_for_ungrouped_vars_context;

 static bool contain_agg_clause_walker(Node *node, void *context);
+static bool contain_distinct_agg_clause_walker(Node *node, void *context);
 static bool pull_agg_clause_walker(Node *node, List **listptr);
 static bool expression_returns_set_walker(Node *node, void *context);
 static bool contain_subplans_walker(Node *node, void *context);
@ -410,6 +411,32 @@ contain_agg_clause_walker(Node *node, void *context)
 	return expression_tree_walker(node, contain_agg_clause_walker, context);
 }

+/*
+ * contain_distinct_agg_clause
+ *	  Recursively search for DISTINCT Aggref nodes within a clause.
+ *
+ *	  Returns true if any DISTINCT aggregate found.
+ */
+bool
+contain_distinct_agg_clause(Node *clause)
+{
+	return contain_distinct_agg_clause_walker(clause, NULL);
+}
+
+static bool
+contain_distinct_agg_clause_walker(Node *node, void *context)
+{
+	if (node == NULL)
+		return false;
+	if (IsA(node, Aggref))
+	{
+		if (((Aggref *) node)->aggdistinct)
+			return true;		/* abort the tree traversal and return
+								 * true */
+	}
+	return expression_tree_walker(node, contain_distinct_agg_clause_walker, context);
+}
+
 /*
 * pull_agg_clause
 *	  Recursively pulls all Aggref nodes from an expression tree.
--- a/src/include/executor/nodeHash.h
+++ b/src/include/executor/nodeHash.h
@ -7,7 +7,7 @@
 * Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $Id: nodeHash.h,v 1.24 2002/06/20 20:29:49 momjian Exp $
+ * $Id: nodeHash.h,v 1.25 2002/11/06 22:31:24 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -36,5 +36,6 @@ extern void ExecChooseHashTableSize(double ntuples, int tupwidth,
 						int *virtualbuckets,
 						int *physicalbuckets,
 						int *numbatches);
+extern uint32 ComputeHashFunc(Datum key, int typLen, bool byVal);

 #endif   /* NODEHASH_H */
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@ -7,7 +7,7 @@
 * Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $Id: execnodes.h,v 1.76 2002/11/06 00:00:44 tgl Exp $
+ * $Id: execnodes.h,v 1.77 2002/11/06 22:31:24 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -661,12 +661,18 @@ typedef struct MaterialState
 *
 *	csstate.css_ScanTupleSlot refers to output of underlying plan.
 *
- *	Note: the associated ExprContext contains ecxt_aggvalues and ecxt_aggnulls
- *	arrays, which hold the computed agg values for the current input group
- *	during evaluation of an Agg node's output tuple(s).
+ *	Note: csstate.cstate.cs_ExprContext contains ecxt_aggvalues and
+ *	ecxt_aggnulls arrays, which hold the computed agg values for the current
+ *	input group during evaluation of an Agg node's output tuple(s).  We
+ *	create a second ExprContext, tmpcontext, in which to evaluate input
+ *	expressions and run the aggregate transition functions.
 * -------------------------
 */
-typedef struct AggStatePerAggData *AggStatePerAgg;		/* private in nodeAgg.c */
+/* these structs are private in nodeAgg.c: */
+typedef struct AggStatePerAggData *AggStatePerAgg;
+typedef struct AggStatePerGroupData *AggStatePerGroup;
+typedef struct AggHashEntryData *AggHashEntry;
+typedef struct AggHashTableData *AggHashTable;

 typedef struct AggState
 {
@ -674,13 +680,18 @@ typedef struct AggState
 	List	   *aggs;			/* all Aggref nodes in targetlist & quals */
 	int			numaggs;		/* length of list (could be zero!) */
 	FmgrInfo   *eqfunctions;	/* per-grouping-field equality fns */
-	HeapTuple	grp_firstTuple;	/* copy of first tuple of current group */
-	AggStatePerAgg peragg;		/* per-Aggref working state */
-	MemoryContext tup_cxt;		/* context for per-output-tuple
-								 * expressions */
-	MemoryContext agg_cxt[2];	/* pair of expression eval memory contexts */
-	int			which_cxt;		/* 0 or 1, indicates current agg_cxt */
+	AggStatePerAgg peragg;		/* per-Aggref information */
+	MemoryContext aggcontext;	/* memory context for long-lived data */
+	ExprContext *tmpcontext;	/* econtext for input expressions */
 	bool		agg_done;		/* indicates completion of Agg scan */
+	/* these fields are used in AGG_PLAIN and AGG_SORTED modes: */
+	AggStatePerGroup pergroup;	/* per-Aggref-per-group working state */
+	HeapTuple	grp_firstTuple;	/* copy of first tuple of current group */
+	/* these fields are used in AGG_HASHED mode: */
+	AggHashTable hashtable;		/* hash table with one entry per group */
+	bool		table_filled;	/* hash table filled yet? */
+	AggHashEntry next_hash_entry; /* next entry in current chain */
+	int			next_hash_bucket; /* next chain */
 } AggState;

 /* ---------------------
@ -691,9 +702,8 @@ typedef struct GroupState
 {
 	CommonScanState csstate;	/* its first field is NodeTag */
 	FmgrInfo   *eqfunctions;	/* per-field lookup data for equality fns */
-	bool		grp_useFirstTuple;		/* first tuple not processed yet */
-	bool		grp_done;
 	HeapTuple	grp_firstTuple;	/* copy of first tuple of current group */
+	bool		grp_done;		/* indicates completion of Group scan */
 } GroupState;

 /* ----------------
--- a/src/include/nodes/plannodes.h
+++ b/src/include/nodes/plannodes.h
@ -7,7 +7,7 @@
 * Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $Id: plannodes.h,v 1.59 2002/11/06 00:00:44 tgl Exp $
+ * $Id: plannodes.h,v 1.60 2002/11/06 22:31:24 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -349,6 +349,7 @@ typedef struct Agg
 	AggStrategy	aggstrategy;
 	int			numCols;		/* number of grouping columns */
 	AttrNumber *grpColIdx;		/* their indexes in the target list */
+	long		numGroups;		/* estimated number of groups in input */
 	AggState   *aggstate;
 } Agg;

--- a/src/include/optimizer/clauses.h
+++ b/src/include/optimizer/clauses.h
@ -7,7 +7,7 @@
 * Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $Id: clauses.h,v 1.54 2002/09/11 14:48:55 tgl Exp $
+ * $Id: clauses.h,v 1.55 2002/11/06 22:31:24 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -40,6 +40,7 @@ extern Expr *make_ands_explicit(List *andclauses);
 extern List *make_ands_implicit(Expr *clause);

 extern bool contain_agg_clause(Node *clause);
+extern bool contain_distinct_agg_clause(Node *clause);
 extern List *pull_agg_clause(Node *clause);

 extern bool expression_returns_set(Node *clause);