Implement choice between hash-based and sort-based grouping for doing

DISTINCT processing on the output of an IN sub-select.
2025-03-13 19:57:53 +08:00 · 2003-01-22 00:07:00 +00:00 · 2003-01-22 00:07:00 +00:00 · e2114817c7
commit e2114817c7
parent a4482f4c4c
2 changed files with 105 additions and 12 deletions
--- a/src/backend/optimizer/plan/createplan.c
+++ b/src/backend/optimizer/plan/createplan.c
@ -10,12 +10,13 @@
 *
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/optimizer/plan/createplan.c,v 1.132 2003/01/20 18:54:52 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/optimizer/plan/createplan.c,v 1.133 2003/01/22 00:07:00 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
 #include "postgres.h"

+#include <limits.h>

 #include "nodes/makefuncs.h"
 #include "nodes/nodeFuncs.h"
@ -418,6 +419,7 @@ create_unique_plan(Query *root, UniquePath *best_path)
 	Plan	   *plan;
 	Plan	   *subplan;
 	List	   *sub_targetlist;
+	List	   *my_tlist;
 	List	   *l;

 	subplan = create_plan(root, best_path->subpath);
@ -474,21 +476,39 @@ create_unique_plan(Query *root, UniquePath *best_path)
 			subplan->targetlist = newtlist;
 	}

+	my_tlist = new_unsorted_tlist(subplan->targetlist);
+
 	if (best_path->use_hash)
 	{
-		elog(ERROR, "create_unique_plan: hash case not implemented yet");
-		plan = NULL;
+		int		numGroupCols = length(my_tlist);
+		long	numGroups;
+		AttrNumber *groupColIdx;
+		int		i;
+
+		numGroups = (long) Min(best_path->rows, (double) LONG_MAX);
+
+		groupColIdx = (AttrNumber *) palloc(numGroupCols * sizeof(AttrNumber));
+		for (i = 0; i < numGroupCols; i++)
+			groupColIdx[i] = i+1;
+
+		plan = (Plan *) make_agg(root,
+								 my_tlist,
+								 NIL,
+								 AGG_HASHED,
+								 numGroupCols,
+								 groupColIdx,
+								 numGroups,
+								 0,
+								 subplan);
 	}
 	else
 	{
-		List	   *sort_tlist;
 		List	   *sortList;

-		sort_tlist = new_unsorted_tlist(subplan->targetlist);
-		sortList = addAllTargetsToSortList(NIL, sort_tlist);
-		plan = (Plan *) make_sort_from_sortclauses(root, sort_tlist,
+		sortList = addAllTargetsToSortList(NIL, my_tlist);
+		plan = (Plan *) make_sort_from_sortclauses(root, my_tlist,
 												   subplan, sortList);
-		plan = (Plan *) make_unique(sort_tlist, plan, sortList);
+		plan = (Plan *) make_unique(my_tlist, plan, sortList);
 	}

 	plan->plan_rows = best_path->rows;
--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/optimizer/util/pathnode.c,v 1.84 2003/01/20 18:54:56 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/optimizer/util/pathnode.c,v 1.85 2003/01/22 00:07:00 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -16,14 +16,22 @@

 #include <math.h>

+#include "catalog/pg_operator.h"
 #include "executor/executor.h"
+#include "miscadmin.h"
 #include "nodes/plannodes.h"
 #include "optimizer/cost.h"
 #include "optimizer/pathnode.h"
 #include "optimizer/paths.h"
 #include "optimizer/restrictinfo.h"
+#include "parser/parse_expr.h"
+#include "parser/parse_oper.h"
 #include "utils/memutils.h"
 #include "utils/selfuncs.h"
+#include "utils/syscache.h"
+
+
+static bool hash_safe_tlist(List *tlist);


 /*****************************************************************************
@ -506,6 +514,7 @@ create_unique_path(Query *root, RelOptInfo *rel, Path *subpath)
 {
 	UniquePath *pathnode;
 	Path		sort_path;		/* dummy for result of cost_sort */
+	Path		agg_path;		/* dummy for result of cost_agg */
 	MemoryContext oldcontext;
 	List	   *sub_targetlist;
 	List	   *l;
@ -587,16 +596,80 @@ create_unique_path(Query *root, RelOptInfo *rel, Path *subpath)
 	 */
 	sort_path.total_cost += cpu_operator_cost * rel->rows * numCols;

-	pathnode->use_hash = false;	/* for now */
+	/*
+	 * Is it safe to use a hashed implementation?  If so, estimate and
+	 * compare costs.  We only try this if we know the targetlist for
+	 * sure (else we can't be sure about the datatypes involved).
+	 */
+	pathnode->use_hash = false;
+	if (enable_hashagg && sub_targetlist && hash_safe_tlist(sub_targetlist))
+	{
+		/*
+		 * Estimate the overhead per hashtable entry at 64 bytes (same
+		 * as in planner.c).
+		 */
+		int		hashentrysize = rel->width + 64;

-	pathnode->path.startup_cost = sort_path.startup_cost;
-	pathnode->path.total_cost = sort_path.total_cost;
+		if (hashentrysize * pathnode->rows <= SortMem * 1024L)
+		{
+			cost_agg(&agg_path, root,
+					 AGG_HASHED, 0,
+					 numCols, pathnode->rows,
+					 subpath->startup_cost,
+					 subpath->total_cost,
+					 rel->rows);
+			if (agg_path.total_cost < sort_path.total_cost)
+				pathnode->use_hash = true;
+		}
+	}
+
+	if (pathnode->use_hash)
+	{
+		pathnode->path.startup_cost = agg_path.startup_cost;
+		pathnode->path.total_cost = agg_path.total_cost;
+	}
+	else
+	{
+		pathnode->path.startup_cost = sort_path.startup_cost;
+		pathnode->path.total_cost = sort_path.total_cost;
+	}

 	rel->cheapest_unique_path = (Path *) pathnode;

 	return pathnode;
 }

+/*
+ * hash_safe_tlist - can datatypes of given tlist be hashed?
+ *
+ * We assume hashed aggregation will work if the datatype's equality operator
+ * is marked hashjoinable.
+ *
+ * XXX this probably should be somewhere else.  See also hash_safe_grouping
+ * in plan/planner.c.
+ */
+static bool
+hash_safe_tlist(List *tlist)
+{
+	List	   *tl;
+
+	foreach(tl, tlist)
+	{
+		Node	   *expr = (Node *) lfirst(tl);
+		Operator	optup;
+		bool		oprcanhash;
+
+		optup = equality_oper(exprType(expr), true);
+		if (!optup)
+			return false;
+		oprcanhash = ((Form_pg_operator) GETSTRUCT(optup))->oprcanhash;
+		ReleaseSysCache(optup);
+		if (!oprcanhash)
+			return false;
+	}
+	return true;
+}
+
 /*
 * create_subqueryscan_path
 *	  Creates a path corresponding to a sequential scan of a subquery,