Improve estimation of IN/NOT IN by assuming array elements are distinct.

In constructs such as "x IN (1,2,3,4)" and "x <> ALL(ARRAY[1,2,3,4])", we formerly always used a general-purpose assumption that the probability of success is independent for each comparison of "x" to an array element. But in real-world usage of these constructs, that's a pretty poor assumption; it's much saner to assume that the array elements are distinct and so the match probabilities are disjoint. Apply that assumption if the operator appears to behave as equality (for ANY) or inequality (for ALL). But fall back to the normal independent-probabilities calculation if this yields an impossible result, ie probability > 1 or < 0. We could protect ourselves against bad estimates even more by explicitly checking for equal array elements, but that is expensive and doesn't seem worthwhile: doing it would amount to optimizing for poorly-written queries at the expense of well-written ones. Daniele Varrazzo and Tom Lane, after a suggestion by Ants Aasma
2025-01-06 15:24:56 +08:00 · 2012-03-07 22:59:49 -05:00 · 2012-03-07 22:59:49 -05:00 · 66a7e6bae9
commit 66a7e6bae9
parent 1ed7f0e6b9
1 changed files with 71 additions and 3 deletions
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@ -1712,6 +1712,7 @@ scalararraysel(PlannerInfo *root,
 	RegProcedure oprsel;
 	FmgrInfo	oprselproc;
 	Selectivity s1;
+	Selectivity s1disjoint;

 	/* First, deconstruct the expression */
 	Assert(list_length(clause->args) == 2);
@ -1768,6 +1769,19 @@ scalararraysel(PlannerInfo *root,
 		return (Selectivity) 0.5;
 	fmgr_info(oprsel, &oprselproc);

+	/*
+	 * In the array-containment check above, we must only believe that an
+	 * operator is equality or inequality if it is the default btree equality
+	 * operator (or its negator) for the element type, since those are the
+	 * operators that array containment will use.  But in what follows, we can
+	 * be a little laxer, and also believe that any operators using eqsel() or
+	 * neqsel() as selectivity estimator act like equality or inequality.
+	 */
+	if (oprsel == F_EQSEL || oprsel == F_EQJOINSEL)
+		isEquality = true;
+	else if (oprsel == F_NEQSEL || oprsel == F_NEQJOINSEL)
+		isInequality = true;
+
 	/*
 	 * We consider three cases:
 	 *
@ -1802,7 +1816,23 @@ scalararraysel(PlannerInfo *root,
 						  ARR_ELEMTYPE(arrayval),
 						  elmlen, elmbyval, elmalign,
 						  &elem_values, &elem_nulls, &num_elems);
-		s1 = useOr ? 0.0 : 1.0;
+
+		/*
+		 * For generic operators, we assume the probability of success is
+		 * independent for each array element.  But for "= ANY" or "<> ALL",
+		 * if the array elements are distinct (which'd typically be the case)
+		 * then the probabilities are disjoint, and we should just sum them.
+		 *
+		 * If we were being really tense we would try to confirm that the
+		 * elements are all distinct, but that would be expensive and it
+		 * doesn't seem to be worth the cycles; it would amount to penalizing
+		 * well-written queries in favor of poorly-written ones.  However, we
+		 * do protect ourselves a little bit by checking whether the
+		 * disjointness assumption leads to an impossible (out of range)
+		 * probability; if so, we fall back to the normal calculation.
+		 */
+		s1 = s1disjoint = (useOr ? 0.0 : 1.0);
+
 		for (i = 0; i < num_elems; i++)
 		{
 			List	   *args;
@ -1829,11 +1859,25 @@ scalararraysel(PlannerInfo *root,
 												  ObjectIdGetDatum(operator),
 												  PointerGetDatum(args),
 												  Int32GetDatum(varRelid)));
+
 			if (useOr)
+			{
 				s1 = s1 + s2 - s1 * s2;
+				if (isEquality)
+					s1disjoint += s2;
+			}
 			else
+			{
 				s1 = s1 * s2;
+				if (isInequality)
+					s1disjoint += s2 - 1.0;
+			}
 		}
+
+		/* accept disjoint-probability estimate if in range */
+		if ((useOr ? isEquality : isInequality) &&
+			s1disjoint >= 0.0 && s1disjoint <= 1.0)
+			s1 = s1disjoint;
 	}
 	else if (rightop && IsA(rightop, ArrayExpr) &&
 			 !((ArrayExpr *) rightop)->multidims)
@ -1845,7 +1889,16 @@ scalararraysel(PlannerInfo *root,

 		get_typlenbyval(arrayexpr->element_typeid,
 						&elmlen, &elmbyval);
-		s1 = useOr ? 0.0 : 1.0;
+
+		/*
+		 * We use the assumption of disjoint probabilities here too, although
+		 * the odds of equal array elements are rather higher if the elements
+		 * are not all constants (which they won't be, else constant folding
+		 * would have reduced the ArrayExpr to a Const).  In this path it's
+		 * critical to have the sanity check on the s1disjoint estimate.
+		 */
+		s1 = s1disjoint = (useOr ? 0.0 : 1.0);
+
 		foreach(l, arrayexpr->elements)
 		{
 			Node	   *elem = (Node *) lfirst(l);
@ -1871,11 +1924,25 @@ scalararraysel(PlannerInfo *root,
 												  ObjectIdGetDatum(operator),
 												  PointerGetDatum(args),
 												  Int32GetDatum(varRelid)));
+
 			if (useOr)
+			{
 				s1 = s1 + s2 - s1 * s2;
+				if (isEquality)
+					s1disjoint += s2;
+			}
 			else
+			{
 				s1 = s1 * s2;
+				if (isInequality)
+					s1disjoint += s2 - 1.0;
+			}
 		}
+
+		/* accept disjoint-probability estimate if in range */
+		if ((useOr ? isEquality : isInequality) &&
+			s1disjoint >= 0.0 && s1disjoint <= 1.0)
+			s1 = s1disjoint;
 	}
 	else
 	{
@ -1911,7 +1978,8 @@ scalararraysel(PlannerInfo *root,

 		/*
 		 * Arbitrarily assume 10 elements in the eventual array value (see
-		 * also estimate_array_length)
+		 * also estimate_array_length).  We don't risk an assumption of
+		 * disjoint probabilities here.
 		 */
 		for (i = 0; i < 10; i++)
 		{