From e3ffd05b02468b1a53de31a322cedf195576a625 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Fri, 13 Apr 2012 15:32:34 -0400 Subject: [PATCH] Weaken the planner's tests for relevant joinclauses. We should be willing to cross-join two small relations if that allows us to use an inner indexscan on a large relation (that is, the potential indexqual for the large table requires both smaller relations). This worked in simple cases but fell apart as soon as there was a join clause to a fourth relation, because the existence of any two-relation join clause caused the planner to not consider clauseless joins between other base relations. The added regression test shows an example case adapted from a recent complaint from Benoit Delbosc. Adjust have_relevant_joinclause, have_relevant_eclass_joinclause, and has_relevant_eclass_joinclause to consider that a join clause mentioning three or more relations is sufficient grounds for joining any subset of those relations, even if we have to do so via a cartesian join. Since such clauses are relatively uncommon, this shouldn't affect planning speed on typical queries; in fact it should help a bit, because the latter two functions in particular get significantly simpler. Although this is arguably a bug fix, I'm not going to risk back-patching it, since it might have currently-unforeseen consequences. --- src/backend/optimizer/path/equivclass.c | 96 +++++-------------------- src/backend/optimizer/path/joinrels.c | 14 ++-- src/backend/optimizer/util/joininfo.c | 24 +++++-- src/test/regress/expected/join.out | 51 +++++++++++++ src/test/regress/sql/join.sql | 18 +++++ 5 files changed, 110 insertions(+), 93 deletions(-) diff --git a/src/backend/optimizer/path/equivclass.c b/src/backend/optimizer/path/equivclass.c index c115c2148d..5cdfec542c 100644 --- a/src/backend/optimizer/path/equivclass.c +++ b/src/backend/optimizer/path/equivclass.c @@ -2035,7 +2035,7 @@ get_parent_relid(PlannerInfo *root, RelOptInfo *rel) /* * have_relevant_eclass_joinclause * Detect whether there is an EquivalenceClass that could produce - * a joinclause between the two given relations. + * a joinclause involving the two given relations. * * This is essentially a very cut-down version of * generate_join_implied_equalities(). Note it's OK to occasionally say "yes" @@ -2051,9 +2051,6 @@ have_relevant_eclass_joinclause(PlannerInfo *root, foreach(lc1, root->eq_classes) { EquivalenceClass *ec = (EquivalenceClass *) lfirst(lc1); - bool has_rel1; - bool has_rel2; - ListCell *lc2; /* * Won't generate joinclauses if single-member (this test covers the @@ -2063,9 +2060,18 @@ have_relevant_eclass_joinclause(PlannerInfo *root, continue; /* + * We do not need to examine the individual members of the EC, because + * all that we care about is whether each rel overlaps the relids of + * at least one member, and a test on ec_relids is sufficient to prove + * that. (As with have_relevant_joinclause(), it is not necessary + * that the EC be able to form a joinclause relating exactly the two + * given rels, only that it be able to form a joinclause mentioning + * both, and this will surely be true if both of them overlap + * ec_relids.) + * * Note we don't test ec_broken; if we did, we'd need a separate code - * path to look through ec_sources. Checking the members anyway is OK - * as a possibly-overoptimistic heuristic. + * path to look through ec_sources. Checking the membership anyway is + * OK as a possibly-overoptimistic heuristic. * * We don't test ec_has_const either, even though a const eclass won't * generate real join clauses. This is because if we had "WHERE a.x = @@ -2073,35 +2079,8 @@ have_relevant_eclass_joinclause(PlannerInfo *root, * since the join result is likely to be small even though it'll end * up being an unqualified nestloop. */ - - /* Needn't scan if it couldn't contain members from each rel */ - if (!bms_overlap(rel1->relids, ec->ec_relids) || - !bms_overlap(rel2->relids, ec->ec_relids)) - continue; - - /* Scan the EC to see if it has member(s) in each rel */ - has_rel1 = has_rel2 = false; - foreach(lc2, ec->ec_members) - { - EquivalenceMember *cur_em = (EquivalenceMember *) lfirst(lc2); - - if (cur_em->em_is_const || cur_em->em_is_child) - continue; /* ignore consts and children here */ - if (bms_is_subset(cur_em->em_relids, rel1->relids)) - { - has_rel1 = true; - if (has_rel2) - break; - } - if (bms_is_subset(cur_em->em_relids, rel2->relids)) - { - has_rel2 = true; - if (has_rel1) - break; - } - } - - if (has_rel1 && has_rel2) + if (bms_overlap(rel1->relids, ec->ec_relids) && + bms_overlap(rel2->relids, ec->ec_relids)) return true; } @@ -2112,7 +2091,7 @@ have_relevant_eclass_joinclause(PlannerInfo *root, /* * has_relevant_eclass_joinclause * Detect whether there is an EquivalenceClass that could produce - * a joinclause between the given relation and anything else. + * a joinclause involving the given relation and anything else. * * This is the same as have_relevant_eclass_joinclause with the other rel * implicitly defined as "everything else in the query". @@ -2125,9 +2104,6 @@ has_relevant_eclass_joinclause(PlannerInfo *root, RelOptInfo *rel1) foreach(lc1, root->eq_classes) { EquivalenceClass *ec = (EquivalenceClass *) lfirst(lc1); - bool has_rel1; - bool has_rel2; - ListCell *lc2; /* * Won't generate joinclauses if single-member (this test covers the @@ -2137,45 +2113,11 @@ has_relevant_eclass_joinclause(PlannerInfo *root, RelOptInfo *rel1) continue; /* - * Note we don't test ec_broken; if we did, we'd need a separate code - * path to look through ec_sources. Checking the members anyway is OK - * as a possibly-overoptimistic heuristic. - * - * We don't test ec_has_const either, even though a const eclass won't - * generate real join clauses. This is because if we had "WHERE a.x = - * b.y and a.x = 42", it is worth considering a join between a and b, - * since the join result is likely to be small even though it'll end - * up being an unqualified nestloop. + * Per the comment in have_relevant_eclass_joinclause, it's sufficient + * to find an EC that mentions both this rel and some other rel. */ - - /* Needn't scan if it couldn't contain members from each rel */ - if (!bms_overlap(rel1->relids, ec->ec_relids) || - bms_is_subset(ec->ec_relids, rel1->relids)) - continue; - - /* Scan the EC to see if it has member(s) in each rel */ - has_rel1 = has_rel2 = false; - foreach(lc2, ec->ec_members) - { - EquivalenceMember *cur_em = (EquivalenceMember *) lfirst(lc2); - - if (cur_em->em_is_const || cur_em->em_is_child) - continue; /* ignore consts and children here */ - if (bms_is_subset(cur_em->em_relids, rel1->relids)) - { - has_rel1 = true; - if (has_rel2) - break; - } - if (!bms_overlap(cur_em->em_relids, rel1->relids)) - { - has_rel2 = true; - if (has_rel1) - break; - } - } - - if (has_rel1 && has_rel2) + if (bms_overlap(rel1->relids, ec->ec_relids) && + !bms_is_subset(ec->ec_relids, rel1->relids)) return true; } diff --git a/src/backend/optimizer/path/joinrels.c b/src/backend/optimizer/path/joinrels.c index 4a35d8d3a4..2ad0b969d2 100644 --- a/src/backend/optimizer/path/joinrels.c +++ b/src/backend/optimizer/path/joinrels.c @@ -88,13 +88,9 @@ join_search_one_level(PlannerInfo *root, int level) has_join_restriction(root, old_rel)) { /* - * Note that if all available join clauses for this rel require - * more than one other rel, we will fail to make any joins against - * it here. In most cases that's OK; it'll be considered by - * "bushy plan" join code in a higher-level pass where we have - * those other rels collected into a join rel. - * - * See also the last-ditch case below. + * There are relevant join clauses or join order restrictions, + * so consider joins between this rel and (only) those rels it is + * linked to by a clause or restriction. */ make_rels_by_clause_joins(root, old_rel, @@ -160,8 +156,8 @@ join_search_one_level(PlannerInfo *root, int level) { /* * OK, we can build a rel of the right level from this - * pair of rels. Do so if there is at least one usable - * join clause or a relevant join restriction. + * pair of rels. Do so if there is at least one relevant + * join clause or join order restriction. */ if (have_relevant_joinclause(root, old_rel, new_rel) || have_join_order_restriction(root, old_rel, new_rel)) diff --git a/src/backend/optimizer/util/joininfo.c b/src/backend/optimizer/util/joininfo.c index b582ccdc00..20d57c5140 100644 --- a/src/backend/optimizer/util/joininfo.c +++ b/src/backend/optimizer/util/joininfo.c @@ -21,34 +21,46 @@ /* * have_relevant_joinclause - * Detect whether there is a joinclause that can be used to join + * Detect whether there is a joinclause that involves * the two given relations. + * + * Note: the joinclause does not have to be evaluatable with only these two + * relations. This is intentional. For example consider + * SELECT * FROM a, b, c WHERE a.x = (b.y + c.z) + * If a is much larger than the other tables, it may be worthwhile to + * cross-join b and c and then use an inner indexscan on a.x. Therefore + * we should consider this joinclause as reason to join b to c, even though + * it can't be applied at that join step. */ bool have_relevant_joinclause(PlannerInfo *root, RelOptInfo *rel1, RelOptInfo *rel2) { bool result = false; - Relids join_relids; List *joininfo; + Relids other_relids; ListCell *l; - join_relids = bms_union(rel1->relids, rel2->relids); - /* * We could scan either relation's joininfo list; may as well use the * shorter one. */ if (list_length(rel1->joininfo) <= list_length(rel2->joininfo)) + { joininfo = rel1->joininfo; + other_relids = rel2->relids; + } else + { joininfo = rel2->joininfo; + other_relids = rel1->relids; + } foreach(l, joininfo) { RestrictInfo *rinfo = (RestrictInfo *) lfirst(l); - if (bms_is_subset(rinfo->required_relids, join_relids)) + if (bms_overlap(other_relids, rinfo->required_relids)) { result = true; break; @@ -62,8 +74,6 @@ have_relevant_joinclause(PlannerInfo *root, if (!result && rel1->has_eclass_joins && rel2->has_eclass_joins) result = have_relevant_eclass_joinclause(root, rel1, rel2); - bms_free(join_relids); - return result; } diff --git a/src/test/regress/expected/join.out b/src/test/regress/expected/join.out index c0c7283333..e5dceff765 100644 --- a/src/test/regress/expected/join.out +++ b/src/test/regress/expected/join.out @@ -2666,6 +2666,57 @@ select * from int4_tbl a full join int4_tbl b on false; -2147483647 | (10 rows) +-- +-- test for ability to use a cartesian join when necessary +-- +explain (costs off) +select * from + tenk1 join int4_tbl on f1 = twothousand, + int4(sin(1)) q1, + int4(sin(0)) q2 +where q1 = thousand or q2 = thousand; + QUERY PLAN +----------------------------------------------------------------------------- + Hash Join + Hash Cond: (tenk1.twothousand = int4_tbl.f1) + -> Nested Loop + Join Filter: ((q1.q1 = tenk1.thousand) OR (q2.q2 = tenk1.thousand)) + -> Nested Loop + -> Function Scan on q1 + -> Function Scan on q2 + -> Bitmap Heap Scan on tenk1 + Recheck Cond: ((q1.q1 = thousand) OR (q2.q2 = thousand)) + -> BitmapOr + -> Bitmap Index Scan on tenk1_thous_tenthous + Index Cond: (q1.q1 = thousand) + -> Bitmap Index Scan on tenk1_thous_tenthous + Index Cond: (q2.q2 = thousand) + -> Hash + -> Seq Scan on int4_tbl +(16 rows) + +explain (costs off) +select * from + tenk1 join int4_tbl on f1 = twothousand, + int4(sin(1)) q1, + int4(sin(0)) q2 +where thousand = (q1 + q2); + QUERY PLAN +-------------------------------------------------------------- + Hash Join + Hash Cond: (tenk1.twothousand = int4_tbl.f1) + -> Nested Loop + -> Nested Loop + -> Function Scan on q1 + -> Function Scan on q2 + -> Bitmap Heap Scan on tenk1 + Recheck Cond: (thousand = (q1.q1 + q2.q2)) + -> Bitmap Index Scan on tenk1_thous_tenthous + Index Cond: (thousand = (q1.q1 + q2.q2)) + -> Hash + -> Seq Scan on int4_tbl +(12 rows) + -- -- test join removal -- diff --git a/src/test/regress/sql/join.sql b/src/test/regress/sql/join.sql index 2d53cf1725..bc9b7812cf 100644 --- a/src/test/regress/sql/join.sql +++ b/src/test/regress/sql/join.sql @@ -689,6 +689,24 @@ order by 1,2; select * from int4_tbl a full join int4_tbl b on true; select * from int4_tbl a full join int4_tbl b on false; +-- +-- test for ability to use a cartesian join when necessary +-- + +explain (costs off) +select * from + tenk1 join int4_tbl on f1 = twothousand, + int4(sin(1)) q1, + int4(sin(0)) q2 +where q1 = thousand or q2 = thousand; + +explain (costs off) +select * from + tenk1 join int4_tbl on f1 = twothousand, + int4(sin(1)) q1, + int4(sin(0)) q2 +where thousand = (q1 + q2); + -- -- test join removal --