From 3c569049b7b502bb4952483d19ce622ff0af5fd6 Mon Sep 17 00:00:00 2001 From: David Rowley Date: Mon, 9 Jan 2023 17:15:08 +1300 Subject: [PATCH] Allow left join removals and unique joins on partitioned tables This allows left join removals and unique joins to work with partitioned tables. The planner just lacked sufficient proofs that a given join would not cause any row duplication. Unique indexes currently serve as that proof, so have get_relation_info() populate the indexlist for partitioned tables too. Author: Arne Roland Reviewed-by: Alvaro Herrera, Zhihong Yu, Amit Langote, David Rowley Discussion: https://postgr.es/m/c3b2408b7a39433b8230bbcd02e9f302@index.de --- src/backend/optimizer/util/plancat.c | 264 +++++++++++-------- src/backend/utils/adt/selfuncs.c | 4 + src/include/nodes/pathnodes.h | 10 +- src/test/regress/expected/join.out | 10 + src/test/regress/expected/partition_join.out | 4 +- src/test/regress/sql/join.sql | 7 + src/test/regress/sql/partition_join.sql | 4 +- 7 files changed, 179 insertions(+), 124 deletions(-) diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c index 9f158f2421b..d58c4a10782 100644 --- a/src/backend/optimizer/util/plancat.c +++ b/src/backend/optimizer/util/plancat.c @@ -109,7 +109,9 @@ static void set_baserel_partition_constraint(Relation relation, * If inhparent is true, all we need to do is set up the attr arrays: * the RelOptInfo actually represents the appendrel formed by an inheritance * tree, and so the parent rel's physical size and index information isn't - * important for it. + * important for it, however, for partitioned tables, we do populate the + * indexlist as the planner uses unique indexes as unique proofs for certain + * optimizations. */ void get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent, @@ -175,10 +177,14 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent, /* * Make list of indexes. Ignore indexes on system catalogs if told to. - * Don't bother with indexes for an inheritance parent, either. + * Don't bother with indexes from traditional inheritance parents. For + * partitioned tables, we need a list of at least unique indexes as these + * serve as unique proofs for certain planner optimizations. However, + * let's not discriminate here and just record all partitioned indexes + * whether they're unique indexes or not. */ - if (inhparent || - (IgnoreSystemIndexes && IsSystemRelation(relation))) + if ((inhparent && relation->rd_rel->relkind != RELKIND_PARTITIONED_TABLE) + || (IgnoreSystemIndexes && IsSystemRelation(relation))) hasindex = false; else hasindex = relation->rd_rel->relhasindex; @@ -231,16 +237,6 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent, continue; } - /* - * Ignore partitioned indexes, since they are not usable for - * queries. - */ - if (indexRelation->rd_rel->relkind == RELKIND_PARTITIONED_INDEX) - { - index_close(indexRelation, NoLock); - continue; - } - /* * If the index is valid, but cannot yet be used, ignore it; but * mark the plan we are generating as transient. See @@ -285,105 +281,129 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent, info->relam = indexRelation->rd_rel->relam; - /* We copy just the fields we need, not all of rd_indam */ - amroutine = indexRelation->rd_indam; - info->amcanorderbyop = amroutine->amcanorderbyop; - info->amoptionalkey = amroutine->amoptionalkey; - info->amsearcharray = amroutine->amsearcharray; - info->amsearchnulls = amroutine->amsearchnulls; - info->amcanparallel = amroutine->amcanparallel; - info->amhasgettuple = (amroutine->amgettuple != NULL); - info->amhasgetbitmap = amroutine->amgetbitmap != NULL && - relation->rd_tableam->scan_bitmap_next_block != NULL; - info->amcanmarkpos = (amroutine->ammarkpos != NULL && - amroutine->amrestrpos != NULL); - info->amcostestimate = amroutine->amcostestimate; - Assert(info->amcostestimate != NULL); - - /* Fetch index opclass options */ - info->opclassoptions = RelationGetIndexAttOptions(indexRelation, true); - /* - * Fetch the ordering information for the index, if any. + * We don't have an AM for partitioned indexes, so we'll just + * NULLify the AM related fields for those. */ - if (info->relam == BTREE_AM_OID) + if (indexRelation->rd_rel->relkind != RELKIND_PARTITIONED_INDEX) { + /* We copy just the fields we need, not all of rd_indam */ + amroutine = indexRelation->rd_indam; + info->amcanorderbyop = amroutine->amcanorderbyop; + info->amoptionalkey = amroutine->amoptionalkey; + info->amsearcharray = amroutine->amsearcharray; + info->amsearchnulls = amroutine->amsearchnulls; + info->amcanparallel = amroutine->amcanparallel; + info->amhasgettuple = (amroutine->amgettuple != NULL); + info->amhasgetbitmap = amroutine->amgetbitmap != NULL && + relation->rd_tableam->scan_bitmap_next_block != NULL; + info->amcanmarkpos = (amroutine->ammarkpos != NULL && + amroutine->amrestrpos != NULL); + info->amcostestimate = amroutine->amcostestimate; + Assert(info->amcostestimate != NULL); + + /* Fetch index opclass options */ + info->opclassoptions = RelationGetIndexAttOptions(indexRelation, true); + /* - * If it's a btree index, we can use its opfamily OIDs - * directly as the sort ordering opfamily OIDs. + * Fetch the ordering information for the index, if any. */ - Assert(amroutine->amcanorder); - - info->sortopfamily = info->opfamily; - info->reverse_sort = (bool *) palloc(sizeof(bool) * nkeycolumns); - info->nulls_first = (bool *) palloc(sizeof(bool) * nkeycolumns); - - for (i = 0; i < nkeycolumns; i++) + if (info->relam == BTREE_AM_OID) { - int16 opt = indexRelation->rd_indoption[i]; + /* + * If it's a btree index, we can use its opfamily OIDs + * directly as the sort ordering opfamily OIDs. + */ + Assert(amroutine->amcanorder); - info->reverse_sort[i] = (opt & INDOPTION_DESC) != 0; - info->nulls_first[i] = (opt & INDOPTION_NULLS_FIRST) != 0; + info->sortopfamily = info->opfamily; + info->reverse_sort = (bool *) palloc(sizeof(bool) * nkeycolumns); + info->nulls_first = (bool *) palloc(sizeof(bool) * nkeycolumns); + + for (i = 0; i < nkeycolumns; i++) + { + int16 opt = indexRelation->rd_indoption[i]; + + info->reverse_sort[i] = (opt & INDOPTION_DESC) != 0; + info->nulls_first[i] = (opt & INDOPTION_NULLS_FIRST) != 0; + } } - } - else if (amroutine->amcanorder) - { - /* - * Otherwise, identify the corresponding btree opfamilies by - * trying to map this index's "<" operators into btree. Since - * "<" uniquely defines the behavior of a sort order, this is - * a sufficient test. - * - * XXX This method is rather slow and also requires the - * undesirable assumption that the other index AM numbers its - * strategies the same as btree. It'd be better to have a way - * to explicitly declare the corresponding btree opfamily for - * each opfamily of the other index type. But given the lack - * of current or foreseeable amcanorder index types, it's not - * worth expending more effort on now. - */ - info->sortopfamily = (Oid *) palloc(sizeof(Oid) * nkeycolumns); - info->reverse_sort = (bool *) palloc(sizeof(bool) * nkeycolumns); - info->nulls_first = (bool *) palloc(sizeof(bool) * nkeycolumns); - - for (i = 0; i < nkeycolumns; i++) + else if (amroutine->amcanorder) { - int16 opt = indexRelation->rd_indoption[i]; - Oid ltopr; - Oid btopfamily; - Oid btopcintype; - int16 btstrategy; + /* + * Otherwise, identify the corresponding btree opfamilies + * by trying to map this index's "<" operators into btree. + * Since "<" uniquely defines the behavior of a sort + * order, this is a sufficient test. + * + * XXX This method is rather slow and also requires the + * undesirable assumption that the other index AM numbers + * its strategies the same as btree. It'd be better to + * have a way to explicitly declare the corresponding + * btree opfamily for each opfamily of the other index + * type. But given the lack of current or foreseeable + * amcanorder index types, it's not worth expending more + * effort on now. + */ + info->sortopfamily = (Oid *) palloc(sizeof(Oid) * nkeycolumns); + info->reverse_sort = (bool *) palloc(sizeof(bool) * nkeycolumns); + info->nulls_first = (bool *) palloc(sizeof(bool) * nkeycolumns); - info->reverse_sort[i] = (opt & INDOPTION_DESC) != 0; - info->nulls_first[i] = (opt & INDOPTION_NULLS_FIRST) != 0; + for (i = 0; i < nkeycolumns; i++) + { + int16 opt = indexRelation->rd_indoption[i]; + Oid ltopr; + Oid btopfamily; + Oid btopcintype; + int16 btstrategy; - ltopr = get_opfamily_member(info->opfamily[i], - info->opcintype[i], - info->opcintype[i], - BTLessStrategyNumber); - if (OidIsValid(ltopr) && - get_ordering_op_properties(ltopr, - &btopfamily, - &btopcintype, - &btstrategy) && - btopcintype == info->opcintype[i] && - btstrategy == BTLessStrategyNumber) - { - /* Successful mapping */ - info->sortopfamily[i] = btopfamily; - } - else - { - /* Fail ... quietly treat index as unordered */ - info->sortopfamily = NULL; - info->reverse_sort = NULL; - info->nulls_first = NULL; - break; + info->reverse_sort[i] = (opt & INDOPTION_DESC) != 0; + info->nulls_first[i] = (opt & INDOPTION_NULLS_FIRST) != 0; + + ltopr = get_opfamily_member(info->opfamily[i], + info->opcintype[i], + info->opcintype[i], + BTLessStrategyNumber); + if (OidIsValid(ltopr) && + get_ordering_op_properties(ltopr, + &btopfamily, + &btopcintype, + &btstrategy) && + btopcintype == info->opcintype[i] && + btstrategy == BTLessStrategyNumber) + { + /* Successful mapping */ + info->sortopfamily[i] = btopfamily; + } + else + { + /* Fail ... quietly treat index as unordered */ + info->sortopfamily = NULL; + info->reverse_sort = NULL; + info->nulls_first = NULL; + break; + } } } + else + { + info->sortopfamily = NULL; + info->reverse_sort = NULL; + info->nulls_first = NULL; + } } else { + info->amcanorderbyop = false; + info->amoptionalkey = false; + info->amsearcharray = false; + info->amsearchnulls = false; + info->amcanparallel = false; + info->amhasgettuple = false; + info->amhasgetbitmap = false; + info->amcanmarkpos = false; + info->amcostestimate = NULL; + info->sortopfamily = NULL; info->reverse_sort = NULL; info->nulls_first = NULL; @@ -416,31 +436,45 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent, * the number-of-tuples estimate to equal the parent table; if it * is partial then we have to use the same methods as we would for * a table, except we can be sure that the index is not larger - * than the table. + * than the table. We must ignore partitioned indexes here as as + * there are not physical indexes. */ - if (info->indpred == NIL) + if (indexRelation->rd_rel->relkind != RELKIND_PARTITIONED_INDEX) { - info->pages = RelationGetNumberOfBlocks(indexRelation); - info->tuples = rel->tuples; - } - else - { - double allvisfrac; /* dummy */ - - estimate_rel_size(indexRelation, NULL, - &info->pages, &info->tuples, &allvisfrac); - if (info->tuples > rel->tuples) + if (info->indpred == NIL) + { + info->pages = RelationGetNumberOfBlocks(indexRelation); info->tuples = rel->tuples; - } + } + else + { + double allvisfrac; /* dummy */ - if (info->relam == BTREE_AM_OID) - { - /* For btrees, get tree height while we have the index open */ - info->tree_height = _bt_getrootheight(indexRelation); + estimate_rel_size(indexRelation, NULL, + &info->pages, &info->tuples, &allvisfrac); + if (info->tuples > rel->tuples) + info->tuples = rel->tuples; + } + + if (info->relam == BTREE_AM_OID) + { + /* + * For btrees, get tree height while we have the index + * open + */ + info->tree_height = _bt_getrootheight(indexRelation); + } + else + { + /* For other index types, just set it to "unknown" for now */ + info->tree_height = -1; + } } else { - /* For other index types, just set it to "unknown" for now */ + /* Zero these out for partitioned indexes */ + info->pages = 0; + info->tuples = 0.0; info->tree_height = -1; } diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index f50e58adbd6..57de51f0db2 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -5994,6 +5994,10 @@ get_actual_variable_range(PlannerInfo *root, VariableStatData *vardata, rte = root->simple_rte_array[rel->relid]; Assert(rte->rtekind == RTE_RELATION); + /* ignore partitioned tables. Any indexes here are not real indexes */ + if (rte->relkind == RELKIND_PARTITIONED_TABLE) + return false; + /* Search through the indexes to see if any match our problem */ foreach(lc, rel->indexlist) { diff --git a/src/include/nodes/pathnodes.h b/src/include/nodes/pathnodes.h index 1827e506479..c20b7298a3d 100644 --- a/src/include/nodes/pathnodes.h +++ b/src/include/nodes/pathnodes.h @@ -653,7 +653,7 @@ typedef struct PartitionSchemeData *PartitionScheme; * lateral_referencers - relids of rels that reference this one laterally * (includes both direct and indirect lateral references) * indexlist - list of IndexOptInfo nodes for relation's indexes - * (always NIL if it's not a table) + * (always NIL if it's not a table or partitioned table) * pages - number of disk pages in relation (zero if not a table) * tuples - number of tuples in relation (not considering restrictions) * allvisfrac - fraction of disk pages that are marked all-visible @@ -1097,11 +1097,11 @@ struct IndexOptInfo Oid *opfamily pg_node_attr(array_size(nkeycolumns)); /* OIDs of opclass declared input data types */ Oid *opcintype pg_node_attr(array_size(nkeycolumns)); - /* OIDs of btree opfamilies, if orderable */ + /* OIDs of btree opfamilies, if orderable. NULL if partitioned index */ Oid *sortopfamily pg_node_attr(array_size(nkeycolumns)); - /* is sort order descending? */ + /* is sort order descending? or NULL if partitioned index */ bool *reverse_sort pg_node_attr(array_size(nkeycolumns)); - /* do NULLs come first in the sort order? */ + /* do NULLs come first in the sort order? or NULL if partitioned index */ bool *nulls_first pg_node_attr(array_size(nkeycolumns)); /* opclass-specific options for columns */ bytea **opclassoptions pg_node_attr(read_write_ignore); @@ -1139,7 +1139,7 @@ struct IndexOptInfo /* * Remaining fields are copied from the index AM's API struct - * (IndexAmRoutine). + * (IndexAmRoutine). These fields are not set for partitioned indexes. */ bool amcanorderbyop; bool amoptionalkey; diff --git a/src/test/regress/expected/join.out b/src/test/regress/expected/join.out index 3ddea3b6837..c2b85d27950 100644 --- a/src/test/regress/expected/join.out +++ b/src/test/regress/expected/join.out @@ -4860,6 +4860,16 @@ select 1 from (select a.id FROM a left join b on a.b_id = b.id) q, Filter: (a.id = i) (4 rows) +CREATE TEMP TABLE parted_b (id int PRIMARY KEY) partition by range(id); +CREATE TEMP TABLE parted_b1 partition of parted_b for values from (0) to (10); +-- test join removals on a partitioned table +explain (costs off) +select a.* from a left join parted_b pb on a.b_id = pb.id; + QUERY PLAN +--------------- + Seq Scan on a +(1 row) + rollback; create temp table parent (k int primary key, pd int); create temp table child (k int unique, cd int); diff --git a/src/test/regress/expected/partition_join.out b/src/test/regress/expected/partition_join.out index c59caf1cb3d..c649c4aeaae 100644 --- a/src/test/regress/expected/partition_join.out +++ b/src/test/regress/expected/partition_join.out @@ -4874,7 +4874,7 @@ ANALYZE fract_t; SET max_parallel_workers_per_gather = 0; SET enable_partitionwise_join = on; EXPLAIN (COSTS OFF) -SELECT * FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY id ASC LIMIT 10; +SELECT x.id, y.id FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY x.id ASC LIMIT 10; QUERY PLAN ----------------------------------------------------------------------- Limit @@ -4891,7 +4891,7 @@ SELECT * FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY id ASC LIMIT 10; (11 rows) EXPLAIN (COSTS OFF) -SELECT * FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY id DESC LIMIT 10; +SELECT x.id, y.id FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY x.id DESC LIMIT 10; QUERY PLAN -------------------------------------------------------------------------------- Limit diff --git a/src/test/regress/sql/join.sql b/src/test/regress/sql/join.sql index 9fc6ef43768..027927354c0 100644 --- a/src/test/regress/sql/join.sql +++ b/src/test/regress/sql/join.sql @@ -1709,6 +1709,13 @@ explain (costs off) select 1 from (select a.id FROM a left join b on a.b_id = b.id) q, lateral generate_series(1, q.id) gs(i) where q.id = gs.i; +CREATE TEMP TABLE parted_b (id int PRIMARY KEY) partition by range(id); +CREATE TEMP TABLE parted_b1 partition of parted_b for values from (0) to (10); + +-- test join removals on a partitioned table +explain (costs off) +select a.* from a left join parted_b pb on a.b_id = pb.id; + rollback; create temp table parent (k int primary key, pd int); diff --git a/src/test/regress/sql/partition_join.sql b/src/test/regress/sql/partition_join.sql index 67f506361f8..9e16f1ca550 100644 --- a/src/test/regress/sql/partition_join.sql +++ b/src/test/regress/sql/partition_join.sql @@ -1157,10 +1157,10 @@ SET max_parallel_workers_per_gather = 0; SET enable_partitionwise_join = on; EXPLAIN (COSTS OFF) -SELECT * FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY id ASC LIMIT 10; +SELECT x.id, y.id FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY x.id ASC LIMIT 10; EXPLAIN (COSTS OFF) -SELECT * FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY id DESC LIMIT 10; +SELECT x.id, y.id FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY x.id DESC LIMIT 10; -- cleanup DROP TABLE fract_t;