mirror of
https://git.postgresql.org/git/postgresql.git
synced 2024-12-09 08:10:09 +08:00
Fix tuple routing in cases where tuple descriptors don't match.
The previous coding failed to work correctly when we have a multi-level partitioned hierarchy where tables at successive levels have different attribute numbers for the partition key attributes. To fix, have each PartitionDispatch object store a standalone TupleTableSlot initialized with the TupleDesc of the corresponding partitioned table, along with a TupleConversionMap to map tuples from the its parent's rowtype to own rowtype. After tuple routing chooses a leaf partition, we must use the leaf partition's tuple descriptor, not the root table's. To that end, a dedicated TupleTableSlot for tuple routing is now allocated in EState. Amit Langote
This commit is contained in:
parent
12bd7dd317
commit
2ac3ef7a01
@ -923,13 +923,19 @@ RelationGetPartitionQual(Relation rel, bool recurse)
|
||||
return generate_partition_qual(rel, recurse);
|
||||
}
|
||||
|
||||
/* Turn an array of OIDs with N elements into a list */
|
||||
#define OID_ARRAY_TO_LIST(arr, N, list) \
|
||||
/*
|
||||
* Append OIDs of rel's partitions to the list 'partoids' and for each OID,
|
||||
* append pointer rel to the list 'parents'.
|
||||
*/
|
||||
#define APPEND_REL_PARTITION_OIDS(rel, partoids, parents) \
|
||||
do\
|
||||
{\
|
||||
int i;\
|
||||
for (i = 0; i < (N); i++)\
|
||||
(list) = lappend_oid((list), (arr)[i]);\
|
||||
for (i = 0; i < (rel)->rd_partdesc->nparts; i++)\
|
||||
{\
|
||||
(partoids) = lappend_oid((partoids), (rel)->rd_partdesc->oids[i]);\
|
||||
(parents) = lappend((parents), (rel));\
|
||||
}\
|
||||
} while(0)
|
||||
|
||||
/*
|
||||
@ -944,11 +950,13 @@ PartitionDispatch *
|
||||
RelationGetPartitionDispatchInfo(Relation rel, int lockmode,
|
||||
int *num_parted, List **leaf_part_oids)
|
||||
{
|
||||
PartitionDesc rootpartdesc = RelationGetPartitionDesc(rel);
|
||||
PartitionDispatchData **pd;
|
||||
List *all_parts = NIL,
|
||||
*parted_rels;
|
||||
ListCell *lc;
|
||||
*all_parents = NIL,
|
||||
*parted_rels,
|
||||
*parted_rel_parents;
|
||||
ListCell *lc1,
|
||||
*lc2;
|
||||
int i,
|
||||
k,
|
||||
offset;
|
||||
@ -965,10 +973,13 @@ RelationGetPartitionDispatchInfo(Relation rel, int lockmode,
|
||||
*/
|
||||
*num_parted = 1;
|
||||
parted_rels = list_make1(rel);
|
||||
OID_ARRAY_TO_LIST(rootpartdesc->oids, rootpartdesc->nparts, all_parts);
|
||||
foreach(lc, all_parts)
|
||||
/* Root partitioned table has no parent, so NULL for parent */
|
||||
parted_rel_parents = list_make1(NULL);
|
||||
APPEND_REL_PARTITION_OIDS(rel, all_parts, all_parents);
|
||||
forboth(lc1, all_parts, lc2, all_parents)
|
||||
{
|
||||
Relation partrel = heap_open(lfirst_oid(lc), lockmode);
|
||||
Relation partrel = heap_open(lfirst_oid(lc1), lockmode);
|
||||
Relation parent = lfirst(lc2);
|
||||
PartitionDesc partdesc = RelationGetPartitionDesc(partrel);
|
||||
|
||||
/*
|
||||
@ -979,7 +990,8 @@ RelationGetPartitionDispatchInfo(Relation rel, int lockmode,
|
||||
{
|
||||
(*num_parted)++;
|
||||
parted_rels = lappend(parted_rels, partrel);
|
||||
OID_ARRAY_TO_LIST(partdesc->oids, partdesc->nparts, all_parts);
|
||||
parted_rel_parents = lappend(parted_rel_parents, parent);
|
||||
APPEND_REL_PARTITION_OIDS(partrel, all_parts, all_parents);
|
||||
}
|
||||
else
|
||||
heap_close(partrel, NoLock);
|
||||
@ -1004,10 +1016,12 @@ RelationGetPartitionDispatchInfo(Relation rel, int lockmode,
|
||||
sizeof(PartitionDispatchData *));
|
||||
*leaf_part_oids = NIL;
|
||||
i = k = offset = 0;
|
||||
foreach(lc, parted_rels)
|
||||
forboth(lc1, parted_rels, lc2, parted_rel_parents)
|
||||
{
|
||||
Relation partrel = lfirst(lc);
|
||||
Relation partrel = lfirst(lc1);
|
||||
Relation parent = lfirst(lc2);
|
||||
PartitionKey partkey = RelationGetPartitionKey(partrel);
|
||||
TupleDesc tupdesc = RelationGetDescr(partrel);
|
||||
PartitionDesc partdesc = RelationGetPartitionDesc(partrel);
|
||||
int j,
|
||||
m;
|
||||
@ -1017,6 +1031,27 @@ RelationGetPartitionDispatchInfo(Relation rel, int lockmode,
|
||||
pd[i]->key = partkey;
|
||||
pd[i]->keystate = NIL;
|
||||
pd[i]->partdesc = partdesc;
|
||||
if (parent != NULL)
|
||||
{
|
||||
/*
|
||||
* For every partitioned table other than root, we must store
|
||||
* a tuple table slot initialized with its tuple descriptor and
|
||||
* a tuple conversion map to convert a tuple from its parent's
|
||||
* rowtype to its own. That is to make sure that we are looking
|
||||
* at the correct row using the correct tuple descriptor when
|
||||
* computing its partition key for tuple routing.
|
||||
*/
|
||||
pd[i]->tupslot = MakeSingleTupleTableSlot(tupdesc);
|
||||
pd[i]->tupmap = convert_tuples_by_name(RelationGetDescr(parent),
|
||||
tupdesc,
|
||||
gettext_noop("could not convert row type"));
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Not required for the root partitioned table */
|
||||
pd[i]->tupslot = NULL;
|
||||
pd[i]->tupmap = NULL;
|
||||
}
|
||||
pd[i]->indexes = (int *) palloc(partdesc->nparts * sizeof(int));
|
||||
|
||||
/*
|
||||
@ -1610,6 +1645,8 @@ get_partition_for_tuple(PartitionDispatch *pd,
|
||||
{
|
||||
PartitionKey key = parent->key;
|
||||
PartitionDesc partdesc = parent->partdesc;
|
||||
TupleTableSlot *myslot = parent->tupslot;
|
||||
TupleConversionMap *map = parent->tupmap;
|
||||
|
||||
/* Quick exit */
|
||||
if (partdesc->nparts == 0)
|
||||
@ -1618,6 +1655,17 @@ get_partition_for_tuple(PartitionDispatch *pd,
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (myslot != NULL)
|
||||
{
|
||||
HeapTuple tuple = ExecFetchSlotTuple(slot);
|
||||
|
||||
ExecClearTuple(myslot);
|
||||
Assert(map != NULL);
|
||||
tuple = do_convert_tuple(tuple, map);
|
||||
ExecStoreTuple(tuple, myslot, InvalidBuffer, true);
|
||||
slot = myslot;
|
||||
}
|
||||
|
||||
/* Extract partition key from tuple */
|
||||
FormPartitionKeyDatum(parent, slot, estate, values, isnull);
|
||||
|
||||
|
@ -2435,6 +2435,15 @@ CopyFrom(CopyState cstate)
|
||||
/* Triggers might need a slot as well */
|
||||
estate->es_trig_tuple_slot = ExecInitExtraTupleSlot(estate);
|
||||
|
||||
/*
|
||||
* Initialize a dedicated slot to manipulate tuples of any given
|
||||
* partition's rowtype.
|
||||
*/
|
||||
if (cstate->partition_dispatch_info)
|
||||
estate->es_partition_tuple_slot = ExecInitExtraTupleSlot(estate);
|
||||
else
|
||||
estate->es_partition_tuple_slot = NULL;
|
||||
|
||||
/*
|
||||
* It's more efficient to prepare a bunch of tuples for insertion, and
|
||||
* insert them in one heap_multi_insert() call, than call heap_insert()
|
||||
@ -2484,7 +2493,8 @@ CopyFrom(CopyState cstate)
|
||||
|
||||
for (;;)
|
||||
{
|
||||
TupleTableSlot *slot;
|
||||
TupleTableSlot *slot,
|
||||
*oldslot = NULL;
|
||||
bool skip_tuple;
|
||||
Oid loaded_oid = InvalidOid;
|
||||
|
||||
@ -2571,7 +2581,19 @@ CopyFrom(CopyState cstate)
|
||||
map = cstate->partition_tupconv_maps[leaf_part_index];
|
||||
if (map)
|
||||
{
|
||||
Relation partrel = resultRelInfo->ri_RelationDesc;
|
||||
|
||||
tuple = do_convert_tuple(tuple, map);
|
||||
|
||||
/*
|
||||
* We must use the partition's tuple descriptor from this
|
||||
* point on. Use a dedicated slot from this point on until
|
||||
* we're finished dealing with the partition.
|
||||
*/
|
||||
oldslot = slot;
|
||||
slot = estate->es_partition_tuple_slot;
|
||||
Assert(slot != NULL);
|
||||
ExecSetSlotDescriptor(slot, RelationGetDescr(partrel));
|
||||
ExecStoreTuple(tuple, slot, InvalidBuffer, true);
|
||||
}
|
||||
|
||||
@ -2667,6 +2689,10 @@ CopyFrom(CopyState cstate)
|
||||
{
|
||||
resultRelInfo = saved_resultRelInfo;
|
||||
estate->es_result_relation_info = resultRelInfo;
|
||||
|
||||
/* Switch back to the slot corresponding to the root table */
|
||||
Assert(oldslot != NULL);
|
||||
slot = oldslot;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -2714,13 +2740,14 @@ CopyFrom(CopyState cstate)
|
||||
* Remember cstate->partition_dispatch_info[0] corresponds to the root
|
||||
* partitioned table, which we must not try to close, because it is
|
||||
* the main target table of COPY that will be closed eventually by
|
||||
* DoCopy().
|
||||
* DoCopy(). Also, tupslot is NULL for the root partitioned table.
|
||||
*/
|
||||
for (i = 1; i < cstate->num_dispatch; i++)
|
||||
{
|
||||
PartitionDispatch pd = cstate->partition_dispatch_info[i];
|
||||
|
||||
heap_close(pd->reldesc, NoLock);
|
||||
ExecDropSingleTupleTableSlot(pd->tupslot);
|
||||
}
|
||||
for (i = 0; i < cstate->num_partitions; i++)
|
||||
{
|
||||
|
@ -262,6 +262,7 @@ ExecInsert(ModifyTableState *mtstate,
|
||||
Relation resultRelationDesc;
|
||||
Oid newId;
|
||||
List *recheckIndexes = NIL;
|
||||
TupleTableSlot *oldslot = NULL;
|
||||
|
||||
/*
|
||||
* get the heap tuple out of the tuple table slot, making sure we have a
|
||||
@ -318,7 +319,19 @@ ExecInsert(ModifyTableState *mtstate,
|
||||
map = mtstate->mt_partition_tupconv_maps[leaf_part_index];
|
||||
if (map)
|
||||
{
|
||||
Relation partrel = resultRelInfo->ri_RelationDesc;
|
||||
|
||||
tuple = do_convert_tuple(tuple, map);
|
||||
|
||||
/*
|
||||
* We must use the partition's tuple descriptor from this
|
||||
* point on, until we're finished dealing with the partition.
|
||||
* Use the dedicated slot for that.
|
||||
*/
|
||||
oldslot = slot;
|
||||
slot = estate->es_partition_tuple_slot;
|
||||
Assert(slot != NULL);
|
||||
ExecSetSlotDescriptor(slot, RelationGetDescr(partrel));
|
||||
ExecStoreTuple(tuple, slot, InvalidBuffer, true);
|
||||
}
|
||||
}
|
||||
@ -566,6 +579,10 @@ ExecInsert(ModifyTableState *mtstate,
|
||||
{
|
||||
resultRelInfo = saved_resultRelInfo;
|
||||
estate->es_result_relation_info = resultRelInfo;
|
||||
|
||||
/* Switch back to the slot corresponding to the root table */
|
||||
Assert(oldslot != NULL);
|
||||
slot = oldslot;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -1734,7 +1751,15 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags)
|
||||
mtstate->mt_partitions = partitions;
|
||||
mtstate->mt_num_partitions = num_partitions;
|
||||
mtstate->mt_partition_tupconv_maps = partition_tupconv_maps;
|
||||
|
||||
/*
|
||||
* Initialize a dedicated slot to manipulate tuples of any given
|
||||
* partition's rowtype.
|
||||
*/
|
||||
estate->es_partition_tuple_slot = ExecInitExtraTupleSlot(estate);
|
||||
}
|
||||
else
|
||||
estate->es_partition_tuple_slot = NULL;
|
||||
|
||||
/*
|
||||
* Initialize any WITH CHECK OPTION constraints if needed.
|
||||
@ -2058,12 +2083,14 @@ ExecEndModifyTable(ModifyTableState *node)
|
||||
* Remember node->mt_partition_dispatch_info[0] corresponds to the root
|
||||
* partitioned table, which we must not try to close, because it is the
|
||||
* main target table of the query that will be closed by ExecEndPlan().
|
||||
* Also, tupslot is NULL for the root partitioned table.
|
||||
*/
|
||||
for (i = 1; i < node->mt_num_dispatch; i++)
|
||||
{
|
||||
PartitionDispatch pd = node->mt_partition_dispatch_info[i];
|
||||
|
||||
heap_close(pd->reldesc, NoLock);
|
||||
ExecDropSingleTupleTableSlot(pd->tupslot);
|
||||
}
|
||||
for (i = 0; i < node->mt_num_partitions; i++)
|
||||
{
|
||||
|
@ -47,6 +47,11 @@ typedef struct PartitionDescData *PartitionDesc;
|
||||
* key Partition key information of the table
|
||||
* keystate Execution state required for expressions in the partition key
|
||||
* partdesc Partition descriptor of the table
|
||||
* tupslot A standalone TupleTableSlot initialized with this table's tuple
|
||||
* descriptor
|
||||
* tupmap TupleConversionMap to convert from the parent's rowtype to
|
||||
* this table's rowtype (when extracting the partition key of a
|
||||
* tuple just before routing it through this table)
|
||||
* indexes Array with partdesc->nparts members (for details on what
|
||||
* individual members represent, see how they are set in
|
||||
* RelationGetPartitionDispatchInfo())
|
||||
@ -58,6 +63,8 @@ typedef struct PartitionDispatchData
|
||||
PartitionKey key;
|
||||
List *keystate; /* list of ExprState */
|
||||
PartitionDesc partdesc;
|
||||
TupleTableSlot *tupslot;
|
||||
TupleConversionMap *tupmap;
|
||||
int *indexes;
|
||||
} PartitionDispatchData;
|
||||
|
||||
|
@ -384,6 +384,9 @@ typedef struct EState
|
||||
TupleTableSlot *es_trig_oldtup_slot; /* for TriggerEnabled */
|
||||
TupleTableSlot *es_trig_newtup_slot; /* for TriggerEnabled */
|
||||
|
||||
/* Slot used to manipulate a tuple after it is routed to a partition */
|
||||
TupleTableSlot *es_partition_tuple_slot;
|
||||
|
||||
/* Parameter info: */
|
||||
ParamListInfo es_param_list_info; /* values of external params */
|
||||
ParamExecData *es_param_exec_vals; /* values of internal params */
|
||||
|
@ -300,3 +300,40 @@ drop cascades to table part_null
|
||||
drop cascades to table part_ee_ff
|
||||
drop cascades to table part_ee_ff1
|
||||
drop cascades to table part_ee_ff2
|
||||
-- more tests for certain multi-level partitioning scenarios
|
||||
create table p (a int, b int) partition by range (a, b);
|
||||
create table p1 (b int, a int not null) partition by range (b);
|
||||
create table p11 (like p1);
|
||||
alter table p11 drop a;
|
||||
alter table p11 add a int;
|
||||
alter table p11 drop a;
|
||||
alter table p11 add a int not null;
|
||||
-- attnum for key attribute 'a' is different in p, p1, and p11
|
||||
select attrelid::regclass, attname, attnum
|
||||
from pg_attribute
|
||||
where attname = 'a'
|
||||
and (attrelid = 'p'::regclass
|
||||
or attrelid = 'p1'::regclass
|
||||
or attrelid = 'p11'::regclass);
|
||||
attrelid | attname | attnum
|
||||
----------+---------+--------
|
||||
p | a | 1
|
||||
p1 | a | 2
|
||||
p11 | a | 4
|
||||
(3 rows)
|
||||
|
||||
alter table p1 attach partition p11 for values from (2) to (5);
|
||||
alter table p attach partition p1 for values from (1, 2) to (1, 10);
|
||||
-- check that "(1, 2)" is correctly routed to p11.
|
||||
insert into p values (1, 2);
|
||||
select tableoid::regclass, * from p;
|
||||
tableoid | a | b
|
||||
----------+---+---
|
||||
p11 | 1 | 2
|
||||
(1 row)
|
||||
|
||||
-- cleanup
|
||||
drop table p cascade;
|
||||
NOTICE: drop cascades to 2 other objects
|
||||
DETAIL: drop cascades to table p1
|
||||
drop cascades to table p11
|
||||
|
@ -170,3 +170,29 @@ select tableoid::regclass, * from list_parted;
|
||||
-- cleanup
|
||||
drop table range_parted cascade;
|
||||
drop table list_parted cascade;
|
||||
|
||||
-- more tests for certain multi-level partitioning scenarios
|
||||
create table p (a int, b int) partition by range (a, b);
|
||||
create table p1 (b int, a int not null) partition by range (b);
|
||||
create table p11 (like p1);
|
||||
alter table p11 drop a;
|
||||
alter table p11 add a int;
|
||||
alter table p11 drop a;
|
||||
alter table p11 add a int not null;
|
||||
-- attnum for key attribute 'a' is different in p, p1, and p11
|
||||
select attrelid::regclass, attname, attnum
|
||||
from pg_attribute
|
||||
where attname = 'a'
|
||||
and (attrelid = 'p'::regclass
|
||||
or attrelid = 'p1'::regclass
|
||||
or attrelid = 'p11'::regclass);
|
||||
|
||||
alter table p1 attach partition p11 for values from (2) to (5);
|
||||
alter table p attach partition p1 for values from (1, 2) to (1, 10);
|
||||
|
||||
-- check that "(1, 2)" is correctly routed to p11.
|
||||
insert into p values (1, 2);
|
||||
select tableoid::regclass, * from p;
|
||||
|
||||
-- cleanup
|
||||
drop table p cascade;
|
||||
|
Loading…
Reference in New Issue
Block a user