mirror of
https://git.postgresql.org/git/postgresql.git
synced 2025-01-24 18:55:04 +08:00
64d0b8b05f
version of crosstab. This fixes a major deficiency in real-world use of the original version. Easiest to undestand with an illustration: Data: ------------------------------------------------------------------- select * from cth; id | rowid | rowdt | attribute | val ----+-------+---------------------+----------------+--------------- 1 | test1 | 2003-03-01 00:00:00 | temperature | 42 2 | test1 | 2003-03-01 00:00:00 | test_result | PASS 3 | test1 | 2003-03-01 00:00:00 | volts | 2.6987 4 | test2 | 2003-03-02 00:00:00 | temperature | 53 5 | test2 | 2003-03-02 00:00:00 | test_result | FAIL 6 | test2 | 2003-03-02 00:00:00 | test_startdate | 01 March 2003 7 | test2 | 2003-03-02 00:00:00 | volts | 3.1234 (7 rows) Original crosstab: ------------------------------------------------------------------- SELECT * FROM crosstab( 'SELECT rowid, attribute, val FROM cth ORDER BY 1,2',4) AS c(rowid text, temperature text, test_result text, test_startdate text, volts text); rowid | temperature | test_result | test_startdate | volts -------+-------------+-------------+----------------+-------- test1 | 42 | PASS | 2.6987 | test2 | 53 | FAIL | 01 March 2003 | 3.1234 (2 rows) Hashed crosstab: ------------------------------------------------------------------- SELECT * FROM crosstab( 'SELECT rowid, attribute, val FROM cth ORDER BY 1', 'SELECT DISTINCT attribute FROM cth ORDER BY 1') AS c(rowid text, temperature int4, test_result text, test_startdate timestamp, volts float8); rowid | temperature | test_result | test_startdate | volts -------+-------------+-------------+---------------------+-------- test1 | 42 | PASS | | 2.6987 test2 | 53 | FAIL | 2003-03-01 00:00:00 | 3.1234 (2 rows) Notice that the original crosstab slides data over to the left in the result tuple when it encounters missing data. In order to work around this you have to be make your source sql do all sorts of contortions (cartesian join of distinct rowid with distinct attribute; left join that back to the real source data). The new version avoids this by building a hash table using a second distinct attribute query. The new version also allows for "extra" columns (see the README) and allows the result columns to be coerced into differing datatypes if they are suitable (as shown above). In testing a "real-world" data set (69 distinct rowid's, 27 distinct categories/attributes, multiple missing data points) I saw about a 5-fold improvement in execution time (from about 2200 ms old, to 440 ms new). I left the original version intact because: 1) BC, 2) it is probably slightly faster if you know that you have no missing attributes. README and regression test adjustments included. If there are no objections, please apply. Joe Conway
291 lines
13 KiB
Plaintext
291 lines
13 KiB
Plaintext
--
|
|
-- first, define the functions. Turn off echoing so that expected file
|
|
-- does not depend on contents of tablefunc.sql.
|
|
--
|
|
\set ECHO none
|
|
--
|
|
-- normal_rand()
|
|
-- no easy way to do this for regression testing
|
|
--
|
|
SELECT avg(normal_rand)::int FROM normal_rand(100, 250, 0.2, EXTRACT(SECONDS FROM CURRENT_TIME(0))::int);
|
|
avg
|
|
-----
|
|
250
|
|
(1 row)
|
|
|
|
--
|
|
-- crosstab()
|
|
--
|
|
CREATE TABLE ct(id int, rowclass text, rowid text, attribute text, val text);
|
|
\copy ct from 'data/ct.data'
|
|
SELECT * FROM crosstab2('SELECT rowid, attribute, val FROM ct where rowclass = ''group1'' and (attribute = ''att2'' or attribute = ''att3'') ORDER BY 1,2;');
|
|
row_name | category_1 | category_2
|
|
----------+------------+------------
|
|
test1 | val2 | val3
|
|
test2 | val6 | val7
|
|
(2 rows)
|
|
|
|
SELECT * FROM crosstab3('SELECT rowid, attribute, val FROM ct where rowclass = ''group1'' and (attribute = ''att2'' or attribute = ''att3'') ORDER BY 1,2;');
|
|
row_name | category_1 | category_2 | category_3
|
|
----------+------------+------------+------------
|
|
test1 | val2 | val3 |
|
|
test2 | val6 | val7 |
|
|
(2 rows)
|
|
|
|
SELECT * FROM crosstab4('SELECT rowid, attribute, val FROM ct where rowclass = ''group1'' and (attribute = ''att2'' or attribute = ''att3'') ORDER BY 1,2;');
|
|
row_name | category_1 | category_2 | category_3 | category_4
|
|
----------+------------+------------+------------+------------
|
|
test1 | val2 | val3 | |
|
|
test2 | val6 | val7 | |
|
|
(2 rows)
|
|
|
|
SELECT * FROM crosstab2('SELECT rowid, attribute, val FROM ct where rowclass = ''group1'' ORDER BY 1,2;');
|
|
row_name | category_1 | category_2
|
|
----------+------------+------------
|
|
test1 | val1 | val2
|
|
test2 | val5 | val6
|
|
(2 rows)
|
|
|
|
SELECT * FROM crosstab3('SELECT rowid, attribute, val FROM ct where rowclass = ''group1'' ORDER BY 1,2;');
|
|
row_name | category_1 | category_2 | category_3
|
|
----------+------------+------------+------------
|
|
test1 | val1 | val2 | val3
|
|
test2 | val5 | val6 | val7
|
|
(2 rows)
|
|
|
|
SELECT * FROM crosstab4('SELECT rowid, attribute, val FROM ct where rowclass = ''group1'' ORDER BY 1,2;');
|
|
row_name | category_1 | category_2 | category_3 | category_4
|
|
----------+------------+------------+------------+------------
|
|
test1 | val1 | val2 | val3 | val4
|
|
test2 | val5 | val6 | val7 | val8
|
|
(2 rows)
|
|
|
|
SELECT * FROM crosstab2('SELECT rowid, attribute, val FROM ct where rowclass = ''group2'' and (attribute = ''att1'' or attribute = ''att2'') ORDER BY 1,2;');
|
|
row_name | category_1 | category_2
|
|
----------+------------+------------
|
|
test3 | val1 | val2
|
|
test4 | val4 | val5
|
|
(2 rows)
|
|
|
|
SELECT * FROM crosstab3('SELECT rowid, attribute, val FROM ct where rowclass = ''group2'' and (attribute = ''att1'' or attribute = ''att2'') ORDER BY 1,2;');
|
|
row_name | category_1 | category_2 | category_3
|
|
----------+------------+------------+------------
|
|
test3 | val1 | val2 |
|
|
test4 | val4 | val5 |
|
|
(2 rows)
|
|
|
|
SELECT * FROM crosstab4('SELECT rowid, attribute, val FROM ct where rowclass = ''group2'' and (attribute = ''att1'' or attribute = ''att2'') ORDER BY 1,2;');
|
|
row_name | category_1 | category_2 | category_3 | category_4
|
|
----------+------------+------------+------------+------------
|
|
test3 | val1 | val2 | |
|
|
test4 | val4 | val5 | |
|
|
(2 rows)
|
|
|
|
SELECT * FROM crosstab2('SELECT rowid, attribute, val FROM ct where rowclass = ''group2'' ORDER BY 1,2;');
|
|
row_name | category_1 | category_2
|
|
----------+------------+------------
|
|
test3 | val1 | val2
|
|
test4 | val4 | val5
|
|
(2 rows)
|
|
|
|
SELECT * FROM crosstab3('SELECT rowid, attribute, val FROM ct where rowclass = ''group2'' ORDER BY 1,2;');
|
|
row_name | category_1 | category_2 | category_3
|
|
----------+------------+------------+------------
|
|
test3 | val1 | val2 | val3
|
|
test4 | val4 | val5 | val6
|
|
(2 rows)
|
|
|
|
SELECT * FROM crosstab4('SELECT rowid, attribute, val FROM ct where rowclass = ''group2'' ORDER BY 1,2;');
|
|
row_name | category_1 | category_2 | category_3 | category_4
|
|
----------+------------+------------+------------+------------
|
|
test3 | val1 | val2 | val3 |
|
|
test4 | val4 | val5 | val6 |
|
|
(2 rows)
|
|
|
|
SELECT * FROM crosstab('SELECT rowid, attribute, val FROM ct where rowclass = ''group1'' ORDER BY 1,2;', 2) AS c(rowid text, att1 text, att2 text);
|
|
rowid | att1 | att2
|
|
-------+------+------
|
|
test1 | val1 | val2
|
|
test2 | val5 | val6
|
|
(2 rows)
|
|
|
|
SELECT * FROM crosstab('SELECT rowid, attribute, val FROM ct where rowclass = ''group1'' ORDER BY 1,2;', 3) AS c(rowid text, att1 text, att2 text, att3 text);
|
|
rowid | att1 | att2 | att3
|
|
-------+------+------+------
|
|
test1 | val1 | val2 | val3
|
|
test2 | val5 | val6 | val7
|
|
(2 rows)
|
|
|
|
SELECT * FROM crosstab('SELECT rowid, attribute, val FROM ct where rowclass = ''group1'' ORDER BY 1,2;', 4) AS c(rowid text, att1 text, att2 text, att3 text, att4 text);
|
|
rowid | att1 | att2 | att3 | att4
|
|
-------+------+------+------+------
|
|
test1 | val1 | val2 | val3 | val4
|
|
test2 | val5 | val6 | val7 | val8
|
|
(2 rows)
|
|
|
|
--
|
|
-- hash based crosstab
|
|
--
|
|
create table cth(id serial, rowid text, rowdt timestamp, attribute text, val text);
|
|
NOTICE: CREATE TABLE will create implicit sequence 'cth_id_seq' for SERIAL column 'cth.id'
|
|
insert into cth values(DEFAULT,'test1','01 March 2003','temperature','42');
|
|
insert into cth values(DEFAULT,'test1','01 March 2003','test_result','PASS');
|
|
-- the next line is intentionally left commented and is therefore a "missing" attribute
|
|
-- insert into cth values(DEFAULT,'test1','01 March 2003','test_startdate','28 February 2003');
|
|
insert into cth values(DEFAULT,'test1','01 March 2003','volts','2.6987');
|
|
insert into cth values(DEFAULT,'test2','02 March 2003','temperature','53');
|
|
insert into cth values(DEFAULT,'test2','02 March 2003','test_result','FAIL');
|
|
insert into cth values(DEFAULT,'test2','02 March 2003','test_startdate','01 March 2003');
|
|
insert into cth values(DEFAULT,'test2','02 March 2003','volts','3.1234');
|
|
-- return attributes as plain text
|
|
SELECT * FROM crosstab(
|
|
'SELECT rowid, rowdt, attribute, val FROM cth ORDER BY 1',
|
|
'SELECT DISTINCT attribute FROM cth ORDER BY 1')
|
|
AS c(rowid text, rowdt timestamp, temperature text, test_result text, test_startdate text, volts text);
|
|
rowid | rowdt | temperature | test_result | test_startdate | volts
|
|
-------+--------------------------+-------------+-------------+----------------+--------
|
|
test1 | Sat Mar 01 00:00:00 2003 | 42 | PASS | | 2.6987
|
|
test2 | Sun Mar 02 00:00:00 2003 | 53 | FAIL | 01 March 2003 | 3.1234
|
|
(2 rows)
|
|
|
|
-- this time without rowdt
|
|
SELECT * FROM crosstab(
|
|
'SELECT rowid, attribute, val FROM cth ORDER BY 1',
|
|
'SELECT DISTINCT attribute FROM cth ORDER BY 1')
|
|
AS c(rowid text, temperature text, test_result text, test_startdate text, volts text);
|
|
rowid | temperature | test_result | test_startdate | volts
|
|
-------+-------------+-------------+----------------+--------
|
|
test1 | 42 | PASS | | 2.6987
|
|
test2 | 53 | FAIL | 01 March 2003 | 3.1234
|
|
(2 rows)
|
|
|
|
-- convert attributes to specific datatypes
|
|
SELECT * FROM crosstab(
|
|
'SELECT rowid, rowdt, attribute, val FROM cth ORDER BY 1',
|
|
'SELECT DISTINCT attribute FROM cth ORDER BY 1')
|
|
AS c(rowid text, rowdt timestamp, temperature int4, test_result text, test_startdate timestamp, volts float8);
|
|
rowid | rowdt | temperature | test_result | test_startdate | volts
|
|
-------+--------------------------+-------------+-------------+--------------------------+--------
|
|
test1 | Sat Mar 01 00:00:00 2003 | 42 | PASS | | 2.6987
|
|
test2 | Sun Mar 02 00:00:00 2003 | 53 | FAIL | Sat Mar 01 00:00:00 2003 | 3.1234
|
|
(2 rows)
|
|
|
|
-- source query and category query out of sync
|
|
SELECT * FROM crosstab(
|
|
'SELECT rowid, rowdt, attribute, val FROM cth ORDER BY 1',
|
|
'SELECT DISTINCT attribute FROM cth WHERE attribute IN (''temperature'',''test_result'',''test_startdate'') ORDER BY 1')
|
|
AS c(rowid text, rowdt timestamp, temperature int4, test_result text, test_startdate timestamp);
|
|
rowid | rowdt | temperature | test_result | test_startdate
|
|
-------+--------------------------+-------------+-------------+--------------------------
|
|
test1 | Sat Mar 01 00:00:00 2003 | 42 | PASS |
|
|
test2 | Sun Mar 02 00:00:00 2003 | 53 | FAIL | Sat Mar 01 00:00:00 2003
|
|
(2 rows)
|
|
|
|
-- if category query generates no rows, get expected error
|
|
SELECT * FROM crosstab(
|
|
'SELECT rowid, rowdt, attribute, val FROM cth ORDER BY 1',
|
|
'SELECT DISTINCT attribute FROM cth WHERE attribute = ''a'' ORDER BY 1')
|
|
AS c(rowid text, rowdt timestamp, temperature int4, test_result text, test_startdate timestamp, volts float8);
|
|
ERROR: load_categories_hash: provided categories SQL must return 1 column of at least one row
|
|
-- if category query generates more than one column, get expected error
|
|
SELECT * FROM crosstab(
|
|
'SELECT rowid, rowdt, attribute, val FROM cth ORDER BY 1',
|
|
'SELECT DISTINCT rowdt, attribute FROM cth ORDER BY 2')
|
|
AS c(rowid text, rowdt timestamp, temperature int4, test_result text, test_startdate timestamp, volts float8);
|
|
ERROR: load_categories_hash: provided categories SQL must return 1 column of at least one row
|
|
--
|
|
-- connectby
|
|
--
|
|
-- test connectby with text based hierarchy
|
|
CREATE TABLE connectby_text(keyid text, parent_keyid text);
|
|
\copy connectby_text from 'data/connectby_text.data'
|
|
-- with branch
|
|
SELECT * FROM connectby('connectby_text', 'keyid', 'parent_keyid', 'row2', 0, '~') AS t(keyid text, parent_keyid text, level int, branch text);
|
|
keyid | parent_keyid | level | branch
|
|
-------+--------------+-------+---------------------
|
|
row2 | | 0 | row2
|
|
row4 | row2 | 1 | row2~row4
|
|
row6 | row4 | 2 | row2~row4~row6
|
|
row8 | row6 | 3 | row2~row4~row6~row8
|
|
row5 | row2 | 1 | row2~row5
|
|
row9 | row5 | 2 | row2~row5~row9
|
|
(6 rows)
|
|
|
|
-- without branch
|
|
SELECT * FROM connectby('connectby_text', 'keyid', 'parent_keyid', 'row2', 0) AS t(keyid text, parent_keyid text, level int);
|
|
keyid | parent_keyid | level
|
|
-------+--------------+-------
|
|
row2 | | 0
|
|
row4 | row2 | 1
|
|
row6 | row4 | 2
|
|
row8 | row6 | 3
|
|
row5 | row2 | 1
|
|
row9 | row5 | 2
|
|
(6 rows)
|
|
|
|
-- test connectby with int based hierarchy
|
|
CREATE TABLE connectby_int(keyid int, parent_keyid int);
|
|
\copy connectby_int from 'data/connectby_int.data'
|
|
-- with branch
|
|
SELECT * FROM connectby('connectby_int', 'keyid', 'parent_keyid', '2', 0, '~') AS t(keyid int, parent_keyid int, level int, branch text);
|
|
keyid | parent_keyid | level | branch
|
|
-------+--------------+-------+---------
|
|
2 | | 0 | 2
|
|
4 | 2 | 1 | 2~4
|
|
6 | 4 | 2 | 2~4~6
|
|
8 | 6 | 3 | 2~4~6~8
|
|
5 | 2 | 1 | 2~5
|
|
9 | 5 | 2 | 2~5~9
|
|
(6 rows)
|
|
|
|
-- without branch
|
|
SELECT * FROM connectby('connectby_int', 'keyid', 'parent_keyid', '2', 0) AS t(keyid int, parent_keyid int, level int);
|
|
keyid | parent_keyid | level
|
|
-------+--------------+-------
|
|
2 | | 0
|
|
4 | 2 | 1
|
|
6 | 4 | 2
|
|
8 | 6 | 3
|
|
5 | 2 | 1
|
|
9 | 5 | 2
|
|
(6 rows)
|
|
|
|
-- recursion detection
|
|
INSERT INTO connectby_int VALUES(10,9);
|
|
INSERT INTO connectby_int VALUES(11,10);
|
|
INSERT INTO connectby_int VALUES(9,11);
|
|
-- should fail due to infinite recursion
|
|
SELECT * FROM connectby('connectby_int', 'keyid', 'parent_keyid', '2', 0, '~') AS t(keyid int, parent_keyid int, level int, branch text);
|
|
ERROR: infinite recursion detected
|
|
-- infinite recursion failure avoided by depth limit
|
|
SELECT * FROM connectby('connectby_int', 'keyid', 'parent_keyid', '2', 4, '~') AS t(keyid int, parent_keyid int, level int, branch text);
|
|
keyid | parent_keyid | level | branch
|
|
-------+--------------+-------+-------------
|
|
2 | | 0 | 2
|
|
4 | 2 | 1 | 2~4
|
|
6 | 4 | 2 | 2~4~6
|
|
8 | 6 | 3 | 2~4~6~8
|
|
5 | 2 | 1 | 2~5
|
|
9 | 5 | 2 | 2~5~9
|
|
10 | 9 | 3 | 2~5~9~10
|
|
11 | 10 | 4 | 2~5~9~10~11
|
|
(8 rows)
|
|
|
|
-- test for falsely detected recursion
|
|
DROP TABLE connectby_int;
|
|
CREATE TABLE connectby_int(keyid int, parent_keyid int);
|
|
INSERT INTO connectby_int VALUES(11,NULL);
|
|
INSERT INTO connectby_int VALUES(10,11);
|
|
INSERT INTO connectby_int VALUES(111,11);
|
|
INSERT INTO connectby_int VALUES(1,111);
|
|
-- this should not fail due to recursion detection
|
|
SELECT * FROM connectby('connectby_int', 'keyid', 'parent_keyid', '11', 0, '-') AS t(keyid int, parent_keyid int, level int, branch text);
|
|
keyid | parent_keyid | level | branch
|
|
-------+--------------+-------+----------
|
|
11 | | 0 | 11
|
|
10 | 11 | 1 | 11-10
|
|
111 | 11 | 1 | 11-111
|
|
1 | 111 | 2 | 11-111-1
|
|
(4 rows)
|
|
|