2011-02-14 09:06:41 +08:00
CREATE EXTENSION tablefunc;
2002-09-12 08:14:40 +08:00
--
-- normal_rand()
-- no easy way to do this for regression testing
--
2003-09-14 05:44:50 +08:00
SELECT avg(normal_rand)::int FROM normal_rand(100, 250, 0.2);
2002-09-12 08:14:40 +08:00
avg
-----
250
(1 row)
--
-- crosstab()
--
2002-11-23 09:54:09 +08:00
CREATE TABLE ct(id int, rowclass text, rowid text, attribute text, val text);
2002-09-12 08:14:40 +08:00
\copy ct from 'data/ct.data'
2002-11-23 09:54:09 +08:00
SELECT * FROM crosstab2('SELECT rowid, attribute, val FROM ct where rowclass = ''group1'' and (attribute = ''att2'' or attribute = ''att3'') ORDER BY 1,2;');
2002-09-12 08:14:40 +08:00
row_name | category_1 | category_2
----------+------------+------------
test1 | val2 | val3
test2 | val6 | val7
2007-11-10 13:00:41 +08:00
| val10 | val11
(3 rows)
2002-09-12 08:14:40 +08:00
2002-11-23 09:54:09 +08:00
SELECT * FROM crosstab3('SELECT rowid, attribute, val FROM ct where rowclass = ''group1'' and (attribute = ''att2'' or attribute = ''att3'') ORDER BY 1,2;');
2002-09-12 08:14:40 +08:00
row_name | category_1 | category_2 | category_3
----------+------------+------------+------------
test1 | val2 | val3 |
test2 | val6 | val7 |
2007-11-10 13:00:41 +08:00
| val10 | val11 |
(3 rows)
2002-09-12 08:14:40 +08:00
2002-11-23 09:54:09 +08:00
SELECT * FROM crosstab4('SELECT rowid, attribute, val FROM ct where rowclass = ''group1'' and (attribute = ''att2'' or attribute = ''att3'') ORDER BY 1,2;');
2002-09-12 08:14:40 +08:00
row_name | category_1 | category_2 | category_3 | category_4
----------+------------+------------+------------+------------
test1 | val2 | val3 | |
test2 | val6 | val7 | |
2007-11-10 13:00:41 +08:00
| val10 | val11 | |
(3 rows)
2002-09-12 08:14:40 +08:00
2002-11-23 09:54:09 +08:00
SELECT * FROM crosstab2('SELECT rowid, attribute, val FROM ct where rowclass = ''group1'' ORDER BY 1,2;');
2002-09-12 08:14:40 +08:00
row_name | category_1 | category_2
----------+------------+------------
test1 | val1 | val2
test2 | val5 | val6
2007-11-10 13:00:41 +08:00
| val9 | val10
(3 rows)
2002-09-12 08:14:40 +08:00
2002-11-23 09:54:09 +08:00
SELECT * FROM crosstab3('SELECT rowid, attribute, val FROM ct where rowclass = ''group1'' ORDER BY 1,2;');
2002-09-12 08:14:40 +08:00
row_name | category_1 | category_2 | category_3
----------+------------+------------+------------
test1 | val1 | val2 | val3
test2 | val5 | val6 | val7
2007-11-10 13:00:41 +08:00
| val9 | val10 | val11
(3 rows)
2002-09-12 08:14:40 +08:00
2002-11-23 09:54:09 +08:00
SELECT * FROM crosstab4('SELECT rowid, attribute, val FROM ct where rowclass = ''group1'' ORDER BY 1,2;');
2002-09-12 08:14:40 +08:00
row_name | category_1 | category_2 | category_3 | category_4
----------+------------+------------+------------+------------
test1 | val1 | val2 | val3 | val4
test2 | val5 | val6 | val7 | val8
2007-11-10 13:00:41 +08:00
| val9 | val10 | val11 | val12
(3 rows)
2002-09-12 08:14:40 +08:00
2002-11-23 09:54:09 +08:00
SELECT * FROM crosstab2('SELECT rowid, attribute, val FROM ct where rowclass = ''group2'' and (attribute = ''att1'' or attribute = ''att2'') ORDER BY 1,2;');
2002-09-12 08:14:40 +08:00
row_name | category_1 | category_2
----------+------------+------------
test3 | val1 | val2
test4 | val4 | val5
(2 rows)
2002-11-23 09:54:09 +08:00
SELECT * FROM crosstab3('SELECT rowid, attribute, val FROM ct where rowclass = ''group2'' and (attribute = ''att1'' or attribute = ''att2'') ORDER BY 1,2;');
2002-09-12 08:14:40 +08:00
row_name | category_1 | category_2 | category_3
----------+------------+------------+------------
test3 | val1 | val2 |
test4 | val4 | val5 |
(2 rows)
2002-11-23 09:54:09 +08:00
SELECT * FROM crosstab4('SELECT rowid, attribute, val FROM ct where rowclass = ''group2'' and (attribute = ''att1'' or attribute = ''att2'') ORDER BY 1,2;');
2002-09-12 08:14:40 +08:00
row_name | category_1 | category_2 | category_3 | category_4
----------+------------+------------+------------+------------
test3 | val1 | val2 | |
test4 | val4 | val5 | |
(2 rows)
2002-11-23 09:54:09 +08:00
SELECT * FROM crosstab2('SELECT rowid, attribute, val FROM ct where rowclass = ''group2'' ORDER BY 1,2;');
2002-09-12 08:14:40 +08:00
row_name | category_1 | category_2
----------+------------+------------
test3 | val1 | val2
test4 | val4 | val5
(2 rows)
2002-11-23 09:54:09 +08:00
SELECT * FROM crosstab3('SELECT rowid, attribute, val FROM ct where rowclass = ''group2'' ORDER BY 1,2;');
2002-09-12 08:14:40 +08:00
row_name | category_1 | category_2 | category_3
----------+------------+------------+------------
test3 | val1 | val2 | val3
test4 | val4 | val5 | val6
(2 rows)
2002-11-23 09:54:09 +08:00
SELECT * FROM crosstab4('SELECT rowid, attribute, val FROM ct where rowclass = ''group2'' ORDER BY 1,2;');
2002-09-12 08:14:40 +08:00
row_name | category_1 | category_2 | category_3 | category_4
----------+------------+------------+------------+------------
test3 | val1 | val2 | val3 |
test4 | val4 | val5 | val6 |
(2 rows)
2005-05-31 07:09:07 +08:00
SELECT * FROM crosstab('SELECT rowid, attribute, val FROM ct where rowclass = ''group1'' ORDER BY 1,2;') AS c(rowid text, att1 text, att2 text);
2007-11-10 13:00:41 +08:00
rowid | att1 | att2
-------+------+-------
2002-09-12 08:14:40 +08:00
test1 | val1 | val2
test2 | val5 | val6
2007-11-10 13:00:41 +08:00
| val9 | val10
(3 rows)
2002-09-12 08:14:40 +08:00
2005-05-31 07:09:07 +08:00
SELECT * FROM crosstab('SELECT rowid, attribute, val FROM ct where rowclass = ''group1'' ORDER BY 1,2;') AS c(rowid text, att1 text, att2 text, att3 text);
2007-11-10 13:00:41 +08:00
rowid | att1 | att2 | att3
-------+------+-------+-------
test1 | val1 | val2 | val3
test2 | val5 | val6 | val7
| val9 | val10 | val11
(3 rows)
2002-09-12 08:14:40 +08:00
2005-05-31 07:09:07 +08:00
SELECT * FROM crosstab('SELECT rowid, attribute, val FROM ct where rowclass = ''group1'' ORDER BY 1,2;') AS c(rowid text, att1 text, att2 text, att3 text, att4 text);
2007-11-10 13:00:41 +08:00
rowid | att1 | att2 | att3 | att4
-------+------+-------+-------+-------
test1 | val1 | val2 | val3 | val4
test2 | val5 | val6 | val7 | val8
| val9 | val10 | val11 | val12
(3 rows)
2002-09-12 08:14:40 +08:00
2005-05-31 07:09:07 +08:00
-- check it works with OUT parameters, too
CREATE FUNCTION crosstab_out(text,
OUT rowid text, OUT att1 text, OUT att2 text, OUT att3 text)
RETURNS setof record
AS '$libdir/tablefunc','crosstab'
2006-02-28 00:09:50 +08:00
LANGUAGE C STABLE STRICT;
2005-05-31 07:09:07 +08:00
SELECT * FROM crosstab_out('SELECT rowid, attribute, val FROM ct where rowclass = ''group1'' ORDER BY 1,2;');
2007-11-10 13:00:41 +08:00
rowid | att1 | att2 | att3
-------+------+-------+-------
test1 | val1 | val2 | val3
test2 | val5 | val6 | val7
| val9 | val10 | val11
(3 rows)
2005-05-31 07:09:07 +08:00
Attached is an update to contrib/tablefunc. It implements a new hashed
version of crosstab. This fixes a major deficiency in real-world use of
the original version. Easiest to undestand with an illustration:
Data:
-------------------------------------------------------------------
select * from cth;
id | rowid | rowdt | attribute | val
----+-------+---------------------+----------------+---------------
1 | test1 | 2003-03-01 00:00:00 | temperature | 42
2 | test1 | 2003-03-01 00:00:00 | test_result | PASS
3 | test1 | 2003-03-01 00:00:00 | volts | 2.6987
4 | test2 | 2003-03-02 00:00:00 | temperature | 53
5 | test2 | 2003-03-02 00:00:00 | test_result | FAIL
6 | test2 | 2003-03-02 00:00:00 | test_startdate | 01 March 2003
7 | test2 | 2003-03-02 00:00:00 | volts | 3.1234
(7 rows)
Original crosstab:
-------------------------------------------------------------------
SELECT * FROM crosstab(
'SELECT rowid, attribute, val FROM cth ORDER BY 1,2',4)
AS c(rowid text, temperature text, test_result text, test_startdate
text, volts text);
rowid | temperature | test_result | test_startdate | volts
-------+-------------+-------------+----------------+--------
test1 | 42 | PASS | 2.6987 |
test2 | 53 | FAIL | 01 March 2003 | 3.1234
(2 rows)
Hashed crosstab:
-------------------------------------------------------------------
SELECT * FROM crosstab(
'SELECT rowid, attribute, val FROM cth ORDER BY 1',
'SELECT DISTINCT attribute FROM cth ORDER BY 1')
AS c(rowid text, temperature int4, test_result text, test_startdate
timestamp, volts float8);
rowid | temperature | test_result | test_startdate | volts
-------+-------------+-------------+---------------------+--------
test1 | 42 | PASS | | 2.6987
test2 | 53 | FAIL | 2003-03-01 00:00:00 | 3.1234
(2 rows)
Notice that the original crosstab slides data over to the left in the
result tuple when it encounters missing data. In order to work around
this you have to be make your source sql do all sorts of contortions
(cartesian join of distinct rowid with distinct attribute; left join
that back to the real source data). The new version avoids this by
building a hash table using a second distinct attribute query.
The new version also allows for "extra" columns (see the README) and
allows the result columns to be coerced into differing datatypes if they
are suitable (as shown above).
In testing a "real-world" data set (69 distinct rowid's, 27 distinct
categories/attributes, multiple missing data points) I saw about a
5-fold improvement in execution time (from about 2200 ms old, to 440 ms
new).
I left the original version intact because: 1) BC, 2) it is probably
slightly faster if you know that you have no missing attributes.
README and regression test adjustments included. If there are no
objections, please apply.
Joe Conway
2003-03-20 14:46:30 +08:00
--
-- hash based crosstab
--
create table cth(id serial, rowid text, rowdt timestamp, attribute text, val text);
2004-08-11 08:49:35 +08:00
NOTICE: CREATE TABLE will create implicit sequence "cth_id_seq" for serial column "cth.id"
Attached is an update to contrib/tablefunc. It implements a new hashed
version of crosstab. This fixes a major deficiency in real-world use of
the original version. Easiest to undestand with an illustration:
Data:
-------------------------------------------------------------------
select * from cth;
id | rowid | rowdt | attribute | val
----+-------+---------------------+----------------+---------------
1 | test1 | 2003-03-01 00:00:00 | temperature | 42
2 | test1 | 2003-03-01 00:00:00 | test_result | PASS
3 | test1 | 2003-03-01 00:00:00 | volts | 2.6987
4 | test2 | 2003-03-02 00:00:00 | temperature | 53
5 | test2 | 2003-03-02 00:00:00 | test_result | FAIL
6 | test2 | 2003-03-02 00:00:00 | test_startdate | 01 March 2003
7 | test2 | 2003-03-02 00:00:00 | volts | 3.1234
(7 rows)
Original crosstab:
-------------------------------------------------------------------
SELECT * FROM crosstab(
'SELECT rowid, attribute, val FROM cth ORDER BY 1,2',4)
AS c(rowid text, temperature text, test_result text, test_startdate
text, volts text);
rowid | temperature | test_result | test_startdate | volts
-------+-------------+-------------+----------------+--------
test1 | 42 | PASS | 2.6987 |
test2 | 53 | FAIL | 01 March 2003 | 3.1234
(2 rows)
Hashed crosstab:
-------------------------------------------------------------------
SELECT * FROM crosstab(
'SELECT rowid, attribute, val FROM cth ORDER BY 1',
'SELECT DISTINCT attribute FROM cth ORDER BY 1')
AS c(rowid text, temperature int4, test_result text, test_startdate
timestamp, volts float8);
rowid | temperature | test_result | test_startdate | volts
-------+-------------+-------------+---------------------+--------
test1 | 42 | PASS | | 2.6987
test2 | 53 | FAIL | 2003-03-01 00:00:00 | 3.1234
(2 rows)
Notice that the original crosstab slides data over to the left in the
result tuple when it encounters missing data. In order to work around
this you have to be make your source sql do all sorts of contortions
(cartesian join of distinct rowid with distinct attribute; left join
that back to the real source data). The new version avoids this by
building a hash table using a second distinct attribute query.
The new version also allows for "extra" columns (see the README) and
allows the result columns to be coerced into differing datatypes if they
are suitable (as shown above).
In testing a "real-world" data set (69 distinct rowid's, 27 distinct
categories/attributes, multiple missing data points) I saw about a
5-fold improvement in execution time (from about 2200 ms old, to 440 ms
new).
I left the original version intact because: 1) BC, 2) it is probably
slightly faster if you know that you have no missing attributes.
README and regression test adjustments included. If there are no
objections, please apply.
Joe Conway
2003-03-20 14:46:30 +08:00
insert into cth values(DEFAULT,'test1','01 March 2003','temperature','42');
insert into cth values(DEFAULT,'test1','01 March 2003','test_result','PASS');
-- the next line is intentionally left commented and is therefore a "missing" attribute
-- insert into cth values(DEFAULT,'test1','01 March 2003','test_startdate','28 February 2003');
insert into cth values(DEFAULT,'test1','01 March 2003','volts','2.6987');
insert into cth values(DEFAULT,'test2','02 March 2003','temperature','53');
insert into cth values(DEFAULT,'test2','02 March 2003','test_result','FAIL');
insert into cth values(DEFAULT,'test2','02 March 2003','test_startdate','01 March 2003');
insert into cth values(DEFAULT,'test2','02 March 2003','volts','3.1234');
2007-11-10 13:00:41 +08:00
-- next group tests for NULL rowids
insert into cth values(DEFAULT,NULL,'25 October 2007','temperature','57');
insert into cth values(DEFAULT,NULL,'25 October 2007','test_result','PASS');
insert into cth values(DEFAULT,NULL,'25 October 2007','test_startdate','24 October 2007');
insert into cth values(DEFAULT,NULL,'25 October 2007','volts','1.41234');
Attached is an update to contrib/tablefunc. It implements a new hashed
version of crosstab. This fixes a major deficiency in real-world use of
the original version. Easiest to undestand with an illustration:
Data:
-------------------------------------------------------------------
select * from cth;
id | rowid | rowdt | attribute | val
----+-------+---------------------+----------------+---------------
1 | test1 | 2003-03-01 00:00:00 | temperature | 42
2 | test1 | 2003-03-01 00:00:00 | test_result | PASS
3 | test1 | 2003-03-01 00:00:00 | volts | 2.6987
4 | test2 | 2003-03-02 00:00:00 | temperature | 53
5 | test2 | 2003-03-02 00:00:00 | test_result | FAIL
6 | test2 | 2003-03-02 00:00:00 | test_startdate | 01 March 2003
7 | test2 | 2003-03-02 00:00:00 | volts | 3.1234
(7 rows)
Original crosstab:
-------------------------------------------------------------------
SELECT * FROM crosstab(
'SELECT rowid, attribute, val FROM cth ORDER BY 1,2',4)
AS c(rowid text, temperature text, test_result text, test_startdate
text, volts text);
rowid | temperature | test_result | test_startdate | volts
-------+-------------+-------------+----------------+--------
test1 | 42 | PASS | 2.6987 |
test2 | 53 | FAIL | 01 March 2003 | 3.1234
(2 rows)
Hashed crosstab:
-------------------------------------------------------------------
SELECT * FROM crosstab(
'SELECT rowid, attribute, val FROM cth ORDER BY 1',
'SELECT DISTINCT attribute FROM cth ORDER BY 1')
AS c(rowid text, temperature int4, test_result text, test_startdate
timestamp, volts float8);
rowid | temperature | test_result | test_startdate | volts
-------+-------------+-------------+---------------------+--------
test1 | 42 | PASS | | 2.6987
test2 | 53 | FAIL | 2003-03-01 00:00:00 | 3.1234
(2 rows)
Notice that the original crosstab slides data over to the left in the
result tuple when it encounters missing data. In order to work around
this you have to be make your source sql do all sorts of contortions
(cartesian join of distinct rowid with distinct attribute; left join
that back to the real source data). The new version avoids this by
building a hash table using a second distinct attribute query.
The new version also allows for "extra" columns (see the README) and
allows the result columns to be coerced into differing datatypes if they
are suitable (as shown above).
In testing a "real-world" data set (69 distinct rowid's, 27 distinct
categories/attributes, multiple missing data points) I saw about a
5-fold improvement in execution time (from about 2200 ms old, to 440 ms
new).
I left the original version intact because: 1) BC, 2) it is probably
slightly faster if you know that you have no missing attributes.
README and regression test adjustments included. If there are no
objections, please apply.
Joe Conway
2003-03-20 14:46:30 +08:00
-- return attributes as plain text
SELECT * FROM crosstab(
'SELECT rowid, rowdt, attribute, val FROM cth ORDER BY 1',
'SELECT DISTINCT attribute FROM cth ORDER BY 1')
AS c(rowid text, rowdt timestamp, temperature text, test_result text, test_startdate text, volts text);
2007-11-10 13:00:41 +08:00
rowid | rowdt | temperature | test_result | test_startdate | volts
-------+--------------------------+-------------+-------------+-----------------+---------
test1 | Sat Mar 01 00:00:00 2003 | 42 | PASS | | 2.6987
test2 | Sun Mar 02 00:00:00 2003 | 53 | FAIL | 01 March 2003 | 3.1234
| Thu Oct 25 00:00:00 2007 | 57 | PASS | 24 October 2007 | 1.41234
(3 rows)
Attached is an update to contrib/tablefunc. It implements a new hashed
version of crosstab. This fixes a major deficiency in real-world use of
the original version. Easiest to undestand with an illustration:
Data:
-------------------------------------------------------------------
select * from cth;
id | rowid | rowdt | attribute | val
----+-------+---------------------+----------------+---------------
1 | test1 | 2003-03-01 00:00:00 | temperature | 42
2 | test1 | 2003-03-01 00:00:00 | test_result | PASS
3 | test1 | 2003-03-01 00:00:00 | volts | 2.6987
4 | test2 | 2003-03-02 00:00:00 | temperature | 53
5 | test2 | 2003-03-02 00:00:00 | test_result | FAIL
6 | test2 | 2003-03-02 00:00:00 | test_startdate | 01 March 2003
7 | test2 | 2003-03-02 00:00:00 | volts | 3.1234
(7 rows)
Original crosstab:
-------------------------------------------------------------------
SELECT * FROM crosstab(
'SELECT rowid, attribute, val FROM cth ORDER BY 1,2',4)
AS c(rowid text, temperature text, test_result text, test_startdate
text, volts text);
rowid | temperature | test_result | test_startdate | volts
-------+-------------+-------------+----------------+--------
test1 | 42 | PASS | 2.6987 |
test2 | 53 | FAIL | 01 March 2003 | 3.1234
(2 rows)
Hashed crosstab:
-------------------------------------------------------------------
SELECT * FROM crosstab(
'SELECT rowid, attribute, val FROM cth ORDER BY 1',
'SELECT DISTINCT attribute FROM cth ORDER BY 1')
AS c(rowid text, temperature int4, test_result text, test_startdate
timestamp, volts float8);
rowid | temperature | test_result | test_startdate | volts
-------+-------------+-------------+---------------------+--------
test1 | 42 | PASS | | 2.6987
test2 | 53 | FAIL | 2003-03-01 00:00:00 | 3.1234
(2 rows)
Notice that the original crosstab slides data over to the left in the
result tuple when it encounters missing data. In order to work around
this you have to be make your source sql do all sorts of contortions
(cartesian join of distinct rowid with distinct attribute; left join
that back to the real source data). The new version avoids this by
building a hash table using a second distinct attribute query.
The new version also allows for "extra" columns (see the README) and
allows the result columns to be coerced into differing datatypes if they
are suitable (as shown above).
In testing a "real-world" data set (69 distinct rowid's, 27 distinct
categories/attributes, multiple missing data points) I saw about a
5-fold improvement in execution time (from about 2200 ms old, to 440 ms
new).
I left the original version intact because: 1) BC, 2) it is probably
slightly faster if you know that you have no missing attributes.
README and regression test adjustments included. If there are no
objections, please apply.
Joe Conway
2003-03-20 14:46:30 +08:00
-- this time without rowdt
SELECT * FROM crosstab(
'SELECT rowid, attribute, val FROM cth ORDER BY 1',
'SELECT DISTINCT attribute FROM cth ORDER BY 1')
AS c(rowid text, temperature text, test_result text, test_startdate text, volts text);
2007-11-10 13:00:41 +08:00
rowid | temperature | test_result | test_startdate | volts
-------+-------------+-------------+-----------------+---------
test1 | 42 | PASS | | 2.6987
test2 | 53 | FAIL | 01 March 2003 | 3.1234
| 57 | PASS | 24 October 2007 | 1.41234
(3 rows)
Attached is an update to contrib/tablefunc. It implements a new hashed
version of crosstab. This fixes a major deficiency in real-world use of
the original version. Easiest to undestand with an illustration:
Data:
-------------------------------------------------------------------
select * from cth;
id | rowid | rowdt | attribute | val
----+-------+---------------------+----------------+---------------
1 | test1 | 2003-03-01 00:00:00 | temperature | 42
2 | test1 | 2003-03-01 00:00:00 | test_result | PASS
3 | test1 | 2003-03-01 00:00:00 | volts | 2.6987
4 | test2 | 2003-03-02 00:00:00 | temperature | 53
5 | test2 | 2003-03-02 00:00:00 | test_result | FAIL
6 | test2 | 2003-03-02 00:00:00 | test_startdate | 01 March 2003
7 | test2 | 2003-03-02 00:00:00 | volts | 3.1234
(7 rows)
Original crosstab:
-------------------------------------------------------------------
SELECT * FROM crosstab(
'SELECT rowid, attribute, val FROM cth ORDER BY 1,2',4)
AS c(rowid text, temperature text, test_result text, test_startdate
text, volts text);
rowid | temperature | test_result | test_startdate | volts
-------+-------------+-------------+----------------+--------
test1 | 42 | PASS | 2.6987 |
test2 | 53 | FAIL | 01 March 2003 | 3.1234
(2 rows)
Hashed crosstab:
-------------------------------------------------------------------
SELECT * FROM crosstab(
'SELECT rowid, attribute, val FROM cth ORDER BY 1',
'SELECT DISTINCT attribute FROM cth ORDER BY 1')
AS c(rowid text, temperature int4, test_result text, test_startdate
timestamp, volts float8);
rowid | temperature | test_result | test_startdate | volts
-------+-------------+-------------+---------------------+--------
test1 | 42 | PASS | | 2.6987
test2 | 53 | FAIL | 2003-03-01 00:00:00 | 3.1234
(2 rows)
Notice that the original crosstab slides data over to the left in the
result tuple when it encounters missing data. In order to work around
this you have to be make your source sql do all sorts of contortions
(cartesian join of distinct rowid with distinct attribute; left join
that back to the real source data). The new version avoids this by
building a hash table using a second distinct attribute query.
The new version also allows for "extra" columns (see the README) and
allows the result columns to be coerced into differing datatypes if they
are suitable (as shown above).
In testing a "real-world" data set (69 distinct rowid's, 27 distinct
categories/attributes, multiple missing data points) I saw about a
5-fold improvement in execution time (from about 2200 ms old, to 440 ms
new).
I left the original version intact because: 1) BC, 2) it is probably
slightly faster if you know that you have no missing attributes.
README and regression test adjustments included. If there are no
objections, please apply.
Joe Conway
2003-03-20 14:46:30 +08:00
-- convert attributes to specific datatypes
SELECT * FROM crosstab(
'SELECT rowid, rowdt, attribute, val FROM cth ORDER BY 1',
'SELECT DISTINCT attribute FROM cth ORDER BY 1')
AS c(rowid text, rowdt timestamp, temperature int4, test_result text, test_startdate timestamp, volts float8);
2007-11-10 13:00:41 +08:00
rowid | rowdt | temperature | test_result | test_startdate | volts
-------+--------------------------+-------------+-------------+--------------------------+---------
test1 | Sat Mar 01 00:00:00 2003 | 42 | PASS | | 2.6987
test2 | Sun Mar 02 00:00:00 2003 | 53 | FAIL | Sat Mar 01 00:00:00 2003 | 3.1234
| Thu Oct 25 00:00:00 2007 | 57 | PASS | Wed Oct 24 00:00:00 2007 | 1.41234
(3 rows)
Attached is an update to contrib/tablefunc. It implements a new hashed
version of crosstab. This fixes a major deficiency in real-world use of
the original version. Easiest to undestand with an illustration:
Data:
-------------------------------------------------------------------
select * from cth;
id | rowid | rowdt | attribute | val
----+-------+---------------------+----------------+---------------
1 | test1 | 2003-03-01 00:00:00 | temperature | 42
2 | test1 | 2003-03-01 00:00:00 | test_result | PASS
3 | test1 | 2003-03-01 00:00:00 | volts | 2.6987
4 | test2 | 2003-03-02 00:00:00 | temperature | 53
5 | test2 | 2003-03-02 00:00:00 | test_result | FAIL
6 | test2 | 2003-03-02 00:00:00 | test_startdate | 01 March 2003
7 | test2 | 2003-03-02 00:00:00 | volts | 3.1234
(7 rows)
Original crosstab:
-------------------------------------------------------------------
SELECT * FROM crosstab(
'SELECT rowid, attribute, val FROM cth ORDER BY 1,2',4)
AS c(rowid text, temperature text, test_result text, test_startdate
text, volts text);
rowid | temperature | test_result | test_startdate | volts
-------+-------------+-------------+----------------+--------
test1 | 42 | PASS | 2.6987 |
test2 | 53 | FAIL | 01 March 2003 | 3.1234
(2 rows)
Hashed crosstab:
-------------------------------------------------------------------
SELECT * FROM crosstab(
'SELECT rowid, attribute, val FROM cth ORDER BY 1',
'SELECT DISTINCT attribute FROM cth ORDER BY 1')
AS c(rowid text, temperature int4, test_result text, test_startdate
timestamp, volts float8);
rowid | temperature | test_result | test_startdate | volts
-------+-------------+-------------+---------------------+--------
test1 | 42 | PASS | | 2.6987
test2 | 53 | FAIL | 2003-03-01 00:00:00 | 3.1234
(2 rows)
Notice that the original crosstab slides data over to the left in the
result tuple when it encounters missing data. In order to work around
this you have to be make your source sql do all sorts of contortions
(cartesian join of distinct rowid with distinct attribute; left join
that back to the real source data). The new version avoids this by
building a hash table using a second distinct attribute query.
The new version also allows for "extra" columns (see the README) and
allows the result columns to be coerced into differing datatypes if they
are suitable (as shown above).
In testing a "real-world" data set (69 distinct rowid's, 27 distinct
categories/attributes, multiple missing data points) I saw about a
5-fold improvement in execution time (from about 2200 ms old, to 440 ms
new).
I left the original version intact because: 1) BC, 2) it is probably
slightly faster if you know that you have no missing attributes.
README and regression test adjustments included. If there are no
objections, please apply.
Joe Conway
2003-03-20 14:46:30 +08:00
-- source query and category query out of sync
SELECT * FROM crosstab(
'SELECT rowid, rowdt, attribute, val FROM cth ORDER BY 1',
'SELECT DISTINCT attribute FROM cth WHERE attribute IN (''temperature'',''test_result'',''test_startdate'') ORDER BY 1')
AS c(rowid text, rowdt timestamp, temperature int4, test_result text, test_startdate timestamp);
rowid | rowdt | temperature | test_result | test_startdate
-------+--------------------------+-------------+-------------+--------------------------
test1 | Sat Mar 01 00:00:00 2003 | 42 | PASS |
test2 | Sun Mar 02 00:00:00 2003 | 53 | FAIL | Sat Mar 01 00:00:00 2003
2007-11-10 13:00:41 +08:00
| Thu Oct 25 00:00:00 2007 | 57 | PASS | Wed Oct 24 00:00:00 2007
(3 rows)
Attached is an update to contrib/tablefunc. It implements a new hashed
version of crosstab. This fixes a major deficiency in real-world use of
the original version. Easiest to undestand with an illustration:
Data:
-------------------------------------------------------------------
select * from cth;
id | rowid | rowdt | attribute | val
----+-------+---------------------+----------------+---------------
1 | test1 | 2003-03-01 00:00:00 | temperature | 42
2 | test1 | 2003-03-01 00:00:00 | test_result | PASS
3 | test1 | 2003-03-01 00:00:00 | volts | 2.6987
4 | test2 | 2003-03-02 00:00:00 | temperature | 53
5 | test2 | 2003-03-02 00:00:00 | test_result | FAIL
6 | test2 | 2003-03-02 00:00:00 | test_startdate | 01 March 2003
7 | test2 | 2003-03-02 00:00:00 | volts | 3.1234
(7 rows)
Original crosstab:
-------------------------------------------------------------------
SELECT * FROM crosstab(
'SELECT rowid, attribute, val FROM cth ORDER BY 1,2',4)
AS c(rowid text, temperature text, test_result text, test_startdate
text, volts text);
rowid | temperature | test_result | test_startdate | volts
-------+-------------+-------------+----------------+--------
test1 | 42 | PASS | 2.6987 |
test2 | 53 | FAIL | 01 March 2003 | 3.1234
(2 rows)
Hashed crosstab:
-------------------------------------------------------------------
SELECT * FROM crosstab(
'SELECT rowid, attribute, val FROM cth ORDER BY 1',
'SELECT DISTINCT attribute FROM cth ORDER BY 1')
AS c(rowid text, temperature int4, test_result text, test_startdate
timestamp, volts float8);
rowid | temperature | test_result | test_startdate | volts
-------+-------------+-------------+---------------------+--------
test1 | 42 | PASS | | 2.6987
test2 | 53 | FAIL | 2003-03-01 00:00:00 | 3.1234
(2 rows)
Notice that the original crosstab slides data over to the left in the
result tuple when it encounters missing data. In order to work around
this you have to be make your source sql do all sorts of contortions
(cartesian join of distinct rowid with distinct attribute; left join
that back to the real source data). The new version avoids this by
building a hash table using a second distinct attribute query.
The new version also allows for "extra" columns (see the README) and
allows the result columns to be coerced into differing datatypes if they
are suitable (as shown above).
In testing a "real-world" data set (69 distinct rowid's, 27 distinct
categories/attributes, multiple missing data points) I saw about a
5-fold improvement in execution time (from about 2200 ms old, to 440 ms
new).
I left the original version intact because: 1) BC, 2) it is probably
slightly faster if you know that you have no missing attributes.
README and regression test adjustments included. If there are no
objections, please apply.
Joe Conway
2003-03-20 14:46:30 +08:00
-- if category query generates no rows, get expected error
SELECT * FROM crosstab(
'SELECT rowid, rowdt, attribute, val FROM cth ORDER BY 1',
'SELECT DISTINCT attribute FROM cth WHERE attribute = ''a'' ORDER BY 1')
AS c(rowid text, rowdt timestamp, temperature int4, test_result text, test_startdate timestamp, volts float8);
2003-07-25 01:52:50 +08:00
ERROR: provided "categories" SQL must return 1 column of at least one row
Attached is an update to contrib/tablefunc. It implements a new hashed
version of crosstab. This fixes a major deficiency in real-world use of
the original version. Easiest to undestand with an illustration:
Data:
-------------------------------------------------------------------
select * from cth;
id | rowid | rowdt | attribute | val
----+-------+---------------------+----------------+---------------
1 | test1 | 2003-03-01 00:00:00 | temperature | 42
2 | test1 | 2003-03-01 00:00:00 | test_result | PASS
3 | test1 | 2003-03-01 00:00:00 | volts | 2.6987
4 | test2 | 2003-03-02 00:00:00 | temperature | 53
5 | test2 | 2003-03-02 00:00:00 | test_result | FAIL
6 | test2 | 2003-03-02 00:00:00 | test_startdate | 01 March 2003
7 | test2 | 2003-03-02 00:00:00 | volts | 3.1234
(7 rows)
Original crosstab:
-------------------------------------------------------------------
SELECT * FROM crosstab(
'SELECT rowid, attribute, val FROM cth ORDER BY 1,2',4)
AS c(rowid text, temperature text, test_result text, test_startdate
text, volts text);
rowid | temperature | test_result | test_startdate | volts
-------+-------------+-------------+----------------+--------
test1 | 42 | PASS | 2.6987 |
test2 | 53 | FAIL | 01 March 2003 | 3.1234
(2 rows)
Hashed crosstab:
-------------------------------------------------------------------
SELECT * FROM crosstab(
'SELECT rowid, attribute, val FROM cth ORDER BY 1',
'SELECT DISTINCT attribute FROM cth ORDER BY 1')
AS c(rowid text, temperature int4, test_result text, test_startdate
timestamp, volts float8);
rowid | temperature | test_result | test_startdate | volts
-------+-------------+-------------+---------------------+--------
test1 | 42 | PASS | | 2.6987
test2 | 53 | FAIL | 2003-03-01 00:00:00 | 3.1234
(2 rows)
Notice that the original crosstab slides data over to the left in the
result tuple when it encounters missing data. In order to work around
this you have to be make your source sql do all sorts of contortions
(cartesian join of distinct rowid with distinct attribute; left join
that back to the real source data). The new version avoids this by
building a hash table using a second distinct attribute query.
The new version also allows for "extra" columns (see the README) and
allows the result columns to be coerced into differing datatypes if they
are suitable (as shown above).
In testing a "real-world" data set (69 distinct rowid's, 27 distinct
categories/attributes, multiple missing data points) I saw about a
5-fold improvement in execution time (from about 2200 ms old, to 440 ms
new).
I left the original version intact because: 1) BC, 2) it is probably
slightly faster if you know that you have no missing attributes.
README and regression test adjustments included. If there are no
objections, please apply.
Joe Conway
2003-03-20 14:46:30 +08:00
-- if category query generates more than one column, get expected error
SELECT * FROM crosstab(
'SELECT rowid, rowdt, attribute, val FROM cth ORDER BY 1',
'SELECT DISTINCT rowdt, attribute FROM cth ORDER BY 2')
AS c(rowid text, rowdt timestamp, temperature int4, test_result text, test_startdate timestamp, volts float8);
2003-07-25 01:52:50 +08:00
ERROR: provided "categories" SQL must return 1 column of at least one row
2004-08-11 08:49:35 +08:00
-- if source query returns zero rows, get zero rows returned
SELECT * FROM crosstab(
'SELECT rowid, rowdt, attribute, val FROM cth WHERE false ORDER BY 1',
'SELECT DISTINCT attribute FROM cth ORDER BY 1')
AS c(rowid text, rowdt timestamp, temperature text, test_result text, test_startdate text, volts text);
rowid | rowdt | temperature | test_result | test_startdate | volts
-------+-------+-------------+-------------+----------------+-------
(0 rows)
-- if source query returns zero rows, get zero rows returned even if category query generates no rows
SELECT * FROM crosstab(
'SELECT rowid, rowdt, attribute, val FROM cth WHERE false ORDER BY 1',
'SELECT DISTINCT attribute FROM cth WHERE false ORDER BY 1')
AS c(rowid text, rowdt timestamp, temperature text, test_result text, test_startdate text, volts text);
rowid | rowdt | temperature | test_result | test_startdate | volts
-------+-------+-------------+-------------+----------------+-------
(0 rows)
2005-05-31 07:09:07 +08:00
-- check it works with a named result rowtype
create type my_crosstab_result as (
rowid text, rowdt timestamp,
temperature int4, test_result text, test_startdate timestamp, volts float8);
CREATE FUNCTION crosstab_named(text, text)
RETURNS setof my_crosstab_result
AS '$libdir/tablefunc','crosstab_hash'
2006-02-28 00:09:50 +08:00
LANGUAGE C STABLE STRICT;
2005-05-31 07:09:07 +08:00
SELECT * FROM crosstab_named(
'SELECT rowid, rowdt, attribute, val FROM cth ORDER BY 1',
'SELECT DISTINCT attribute FROM cth ORDER BY 1');
2007-11-10 13:00:41 +08:00
rowid | rowdt | temperature | test_result | test_startdate | volts
-------+--------------------------+-------------+-------------+--------------------------+---------
test1 | Sat Mar 01 00:00:00 2003 | 42 | PASS | | 2.6987
test2 | Sun Mar 02 00:00:00 2003 | 53 | FAIL | Sat Mar 01 00:00:00 2003 | 3.1234
| Thu Oct 25 00:00:00 2007 | 57 | PASS | Wed Oct 24 00:00:00 2007 | 1.41234
(3 rows)
2005-05-31 07:09:07 +08:00
-- check it works with OUT parameters
CREATE FUNCTION crosstab_out(text, text,
OUT rowid text, OUT rowdt timestamp,
OUT temperature int4, OUT test_result text,
OUT test_startdate timestamp, OUT volts float8)
RETURNS setof record
AS '$libdir/tablefunc','crosstab_hash'
2006-02-28 00:09:50 +08:00
LANGUAGE C STABLE STRICT;
2005-05-31 07:09:07 +08:00
SELECT * FROM crosstab_out(
'SELECT rowid, rowdt, attribute, val FROM cth ORDER BY 1',
'SELECT DISTINCT attribute FROM cth ORDER BY 1');
2007-11-10 13:00:41 +08:00
rowid | rowdt | temperature | test_result | test_startdate | volts
-------+--------------------------+-------------+-------------+--------------------------+---------
test1 | Sat Mar 01 00:00:00 2003 | 42 | PASS | | 2.6987
test2 | Sun Mar 02 00:00:00 2003 | 53 | FAIL | Sat Mar 01 00:00:00 2003 | 3.1234
| Thu Oct 25 00:00:00 2007 | 57 | PASS | Wed Oct 24 00:00:00 2007 | 1.41234
(3 rows)
2005-05-31 07:09:07 +08:00
Attached is an update to contrib/tablefunc. It implements a new hashed
version of crosstab. This fixes a major deficiency in real-world use of
the original version. Easiest to undestand with an illustration:
Data:
-------------------------------------------------------------------
select * from cth;
id | rowid | rowdt | attribute | val
----+-------+---------------------+----------------+---------------
1 | test1 | 2003-03-01 00:00:00 | temperature | 42
2 | test1 | 2003-03-01 00:00:00 | test_result | PASS
3 | test1 | 2003-03-01 00:00:00 | volts | 2.6987
4 | test2 | 2003-03-02 00:00:00 | temperature | 53
5 | test2 | 2003-03-02 00:00:00 | test_result | FAIL
6 | test2 | 2003-03-02 00:00:00 | test_startdate | 01 March 2003
7 | test2 | 2003-03-02 00:00:00 | volts | 3.1234
(7 rows)
Original crosstab:
-------------------------------------------------------------------
SELECT * FROM crosstab(
'SELECT rowid, attribute, val FROM cth ORDER BY 1,2',4)
AS c(rowid text, temperature text, test_result text, test_startdate
text, volts text);
rowid | temperature | test_result | test_startdate | volts
-------+-------------+-------------+----------------+--------
test1 | 42 | PASS | 2.6987 |
test2 | 53 | FAIL | 01 March 2003 | 3.1234
(2 rows)
Hashed crosstab:
-------------------------------------------------------------------
SELECT * FROM crosstab(
'SELECT rowid, attribute, val FROM cth ORDER BY 1',
'SELECT DISTINCT attribute FROM cth ORDER BY 1')
AS c(rowid text, temperature int4, test_result text, test_startdate
timestamp, volts float8);
rowid | temperature | test_result | test_startdate | volts
-------+-------------+-------------+---------------------+--------
test1 | 42 | PASS | | 2.6987
test2 | 53 | FAIL | 2003-03-01 00:00:00 | 3.1234
(2 rows)
Notice that the original crosstab slides data over to the left in the
result tuple when it encounters missing data. In order to work around
this you have to be make your source sql do all sorts of contortions
(cartesian join of distinct rowid with distinct attribute; left join
that back to the real source data). The new version avoids this by
building a hash table using a second distinct attribute query.
The new version also allows for "extra" columns (see the README) and
allows the result columns to be coerced into differing datatypes if they
are suitable (as shown above).
In testing a "real-world" data set (69 distinct rowid's, 27 distinct
categories/attributes, multiple missing data points) I saw about a
5-fold improvement in execution time (from about 2200 ms old, to 440 ms
new).
I left the original version intact because: 1) BC, 2) it is probably
slightly faster if you know that you have no missing attributes.
README and regression test adjustments included. If there are no
objections, please apply.
Joe Conway
2003-03-20 14:46:30 +08:00
--
-- connectby
--
2002-09-12 08:14:40 +08:00
-- test connectby with text based hierarchy
2003-07-27 11:51:59 +08:00
CREATE TABLE connectby_text(keyid text, parent_keyid text, pos int);
2002-09-12 08:14:40 +08:00
\copy connectby_text from 'data/connectby_text.data'
2003-07-27 11:51:59 +08:00
-- with branch, without orderby
2002-09-12 08:14:40 +08:00
SELECT * FROM connectby('connectby_text', 'keyid', 'parent_keyid', 'row2', 0, '~') AS t(keyid text, parent_keyid text, level int, branch text);
keyid | parent_keyid | level | branch
-------+--------------+-------+---------------------
row2 | | 0 | row2
row4 | row2 | 1 | row2~row4
row6 | row4 | 2 | row2~row4~row6
row8 | row6 | 3 | row2~row4~row6~row8
row5 | row2 | 1 | row2~row5
row9 | row5 | 2 | row2~row5~row9
(6 rows)
2003-07-27 11:51:59 +08:00
-- without branch, without orderby
2002-09-12 08:14:40 +08:00
SELECT * FROM connectby('connectby_text', 'keyid', 'parent_keyid', 'row2', 0) AS t(keyid text, parent_keyid text, level int);
keyid | parent_keyid | level
-------+--------------+-------
row2 | | 0
row4 | row2 | 1
row6 | row4 | 2
row8 | row6 | 3
row5 | row2 | 1
row9 | row5 | 2
(6 rows)
2003-07-27 11:51:59 +08:00
-- with branch, with orderby
SELECT * FROM connectby('connectby_text', 'keyid', 'parent_keyid', 'pos', 'row2', 0, '~') AS t(keyid text, parent_keyid text, level int, branch text, pos int) ORDER BY t.pos;
keyid | parent_keyid | level | branch | pos
-------+--------------+-------+---------------------+-----
row2 | | 0 | row2 | 1
row5 | row2 | 1 | row2~row5 | 2
row9 | row5 | 2 | row2~row5~row9 | 3
row4 | row2 | 1 | row2~row4 | 4
row6 | row4 | 2 | row2~row4~row6 | 5
row8 | row6 | 3 | row2~row4~row6~row8 | 6
(6 rows)
-- without branch, with orderby
SELECT * FROM connectby('connectby_text', 'keyid', 'parent_keyid', 'pos', 'row2', 0) AS t(keyid text, parent_keyid text, level int, pos int) ORDER BY t.pos;
keyid | parent_keyid | level | pos
-------+--------------+-------+-----
row2 | | 0 | 1
row5 | row2 | 1 | 2
row9 | row5 | 2 | 3
row4 | row2 | 1 | 4
row6 | row4 | 2 | 5
row8 | row6 | 3 | 6
(6 rows)
2002-09-12 08:14:40 +08:00
-- test connectby with int based hierarchy
CREATE TABLE connectby_int(keyid int, parent_keyid int);
\copy connectby_int from 'data/connectby_int.data'
-- with branch
SELECT * FROM connectby('connectby_int', 'keyid', 'parent_keyid', '2', 0, '~') AS t(keyid int, parent_keyid int, level int, branch text);
keyid | parent_keyid | level | branch
-------+--------------+-------+---------
2 | | 0 | 2
4 | 2 | 1 | 2~4
6 | 4 | 2 | 2~4~6
8 | 6 | 3 | 2~4~6~8
5 | 2 | 1 | 2~5
9 | 5 | 2 | 2~5~9
(6 rows)
-- without branch
SELECT * FROM connectby('connectby_int', 'keyid', 'parent_keyid', '2', 0) AS t(keyid int, parent_keyid int, level int);
keyid | parent_keyid | level
-------+--------------+-------
2 | | 0
4 | 2 | 1
6 | 4 | 2
8 | 6 | 3
5 | 2 | 1
9 | 5 | 2
(6 rows)
2002-10-04 01:15:36 +08:00
-- recursion detection
INSERT INTO connectby_int VALUES(10,9);
INSERT INTO connectby_int VALUES(11,10);
INSERT INTO connectby_int VALUES(9,11);
-- should fail due to infinite recursion
SELECT * FROM connectby('connectby_int', 'keyid', 'parent_keyid', '2', 0, '~') AS t(keyid int, parent_keyid int, level int, branch text);
ERROR: infinite recursion detected
-- infinite recursion failure avoided by depth limit
SELECT * FROM connectby('connectby_int', 'keyid', 'parent_keyid', '2', 4, '~') AS t(keyid int, parent_keyid int, level int, branch text);
keyid | parent_keyid | level | branch
-------+--------------+-------+-------------
2 | | 0 | 2
4 | 2 | 1 | 2~4
6 | 4 | 2 | 2~4~6
8 | 6 | 3 | 2~4~6~8
5 | 2 | 1 | 2~5
9 | 5 | 2 | 2~5~9
10 | 9 | 3 | 2~5~9~10
11 | 10 | 4 | 2~5~9~10~11
(8 rows)
-- test for falsely detected recursion
DROP TABLE connectby_int;
CREATE TABLE connectby_int(keyid int, parent_keyid int);
INSERT INTO connectby_int VALUES(11,NULL);
INSERT INTO connectby_int VALUES(10,11);
INSERT INTO connectby_int VALUES(111,11);
INSERT INTO connectby_int VALUES(1,111);
-- this should not fail due to recursion detection
SELECT * FROM connectby('connectby_int', 'keyid', 'parent_keyid', '11', 0, '-') AS t(keyid int, parent_keyid int, level int, branch text);
keyid | parent_keyid | level | branch
-------+--------------+-------+----------
11 | | 0 | 11
10 | 11 | 1 | 11-10
111 | 11 | 1 | 11-111
1 | 111 | 2 | 11-111-1
(4 rows)