mirror of
https://git.postgresql.org/git/postgresql.git
synced 2025-01-12 18:34:36 +08:00
Perform conversion from Python unicode to string/bytes object via UTF-8.
We used to convert the unicode object directly to a string in the server encoding by calling Python's PyUnicode_AsEncodedString function. In other words, we used Python's routines to do the encoding. However, that has a few problems. First of all, it required keeping a mapping table of Python encoding names and PostgreSQL encodings. But the real killer was that Python doesn't support EUC_TW and MULE_INTERNAL encodings at all. Instead, convert the Python unicode object to UTF-8, and use PostgreSQL's encoding conversion functions to convert from UTF-8 to server encoding. We were already doing the same in the other direction in PLyUnicode_FromString, so this is more consistent, too. Note: This makes SQL_ASCII to behave more leniently. We used to map SQL_ASCII to Python's 'ascii', which on Python means strict 7-bit ASCII only, so you got an error if the python string contained anything but pure ASCII. You no longer get an error; you get the UTF-8 representation of the string instead. Backpatch to 9.0, where these conversions were introduced. Jan Urbański
This commit is contained in:
parent
bb49e3551b
commit
7fbe5aaaa8
@ -1,52 +0,0 @@
|
||||
--
|
||||
-- Unicode handling
|
||||
--
|
||||
CREATE TABLE unicode_test (
|
||||
testvalue text NOT NULL
|
||||
);
|
||||
CREATE FUNCTION unicode_return() RETURNS text AS E'
|
||||
return u"\\x80"
|
||||
' LANGUAGE plpythonu;
|
||||
CREATE FUNCTION unicode_trigger() RETURNS trigger AS E'
|
||||
TD["new"]["testvalue"] = u"\\x80"
|
||||
return "MODIFY"
|
||||
' LANGUAGE plpythonu;
|
||||
CREATE TRIGGER unicode_test_bi BEFORE INSERT ON unicode_test
|
||||
FOR EACH ROW EXECUTE PROCEDURE unicode_trigger();
|
||||
CREATE FUNCTION unicode_plan1() RETURNS text AS E'
|
||||
plan = plpy.prepare("SELECT $1 AS testvalue", ["text"])
|
||||
rv = plpy.execute(plan, [u"\\x80"], 1)
|
||||
return rv[0]["testvalue"]
|
||||
' LANGUAGE plpythonu;
|
||||
CREATE FUNCTION unicode_plan2() RETURNS text AS E'
|
||||
plan = plpy.prepare("SELECT $1 || $2 AS testvalue", ["text", u"text"])
|
||||
rv = plpy.execute(plan, ["foo", "bar"], 1)
|
||||
return rv[0]["testvalue"]
|
||||
' LANGUAGE plpythonu;
|
||||
SELECT unicode_return();
|
||||
ERROR: PL/Python: could not convert Python Unicode object to PostgreSQL server encoding
|
||||
DETAIL: UnicodeError: ASCII encoding error: ordinal not in range(128)
|
||||
CONTEXT: while creating return value
|
||||
PL/Python function "unicode_return"
|
||||
INSERT INTO unicode_test (testvalue) VALUES ('test');
|
||||
ERROR: PL/Python: could not convert Python Unicode object to PostgreSQL server encoding
|
||||
DETAIL: UnicodeError: ASCII encoding error: ordinal not in range(128)
|
||||
CONTEXT: while modifying trigger row
|
||||
PL/Python function "unicode_trigger"
|
||||
SELECT * FROM unicode_test;
|
||||
testvalue
|
||||
-----------
|
||||
(0 rows)
|
||||
|
||||
SELECT unicode_plan1();
|
||||
WARNING: PL/Python: plpy.Error: unrecognized error in PLy_spi_execute_plan
|
||||
CONTEXT: PL/Python function "unicode_plan1"
|
||||
ERROR: PL/Python: could not convert Python Unicode object to PostgreSQL server encoding
|
||||
DETAIL: UnicodeError: ASCII encoding error: ordinal not in range(128)
|
||||
CONTEXT: PL/Python function "unicode_plan1"
|
||||
SELECT unicode_plan2();
|
||||
unicode_plan2
|
||||
---------------
|
||||
foobar
|
||||
(1 row)
|
||||
|
@ -1,52 +0,0 @@
|
||||
--
|
||||
-- Unicode handling
|
||||
--
|
||||
CREATE TABLE unicode_test (
|
||||
testvalue text NOT NULL
|
||||
);
|
||||
CREATE FUNCTION unicode_return() RETURNS text AS E'
|
||||
return u"\\x80"
|
||||
' LANGUAGE plpythonu;
|
||||
CREATE FUNCTION unicode_trigger() RETURNS trigger AS E'
|
||||
TD["new"]["testvalue"] = u"\\x80"
|
||||
return "MODIFY"
|
||||
' LANGUAGE plpythonu;
|
||||
CREATE TRIGGER unicode_test_bi BEFORE INSERT ON unicode_test
|
||||
FOR EACH ROW EXECUTE PROCEDURE unicode_trigger();
|
||||
CREATE FUNCTION unicode_plan1() RETURNS text AS E'
|
||||
plan = plpy.prepare("SELECT $1 AS testvalue", ["text"])
|
||||
rv = plpy.execute(plan, [u"\\x80"], 1)
|
||||
return rv[0]["testvalue"]
|
||||
' LANGUAGE plpythonu;
|
||||
CREATE FUNCTION unicode_plan2() RETURNS text AS E'
|
||||
plan = plpy.prepare("SELECT $1 || $2 AS testvalue", ["text", u"text"])
|
||||
rv = plpy.execute(plan, ["foo", "bar"], 1)
|
||||
return rv[0]["testvalue"]
|
||||
' LANGUAGE plpythonu;
|
||||
SELECT unicode_return();
|
||||
ERROR: PL/Python: could not convert Python Unicode object to PostgreSQL server encoding
|
||||
DETAIL: UnicodeEncodeError: 'ascii' codec can't encode character u'\x80' in position 0: ordinal not in range(128)
|
||||
CONTEXT: while creating return value
|
||||
PL/Python function "unicode_return"
|
||||
INSERT INTO unicode_test (testvalue) VALUES ('test');
|
||||
ERROR: PL/Python: could not convert Python Unicode object to PostgreSQL server encoding
|
||||
DETAIL: UnicodeEncodeError: 'ascii' codec can't encode character u'\x80' in position 0: ordinal not in range(128)
|
||||
CONTEXT: while modifying trigger row
|
||||
PL/Python function "unicode_trigger"
|
||||
SELECT * FROM unicode_test;
|
||||
testvalue
|
||||
-----------
|
||||
(0 rows)
|
||||
|
||||
SELECT unicode_plan1();
|
||||
WARNING: PL/Python: plpy.Error: unrecognized error in PLy_spi_execute_plan
|
||||
CONTEXT: PL/Python function "unicode_plan1"
|
||||
ERROR: PL/Python: could not convert Python Unicode object to PostgreSQL server encoding
|
||||
DETAIL: UnicodeEncodeError: 'ascii' codec can't encode character u'\x80' in position 0: ordinal not in range(128)
|
||||
CONTEXT: PL/Python function "unicode_plan1"
|
||||
SELECT unicode_plan2();
|
||||
unicode_plan2
|
||||
---------------
|
||||
foobar
|
||||
(1 row)
|
||||
|
@ -3686,66 +3686,56 @@ PLy_free(void *ptr)
|
||||
static PyObject *
|
||||
PLyUnicode_Bytes(PyObject *unicode)
|
||||
{
|
||||
PyObject *rv;
|
||||
const char *serverenc;
|
||||
PyObject *bytes, *rv;
|
||||
char *utf8string, *encoded;
|
||||
|
||||
/*
|
||||
* Map PostgreSQL encoding to a Python encoding name.
|
||||
*/
|
||||
switch (GetDatabaseEncoding())
|
||||
{
|
||||
case PG_SQL_ASCII:
|
||||
/*
|
||||
* Mapping SQL_ASCII to Python's 'ascii' is a bit bogus. Python's
|
||||
* 'ascii' means true 7-bit only ASCII, while PostgreSQL's
|
||||
* SQL_ASCII means that anything is allowed, and the system doesn't
|
||||
* try to interpret the bytes in any way. But not sure what else
|
||||
* to do, and we haven't heard any complaints...
|
||||
*/
|
||||
serverenc = "ascii";
|
||||
break;
|
||||
case PG_WIN1250:
|
||||
serverenc = "cp1250";
|
||||
break;
|
||||
case PG_WIN1251:
|
||||
serverenc = "cp1251";
|
||||
break;
|
||||
case PG_WIN1252:
|
||||
serverenc = "cp1252";
|
||||
break;
|
||||
case PG_WIN1253:
|
||||
serverenc = "cp1253";
|
||||
break;
|
||||
case PG_WIN1254:
|
||||
serverenc = "cp1254";
|
||||
break;
|
||||
case PG_WIN1255:
|
||||
serverenc = "cp1255";
|
||||
break;
|
||||
case PG_WIN1256:
|
||||
serverenc = "cp1256";
|
||||
break;
|
||||
case PG_WIN1257:
|
||||
serverenc = "cp1257";
|
||||
break;
|
||||
case PG_WIN1258:
|
||||
serverenc = "cp1258";
|
||||
break;
|
||||
case PG_WIN866:
|
||||
serverenc = "cp866";
|
||||
break;
|
||||
case PG_WIN874:
|
||||
serverenc = "cp874";
|
||||
break;
|
||||
default:
|
||||
/* Other encodings have the same name in Python. */
|
||||
serverenc = GetDatabaseEncodingName();
|
||||
break;
|
||||
/* First encode the Python unicode object with UTF-8. */
|
||||
bytes = PyUnicode_AsUTF8String(unicode);
|
||||
if (bytes == NULL)
|
||||
PLy_elog(ERROR, "could not convert Python Unicode object to bytes");
|
||||
|
||||
utf8string = PyBytes_AsString(bytes);
|
||||
if (utf8string == NULL) {
|
||||
Py_DECREF(bytes);
|
||||
PLy_elog(ERROR, "could not extract bytes from encoded string");
|
||||
}
|
||||
|
||||
rv = PyUnicode_AsEncodedString(unicode, serverenc, "strict");
|
||||
if (rv == NULL)
|
||||
PLy_elog(ERROR, "could not convert Python Unicode object to PostgreSQL server encoding");
|
||||
/*
|
||||
* Then convert to server encoding if necessary.
|
||||
*
|
||||
* PyUnicode_AsEncodedString could be used to encode the object directly
|
||||
* in the server encoding, but Python doesn't support all the encodings
|
||||
* that PostgreSQL does (EUC_TW and MULE_INTERNAL). UTF-8 is used as an
|
||||
* intermediary in PLyUnicode_FromString as well.
|
||||
*/
|
||||
if (GetDatabaseEncoding() != PG_UTF8)
|
||||
{
|
||||
PG_TRY();
|
||||
{
|
||||
encoded = (char *) pg_do_encoding_conversion(
|
||||
(unsigned char *) utf8string,
|
||||
strlen(utf8string),
|
||||
PG_UTF8,
|
||||
GetDatabaseEncoding());
|
||||
}
|
||||
PG_CATCH();
|
||||
{
|
||||
Py_DECREF(bytes);
|
||||
PG_RE_THROW();
|
||||
}
|
||||
PG_END_TRY();
|
||||
}
|
||||
else
|
||||
encoded = utf8string;
|
||||
|
||||
/* finally, build a bytes object in the server encoding */
|
||||
rv = PyBytes_FromStringAndSize(encoded, strlen(encoded));
|
||||
|
||||
/* if pg_do_encoding_conversion allocated memory, free it now */
|
||||
if (utf8string != encoded)
|
||||
pfree(encoded);
|
||||
|
||||
Py_DECREF(bytes);
|
||||
return rv;
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user