From e2d088de032154eb2d2b91fdc94761662ff9b4ba Mon Sep 17 00:00:00 2001 From: Tatsuo Ishii Date: Fri, 24 Jun 2005 13:56:39 +0000 Subject: [PATCH] Allow direct conversion between EUC_JP and SJIS to improve performance. patches submitted by Atsushi Ogawa. --- .../euc_jp_and_sjis/euc_jp_and_sjis.c | 212 +++++++++++++++++- 1 file changed, 201 insertions(+), 11 deletions(-) diff --git a/src/backend/utils/mb/conversion_procs/euc_jp_and_sjis/euc_jp_and_sjis.c b/src/backend/utils/mb/conversion_procs/euc_jp_and_sjis/euc_jp_and_sjis.c index ddeffa57ac..37f6d2b1c5 100644 --- a/src/backend/utils/mb/conversion_procs/euc_jp_and_sjis/euc_jp_and_sjis.c +++ b/src/backend/utils/mb/conversion_procs/euc_jp_and_sjis/euc_jp_and_sjis.c @@ -6,7 +6,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/euc_jp_and_sjis/euc_jp_and_sjis.c,v 1.10 2005/06/10 16:43:56 ishii Exp $ + * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/euc_jp_and_sjis/euc_jp_and_sjis.c,v 1.11 2005/06/24 13:56:39 ishii Exp $ * *------------------------------------------------------------------------- */ @@ -58,6 +58,8 @@ static void sjis2mic(unsigned char *sjis, unsigned char *p, int len); static void mic2sjis(unsigned char *mic, unsigned char *p, int len); static void euc_jp2mic(unsigned char *euc, unsigned char *p, int len); static void mic2euc_jp(unsigned char *mic, unsigned char *p, int len); +static void euc_jp2sjis(unsigned char *mic, unsigned char *p, int len); +static void sjis2euc_jp(unsigned char *mic, unsigned char *p, int len); Datum euc_jp_to_sjis(PG_FUNCTION_ARGS) @@ -65,16 +67,12 @@ euc_jp_to_sjis(PG_FUNCTION_ARGS) unsigned char *src = PG_GETARG_CSTRING(2); unsigned char *dest = PG_GETARG_CSTRING(3); int len = PG_GETARG_INT32(4); - unsigned char *buf; Assert(PG_GETARG_INT32(0) == PG_EUC_JP); Assert(PG_GETARG_INT32(1) == PG_SJIS); Assert(len >= 0); - buf = palloc(len * ENCODING_GROWTH_RATE); - euc_jp2mic(src, buf, len); - mic2sjis(buf, dest, strlen(buf)); - pfree(buf); + euc_jp2sjis(src, dest, len); PG_RETURN_VOID(); } @@ -85,16 +83,12 @@ sjis_to_euc_jp(PG_FUNCTION_ARGS) unsigned char *src = PG_GETARG_CSTRING(2); unsigned char *dest = PG_GETARG_CSTRING(3); int len = PG_GETARG_INT32(4); - unsigned char *buf; Assert(PG_GETARG_INT32(0) == PG_SJIS); Assert(PG_GETARG_INT32(1) == PG_EUC_JP); Assert(len >= 0); - buf = palloc(len * ENCODING_GROWTH_RATE); - sjis2mic(src, buf, len); - mic2euc_jp(buf, dest, strlen(buf)); - pfree(buf); + sjis2euc_jp(src, dest, len); PG_RETURN_VOID(); } @@ -454,3 +448,199 @@ mic2euc_jp(unsigned char *mic, unsigned char *p, int len) } *p = '\0'; } + +/* + * EUC_JP -> SJIS + */ +static void +euc_jp2sjis(unsigned char *euc, unsigned char *p, int len) +{ + int c1, + c2, + k; + unsigned char *euc_end = euc + len; + + while (euc_end >= euc && (c1 = *euc++)) + { + if(c1 < 0x80) + { + /* should be ASCII */ + *p++ = c1; + } + else if (c1 == SS2) + { + /* hankaku kana? */ + *p++ = *euc++; + } + else if (c1 == SS3) + { + /* JIS X0212 kanji? */ + c1 = *euc++; + c2 = *euc++; + k = c1 << 8 | c2; + if (k >= 0xf5a1) + { + /* UDC2 */ + c1 -= 0x54; + *p++ = ((c1 - 0xa1) >> 1) + ((c1 < 0xdf) ? 0x81 : 0xc1) + 0x74; + *p++ = c2 - ((c1 & 1) ? ((c2 < 0xe0) ? 0x61 : 0x60) : 2); + } + else + { + int i, k2; + + /* IBM kanji */ + for (i = 0;; i++) + { + k2 = ibmkanji[i].euc & 0xffff; + if (k2 == 0xffff) + { + *p++ = PGSJISALTCODE >> 8; + *p++ = PGSJISALTCODE & 0xff; + break; + } + if (k2 == k) + { + k = ibmkanji[i].sjis; + *p++ = k >> 8; + *p++ = k & 0xff; + break; + } + } + } + } + else + { + /* JIS X0208 kanji? */ + c2 = *euc++; + k = (c1 << 8) | (c2 & 0xff); + if (k >= 0xf5a1) + { + /* UDC1 */ + c1 -= 0x54; + *p++ = ((c1 - 0xa1) >> 1) + ((c1 < 0xdf) ? 0x81 : 0xc1) + 0x6f; + } + else + *p++ = ((c1 - 0xa1) >> 1) + ((c1 < 0xdf) ? 0x81 : 0xc1); + *p++ = c2 - ((c1 & 1) ? ((c2 < 0xe0) ? 0x61 : 0x60) : 2); + } + } + *p = '\0'; +} + +/* + * SJIS ---> EUC_JP + */ +static void +sjis2euc_jp(unsigned char *sjis, unsigned char *p, int len) +{ + int c1, + c2, + i, + k, + k2; + unsigned char *sjis_end = sjis + len; + + while (sjis_end >= sjis && (c1 = *sjis++)) + { + if(c1 < 0x80) + { + /* should be ASCII */ + *p++ = c1; + } + else if (c1 >= 0xa1 && c1 <= 0xdf) + { + /* JIS X0201 (1 byte kana) */ + *p++ = SS2; + *p++ = c1; + } + else + { + /* + * JIS X0208, X0212, user defined extended characters + */ + c2 = *sjis++; + k = (c1 << 8) + c2; + if (k >= 0xed40 && k < 0xf040) + { + /* NEC selection IBM kanji */ + for (i = 0;; i++) + { + k2 = ibmkanji[i].nec; + if (k2 == 0xffff) + break; + if (k2 == k) + { + k = ibmkanji[i].sjis; + c1 = (k >> 8) & 0xff; + c2 = k & 0xff; + } + } + } + + if (k < 0xeb3f) + { + /* JIS X0208 */ + *p++ = ((c1 & 0x3f) << 1) + 0x9f + (c2 > 0x9e); + *p++ = c2 + ((c2 > 0x9e) ? 2 : 0x60) + (c2 < 0x80); + } + else if ((k >= 0xeb40 && k < 0xf040) || (k >= 0xfc4c && k <= 0xfcfc)) + { + /* NEC selection IBM kanji - Other undecided justice */ + *p++ = PGEUCALTCODE >> 8; + *p++ = PGEUCALTCODE & 0xff; + } + else if (k >= 0xf040 && k < 0xf540) + { + /* + * UDC1 mapping to X0208 85 ku - 94 ku JIS code 0x7521 - + * 0x7e7e EUC 0xf5a1 - 0xfefe + */ + c1 -= 0x6f; + *p++ = ((c1 & 0x3f) << 1) + 0xf3 + (c2 > 0x9e); + *p++ = c2 + ((c2 > 0x9e) ? 2 : 0x60) + (c2 < 0x80); + } + else if (k >= 0xf540 && k < 0xfa40) + { + /* + * UDC2 mapping to X0212 85 ku - 94 ku JIS code 0x7521 - + * 0x7e7e EUC 0x8ff5a1 - 0x8ffefe + */ + *p++ = SS3; + c1 -= 0x74; + *p++ = ((c1 & 0x3f) << 1) + 0xf3 + (c2 > 0x9e); + *p++ = c2 + ((c2 > 0x9e) ? 2 : 0x60) + (c2 < 0x80); + } + else if (k >= 0xfa40) + { + /* + * mapping IBM kanji to X0208 and X0212 + * + */ + for (i = 0;; i++) + { + k2 = ibmkanji[i].sjis; + if (k2 == 0xffff) + break; + if (k2 == k) + { + k = ibmkanji[i].euc; + if (k >= 0x8f0000) + { + *p++ = SS3; + *p++ = 0x80 | ((k & 0xff00) >> 8); + *p++ = 0x80 | (k & 0xff); + } + else + { + *p++ = 0x80 | (k >> 8); + *p++ = 0x80 | (k & 0xff); + } + } + } + } + } + } + *p = '\0'; +} +