Allow direct conversion between EUC_JP and SJIS to improve

performance. patches submitted by Atsushi Ogawa.
This commit is contained in:
Tatsuo Ishii 2005-06-24 13:56:39 +00:00
parent 1fa87fa780
commit e2d088de03

View File

@ -6,7 +6,7 @@
* Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 1994, Regents of the University of California
* *
* IDENTIFICATION * IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/euc_jp_and_sjis/euc_jp_and_sjis.c,v 1.10 2005/06/10 16:43:56 ishii Exp $ * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/euc_jp_and_sjis/euc_jp_and_sjis.c,v 1.11 2005/06/24 13:56:39 ishii Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
@ -58,6 +58,8 @@ static void sjis2mic(unsigned char *sjis, unsigned char *p, int len);
static void mic2sjis(unsigned char *mic, unsigned char *p, int len); static void mic2sjis(unsigned char *mic, unsigned char *p, int len);
static void euc_jp2mic(unsigned char *euc, unsigned char *p, int len); static void euc_jp2mic(unsigned char *euc, unsigned char *p, int len);
static void mic2euc_jp(unsigned char *mic, unsigned char *p, int len); static void mic2euc_jp(unsigned char *mic, unsigned char *p, int len);
static void euc_jp2sjis(unsigned char *mic, unsigned char *p, int len);
static void sjis2euc_jp(unsigned char *mic, unsigned char *p, int len);
Datum Datum
euc_jp_to_sjis(PG_FUNCTION_ARGS) euc_jp_to_sjis(PG_FUNCTION_ARGS)
@ -65,16 +67,12 @@ euc_jp_to_sjis(PG_FUNCTION_ARGS)
unsigned char *src = PG_GETARG_CSTRING(2); unsigned char *src = PG_GETARG_CSTRING(2);
unsigned char *dest = PG_GETARG_CSTRING(3); unsigned char *dest = PG_GETARG_CSTRING(3);
int len = PG_GETARG_INT32(4); int len = PG_GETARG_INT32(4);
unsigned char *buf;
Assert(PG_GETARG_INT32(0) == PG_EUC_JP); Assert(PG_GETARG_INT32(0) == PG_EUC_JP);
Assert(PG_GETARG_INT32(1) == PG_SJIS); Assert(PG_GETARG_INT32(1) == PG_SJIS);
Assert(len >= 0); Assert(len >= 0);
buf = palloc(len * ENCODING_GROWTH_RATE); euc_jp2sjis(src, dest, len);
euc_jp2mic(src, buf, len);
mic2sjis(buf, dest, strlen(buf));
pfree(buf);
PG_RETURN_VOID(); PG_RETURN_VOID();
} }
@ -85,16 +83,12 @@ sjis_to_euc_jp(PG_FUNCTION_ARGS)
unsigned char *src = PG_GETARG_CSTRING(2); unsigned char *src = PG_GETARG_CSTRING(2);
unsigned char *dest = PG_GETARG_CSTRING(3); unsigned char *dest = PG_GETARG_CSTRING(3);
int len = PG_GETARG_INT32(4); int len = PG_GETARG_INT32(4);
unsigned char *buf;
Assert(PG_GETARG_INT32(0) == PG_SJIS); Assert(PG_GETARG_INT32(0) == PG_SJIS);
Assert(PG_GETARG_INT32(1) == PG_EUC_JP); Assert(PG_GETARG_INT32(1) == PG_EUC_JP);
Assert(len >= 0); Assert(len >= 0);
buf = palloc(len * ENCODING_GROWTH_RATE); sjis2euc_jp(src, dest, len);
sjis2mic(src, buf, len);
mic2euc_jp(buf, dest, strlen(buf));
pfree(buf);
PG_RETURN_VOID(); PG_RETURN_VOID();
} }
@ -454,3 +448,199 @@ mic2euc_jp(unsigned char *mic, unsigned char *p, int len)
} }
*p = '\0'; *p = '\0';
} }
/*
* EUC_JP -> SJIS
*/
static void
euc_jp2sjis(unsigned char *euc, unsigned char *p, int len)
{
int c1,
c2,
k;
unsigned char *euc_end = euc + len;
while (euc_end >= euc && (c1 = *euc++))
{
if(c1 < 0x80)
{
/* should be ASCII */
*p++ = c1;
}
else if (c1 == SS2)
{
/* hankaku kana? */
*p++ = *euc++;
}
else if (c1 == SS3)
{
/* JIS X0212 kanji? */
c1 = *euc++;
c2 = *euc++;
k = c1 << 8 | c2;
if (k >= 0xf5a1)
{
/* UDC2 */
c1 -= 0x54;
*p++ = ((c1 - 0xa1) >> 1) + ((c1 < 0xdf) ? 0x81 : 0xc1) + 0x74;
*p++ = c2 - ((c1 & 1) ? ((c2 < 0xe0) ? 0x61 : 0x60) : 2);
}
else
{
int i, k2;
/* IBM kanji */
for (i = 0;; i++)
{
k2 = ibmkanji[i].euc & 0xffff;
if (k2 == 0xffff)
{
*p++ = PGSJISALTCODE >> 8;
*p++ = PGSJISALTCODE & 0xff;
break;
}
if (k2 == k)
{
k = ibmkanji[i].sjis;
*p++ = k >> 8;
*p++ = k & 0xff;
break;
}
}
}
}
else
{
/* JIS X0208 kanji? */
c2 = *euc++;
k = (c1 << 8) | (c2 & 0xff);
if (k >= 0xf5a1)
{
/* UDC1 */
c1 -= 0x54;
*p++ = ((c1 - 0xa1) >> 1) + ((c1 < 0xdf) ? 0x81 : 0xc1) + 0x6f;
}
else
*p++ = ((c1 - 0xa1) >> 1) + ((c1 < 0xdf) ? 0x81 : 0xc1);
*p++ = c2 - ((c1 & 1) ? ((c2 < 0xe0) ? 0x61 : 0x60) : 2);
}
}
*p = '\0';
}
/*
* SJIS ---> EUC_JP
*/
static void
sjis2euc_jp(unsigned char *sjis, unsigned char *p, int len)
{
int c1,
c2,
i,
k,
k2;
unsigned char *sjis_end = sjis + len;
while (sjis_end >= sjis && (c1 = *sjis++))
{
if(c1 < 0x80)
{
/* should be ASCII */
*p++ = c1;
}
else if (c1 >= 0xa1 && c1 <= 0xdf)
{
/* JIS X0201 (1 byte kana) */
*p++ = SS2;
*p++ = c1;
}
else
{
/*
* JIS X0208, X0212, user defined extended characters
*/
c2 = *sjis++;
k = (c1 << 8) + c2;
if (k >= 0xed40 && k < 0xf040)
{
/* NEC selection IBM kanji */
for (i = 0;; i++)
{
k2 = ibmkanji[i].nec;
if (k2 == 0xffff)
break;
if (k2 == k)
{
k = ibmkanji[i].sjis;
c1 = (k >> 8) & 0xff;
c2 = k & 0xff;
}
}
}
if (k < 0xeb3f)
{
/* JIS X0208 */
*p++ = ((c1 & 0x3f) << 1) + 0x9f + (c2 > 0x9e);
*p++ = c2 + ((c2 > 0x9e) ? 2 : 0x60) + (c2 < 0x80);
}
else if ((k >= 0xeb40 && k < 0xf040) || (k >= 0xfc4c && k <= 0xfcfc))
{
/* NEC selection IBM kanji - Other undecided justice */
*p++ = PGEUCALTCODE >> 8;
*p++ = PGEUCALTCODE & 0xff;
}
else if (k >= 0xf040 && k < 0xf540)
{
/*
* UDC1 mapping to X0208 85 ku - 94 ku JIS code 0x7521 -
* 0x7e7e EUC 0xf5a1 - 0xfefe
*/
c1 -= 0x6f;
*p++ = ((c1 & 0x3f) << 1) + 0xf3 + (c2 > 0x9e);
*p++ = c2 + ((c2 > 0x9e) ? 2 : 0x60) + (c2 < 0x80);
}
else if (k >= 0xf540 && k < 0xfa40)
{
/*
* UDC2 mapping to X0212 85 ku - 94 ku JIS code 0x7521 -
* 0x7e7e EUC 0x8ff5a1 - 0x8ffefe
*/
*p++ = SS3;
c1 -= 0x74;
*p++ = ((c1 & 0x3f) << 1) + 0xf3 + (c2 > 0x9e);
*p++ = c2 + ((c2 > 0x9e) ? 2 : 0x60) + (c2 < 0x80);
}
else if (k >= 0xfa40)
{
/*
* mapping IBM kanji to X0208 and X0212
*
*/
for (i = 0;; i++)
{
k2 = ibmkanji[i].sjis;
if (k2 == 0xffff)
break;
if (k2 == k)
{
k = ibmkanji[i].euc;
if (k >= 0x8f0000)
{
*p++ = SS3;
*p++ = 0x80 | ((k & 0xff00) >> 8);
*p++ = 0x80 | (k & 0xff);
}
else
{
*p++ = 0x80 | (k >> 8);
*p++ = 0x80 | (k & 0xff);
}
}
}
}
}
}
*p = '\0';
}