gcc/libjava/gnu/gcj/convert/Output_UTF8.java

/* Copyright (C) 1999, 2000  Free Software Foundation

   This file is part of libgcj.

This software is copyrighted work licensed under the terms of the
Libgcj License.  Please consult the file "LIBGCJ_LICENSE" for
details.  */

package gnu.gcj.convert;

/**
 * Convert Unicode to UTF8.
 * @author Per Bothner <bothner@cygnus.com>
 * @date Match 1999.
 */

public class Output_UTF8 extends UnicodeToBytes
{
  public String getName() { return "UTF8"; }

  /** True if a surrogate pair should be emitted as a single UTF8 sequence.
   * Otherwise, a surrogate pair is treated as two separate characters.
   * Also, '\0' is emitted as {0} if true, and as {0xC0,0x80} if false. */
  public boolean standardUTF8 = true;

  // Saves the previous char if it was a high-surrogate.
  char hi_part;
  // Value of incomplete character.
  int value;
  // Number of continuation bytes still to emit.
  int bytes_todo;

  public int write (char[] inbuffer, int inpos, int inlength)
  {
    int start_pos = inpos;
    int avail = buf.length - count;
    for (;;)
      {
	if (avail == 0 || (inlength == 0 && bytes_todo == 0))
	  break;
	// The algorithm is made more complicated because we want to write
	// at least one byte in the output buffer, if there is room for
	// that byte, and at least one input character is available.
	// This makes the code more robust, since client code will
	// always "make progress", even in the complicated cases,
	// where the output buffer only has room for only *part* of a
	// multi-byte sequence, or the input char buffer only has half
	// of a surrogate pair (when standardUTF8 is set), or both.

	// Handle continuation characters we did not have room for before.
	if (bytes_todo > 0)
	  {
	    do
	      {
		bytes_todo--;
		buf[count++] = (byte)
		  (((value >> (bytes_todo * 6)) & 0x3F) | 0x80);
		avail--;
	      }
	    while (bytes_todo > 0 && avail > 0);
	    continue;
	  }

	char ch = inbuffer[inpos++];
	inlength--;

	if ((hi_part != 0 && (ch <= 0xDBFF || ch > 0xDFFF))
	    || (hi_part == 0 && ch >= 0xDC00 && ch <= 0xDFFF))
	  {
	    // If the previous character was a high surrogate, and we
	    // don't now have a low surrogate, we print the high
	    // surrogate as an isolated character.  If this character
	    // is a low surrogate and we didn't previously see a high
	    // surrogate, we do the same thing.
	    --inpos;
	    ++inlength;
	    buf[count++] = (byte) (0xE0 | (hi_part >> 12));
	    value = hi_part;
	    hi_part = 0;
	    avail--;
	    bytes_todo = 2;
	  }
	else if (ch < 128 && (ch != 0 || standardUTF8))
	  {
	    avail--;
	    buf[count++] = (byte) ch;
	  }
	else if (ch <= 0x07FF)
	  {
	    buf[count++] = (byte) (0xC0 | (ch >> 6));
	    avail--;
	    value = ch;
	    bytes_todo = 1;
	  }
	else if (ch >= 0xD800 && ch <= 0xDFFF && standardUTF8)
	  {
	    if (ch <= 0xDBFF)  // High surrogates
	      {
		// Just save the high surrogate until the next
		// character comes along.
		hi_part = ch;
	      }
	    else // Low surrogates
	      {
		value = (hi_part - 0xD800) * 0x400 + (ch - 0xDC00) + 0x10000;
		buf[count++] = (byte) (0xF0 | (value >> 18));
		bytes_todo = 3;
		hi_part = 0;
	      }
	  }
	else
	  {
	    buf[count++] = (byte) (0xE0 | (ch >> 12));
	    value = ch;
	    avail--;
	    bytes_todo = 2;
	  }
      }
    return inpos - start_pos;
  }
}
Input_UTF8.java (read): Fixed handling of surrogate characters. * gnu/gcj/convert/Input_UTF8.java (read): Fixed handling of surrogate characters. * gnu/gcj/convert/Output_UTF8.java (standardUTF8): Default to true. (write): Correct handling of surrogate characters. From-SVN: r35569 2000-08-09 01:35:32 +08:00			`/* Copyright (C) 1999, 2000 Free Software Foundation`
Initial revision From-SVN: r26263 1999-04-07 22:42:40 +08:00
			`This file is part of libgcj.`

			`This software is copyrighted work licensed under the terms of the`
			`Libgcj License. Please consult the file "LIBGCJ_LICENSE" for`
			`details. */`

			`package gnu.gcj.convert;`

UnicodeToBytes.java (write(String,int,int,char[])): New overloading, allows greater efficiency. � * gnu/gcj/convert/UnicodeToBytes.java (write(String,int,int,char[])): New overloading, allows greater efficiency. * gnu/gcj/convert/Output_8859_1.java (write(String,int,int,char[])): New overloading (for efficiency - avoids copying). * gnu/gcj/convert/Output_UTF8.java: Fix typo: 0xC0 -> 0c3F. * gnu/gcj/convert/Input_UTF8.java: Fix typos in bit masks. From-SVN: r26493 1999-04-17 01:21:59 +08:00			`/**`
			`* Convert Unicode to UTF8.`
			`* @author Per Bothner <bothner@cygnus.com>`
			`* @date Match 1999.`
			`*/`

Initial revision From-SVN: r26263 1999-04-07 22:42:40 +08:00			`public class Output_UTF8 extends UnicodeToBytes`
			`{`
			`public String getName() { return "UTF8"; }`

			`/** True if a surrogate pair should be emitted as a single UTF8 sequence.`
			`* Otherwise, a surrogate pair is treated as two separate characters.`
			`* Also, '\0' is emitted as {0} if true, and as {0xC0,0x80} if false. */`
Input_UTF8.java (read): Fixed handling of surrogate characters. * gnu/gcj/convert/Input_UTF8.java (read): Fixed handling of surrogate characters. * gnu/gcj/convert/Output_UTF8.java (standardUTF8): Default to true. (write): Correct handling of surrogate characters. From-SVN: r35569 2000-08-09 01:35:32 +08:00			`public boolean standardUTF8 = true;`
Initial revision From-SVN: r26263 1999-04-07 22:42:40 +08:00
			`// Saves the previous char if it was a high-surrogate.`
			`char hi_part;`
Output_UTF8.java (write): Don't exit loop unless both `inlength' and `bytes_todo' are 0. * gnu/gcj/convert/Output_UTF8.java (write): Don't exit loop unless both `inlength' and `bytes_todo' are 0. Simplified 2-byte case. From-SVN: r29570 1999-09-22 07:20:43 +08:00			`// Value of incomplete character.`
Initial revision From-SVN: r26263 1999-04-07 22:42:40 +08:00			`int value;`
			`// Number of continuation bytes still to emit.`
			`int bytes_todo;`

			`public int write (char[] inbuffer, int inpos, int inlength)`
			`{`
			`int start_pos = inpos;`
			`int avail = buf.length - count;`
			`for (;;)`
			`{`
Output_UTF8.java (write): Don't exit loop unless both `inlength' and `bytes_todo' are 0. * gnu/gcj/convert/Output_UTF8.java (write): Don't exit loop unless both `inlength' and `bytes_todo' are 0. Simplified 2-byte case. From-SVN: r29570 1999-09-22 07:20:43 +08:00			`if (avail == 0 \|\| (inlength == 0 && bytes_todo == 0))`
Initial revision From-SVN: r26263 1999-04-07 22:42:40 +08:00			`break;`
Output_UTF8.java (write): Don't exit loop unless both `inlength' and `bytes_todo' are 0. * gnu/gcj/convert/Output_UTF8.java (write): Don't exit loop unless both `inlength' and `bytes_todo' are 0. Simplified 2-byte case. From-SVN: r29570 1999-09-22 07:20:43 +08:00			`// The algorithm is made more complicated because we want to write`
Initial revision From-SVN: r26263 1999-04-07 22:42:40 +08:00			`// at least one byte in the output buffer, if there is room for`
			`// that byte, and at least one input character is available.`
			`// This makes the code more robust, since client code will`
			`// always "make progress", even in the complicated cases,`
			`// where the output buffer only has room for only part of a`
			`// multi-byte sequence, or the input char buffer only has half`
			`// of a surrogate pair (when standardUTF8 is set), or both.`

			`// Handle continuation characters we did not have room for before.`
			`if (bytes_todo > 0)`
			`{`
			`do`
			`{`
			`bytes_todo--;`
			`buf[count++] = (byte)`
UnicodeToBytes.java (write(String,int,int,char[])): New overloading, allows greater efficiency. � * gnu/gcj/convert/UnicodeToBytes.java (write(String,int,int,char[])): New overloading, allows greater efficiency. * gnu/gcj/convert/Output_8859_1.java (write(String,int,int,char[])): New overloading (for efficiency - avoids copying). * gnu/gcj/convert/Output_UTF8.java: Fix typo: 0xC0 -> 0c3F. * gnu/gcj/convert/Input_UTF8.java: Fix typos in bit masks. From-SVN: r26493 1999-04-17 01:21:59 +08:00			`(((value >> (bytes_todo * 6)) & 0x3F) \| 0x80);`
Initial revision From-SVN: r26263 1999-04-07 22:42:40 +08:00			`avail--;`
			`}`
			`while (bytes_todo > 0 && avail > 0);`
			`continue;`
			`}`
Input_UTF8.java (read): Fixed handling of surrogate characters. * gnu/gcj/convert/Input_UTF8.java (read): Fixed handling of surrogate characters. * gnu/gcj/convert/Output_UTF8.java (standardUTF8): Default to true. (write): Correct handling of surrogate characters. From-SVN: r35569 2000-08-09 01:35:32 +08:00
Initial revision From-SVN: r26263 1999-04-07 22:42:40 +08:00			`char ch = inbuffer[inpos++];`
			`inlength--;`
Input_UTF8.java (read): Fixed handling of surrogate characters. * gnu/gcj/convert/Input_UTF8.java (read): Fixed handling of surrogate characters. * gnu/gcj/convert/Output_UTF8.java (standardUTF8): Default to true. (write): Correct handling of surrogate characters. From-SVN: r35569 2000-08-09 01:35:32 +08:00
			`if ((hi_part != 0 && (ch <= 0xDBFF \|\| ch > 0xDFFF))`
			`\|\| (hi_part == 0 && ch >= 0xDC00 && ch <= 0xDFFF))`
			`{`
			`// If the previous character was a high surrogate, and we`
			`// don't now have a low surrogate, we print the high`
			`// surrogate as an isolated character. If this character`
			`// is a low surrogate and we didn't previously see a high`
			`// surrogate, we do the same thing.`
			`--inpos;`
			`++inlength;`
			`buf[count++] = (byte) (0xE0 \| (hi_part >> 12));`
			`value = hi_part;`
			`hi_part = 0;`
			`avail--;`
			`bytes_todo = 2;`
			`}`
			`else if (ch < 128 && (ch != 0 \|\| standardUTF8))`
Initial revision From-SVN: r26263 1999-04-07 22:42:40 +08:00			`{`
			`avail--;`
			`buf[count++] = (byte) ch;`
			`}`
			`else if (ch <= 0x07FF)`
			`{`
			`buf[count++] = (byte) (0xC0 \| (ch >> 6));`
Output_UTF8.java (write): Don't exit loop unless both `inlength' and `bytes_todo' are 0. * gnu/gcj/convert/Output_UTF8.java (write): Don't exit loop unless both `inlength' and `bytes_todo' are 0. Simplified 2-byte case. From-SVN: r29570 1999-09-22 07:20:43 +08:00			`avail--;`
			`value = ch;`
			`bytes_todo = 1;`
Initial revision From-SVN: r26263 1999-04-07 22:42:40 +08:00			`}`
			`else if (ch >= 0xD800 && ch <= 0xDFFF && standardUTF8)`
			`{`
			`if (ch <= 0xDBFF) // High surrogates`
			`{`
Input_UTF8.java (read): Fixed handling of surrogate characters. * gnu/gcj/convert/Input_UTF8.java (read): Fixed handling of surrogate characters. * gnu/gcj/convert/Output_UTF8.java (standardUTF8): Default to true. (write): Correct handling of surrogate characters. From-SVN: r35569 2000-08-09 01:35:32 +08:00			`// Just save the high surrogate until the next`
			`// character comes along.`
Initial revision From-SVN: r26263 1999-04-07 22:42:40 +08:00			`hi_part = ch;`
			`}`
			`else // Low surrogates`
			`{`
			`value = (hi_part - 0xD800) * 0x400 + (ch - 0xDC00) + 0x10000;`
Input_UTF8.java (read): Fixed handling of surrogate characters. * gnu/gcj/convert/Input_UTF8.java (read): Fixed handling of surrogate characters. * gnu/gcj/convert/Output_UTF8.java (standardUTF8): Default to true. (write): Correct handling of surrogate characters. From-SVN: r35569 2000-08-09 01:35:32 +08:00			`buf[count++] = (byte) (0xF0 \| (value >> 18));`
Initial revision From-SVN: r26263 1999-04-07 22:42:40 +08:00			`bytes_todo = 3;`
Input_UTF8.java (read): Fixed handling of surrogate characters. * gnu/gcj/convert/Input_UTF8.java (read): Fixed handling of surrogate characters. * gnu/gcj/convert/Output_UTF8.java (standardUTF8): Default to true. (write): Correct handling of surrogate characters. From-SVN: r35569 2000-08-09 01:35:32 +08:00			`hi_part = 0;`
Initial revision From-SVN: r26263 1999-04-07 22:42:40 +08:00			`}`
			`}`
			`else`
			`{`
			`buf[count++] = (byte) (0xE0 \| (ch >> 12));`
			`value = ch;`
			`avail--;`
			`bytes_todo = 2;`
			`}`
			`}`
			`return inpos - start_pos;`
			`}`
			`}`