mirror of
https://github.com/netwide-assembler/nasm.git
synced 2025-01-18 16:25:05 +08:00
quote: emit invalid UTF-8 rather than just dropping a strange value
If an UTF-8 value exceeds 0x7fffffff, there is no legitimate encoding for it. However, using FE or FF as leading bytes provide at least some kind of encoding. This is assembly, and the programmer is (almost?) always right. It might be worthwhile to add a suppressible warning for invalid UTF-8 strings in general, though, including any character > 0x10ffff, surrogates, or a string that is constructed by hand. Signed-off-by: H. Peter Anvin <hpa@zytor.com>
This commit is contained in:
parent
236f4a832b
commit
10d9589f02
12
asm/quote.c
12
asm/quote.c
@ -220,14 +220,16 @@ static unsigned char *emit_utf8(unsigned char *q, uint32_t v)
|
||||
goto out4;
|
||||
}
|
||||
|
||||
/*
|
||||
* Note: this is invalid even for "classic" (pre-UTF16) 31-bit
|
||||
* UTF-8 if the value is >= 0x8000000. This at least tries to do
|
||||
* something vaguely sensible with it. Caveat programmer.
|
||||
* The __utf*__ string transform functions do reject these
|
||||
* as invalid input.
|
||||
*/
|
||||
vb5 = vb4 >> 6;
|
||||
if (vb5 <= 0x01) {
|
||||
*q++ = 0xfc + vb5;
|
||||
goto out5;
|
||||
}
|
||||
|
||||
/* Otherwise invalid, even with 31-bit "extended Unicode" (pre-UTF-16) */
|
||||
goto out0;
|
||||
|
||||
/* Emit extension bytes as appropriate */
|
||||
out5: *q++ = 0x80 + (vb4 & 63);
|
||||
|
Loading…
Reference in New Issue
Block a user