iconv: Input buffering for the iconv program (bug 6050)

Do not read the entire input file into memory.

Reviewed-by: DJ Delorie <dj@redhat.com>
This commit is contained in:
Florian Weimer 2024-09-20 13:10:54 +02:00
parent 75819cdd29
commit fa1b0d5e9f
2 changed files with 108 additions and 105 deletions

View File

@ -118,8 +118,9 @@ static size_t output_buffer_size = 1024 * 1024;
/* Prototypes for the functions doing the actual work. */ /* Prototypes for the functions doing the actual work. */
static void prepare_output_file (char **argv); static void prepare_output_file (char **argv);
static void close_output_file (int status); static void close_output_file (__gconv_t cd, int status);
static int process_block (iconv_t cd, char *addr, size_t len); static int process_block (iconv_t cd, char **addr, size_t *len,
off64_t file_offset, bool *incomplete);
static int process_fd (iconv_t cd, int fd); static int process_fd (iconv_t cd, int fd);
static int process_file (iconv_t cd, FILE *input); static int process_file (iconv_t cd, FILE *input);
static void print_known_names (void); static void print_known_names (void);
@ -311,7 +312,7 @@ conversions from `%s' and to `%s' are not supported"),
status = EXIT_FAILURE; status = EXIT_FAILURE;
/* Close the output file now. */ /* Close the output file now. */
close_output_file (status); close_output_file (cd, status);
} }
return status; return status;
@ -599,7 +600,7 @@ flush_output (void)
} }
static void static void
close_output_file (int status) close_output_file (__gconv_t cd, int status)
{ {
/* Do not perform a flush if a temporary file or the in-memory /* Do not perform a flush if a temporary file or the in-memory
buffer is in use and there was an error. It would clobber the buffer is in use and there was an error. It would clobber the
@ -608,10 +609,28 @@ close_output_file (int status)
(output_using_temporary_file || output_fd < 0)) (output_using_temporary_file || output_fd < 0))
return; return;
/* The current_input_file_index variable is now larger than /* All the input text is processed. For state-dependent character
last_overlapping_file_index, so the flush_output call switches sets we have to flush the state now.
The current_input_file_index variable is now larger than
last_overlapping_file_index, so the flush_output calls switch
away from the temporary file. */ away from the temporary file. */
size_t n = iconv (cd, NULL, NULL,
&output_buffer_current, &output_buffer_remaining);
if (n == (size_t) -1 && errno == E2BIG)
{
/* Try again if the state flush exceeded the buffer space. */
flush_output (); flush_output ();
n = iconv (cd, NULL, NULL,
&output_buffer_current, &output_buffer_remaining);
}
int saved_errno = errno;
flush_output ();
if (n == (size_t) -1 && !omit_invalid)
{
errno = saved_errno;
output_error ();
}
if (output_fd == STDOUT_FILENO) if (output_fd == STDOUT_FILENO)
{ {
@ -625,52 +644,36 @@ close_output_file (int status)
output_error (); output_error ();
} }
/* CD is the iconv handle. Input processing starts at *ADDR, and
consumes upto *LEN bytes. *ADDR and *LEN are updated. FILE_OFFSET
is the file offset of the data initially at ADDR. *INCOMPLETE is
set to true if conversion stops due to an incomplete input
sequence. */
static int static int
process_block (iconv_t cd, char *addr, size_t len) process_block (iconv_t cd, char **addr, size_t *len, off64_t file_offset,
bool *incomplete)
{ {
const char *start = addr; const char *start = *addr;
size_t n; size_t n;
int ret = 0; int ret = 0;
while (len > 0) while (*len > 0)
{ {
n = iconv (cd, &addr, &len, n = iconv (cd, addr, len,
&output_buffer_current, &output_buffer_remaining); &output_buffer_current, &output_buffer_remaining);
if (n == (size_t) -1 && omit_invalid && errno == EILSEQ) if (n == (size_t) -1 && omit_invalid && errno == EILSEQ)
{ {
ret = 1; ret = 1;
if (len == 0) if (*len == 0)
n = 0; n = 0;
else else
errno = E2BIG; errno = E2BIG;
} }
if (n != (size_t) -1)
{
/* All the input test is processed. For state-dependent
character sets we have to flush the state now. */
n = iconv (cd, NULL, NULL,
&output_buffer_current, &output_buffer_remaining);
if (n == (size_t) -1 && errno == E2BIG)
{
/* Try again if the state flush exceeded the buffer space. */
flush_output ();
n = iconv (cd, NULL, NULL,
&output_buffer_current, &output_buffer_remaining);
}
bool errno_is_EILSEQ = errno == EILSEQ;
if (n != (size_t) -1) if (n != (size_t) -1)
break; break;
if (omit_invalid && errno_is_EILSEQ)
{
ret = 1;
break;
}
}
if (errno == E2BIG) if (errno == E2BIG)
flush_output (); flush_output ();
else else
@ -680,13 +683,12 @@ process_block (iconv_t cd, char *addr, size_t len)
{ {
case EILSEQ: case EILSEQ:
if (! omit_invalid) if (! omit_invalid)
error (0, 0, _("illegal input sequence at position %ld"), error (0, 0, _("illegal input sequence at position %lld"),
(long int) (addr - start)); (long long int) (file_offset + (*addr - start)));
break; break;
case EINVAL: case EINVAL:
error (0, 0, _("\ *incomplete = true;
incomplete character or shift sequence at end of buffer")); return ret;
break;
case EBADF: case EBADF:
error (0, 0, _("internal error (illegal descriptor)")); error (0, 0, _("internal error (illegal descriptor)"));
break; break;
@ -706,79 +708,49 @@ incomplete character or shift sequence at end of buffer"));
static int static int
process_fd (iconv_t cd, int fd) process_fd (iconv_t cd, int fd)
{ {
/* we have a problem with reading from a descriptor since we must not char inbuf[BUFSIZ];
provide the iconv() function an incomplete character or shift char *inbuf_end = inbuf + sizeof (inbuf);
sequence at the end of the buffer. Since we have to deal with size_t inbuf_used = 0;
arbitrary encodings we must read the whole text in a buffer and off64_t file_offset = 0;
process it in one step. */ int status = 0;
static char *inbuf = NULL; bool incomplete = false;
static size_t maxlen = 0;
char *inptr = inbuf;
size_t actlen = 0;
while (actlen < maxlen) while (true)
{ {
ssize_t n = read (fd, inptr, maxlen - actlen); char *p = inbuf + inbuf_used;
ssize_t read_ret = read (fd, p, inbuf_end - p);
if (n == 0) if (read_ret == 0)
/* No more text to read. */ {
break; /* On EOF, check if the previous iconv invocation saw an
incomplete sequence. */
if (n == -1) if (incomplete)
{
error (0, 0, _("\
incomplete character or shift sequence at end of buffer"));
return 1;
}
return 0;
}
if (read_ret < 0)
{ {
/* Error while reading. */
error (0, errno, _("error while reading the input")); error (0, errno, _("error while reading the input"));
return -1; return -1;
} }
inbuf_used += read_ret;
inptr += n; incomplete = false;
actlen += n; p = inbuf;
} int ret = process_block (cd, &p, &inbuf_used, file_offset, &incomplete);
if (ret != 0)
if (actlen == maxlen)
while (1)
{ {
ssize_t n; status = ret;
char *new_inbuf; if (ret < 0)
/* Increase the buffer. */
new_inbuf = (char *) realloc (inbuf, maxlen + 32768);
if (new_inbuf == NULL)
{
error (0, errno, _("unable to allocate buffer for input"));
return -1;
}
inbuf = new_inbuf;
maxlen += 32768;
inptr = inbuf + actlen;
do
{
n = read (fd, inptr, maxlen - actlen);
if (n == 0)
/* No more text to read. */
break;
if (n == -1)
{
/* Error while reading. */
error (0, errno, _("error while reading the input"));
return -1;
}
inptr += n;
actlen += n;
}
while (actlen < maxlen);
if (n == 0)
/* Break again so we leave both loops. */
break; break;
} }
/* The next loop iteration consumes the leftover bytes. */
/* Now we have all the input in the buffer. Process it in one run. */ memmove (inbuf, p, inbuf_used);
return process_block (cd, inbuf, actlen); file_offset += read_ret - inbuf_used;
}
return status;
} }

View File

@ -50,6 +50,9 @@ echo OUT > "$tmp/out-template"
: > "$tmp/empty" : > "$tmp/empty"
printf '\xff' > "$tmp/0xff" printf '\xff' > "$tmp/0xff"
# Length should be a prime number, to help with buffer alignment testing.
printf '\xc3\xa4\xe2\x80\x94\xe2\x80\x94\xc3\xa4\n' > "$tmp/utf8-sequence"
# Double all files to produce larger buffers. # Double all files to produce larger buffers.
for p in "$tmp"/* ; do for p in "$tmp"/* ; do
i=0 i=0
@ -270,6 +273,34 @@ expect_exit 1 run_iconv -o "$tmp/out" "$tmp/abc" - < "$tmp/0xff" "$tmp/def"
run_iconv -o "$tmp/out" "$tmp/xy" - - "$tmp/zt" < "$tmp/abc" run_iconv -o "$tmp/out" "$tmp/xy" - - "$tmp/zt" < "$tmp/abc"
expect_files xy abc zt expect_files xy abc zt
# NB: Extra iconv args are ignored after this point. Actual
# multi-byte conversion does not work with tiny buffers.
iconv_args="-f UTF-8 -t ASCII"
printf 'x\n\xc3' > "$tmp/incomplete"
expect_exit 1 run_iconv -o "$tmp/out" "$tmp/incomplete"
check_out <<EOF
x
EOF
# Test buffering behavior if the buffer ends with an incomplete
# multi-byte sequence.
prefix=""
prefix_length=0
while test $prefix_length -lt 12; do
echo "info: testing prefix length $prefix_length" 2>&$logfd
printf "%s" "$prefix" > "$tmp/prefix"
cat "$tmp/prefix" "$tmp/utf8-sequence" > "$tmp/tmp"
iconv_args="-f UTF-8 -t UCS-4"
run_iconv -o "$tmp/out1" "$tmp/tmp"
iconv_args="-f UCS-4 -t UTF-8"
run_iconv -o "$tmp/out" "$tmp/out1"
expect_files prefix utf8-sequence
prefix="$prefix@"
prefix_length=$(($prefix_length + 1))
done
if $failure ; then if $failure ; then
exit 1 exit 1
fi fi