mirror of
git://sourceware.org/git/glibc.git
synced 2024-11-21 01:12:26 +08:00
iconv: Input buffering for the iconv program (bug 6050)
Do not read the entire input file into memory. Reviewed-by: DJ Delorie <dj@redhat.com>
This commit is contained in:
parent
75819cdd29
commit
fa1b0d5e9f
@ -118,8 +118,9 @@ static size_t output_buffer_size = 1024 * 1024;
|
|||||||
|
|
||||||
/* Prototypes for the functions doing the actual work. */
|
/* Prototypes for the functions doing the actual work. */
|
||||||
static void prepare_output_file (char **argv);
|
static void prepare_output_file (char **argv);
|
||||||
static void close_output_file (int status);
|
static void close_output_file (__gconv_t cd, int status);
|
||||||
static int process_block (iconv_t cd, char *addr, size_t len);
|
static int process_block (iconv_t cd, char **addr, size_t *len,
|
||||||
|
off64_t file_offset, bool *incomplete);
|
||||||
static int process_fd (iconv_t cd, int fd);
|
static int process_fd (iconv_t cd, int fd);
|
||||||
static int process_file (iconv_t cd, FILE *input);
|
static int process_file (iconv_t cd, FILE *input);
|
||||||
static void print_known_names (void);
|
static void print_known_names (void);
|
||||||
@ -311,7 +312,7 @@ conversions from `%s' and to `%s' are not supported"),
|
|||||||
status = EXIT_FAILURE;
|
status = EXIT_FAILURE;
|
||||||
|
|
||||||
/* Close the output file now. */
|
/* Close the output file now. */
|
||||||
close_output_file (status);
|
close_output_file (cd, status);
|
||||||
}
|
}
|
||||||
|
|
||||||
return status;
|
return status;
|
||||||
@ -599,7 +600,7 @@ flush_output (void)
|
|||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
close_output_file (int status)
|
close_output_file (__gconv_t cd, int status)
|
||||||
{
|
{
|
||||||
/* Do not perform a flush if a temporary file or the in-memory
|
/* Do not perform a flush if a temporary file or the in-memory
|
||||||
buffer is in use and there was an error. It would clobber the
|
buffer is in use and there was an error. It would clobber the
|
||||||
@ -608,10 +609,28 @@ close_output_file (int status)
|
|||||||
(output_using_temporary_file || output_fd < 0))
|
(output_using_temporary_file || output_fd < 0))
|
||||||
return;
|
return;
|
||||||
|
|
||||||
/* The current_input_file_index variable is now larger than
|
/* All the input text is processed. For state-dependent character
|
||||||
last_overlapping_file_index, so the flush_output call switches
|
sets we have to flush the state now.
|
||||||
|
|
||||||
|
The current_input_file_index variable is now larger than
|
||||||
|
last_overlapping_file_index, so the flush_output calls switch
|
||||||
away from the temporary file. */
|
away from the temporary file. */
|
||||||
|
size_t n = iconv (cd, NULL, NULL,
|
||||||
|
&output_buffer_current, &output_buffer_remaining);
|
||||||
|
if (n == (size_t) -1 && errno == E2BIG)
|
||||||
|
{
|
||||||
|
/* Try again if the state flush exceeded the buffer space. */
|
||||||
flush_output ();
|
flush_output ();
|
||||||
|
n = iconv (cd, NULL, NULL,
|
||||||
|
&output_buffer_current, &output_buffer_remaining);
|
||||||
|
}
|
||||||
|
int saved_errno = errno;
|
||||||
|
flush_output ();
|
||||||
|
if (n == (size_t) -1 && !omit_invalid)
|
||||||
|
{
|
||||||
|
errno = saved_errno;
|
||||||
|
output_error ();
|
||||||
|
}
|
||||||
|
|
||||||
if (output_fd == STDOUT_FILENO)
|
if (output_fd == STDOUT_FILENO)
|
||||||
{
|
{
|
||||||
@ -625,52 +644,36 @@ close_output_file (int status)
|
|||||||
output_error ();
|
output_error ();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* CD is the iconv handle. Input processing starts at *ADDR, and
|
||||||
|
consumes upto *LEN bytes. *ADDR and *LEN are updated. FILE_OFFSET
|
||||||
|
is the file offset of the data initially at ADDR. *INCOMPLETE is
|
||||||
|
set to true if conversion stops due to an incomplete input
|
||||||
|
sequence. */
|
||||||
static int
|
static int
|
||||||
process_block (iconv_t cd, char *addr, size_t len)
|
process_block (iconv_t cd, char **addr, size_t *len, off64_t file_offset,
|
||||||
|
bool *incomplete)
|
||||||
{
|
{
|
||||||
const char *start = addr;
|
const char *start = *addr;
|
||||||
size_t n;
|
size_t n;
|
||||||
int ret = 0;
|
int ret = 0;
|
||||||
|
|
||||||
while (len > 0)
|
while (*len > 0)
|
||||||
{
|
{
|
||||||
n = iconv (cd, &addr, &len,
|
n = iconv (cd, addr, len,
|
||||||
&output_buffer_current, &output_buffer_remaining);
|
&output_buffer_current, &output_buffer_remaining);
|
||||||
|
|
||||||
if (n == (size_t) -1 && omit_invalid && errno == EILSEQ)
|
if (n == (size_t) -1 && omit_invalid && errno == EILSEQ)
|
||||||
{
|
{
|
||||||
ret = 1;
|
ret = 1;
|
||||||
if (len == 0)
|
if (*len == 0)
|
||||||
n = 0;
|
n = 0;
|
||||||
else
|
else
|
||||||
errno = E2BIG;
|
errno = E2BIG;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (n != (size_t) -1)
|
|
||||||
{
|
|
||||||
/* All the input test is processed. For state-dependent
|
|
||||||
character sets we have to flush the state now. */
|
|
||||||
n = iconv (cd, NULL, NULL,
|
|
||||||
&output_buffer_current, &output_buffer_remaining);
|
|
||||||
if (n == (size_t) -1 && errno == E2BIG)
|
|
||||||
{
|
|
||||||
/* Try again if the state flush exceeded the buffer space. */
|
|
||||||
flush_output ();
|
|
||||||
n = iconv (cd, NULL, NULL,
|
|
||||||
&output_buffer_current, &output_buffer_remaining);
|
|
||||||
}
|
|
||||||
bool errno_is_EILSEQ = errno == EILSEQ;
|
|
||||||
|
|
||||||
if (n != (size_t) -1)
|
if (n != (size_t) -1)
|
||||||
break;
|
break;
|
||||||
|
|
||||||
if (omit_invalid && errno_is_EILSEQ)
|
|
||||||
{
|
|
||||||
ret = 1;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (errno == E2BIG)
|
if (errno == E2BIG)
|
||||||
flush_output ();
|
flush_output ();
|
||||||
else
|
else
|
||||||
@ -680,13 +683,12 @@ process_block (iconv_t cd, char *addr, size_t len)
|
|||||||
{
|
{
|
||||||
case EILSEQ:
|
case EILSEQ:
|
||||||
if (! omit_invalid)
|
if (! omit_invalid)
|
||||||
error (0, 0, _("illegal input sequence at position %ld"),
|
error (0, 0, _("illegal input sequence at position %lld"),
|
||||||
(long int) (addr - start));
|
(long long int) (file_offset + (*addr - start)));
|
||||||
break;
|
break;
|
||||||
case EINVAL:
|
case EINVAL:
|
||||||
error (0, 0, _("\
|
*incomplete = true;
|
||||||
incomplete character or shift sequence at end of buffer"));
|
return ret;
|
||||||
break;
|
|
||||||
case EBADF:
|
case EBADF:
|
||||||
error (0, 0, _("internal error (illegal descriptor)"));
|
error (0, 0, _("internal error (illegal descriptor)"));
|
||||||
break;
|
break;
|
||||||
@ -706,79 +708,49 @@ incomplete character or shift sequence at end of buffer"));
|
|||||||
static int
|
static int
|
||||||
process_fd (iconv_t cd, int fd)
|
process_fd (iconv_t cd, int fd)
|
||||||
{
|
{
|
||||||
/* we have a problem with reading from a descriptor since we must not
|
char inbuf[BUFSIZ];
|
||||||
provide the iconv() function an incomplete character or shift
|
char *inbuf_end = inbuf + sizeof (inbuf);
|
||||||
sequence at the end of the buffer. Since we have to deal with
|
size_t inbuf_used = 0;
|
||||||
arbitrary encodings we must read the whole text in a buffer and
|
off64_t file_offset = 0;
|
||||||
process it in one step. */
|
int status = 0;
|
||||||
static char *inbuf = NULL;
|
bool incomplete = false;
|
||||||
static size_t maxlen = 0;
|
|
||||||
char *inptr = inbuf;
|
|
||||||
size_t actlen = 0;
|
|
||||||
|
|
||||||
while (actlen < maxlen)
|
while (true)
|
||||||
{
|
{
|
||||||
ssize_t n = read (fd, inptr, maxlen - actlen);
|
char *p = inbuf + inbuf_used;
|
||||||
|
ssize_t read_ret = read (fd, p, inbuf_end - p);
|
||||||
if (n == 0)
|
if (read_ret == 0)
|
||||||
/* No more text to read. */
|
{
|
||||||
break;
|
/* On EOF, check if the previous iconv invocation saw an
|
||||||
|
incomplete sequence. */
|
||||||
if (n == -1)
|
if (incomplete)
|
||||||
|
{
|
||||||
|
error (0, 0, _("\
|
||||||
|
incomplete character or shift sequence at end of buffer"));
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
if (read_ret < 0)
|
||||||
{
|
{
|
||||||
/* Error while reading. */
|
|
||||||
error (0, errno, _("error while reading the input"));
|
error (0, errno, _("error while reading the input"));
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
inbuf_used += read_ret;
|
||||||
inptr += n;
|
incomplete = false;
|
||||||
actlen += n;
|
p = inbuf;
|
||||||
}
|
int ret = process_block (cd, &p, &inbuf_used, file_offset, &incomplete);
|
||||||
|
if (ret != 0)
|
||||||
if (actlen == maxlen)
|
|
||||||
while (1)
|
|
||||||
{
|
{
|
||||||
ssize_t n;
|
status = ret;
|
||||||
char *new_inbuf;
|
if (ret < 0)
|
||||||
|
|
||||||
/* Increase the buffer. */
|
|
||||||
new_inbuf = (char *) realloc (inbuf, maxlen + 32768);
|
|
||||||
if (new_inbuf == NULL)
|
|
||||||
{
|
|
||||||
error (0, errno, _("unable to allocate buffer for input"));
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
inbuf = new_inbuf;
|
|
||||||
maxlen += 32768;
|
|
||||||
inptr = inbuf + actlen;
|
|
||||||
|
|
||||||
do
|
|
||||||
{
|
|
||||||
n = read (fd, inptr, maxlen - actlen);
|
|
||||||
|
|
||||||
if (n == 0)
|
|
||||||
/* No more text to read. */
|
|
||||||
break;
|
|
||||||
|
|
||||||
if (n == -1)
|
|
||||||
{
|
|
||||||
/* Error while reading. */
|
|
||||||
error (0, errno, _("error while reading the input"));
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
|
|
||||||
inptr += n;
|
|
||||||
actlen += n;
|
|
||||||
}
|
|
||||||
while (actlen < maxlen);
|
|
||||||
|
|
||||||
if (n == 0)
|
|
||||||
/* Break again so we leave both loops. */
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
/* The next loop iteration consumes the leftover bytes. */
|
||||||
/* Now we have all the input in the buffer. Process it in one run. */
|
memmove (inbuf, p, inbuf_used);
|
||||||
return process_block (cd, inbuf, actlen);
|
file_offset += read_ret - inbuf_used;
|
||||||
|
}
|
||||||
|
return status;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -50,6 +50,9 @@ echo OUT > "$tmp/out-template"
|
|||||||
: > "$tmp/empty"
|
: > "$tmp/empty"
|
||||||
printf '\xff' > "$tmp/0xff"
|
printf '\xff' > "$tmp/0xff"
|
||||||
|
|
||||||
|
# Length should be a prime number, to help with buffer alignment testing.
|
||||||
|
printf '\xc3\xa4\xe2\x80\x94\xe2\x80\x94\xc3\xa4\n' > "$tmp/utf8-sequence"
|
||||||
|
|
||||||
# Double all files to produce larger buffers.
|
# Double all files to produce larger buffers.
|
||||||
for p in "$tmp"/* ; do
|
for p in "$tmp"/* ; do
|
||||||
i=0
|
i=0
|
||||||
@ -270,6 +273,34 @@ expect_exit 1 run_iconv -o "$tmp/out" "$tmp/abc" - < "$tmp/0xff" "$tmp/def"
|
|||||||
run_iconv -o "$tmp/out" "$tmp/xy" - - "$tmp/zt" < "$tmp/abc"
|
run_iconv -o "$tmp/out" "$tmp/xy" - - "$tmp/zt" < "$tmp/abc"
|
||||||
expect_files xy abc zt
|
expect_files xy abc zt
|
||||||
|
|
||||||
|
# NB: Extra iconv args are ignored after this point. Actual
|
||||||
|
# multi-byte conversion does not work with tiny buffers.
|
||||||
|
iconv_args="-f UTF-8 -t ASCII"
|
||||||
|
|
||||||
|
printf 'x\n\xc3' > "$tmp/incomplete"
|
||||||
|
expect_exit 1 run_iconv -o "$tmp/out" "$tmp/incomplete"
|
||||||
|
check_out <<EOF
|
||||||
|
x
|
||||||
|
EOF
|
||||||
|
|
||||||
|
# Test buffering behavior if the buffer ends with an incomplete
|
||||||
|
# multi-byte sequence.
|
||||||
|
prefix=""
|
||||||
|
prefix_length=0
|
||||||
|
while test $prefix_length -lt 12; do
|
||||||
|
echo "info: testing prefix length $prefix_length" 2>&$logfd
|
||||||
|
printf "%s" "$prefix" > "$tmp/prefix"
|
||||||
|
cat "$tmp/prefix" "$tmp/utf8-sequence" > "$tmp/tmp"
|
||||||
|
iconv_args="-f UTF-8 -t UCS-4"
|
||||||
|
run_iconv -o "$tmp/out1" "$tmp/tmp"
|
||||||
|
iconv_args="-f UCS-4 -t UTF-8"
|
||||||
|
run_iconv -o "$tmp/out" "$tmp/out1"
|
||||||
|
expect_files prefix utf8-sequence
|
||||||
|
|
||||||
|
prefix="$prefix@"
|
||||||
|
prefix_length=$(($prefix_length + 1))
|
||||||
|
done
|
||||||
|
|
||||||
if $failure ; then
|
if $failure ; then
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
Loading…
Reference in New Issue
Block a user