Multimodal Textbox (Chat Input Component) (#7420)

* first pass * multimodal textbox * add changeset * remove file * more changes * changes * add changeset * revert demo * doc strings fix * update demo * file icons * more updates * format * add story * remove doc line * type fixes * chat interface * new demo * image upload fix * ui changes * addressing PR comments * format * type check * more pr fixes * format * format * test fixes * test fixes * Streaming fixes + other stuff * optional keys to dict value * final fixes * notebook * format * Update guides/04_chatbots/01_creating-a-chatbot-fast.md Co-authored-by: Abubakar Abid <abubakar@huggingface.co> * Update guides/04_chatbots/01_creating-a-chatbot-fast.md Co-authored-by: Abubakar Abid <abubakar@huggingface.co> * Update guides/04_chatbots/01_creating-a-chatbot-fast.md Co-authored-by: Abubakar Abid <abubakar@huggingface.co> * merge * backend fixes * story fix * ui test fix * format * story * format * demo fix * streaming test fix * stories fix * stories fix --------- Co-authored-by: gradio-pr-bot <gradio-pr-bot@users.noreply.github.com> Co-authored-by: Abubakar Abid <abubakar@huggingface.co>
2025-03-25 12:10:31 +08:00 · 2024-03-19 16:16:05 -04:00 · 2024-03-19 16:16:05 -04:00 · 15da39fca0
commit 15da39fca0
parent c9aba8d8a5
29 changed files with 1291 additions and 76 deletions
--- a/.changeset/early-sheep-drop.md
+++ b/.changeset/early-sheep-drop.md
@ -0,0 +1,8 @@
+---
+"@gradio/app": minor
+"@gradio/multimodaltextbox": minor
+"@gradio/upload": minor
+"gradio": minor
+---
+
+feat: Multimodal Textbox (Chat Input Component)
--- a/demo/chatbot_multimodal/files/avatar.png
+++ b/demo/chatbot_multimodal/files/avatar.png
--- a/demo/chatbot_multimodal/files/lion.jpg
+++ b/demo/chatbot_multimodal/files/lion.jpg
--- a/demo/chatbot_multimodal/run.ipynb
+++ b/demo/chatbot_multimodal/run.ipynb
@ -1 +1 @@
-{"cells": [{"cell_type": "markdown", "id": "302934307671667531413257853548643485645", "metadata": {}, "source": ["# Gradio Demo: chatbot_multimodal"]}, {"cell_type": "code", "execution_count": null, "id": "272996653310673477252411125948039410165", "metadata": {}, "outputs": [], "source": ["!pip install -q gradio "]}, {"cell_type": "code", "execution_count": null, "id": "288918539441861185822528903084949547379", "metadata": {}, "outputs": [], "source": ["# Downloading files from the demo repo\n", "import os\n", "!wget -q https://github.com/gradio-app/gradio/raw/main/demo/chatbot_multimodal/avatar.png"]}, {"cell_type": "code", "execution_count": null, "id": "44380577570523278879349135829904343037", "metadata": {}, "outputs": [], "source": ["import gradio as gr\n", "import os\n", "import time\n", "\n", "# Chatbot demo with multimodal input (text, markdown, LaTeX, code blocks, image, audio, & video). Plus shows support for streaming text.\n", "\n", "\n", "def print_like_dislike(x: gr.LikeData):\n", "    print(x.index, x.value, x.liked)\n", "\n", "\n", "def add_text(history, text):\n", "    history = history + [(text, None)]\n", "    return history, gr.Textbox(value=\"\", interactive=False)\n", "\n", "\n", "def add_file(history, file):\n", "    history = history + [((file.name,), None)]\n", "    return history\n", "\n", "\n", "def bot(history):\n", "    response = \"**That's cool!**\"\n", "    history[-1][1] = \"\"\n", "    for character in response:\n", "        history[-1][1] += character\n", "        time.sleep(0.05)\n", "        yield history\n", "\n", "\n", "with gr.Blocks() as demo:\n", "    chatbot = gr.Chatbot(\n", "        [],\n", "        elem_id=\"chatbot\",\n", "        bubble_full_width=False,\n", "        avatar_images=(None, (os.path.join(os.path.abspath(''), \"avatar.png\"))),\n", "    )\n", "\n", "    with gr.Row():\n", "        txt = gr.Textbox(\n", "            scale=4,\n", "            show_label=False,\n", "            placeholder=\"Enter text and press enter, or upload an image\",\n", "            container=False,\n", "        )\n", "        btn = gr.UploadButton(\"\ud83d\udcc1\", file_types=[\"image\", \"video\", \"audio\"])\n", "\n", "    txt_msg = txt.submit(add_text, [chatbot, txt], [chatbot, txt], queue=False).then(\n", "        bot, chatbot, chatbot, api_name=\"bot_response\"\n", "    )\n", "    txt_msg.then(lambda: gr.Textbox(interactive=True), None, [txt], queue=False)\n", "    file_msg = btn.upload(add_file, [chatbot, btn], [chatbot], queue=False).then(\n", "        bot, chatbot, chatbot\n", "    )\n", "\n", "    chatbot.like(print_like_dislike, None, None)\n", "\n", "\n", "demo.queue()\n", "if __name__ == \"__main__\":\n", "    demo.launch()\n"]}], "metadata": {}, "nbformat": 4, "nbformat_minor": 5}
+{"cells": [{"cell_type": "markdown", "id": "302934307671667531413257853548643485645", "metadata": {}, "source": ["# Gradio Demo: chatbot_multimodal"]}, {"cell_type": "code", "execution_count": null, "id": "272996653310673477252411125948039410165", "metadata": {}, "outputs": [], "source": ["!pip install -q gradio "]}, {"cell_type": "code", "execution_count": null, "id": "288918539441861185822528903084949547379", "metadata": {}, "outputs": [], "source": ["# Downloading files from the demo repo\n", "import os\n", "os.mkdir('files')\n", "!wget -q -O files/avatar.png https://github.com/gradio-app/gradio/raw/main/demo/chatbot_multimodal/files/avatar.png\n", "!wget -q -O files/lion.jpg https://github.com/gradio-app/gradio/raw/main/demo/chatbot_multimodal/files/lion.jpg"]}, {"cell_type": "code", "execution_count": null, "id": "44380577570523278879349135829904343037", "metadata": {}, "outputs": [], "source": ["import gradio as gr\n", "import os\n", "import time\n", "\n", "# Chatbot demo with multimodal input (text, markdown, LaTeX, code blocks, image, audio, & video). Plus shows support for streaming text.\n", "\n", "\n", "def print_like_dislike(x: gr.LikeData):\n", "    print(x.index, x.value, x.liked)\n", "\n", "def add_message(history, message):\n", "    for x in message[\"files\"]:\n", "        history.append(((x[\"path\"],), None))  \n", "    if message[\"text\"] is not None:\n", "        history.append((message[\"text\"], None))\n", "    return history, gr.MultimodalTextbox(value=None, interactive=False, file_types=[\"image\"])\n", "\n", "def bot(history):\n", "    response = \"**That's cool!**\"\n", "    history[-1][1] = \"\"\n", "    for character in response:\n", "        history[-1][1] += character\n", "        time.sleep(0.05)\n", "        yield history\n", "\n", "\n", "with gr.Blocks() as demo:\n", "    chatbot = gr.Chatbot(\n", "        [],\n", "        elem_id=\"chatbot\",\n", "        bubble_full_width=False,\n", "        avatar_images=(None, (os.path.join(os.path.abspath(''), \"files/avatar.png\"))),\n", "    )\n", "\n", "    chat_input = gr.MultimodalTextbox(interactive=True, file_types=[\"image\"], placeholder=\"Enter message or upload file...\", show_label=False)\n", "    chat_msg = chat_input.submit(add_message, [chatbot, chat_input], [chatbot, chat_input], queue=False).then(\n", "        bot, chatbot, chatbot, api_name=\"bot_response\"\n", "    )\n", "    chat_msg.then(lambda: gr.Textbox(interactive=True), None, [chat_input], queue=False)\n", "    chatbot.like(print_like_dislike, None, None)\n", "\n", "demo.queue()\n", "if __name__ == \"__main__\":\n", "    demo.launch()\n"]}], "metadata": {}, "nbformat": 4, "nbformat_minor": 5}
--- a/demo/chatbot_multimodal/run.py
+++ b/demo/chatbot_multimodal/run.py
@ -8,16 +8,12 @@ import time
 def print_like_dislike(x: gr.LikeData):
    print(x.index, x.value, x.liked)

-
-def add_text(history, text):
-    history = history + [(text, None)]
-    return history, gr.Textbox(value="", interactive=False)
-
-
-def add_file(history, file):
-    history = history + [((file.name,), None)]
-    return history
-
+def add_message(history, message):
+    for x in message["files"]:
+        history.append(((x["path"],), None))  
+    if message["text"] is not None:
+        history.append((message["text"], None))
+    return history, gr.MultimodalTextbox(value=None, interactive=False, file_types=["image"])

 def bot(history):
    response = "**That's cool!**"
@ -33,29 +29,16 @@ with gr.Blocks() as demo:
        [],
        elem_id="chatbot",
        bubble_full_width=False,
-        avatar_images=(None, (os.path.join(os.path.dirname(__file__), "avatar.png"))),
+        avatar_images=(None, (os.path.join(os.path.dirname(__file__), "files/avatar.png"))),
    )

-    with gr.Row():
-        txt = gr.Textbox(
-            scale=4,
-            show_label=False,
-            placeholder="Enter text and press enter, or upload an image",
-            container=False,
-        )
-        btn = gr.UploadButton("📁", file_types=["image", "video", "audio"])
-
-    txt_msg = txt.submit(add_text, [chatbot, txt], [chatbot, txt], queue=False).then(
+    chat_input = gr.MultimodalTextbox(interactive=True, file_types=["image"], placeholder="Enter message or upload file...", show_label=False)
+    chat_msg = chat_input.submit(add_message, [chatbot, chat_input], [chatbot, chat_input], queue=False).then(
        bot, chatbot, chatbot, api_name="bot_response"
    )
-    txt_msg.then(lambda: gr.Textbox(interactive=True), None, [txt], queue=False)
-    file_msg = btn.upload(add_file, [chatbot, btn], [chatbot], queue=False).then(
-        bot, chatbot, chatbot
-    )
-
+    chat_msg.then(lambda: gr.Textbox(interactive=True), None, [chat_input], queue=False)
    chatbot.like(print_like_dislike, None, None)

-
 demo.queue()
 if __name__ == "__main__":
    demo.launch()
--- a/demo/chatinterface_multimodal/run.ipynb
+++ b/demo/chatinterface_multimodal/run.ipynb
@ -0,0 +1 @@
+{"cells": [{"cell_type": "markdown", "id": "302934307671667531413257853548643485645", "metadata": {}, "source": ["# Gradio Demo: chatinterface_multimodal"]}, {"cell_type": "code", "execution_count": null, "id": "272996653310673477252411125948039410165", "metadata": {}, "outputs": [], "source": ["!pip install -q gradio "]}, {"cell_type": "code", "execution_count": null, "id": "288918539441861185822528903084949547379", "metadata": {}, "outputs": [], "source": ["import gradio as gr\n", "\n", "def echo(message, history):\n", "    return message[\"text\"]\n", "\n", "demo = gr.ChatInterface(fn=echo, examples=[{\"text\": \"hello\"}, {\"text\": \"hola\"}, {\"text\": \"merhaba\"}], title=\"Echo Bot\", multimodal=True)\n", "demo.launch()\n"]}], "metadata": {}, "nbformat": 4, "nbformat_minor": 5}
--- a/demo/chatinterface_multimodal/run.py
+++ b/demo/chatinterface_multimodal/run.py
@ -0,0 +1,7 @@
+import gradio as gr
+
+def echo(message, history):
+    return message["text"]
+
+demo = gr.ChatInterface(fn=echo, examples=[{"text": "hello"}, {"text": "hola"}, {"text": "merhaba"}], title="Echo Bot", multimodal=True)
+demo.launch()
--- a/demo/multimodaltextbox_component/run.ipynb
+++ b/demo/multimodaltextbox_component/run.ipynb
@ -0,0 +1 @@
+{"cells": [{"cell_type": "markdown", "id": "302934307671667531413257853548643485645", "metadata": {}, "source": ["# Gradio Demo: multimodaltextbox_component"]}, {"cell_type": "code", "execution_count": null, "id": "272996653310673477252411125948039410165", "metadata": {}, "outputs": [], "source": ["!pip install -q gradio "]}, {"cell_type": "code", "execution_count": null, "id": "288918539441861185822528903084949547379", "metadata": {}, "outputs": [], "source": ["import gradio as gr\n", "\n", "with gr.Blocks() as demo:\n", "    gr.MultimodalTextbox(interactive=True)\n", "\n", "demo.launch()"]}], "metadata": {}, "nbformat": 4, "nbformat_minor": 5}
--- a/demo/multimodaltextbox_component/run.py
+++ b/demo/multimodaltextbox_component/run.py
@ -0,0 +1,6 @@
+import gradio as gr
+
+with gr.Blocks() as demo:
+    gr.MultimodalTextbox(interactive=True)
+
+demo.launch()
--- a/gradio/init.py
+++ b/gradio/init.py
@ -44,6 +44,7 @@ from gradio.components import (
    LogoutButton,
    Markdown,
    Model3D,
+    MultimodalTextbox,
    Number,
    ParamViewer,
    Plot,
--- a/gradio/chat_interface.py
+++ b/gradio/chat_interface.py
@ -18,6 +18,7 @@ from gradio.components import (
    Chatbot,
    Component,
    Markdown,
+    MultimodalTextbox,
    State,
    Textbox,
    get_component_instance,
@ -47,7 +48,7 @@ class ChatInterface(Blocks):

        demo = gr.ChatInterface(fn=echo, examples=["hello", "hola", "merhaba"], title="Echo Bot")
        demo.launch()
-    Demos: chatinterface_random_response, chatinterface_streaming_echo
+    Demos: chatinterface_multimodal, chatinterface_random_response, chatinterface_streaming_echo
    Guides: creating-a-chatbot-fast, sharing-your-app
    """

@ -55,12 +56,13 @@ class ChatInterface(Blocks):
        self,
        fn: Callable,
        *,
+        multimodal: bool = False,
        chatbot: Chatbot | None = None,
-        textbox: Textbox | None = None,
+        textbox: Textbox | MultimodalTextbox | None = None,
        additional_inputs: str | Component | list[str | Component] | None = None,
        additional_inputs_accordion_name: str | None = None,
        additional_inputs_accordion: str | Accordion | None = None,
-        examples: list[str] | None = None,
+        examples: list[str] | list[dict[str, str | list]] | None = None,
        cache_examples: bool | None = None,
        title: str | None = None,
        description: str | None = None,
@ -82,8 +84,9 @@ class ChatInterface(Blocks):
        """
        Parameters:
            fn: The function to wrap the chat interface around. Should accept two parameters: a string input message and list of two-element lists of the form [[user_message, bot_message], ...] representing the chat history, and return a string response. See the Chatbot documentation for more information on the chat history format.
+            multimodal: If True, the chat interface will use a gr.MultimodalTextbox component for the input, which allows for the uploading of multimedia files. If False, the chat interface will use a gr.Textbox component for the input.
            chatbot: An instance of the gr.Chatbot component to use for the chat interface, if you would like to customize the chatbot properties. If not provided, a default gr.Chatbot component will be created.
-            textbox: An instance of the gr.Textbox component to use for the chat interface, if you would like to customize the textbox properties. If not provided, a default gr.Textbox component will be created.
+            textbox: An instance of the gr.Textbox or gr.MultimodalTextbox component to use for the chat interface, if you would like to customize the textbox properties. If not provided, a default gr.Textbox or gr.MultimodalTextbox component will be created.
            additional_inputs: An instance or list of instances of gradio components (or their string shortcuts) to use as additional inputs to the chatbot. If components are not already rendered in a surrounding Blocks, then the components will be displayed under the chatbot, in an accordion.
            additional_inputs_accordion_name: Deprecated. Will be removed in a future version of Gradio. Use the `additional_inputs_accordion` parameter instead.
            additional_inputs_accordion: If a string is provided, this is the label of the `gr.Accordion` to use to contain additional inputs. A `gr.Accordion` object can be provided as well to configure other properties of the container holding the additional inputs. Defaults to a `gr.Accordion(label="Additional Inputs", open=False)`. This parameter is only used if `additional_inputs` is provided.
@ -117,6 +120,7 @@ class ChatInterface(Blocks):
            fill_height=fill_height,
            delete_cache=delete_cache,
        )
+        self.multimodal = multimodal
        self.concurrency_limit = concurrency_limit
        self.fn = fn
        self.is_async = inspect.iscoroutinefunction(
@ -202,11 +206,22 @@ class ChatInterface(Blocks):
                        textbox.container = False
                        textbox.show_label = False
                        textbox_ = textbox.render()
-                        if not isinstance(textbox_, Textbox):
+                        if not isinstance(textbox_, Textbox) or not isinstance(
+                            textbox_, MultimodalTextbox
+                        ):
                            raise TypeError(
-                                f"Expected a gr.Textbox, but got {type(textbox_)}"
+                                f"Expected a gr.Textbox or gr.MultimodalTextbox component, but got {type(textbox_)}"
                            )
                        self.textbox = textbox_
+                    elif self.multimodal:
+                        submit_btn = None
+                        self.textbox = MultimodalTextbox(
+                            show_label=False,
+                            label="Message",
+                            placeholder="Type a message...",
+                            scale=7,
+                            autofocus=autofocus,
+                        )
                    else:
                        self.textbox = Textbox(
                            container=False,
@ -216,7 +231,7 @@ class ChatInterface(Blocks):
                            scale=7,
                            autofocus=autofocus,
                        )
-                    if submit_btn is not None:
+                    if submit_btn is not None and not multimodal:
                        if isinstance(submit_btn, Button):
                            submit_btn.render()
                        elif isinstance(submit_btn, str):
@ -331,7 +346,7 @@ class ChatInterface(Blocks):
            retry_event = (
                self.retry_btn.click(
                    self._delete_prev_fn,
-                    [self.chatbot_state],
+                    [self.saved_input, self.chatbot_state],
                    [self.chatbot, self.saved_input, self.chatbot_state],
                    show_api=False,
                    queue=False,
@ -358,7 +373,7 @@ class ChatInterface(Blocks):
        if self.undo_btn:
            self.undo_btn.click(
                self._delete_prev_fn,
-                [self.chatbot_state],
+                [self.saved_input, self.chatbot_state],
                [self.chatbot, self.saved_input, self.chatbot_state],
                show_api=False,
                queue=False,
@ -439,23 +454,48 @@ class ChatInterface(Blocks):
            ),
        )

-    def _clear_and_save_textbox(self, message: str) -> tuple[str, str]:
-        return "", message
+    def _clear_and_save_textbox(self, message: str) -> tuple[str | dict, str]:
+        if self.multimodal:
+            return {"text": "", "files": []}, message
+        else:
+            return "", message
+
+    def _append_multimodal_history(
+        self,
+        message: dict[str, list],
+        response: str | None,
+        history: list[list[str | tuple | None]],
+    ):
+        for x in message["files"]:
+            history.append([(x["path"],), None])
+        if message["text"] is not None and isinstance(message["text"], str):
+            history.append([message["text"], response])

    def _display_input(
-        self, message: str, history: list[list[str | None]]
-    ) -> tuple[list[list[str | None]], list[list[str | None]]]:
-        history.append([message, None])
+        self, message: str | dict[str, list], history: list[list[str | tuple | None]]
+    ) -> tuple[list[list[str | tuple | None]], list[list[str | tuple | None]]]:
+        if self.multimodal and isinstance(message, dict):
+            self._append_multimodal_history(message, None, history)
+        elif isinstance(message, str):
+            history.append([message, None])
        return history, history

    async def _submit_fn(
        self,
-        message: str,
-        history_with_input: list[list[str | None]],
+        message: str | dict[str, list],
+        history_with_input: list[list[str | tuple | None]],
        request: Request,
        *args,
-    ) -> tuple[list[list[str | None]], list[list[str | None]]]:
-        history = history_with_input[:-1]
+    ) -> tuple[list[list[str | tuple | None]], list[list[str | tuple | None]]]:
+        if self.multimodal and isinstance(message, dict):
+            remove_input = (
+                len(message["files"]) + 1
+                if message["text"] is not None
+                else len(message["files"])
+            )
+            history = history_with_input[:-remove_input]
+        else:
+            history = history_with_input[:-1]
        inputs, _, _ = special_args(
            self.fn, inputs=[message, history, *args], request=request
        )
@ -467,17 +507,28 @@ class ChatInterface(Blocks):
                self.fn, *inputs, limiter=self.limiter
            )

-        history.append([message, response])
+        if self.multimodal and isinstance(message, dict):
+            self._append_multimodal_history(message, response, history)
+        elif isinstance(message, str):
+            history.append([message, response])
        return history, history

    async def _stream_fn(
        self,
-        message: str,
-        history_with_input: list[list[str | None]],
+        message: str | dict[str, list],
+        history_with_input: list[list[str | tuple | None]],
        request: Request,
        *args,
    ) -> AsyncGenerator:
-        history = history_with_input[:-1]
+        if self.multimodal and isinstance(message, dict):
+            remove_input = (
+                len(message["files"]) + 1
+                if message["text"] is not None
+                else len(message["files"])
+            )
+            history = history_with_input[:-remove_input]
+        else:
+            history = history_with_input[:-1]
        inputs, _, _ = special_args(
            self.fn, inputs=[message, history, *args], request=request
        )
@ -491,14 +542,28 @@ class ChatInterface(Blocks):
            generator = SyncToAsyncIterator(generator, self.limiter)
        try:
            first_response = await async_iteration(generator)
-            update = history + [[message, first_response]]
-            yield update, update
+            if self.multimodal and isinstance(message, dict):
+                for x in message["files"]:
+                    history.append([(x["path"],), None])
+                update = history + [[message["text"], first_response]]
+                yield update, update
+            else:
+                update = history + [[message, first_response]]
+                yield update, update
        except StopIteration:
-            update = history + [[message, None]]
-            yield update, update
+            if self.multimodal and isinstance(message, dict):
+                self._append_multimodal_history(message, None, history)
+                yield history, history
+            else:
+                update = history + [[message, None]]
+                yield update, update
        async for response in generator:
-            update = history + [[message, response]]
-            yield update, update
+            if self.multimodal and isinstance(message, dict):
+                update = history + [[message["text"], response]]
+                yield update, update
+            else:
+                update = history + [[message, response]]
+                yield update, update

    async def _api_submit_fn(
        self, message: str, history: list[list[str | None]], request: Request, *args
@ -567,10 +632,21 @@ class ChatInterface(Blocks):
            yield [[message, response]]

    def _delete_prev_fn(
-        self, history: list[list[str | None]]
-    ) -> tuple[list[list[str | None]], str, list[list[str | None]]]:
-        try:
-            message, _ = history.pop()
-        except IndexError:
-            message = ""
+        self,
+        message: str | dict[str, list],
+        history: list[list[str | tuple | None]],
+    ) -> tuple[
+        list[list[str | tuple | None]],
+        str | dict[str, list],
+        list[list[str | tuple | None]],
+    ]:
+        if self.multimodal and isinstance(message, dict):
+            remove_input = (
+                len(message["files"]) + 1
+                if message["text"] is not None
+                else len(message["files"])
+            )
+            history = history[:-remove_input]
+        else:
+            history = history[:-1]
        return history, message or "", history
--- a/gradio/components/init.py
+++ b/gradio/components/init.py
@ -37,6 +37,7 @@ from gradio.components.login_button import LoginButton
 from gradio.components.logout_button import LogoutButton
 from gradio.components.markdown import Markdown
 from gradio.components.model3d import Model3D
+from gradio.components.multimodal_textbox import MultimodalTextbox
 from gradio.components.number import Number
 from gradio.components.paramviewer import ParamViewer
 from gradio.components.plot import Plot
@ -114,4 +115,5 @@ __all__ = [
    "StreamingOutput",
    "ImageEditor",
    "ParamViewer",
+    "MultimodalTextbox",
 ]
--- a/gradio/components/multimodal_textbox.py
+++ b/gradio/components/multimodal_textbox.py
@ -0,0 +1,183 @@
+"""gr.MultimodalTextbox() component."""
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any, Callable, List, Literal, Optional, TypedDict
+
+import gradio_client.utils as client_utils
+from gradio_client.documentation import document
+from pydantic import Field
+
+from gradio.components.base import FormComponent
+from gradio.data_classes import FileData, GradioModel
+from gradio.events import Events
+
+
+class MultimodalData(GradioModel):
+    text: Optional[str] = None
+    files: Optional[List[FileData]] = Field(default_factory=list)
+
+
+class MultimodalPostprocess(TypedDict):
+    text: str
+    files: List[FileData]
+
+
+@document()
+class MultimodalTextbox(FormComponent):
+    """
+    Creates a textarea for users to enter string input or display string output and also allows for the uploading of multimedia files.
+
+    Demos: chatbot_multimodal
+    Guides: creating-a-chatbot
+    """
+
+    data_model = MultimodalData
+
+    EVENTS = [
+        Events.change,
+        Events.input,
+        Events.select,
+        Events.submit,
+        Events.focus,
+        Events.blur,
+    ]
+
+    def __init__(
+        self,
+        value: dict[str, str | list] | Callable | None = None,
+        *,
+        file_types: list[str] | None = None,
+        lines: int = 1,
+        max_lines: int = 20,
+        placeholder: str | None = None,
+        label: str | None = None,
+        info: str | None = None,
+        every: float | None = None,
+        show_label: bool | None = None,
+        container: bool = True,
+        scale: int | None = None,
+        min_width: int = 160,
+        interactive: bool | None = None,
+        visible: bool = True,
+        elem_id: str | None = None,
+        autofocus: bool = False,
+        autoscroll: bool = True,
+        elem_classes: list[str] | str | None = None,
+        render: bool = True,
+        text_align: Literal["left", "right"] | None = None,
+        rtl: bool = False,
+        submit_btn: str | Literal[False] = "⌲",
+    ):
+        """
+        Parameters:
+            value: Default value to show in MultimodalTextbox. A dictionary of the form {"text": "sample text", "files": [{path: "files/file.jpg", orig_name: "file.jpg", url: "http://image_url.jpg", size: 100}]}. If callable, the function will be called whenever the app loads to set the initial value of the component.
+            file_types: List of file extensions or types of files to be uploaded (e.g. ['image', '.json', '.mp4']). "file" allows any file to be uploaded, "image" allows only image files to be uploaded, "audio" allows only audio files to be uploaded, "video" allows only video files to be uploaded, "text" allows only text files to be uploaded.
+            lines: minimum number of line rows to provide in textarea.
+            max_lines: maximum number of line rows to provide in textarea.
+            placeholder: placeholder hint to provide behind textarea.
+            label: The label for this component. Appears above the component and is also used as the header if there is a table of examples for this component. If None and used in a `gr.Interface`, the label will be the name of the parameter this component is assigned to.
+            info: additional component description.
+            every: If `value` is a callable, run the function 'every' number of seconds while the client connection is open. Has no effect otherwise. The event can be accessed (e.g. to cancel it) via this component's .load_event attribute.
+            show_label: if True, will display label.
+            container: If True, will place the component in a container - providing some extra padding around the border.
+            scale: relative size compared to adjacent Components. For example if Components A and B are in a Row, and A has scale=2, and B has scale=1, A will be twice as wide as B. Should be an integer. scale applies in Rows, and to top-level Components in Blocks where fill_height=True.
+            min_width: minimum pixel width, will wrap if not sufficient screen space to satisfy this value. If a certain scale value results in this Component being narrower than min_width, the min_width parameter will be respected first.
+            interactive: if True, will be rendered as an editable textbox; if False, editing will be disabled. If not provided, this is inferred based on whether the component is used as an input or output.
+            visible: If False, component will be hidden.
+            autofocus: If True, will focus on the textbox when the page loads. Use this carefully, as it can cause usability issues for sighted and non-sighted users.
+            elem_id: An optional string that is assigned as the id of this component in the HTML DOM. Can be used for targeting CSS styles.
+            elem_classes: An optional list of strings that are assigned as the classes of this component in the HTML DOM. Can be used for targeting CSS styles.
+            render: If False, component will not render be rendered in the Blocks context. Should be used if the intention is to assign event listeners now but render the component later.
+            text_align: How to align the text in the textbox, can be: "left", "right", or None (default). If None, the alignment is left if `rtl` is False, or right if `rtl` is True. Can only be changed if `type` is "text".
+            rtl: If True and `type` is "text", sets the direction of the text to right-to-left (cursor appears on the left of the text). Default is False, which renders cursor on the right.
+            autoscroll: If True, will automatically scroll to the bottom of the textbox when the value changes, unless the user scrolls up. If False, will not scroll to the bottom of the textbox when the value changes.
+            submit_btn: If False, will not show a submit button. If a string, will use that string as the submit button text. Only applies if `interactive` is True.
+        """
+        self.file_types = file_types
+        if value is None:
+            value = {"text": "", "files": []}
+        if file_types is not None and not isinstance(file_types, list):
+            raise ValueError(
+                f"Parameter file_types must be a list. Received {file_types.__class__.__name__}"
+            )
+        self.lines = lines
+        self.max_lines = max(lines, max_lines)
+        self.placeholder = placeholder
+        self.submit_btn = submit_btn
+        self.autofocus = autofocus
+        self.autoscroll = autoscroll
+
+        super().__init__(
+            label=label,
+            info=info,
+            every=every,
+            show_label=show_label,
+            container=container,
+            scale=scale,
+            min_width=min_width,
+            interactive=interactive,
+            visible=visible,
+            elem_id=elem_id,
+            elem_classes=elem_classes,
+            render=render,
+            value=value,
+        )
+        self.rtl = rtl
+        self.text_align = text_align
+
+    def preprocess(
+        self, payload: MultimodalData | None
+    ) -> dict[str, str | list] | None:
+        """
+        Parameters:
+            payload: the text and list of file(s) entered in the multimodal textbox.
+        Returns:
+            Passes text value and list of file(s) as a {dict} into the function.
+        """
+        return None if payload is None else payload.model_dump()
+
+    def postprocess(self, value: dict[str, str | list] | None) -> MultimodalData:
+        """
+        Parameters:
+            value: Expects a {dict} with "text" and "files", both optional. The files array is a list of file paths or URLs.
+        Returns:
+            The value to display in the multimodal textbox. Files information as a list of FileData objects.
+        """
+        if value is None:
+            return MultimodalData(text="", files=[])
+        if not isinstance(value, dict):
+            raise ValueError(
+                f"MultimodalTextbox expects a dictionary with optional keys 'text' and 'files'. Received {value.__class__.__name__}"
+            )
+        if "files" in value and isinstance(value["files"], list):
+            value["files"] = [
+                file
+                if isinstance(file, FileData)
+                else FileData(
+                    path=file["path"] if "path" in file else file,
+                    mime_type=file["mime_type"]
+                    if "mime_type" in file
+                    else client_utils.get_mimetype(file),
+                    orig_name=file["orig_name"]
+                    if "orig_name" in file
+                    else Path(file).name,
+                    size=file["size"] if "size" in file else Path(file).stat().st_size,
+                )
+                for file in value["files"]
+            ]
+        text = value.get("text", "")
+        files = value.get("files", [])
+        if not isinstance(text, str):
+            raise TypeError(
+                f"Expected 'text' to be a string, but got {type(text).__name__}"
+            )
+        if not isinstance(files, list):
+            raise TypeError(
+                f"Expected 'files' to be a list, but got {type(files).__name__}"
+            )
+        return MultimodalData(text=text, files=files)
+
+    def example_inputs(self) -> Any:
+        return {"text": "sample text", "files": []}
--- a/guides/04_chatbots/01_creating-a-chatbot-fast.md
+++ b/guides/04_chatbots/01_creating-a-chatbot-fast.md
@ -124,6 +124,35 @@ gr.ChatInterface(
 ).launch()
 ```

+## Add Multimodal Capability to your chatbot
+
+You may want to add multimodal capability to your chatbot. For example, you may want users to be able to easily upload images or files to your chatbot and ask questions about it. You can make your chatbot "multimodal" by passing in a single parameter (`multimodal=True`) to the `gr.ChatInterface` class.
+
+```python
+import gradio as gr
+
+chat_input = gr.MultimodalTextbox(file_types=["image"], placeholder="Enter message or upload file...")
+```
+
+`gr.ChatInterface` also supports multimodality, simply pass in the `multimodal` parameter as `True`:
+
+```python
+import gradio as gr
+import time
+
+def echo(message, history):
+    t = x['text']
+    for i in range(len(t)):
+        time.sleep(0.5)
+        yield t[:i+1]
+
+demo = gr.ChatInterface(fn=echo, examples=["hello", "hola", "merhaba"], title="Echo Bot", multimodal=True)
+
+demo.launch()
+```
+
+When `multimodal=True`, the first parameter of your function should receives a dictionary consisting of the submitted text and uploaded files that looks like this: `{"text": "user input", "file": ["file_path1", "file_path2", ...]}`.
+
 ## Additional Inputs

 You may want to add additional parameters to your chatbot and expose them to your users through the Chatbot UI. For example, suppose you want to add a textbox for a system prompt, or a slider that sets the number of tokens in the chatbot's response. The `ChatInterface` class supports an `additional_inputs` parameter which can be used to add additional input components.
--- a/guides/04_chatbots/02_creating-a-custom-chatbot-with-blocks.md
+++ b/guides/04_chatbots/02_creating-a-custom-chatbot-with-blocks.md
@ -91,15 +91,18 @@ def bot(history):
    return history
 ```

-In addition, it can handle media files, such as images, audio, and video. To pass in a media file, we must pass in the file as a tuple of two strings, like this: `(filepath, alt_text)`. The `alt_text` is optional, so you can also just pass in a tuple with a single element `(filepath,)`, like this:
+In addition, it can handle media files, such as images, audio, and video. You can use the `MultimodalTextbox` component to easily upload all types of media files to your chatbot. To pass in a media file, we must pass in the file as a tuple of two strings, like this: `(filepath, alt_text)`. The `alt_text` is optional, so you can also just pass in a tuple with a single element `(filepath,)`, like this:

 ```python
-def add_file(history, file):
-    history = history + [((file.name,), None)]
-    return history
+def add_message(history, message):
+    for x in message["files"]:
+        history.append(((x["path"],), None))  
+    if message["text"] is not None:
+        history.append((message["text"], None))
+    return history, gr.MultimodalTextbox(value=None, interactive=False, file_types=["image"])
 ```

-Putting this together, we can create a _multimodal_ chatbot with a textbox for a user to submit text and an file upload button to submit images / audio / video files. The rest of the code looks pretty much the same as before:
+Putting this together, we can create a _multimodal_ chatbot with a multimodal textbox for a user to submit text and media files. The rest of the code looks pretty much the same as before:

 $code_chatbot_multimodal
 $demo_chatbot_multimodal
--- a/js/app/package.json
+++ b/js/app/package.json
@ -55,6 +55,7 @@
 		"@gradio/label": "workspace:^",
 		"@gradio/markdown": "workspace:^",
 		"@gradio/model3d": "workspace:^",
+		"@gradio/multimodaltextbox": "workspace:^",
 		"@gradio/number": "workspace:^",
 		"@gradio/paramviewer": "workspace:^",
 		"@gradio/plot": "workspace:^",
--- a/js/app/test/chatbot_multimodal.spec.ts
+++ b/js/app/test/chatbot_multimodal.spec.ts
@ -24,9 +24,10 @@ test("images uploaded by a user should be shown in the chat", async ({
 	page
 }) => {
 	const fileChooserPromise = page.waitForEvent("filechooser");
-	await page.getByRole("button", { name: "📁" }).click();
+	await page.getByRole("button", { name: "+", exact: true }).click();
 	const fileChooser = await fileChooserPromise;
 	await fileChooser.setFiles("./test/files/cheetah1.jpg");
+	await page.getByTestId("textbox").click();
 	await page.keyboard.press("Enter");

 	const user_message = await page.getByTestId("user").first().getByRole("img");
@ -45,9 +46,10 @@ test("audio uploaded by a user should be shown in the chatbot", async ({
 	page
 }) => {
 	const fileChooserPromise = page.waitForEvent("filechooser");
-	await page.getByRole("button", { name: "📁" }).click();
+	await page.getByRole("button", { name: "+" }).click();
 	const fileChooser = await fileChooserPromise;
 	await fileChooser.setFiles("../../test/test_files/audio_sample.wav");
+	await page.getByTestId("textbox").click();
 	await page.keyboard.press("Enter");

 	const user_message = await page.getByTestId("user").first().locator("audio");
@ -65,9 +67,10 @@ test("videos uploaded by a user should be shown in the chatbot", async ({
 	page
 }) => {
 	const fileChooserPromise = page.waitForEvent("filechooser");
-	await page.getByRole("button", { name: "📁" }).click();
+	await page.getByRole("button", { name: "+" }).click();
 	const fileChooser = await fileChooserPromise;
 	await fileChooser.setFiles("../../test/test_files/video_sample.mp4");
+	await page.getByTestId("textbox").click();
 	await page.keyboard.press("Enter");

 	const user_message = await page.getByTestId("user").first().locator("video");
--- a/js/app/test/test_chatinterface_streaming_echo.spec.ts
+++ b/js/app/test/test_chatinterface_streaming_echo.spec.ts
@ -40,7 +40,7 @@ test("chatinterface works with streaming functions and all buttons behave as exp

 	await retry_button.click();
 	const expected_text_el_2 = page.locator(".bot p", {
-		hasText: "Run 3 - You typed: hello"
+		hasText: ""
 	});
 	await expect(expected_text_el_2).toBeVisible();

--- a/js/multimodaltextbox/Example.svelte
+++ b/js/multimodaltextbox/Example.svelte
@ -0,0 +1,72 @@
+<script lang="ts">
+	import { onMount } from "svelte";
+	import { Image } from "@gradio/image/shared";
+	import type { FileData } from "@gradio/client";
+
+	export let value: { text: string; files: FileData[] } = {
+		text: "",
+		files: []
+	};
+	export let type: "gallery" | "table";
+	export let selected = false;
+
+	let size: number;
+	let el: HTMLDivElement;
+
+	function set_styles(element: HTMLElement, el_width: number): void {
+		if (!element || !el_width) return;
+		el.style.setProperty(
+			"--local-text-width",
+			`${el_width < 150 ? el_width : 200}px`
+		);
+		el.style.whiteSpace = "unset";
+	}
+
+	onMount(() => {
+		set_styles(el, size);
+	});
+</script>
+
+<div
+	bind:clientWidth={size}
+	bind:this={el}
+	class:table={type === "table"}
+	class:gallery={type === "gallery"}
+	class:selected
+>
+	<p>{value.text ? value.text : ""}</p>
+	{#each value.files as file}
+		{#if file.mime_type && file.mime_type.includes("image")}
+			<Image src={file.url} alt="" />
+		{:else}
+			{file.path}
+		{/if}
+	{/each}
+</div>
+
+<style>
+	.gallery {
+		padding: var(--size-1) var(--size-2);
+		display: flex;
+		align-items: center;
+		gap: 20px;
+		overflow-x: auto;
+	}
+
+	div {
+		overflow: hidden;
+		min-width: var(--local-text-width);
+
+		white-space: nowrap;
+	}
+
+	:global(img) {
+		width: 100px;
+		height: 100px;
+	}
+
+	div > :global(p) {
+		font-size: var(--text-lg);
+		white-space: normal;
+	}
+</style>
--- a/js/multimodaltextbox/Index.svelte
+++ b/js/multimodaltextbox/Index.svelte
@ -0,0 +1,94 @@
+<svelte:options accessors={true} />
+
+<script context="module" lang="ts">
+	export { default as BaseMultimodalTextbox } from "./shared/MultimodalTextbox.svelte";
+	export { default as BaseExample } from "./Example.svelte";
+</script>
+
+<script lang="ts">
+	import type { Gradio, SelectData } from "@gradio/utils";
+	import MultimodalTextbox from "./shared/MultimodalTextbox.svelte";
+	import { Block } from "@gradio/atoms";
+	import { StatusTracker } from "@gradio/statustracker";
+	import type { LoadingStatus } from "@gradio/statustracker";
+	import type { FileData } from "@gradio/client";
+
+	export let gradio: Gradio<{
+		change: typeof value;
+		submit: never;
+		blur: never;
+		select: SelectData;
+		input: never;
+		focus: never;
+	}>;
+	export let elem_id = "";
+	export let elem_classes: string[] = [];
+	export let visible = true;
+	export let value: { text: string; files: FileData[] } = {
+		text: "",
+		files: []
+	};
+	export let file_types: string[] | null = null;
+	export let lines: number;
+	export let placeholder = "";
+	export let label = "MultimodalTextbox";
+	export let info: string | undefined = undefined;
+	export let show_label: boolean;
+	export let max_lines: number;
+	export let container = true;
+	export let scale: number | null = null;
+	export let min_width: number | undefined = undefined;
+	export let submit_btn = "⌲";
+	export let loading_status: LoadingStatus | undefined = undefined;
+	export let value_is_output = false;
+	export let rtl = false;
+	export let text_align: "left" | "right" | undefined = undefined;
+	export let autofocus = false;
+	export let autoscroll = true;
+	export let interactive: boolean;
+	export let root: string;
+</script>
+
+<Block
+	{visible}
+	{elem_id}
+	{elem_classes}
+	{scale}
+	{min_width}
+	allow_overflow={false}
+	padding={container}
+>
+	{#if loading_status}
+		<StatusTracker
+			autoscroll={gradio.autoscroll}
+			i18n={gradio.i18n}
+			{...loading_status}
+		/>
+	{/if}
+
+	<MultimodalTextbox
+		bind:value
+		bind:value_is_output
+		{file_types}
+		{root}
+		{label}
+		{info}
+		{show_label}
+		{lines}
+		{rtl}
+		{text_align}
+		max_lines={!max_lines ? lines + 1 : max_lines}
+		{placeholder}
+		{submit_btn}
+		{autofocus}
+		{container}
+		{autoscroll}
+		on:change={() => gradio.dispatch("change", value)}
+		on:input={() => gradio.dispatch("input")}
+		on:submit={() => gradio.dispatch("submit")}
+		on:blur={() => gradio.dispatch("blur")}
+		on:select={(e) => gradio.dispatch("select", e.detail)}
+		on:focus={() => gradio.dispatch("focus")}
+		disabled={!interactive}
+	/>
+</Block>
--- a/js/multimodaltextbox/MultimodalTextbox.stories.svelte
+++ b/js/multimodaltextbox/MultimodalTextbox.stories.svelte
@ -0,0 +1,75 @@
+<script>
+	import { Meta, Template, Story } from "@storybook/addon-svelte-csf";
+	import MultimodalTextbox from "./Index.svelte";
+</script>
+
+<Meta
+	title="Components/MultimodalTextbox"
+	component={MultimodalTextbox}
+	argTypes={{
+		label: {
+			control: "text",
+			description: "The textbox label",
+			name: "label"
+		},
+		show_label: {
+			options: [true, false],
+			description: "Whether to show the label",
+			control: { type: "boolean" },
+			defaultValue: true
+		},
+		text_align: {
+			options: ["left", "right"],
+			description: "Whether to align the text left or right",
+			control: { type: "select" },
+			defaultValue: "left"
+		},
+		lines: {
+			options: [1, 5, 10, 20],
+			description: "The number of lines to display in the textbox",
+			control: { type: "select" },
+			defaultValue: 1
+		},
+		max_lines: {
+			options: [1, 5, 10, 20],
+			description:
+				"The maximum number of lines to allow users to type in the textbox",
+			control: { type: "select" },
+			defaultValue: 1
+		},
+		rtl: {
+			options: [true, false],
+			description: "Whether to render right-to-left",
+			control: { type: "boolean" },
+			defaultValue: false
+		}
+	}}
+/>
+
+<Template let:args>
+	<MultimodalTextbox {...args} />
+</Template>
+
+<Story
+	name="MultimodalTextbox with file and label"
+	args={{
+		value: {
+			text: "sample text",
+			files: [
+				{
+					path: "https://gradio-builds.s3.amazonaws.com/demo-files/ghepardo-primo-piano.jpg",
+					url: "https://gradio-builds.s3.amazonaws.com/demo-files/ghepardo-primo-piano.jpg",
+					orig_name: "cheetah.jpg"
+				}
+			]
+		},
+		label: "My simple label",
+		show_label: true
+	}}
+/>
+<Story
+	name="MultimodalTextbox with 5 lines and max 5 lines"
+	args={{ lines: 5, max_lines: 5 }}
+/>
+<Story name="Right aligned textbox" args={{ text_align: "right" }} />
+<Story name="RTL textbox" args={{ rtl: true }} />
--- a/js/multimodaltextbox/MultimodalTextbox.test.ts
+++ b/js/multimodaltextbox/MultimodalTextbox.test.ts
@ -0,0 +1,69 @@
+import { test, describe, assert, afterEach } from "vitest";
+import { spy } from "tinyspy";
+import { cleanup, fireEvent, render, get_text, wait } from "@gradio/tootils";
+import event from "@testing-library/user-event";
+
+import MultimodalTextbox from "./Index.svelte";
+import type { LoadingStatus } from "@gradio/statustracker";
+
+const loading_status: LoadingStatus = {
+	eta: 0,
+	queue_position: 1,
+	queue_size: 1,
+	status: "complete" as LoadingStatus["status"],
+	scroll_to_output: false,
+	visible: true,
+	fn_index: 0,
+	show_progress: "full"
+};
+
+describe("MultimodalTextbox", () => {
+	afterEach(() => cleanup());
+
+	test("renders provided value", async () => {
+		const { getByDisplayValue } = await render(MultimodalTextbox, {
+			show_label: true,
+			max_lines: 1,
+			loading_status,
+			lines: 1,
+			value: { text: "hello world", files: [] },
+			label: "Textbox",
+			interactive: false,
+			root: ""
+		});
+
+		const item: HTMLInputElement = getByDisplayValue(
+			"hello world"
+		) as HTMLInputElement;
+		assert.equal(item.value, "hello world");
+	});
+
+	test("changing the text should update the value", async () => {
+		const { component, getByDisplayValue, listen } = await render(
+			MultimodalTextbox,
+			{
+				show_label: true,
+				max_lines: 10,
+				loading_status,
+				lines: 1,
+				value: { text: "hi ", files: [] },
+				label: "MultimodalTextbox",
+				interactive: true,
+				root: ""
+			}
+		);
+
+		const item: HTMLInputElement = getByDisplayValue("hi") as HTMLInputElement;
+
+		const mock = listen("change");
+
+		item.focus();
+		await event.keyboard("some text");
+
+		assert.equal(item.value, "hi some text");
+		assert.equal(component.value.text, "hi some text");
+		assert.equal(mock.callCount, 9);
+		assert.equal(mock.calls[8][0].detail.data.text, "hi some text");
+		assert.equal(mock.calls[8][0].detail.data.files.length, 0);
+	});
+});
--- a/js/multimodaltextbox/MultimodalTextboxExample.stories.svelte
+++ b/js/multimodaltextbox/MultimodalTextboxExample.stories.svelte
@ -0,0 +1,27 @@
+<script>
+	import { Meta, Template, Story } from "@storybook/addon-svelte-csf";
+	import MultimodalTextbox from "./Example.svelte";
+</script>
+
+<Meta
+	title="Components/MultimodalTextbox/Example"
+	component={MultimodalTextbox}
+/>
+
+<Template let:args>
+	<MultimodalTextbox {...args} />
+</Template>
+
+<Story
+	name="Text value"
+	args={{
+		value: { text: "the quick brown fox", files: [] }
+	}}
+/>
+
+<Story
+	name="Empty Value"
+	args={{
+		value: { text: "", files: [] }
+	}}
+/>
--- a/js/multimodaltextbox/README.md
+++ b/js/multimodaltextbox/README.md
@ -0,0 +1,34 @@
+# `@gradio/multimodaltextbox`
+
+```html
+<script>
+    import { BaseMultimodalTextbox, BaseExample } from "@gradio/multimodaltextbox";
+</script>
+```
+
+BaseMultimodalTextbox
+```javascript
+	export let value = "";
+	export let value_is_output = false;
+	export let lines = 1;
+	export let placeholder = "Type here...";
+	export let label: string;
+	export let info: string | undefined = undefined;
+	export let disabled = false;
+	export let show_label = true;
+	export let container = true;
+	export let max_lines: number;
+	export let type: "text" | "password" | "email" = "text";
+	export let show_copy_button = false;
+	export let rtl = false;
+	export let autofocus = false;
+	export let text_align: "left" | "right" | undefined = undefined;
+	export let autoscroll = true;
+```
+
+BaseExample
+```javascript
+	export let value: string;
+	export let type: "gallery" | "table";
+	export let selected = false;
+```
--- a/js/multimodaltextbox/package.json
+++ b/js/multimodaltextbox/package.json
@ -0,0 +1,25 @@
+{
+	"name": "@gradio/multimodaltextbox",
+	"version": "0.1.0",
+	"description": "Gradio UI packages",
+	"type": "module",
+	"author": "",
+	"license": "ISC",
+	"private": false,
+	"main_changeset": true,
+	"main": "Index.svelte",
+	"exports": {
+		".": "./Index.svelte",
+		"./example": "./Example.svelte",
+		"./package.json": "./package.json"
+	},
+	"dependencies": {
+		"@gradio/atoms": "workspace:^",
+		"@gradio/icons": "workspace:^",
+		"@gradio/statustracker": "workspace:^",
+		"@gradio/utils": "workspace:^",
+		"@gradio/upload": "workspace:^",
+		"@gradio/image": "workspace:^",
+		"@gradio/client": "workspace:^"
+	}
+}
--- a/js/multimodaltextbox/shared/MultimodalTextbox.svelte
+++ b/js/multimodaltextbox/shared/MultimodalTextbox.svelte
@ -0,0 +1,421 @@
+<script lang="ts">
+	import {
+		beforeUpdate,
+		afterUpdate,
+		createEventDispatcher,
+		tick
+	} from "svelte";
+	import { text_area_resize, resize } from "../shared/utils";
+	import { BlockTitle } from "@gradio/atoms";
+	import { Upload } from "@gradio/upload";
+	import { Image } from "@gradio/image/shared";
+	import type { FileData } from "@gradio/client";
+	import { Clear, File, Music, Video } from "@gradio/icons";
+	import type { SelectData } from "@gradio/utils";
+
+	export let value: { text: string; files: FileData[] } = {
+		text: "",
+		files: []
+	};
+
+	export let value_is_output = false;
+	export let lines = 1;
+	export let placeholder = "Type here...";
+	export let disabled = false;
+	export let label: string;
+	export let info: string | undefined = undefined;
+	export let show_label = true;
+	export let container = true;
+	export let max_lines: number;
+	export let submit_btn = "⌲";
+	export let rtl = false;
+	export let autofocus = false;
+	export let text_align: "left" | "right" | undefined = undefined;
+	export let autoscroll = true;
+	export let root: string;
+	export let file_types: string[] | null = null;
+
+	let el: HTMLTextAreaElement | HTMLInputElement;
+	let can_scroll: boolean;
+	let previous_scroll_top = 0;
+	let user_has_scrolled_up = false;
+	let dragging = false;
+	let oldValue = value.text;
+	$: dispatch("drag", dragging);
+
+	$: if (oldValue !== value.text) {
+		dispatch("change", value);
+		oldValue = value.text;
+	}
+	let accept_file_types: string | null;
+	if (file_types == null) {
+		accept_file_types = null;
+	} else {
+		file_types = file_types.map((x) => {
+			if (x.startsWith(".")) {
+				return x;
+			}
+			return x + "/*";
+		});
+		accept_file_types = file_types.join(", ");
+	}
+
+	$: if (value === null) value = { text: "", files: [] };
+	$: value, el && lines !== max_lines && resize(el, lines, max_lines);
+
+	const dispatch = createEventDispatcher<{
+		change: typeof value;
+		submit: undefined;
+		blur: undefined;
+		select: SelectData;
+		input: undefined;
+		focus: undefined;
+		drag: boolean;
+		upload: FileData[] | FileData;
+		clear: undefined;
+		load: FileData[] | FileData;
+		error: string;
+	}>();
+
+	beforeUpdate(() => {
+		can_scroll = el && el.offsetHeight + el.scrollTop > el.scrollHeight - 100;
+	});
+
+	const scroll = (): void => {
+		if (can_scroll && autoscroll && !user_has_scrolled_up) {
+			el.scrollTo(0, el.scrollHeight);
+		}
+	};
+
+	async function handle_change(): Promise<void> {
+		dispatch("change", value);
+		if (!value_is_output) {
+			dispatch("input");
+		}
+	}
+
+	afterUpdate(() => {
+		if (autofocus && el !== null) {
+			el.focus();
+		}
+		if (can_scroll && autoscroll) {
+			scroll();
+		}
+		value_is_output = false;
+	});
+
+	function handle_select(event: Event): void {
+		const target: HTMLTextAreaElement | HTMLInputElement = event.target as
+			| HTMLTextAreaElement
+			| HTMLInputElement;
+		const text = target.value;
+		const index: [number, number] = [
+			target.selectionStart as number,
+			target.selectionEnd as number
+		];
+		dispatch("select", { value: text.substring(...index), index: index });
+	}
+
+	async function handle_keypress(e: KeyboardEvent): Promise<void> {
+		await tick();
+		if (e.key === "Enter" && e.shiftKey && lines > 1) {
+			e.preventDefault();
+			dispatch("submit");
+		} else if (
+			e.key === "Enter" &&
+			!e.shiftKey &&
+			lines === 1 &&
+			max_lines >= 1
+		) {
+			e.preventDefault();
+			dispatch("submit");
+		}
+	}
+
+	function handle_scroll(event: Event): void {
+		const target = event.target as HTMLElement;
+		const current_scroll_top = target.scrollTop;
+		if (current_scroll_top < previous_scroll_top) {
+			user_has_scrolled_up = true;
+		}
+		previous_scroll_top = current_scroll_top;
+
+		const max_scroll_top = target.scrollHeight - target.clientHeight;
+		const user_has_scrolled_to_bottom = current_scroll_top >= max_scroll_top;
+		if (user_has_scrolled_to_bottom) {
+			user_has_scrolled_up = false;
+		}
+	}
+
+	async function handle_upload({
+		detail
+	}: CustomEvent<FileData | FileData[]>): Promise<void> {
+		handle_change();
+		if (Array.isArray(detail)) {
+			for (let file of detail) {
+				value.files.push(file);
+			}
+		} else {
+			value.files.push(detail);
+			value = value;
+		}
+		await tick();
+		dispatch("change", value);
+		dispatch("upload", detail);
+	}
+
+	function remove_thumbnail(event: MouseEvent, index: number): void {
+		handle_change();
+		event.stopPropagation();
+		value.files.splice(index, 1);
+		value = value;
+	}
+
+	let hidden_upload: HTMLInputElement;
+
+	function handle_upload_click(): void {
+		if (hidden_upload) {
+			hidden_upload.click();
+		}
+	}
+
+	async function handle_submit(): Promise<void> {
+		dispatch("submit");
+	}
+</script>
+
+<!-- svelte-ignore a11y-autofocus -->
+<label class:container>
+	<BlockTitle {show_label} {info}>{label}</BlockTitle>
+	<div class="input-container">
+		<Upload
+			on:load={handle_upload}
+			filetype={accept_file_types}
+			{root}
+			bind:dragging
+			disable_click={true}
+			bind:hidden_upload
+		>
+			{#if submit_btn}
+				<button class:disabled class="submit-button" on:click={handle_submit}
+					>{submit_btn}</button
+				>
+			{/if}
+			<button class:disabled class="plus-button" on:click={handle_upload_click}
+				>+</button
+			>
+			{#if value.files.length > 0}
+				<div
+					class="thumbnails scroll-hide"
+					data-testid="container_el"
+					style="display: {value.files.length > 0 ? 'flex' : 'none'};"
+				>
+					{#each value.files as file, index}
+						<button class="thumbnail-item thumbnail-small">
+							<button
+								class:disabled
+								class="delete-button"
+								on:click={(event) => remove_thumbnail(event, index)}
+								><Clear /></button
+							>
+							{#if file.mime_type && file.mime_type.includes("image")}
+								<Image
+									src={file.url}
+									title={null}
+									alt=""
+									loading="lazy"
+									class={"thumbnail-image"}
+								/>
+							{:else if file.mime_type && file.mime_type.includes("audio")}
+								<Music />
+							{:else if file.mime_type && file.mime_type.includes("video")}
+								<Video />
+							{:else}
+								<File />
+							{/if}
+						</button>
+					{/each}
+				</div>
+			{/if}
+			<textarea
+				data-testid="textbox"
+				use:text_area_resize={{
+					text: value.text,
+					lines: lines,
+					max_lines: max_lines
+				}}
+				class="scroll-hide"
+				dir={rtl ? "rtl" : "ltr"}
+				bind:value={value.text}
+				bind:this={el}
+				{placeholder}
+				rows={lines}
+				{disabled}
+				{autofocus}
+				on:keypress={handle_keypress}
+				on:blur
+				on:select={handle_select}
+				on:focus
+				on:scroll={handle_scroll}
+				style={text_align ? "text-align: " + text_align : ""}
+			/>
+		</Upload>
+	</div>
+</label>
+
+<style>
+	.input-container {
+		display: flex;
+		flex-direction: column;
+		align-items: center;
+		justify-content: center;
+	}
+
+	textarea {
+		align-self: flex-start;
+		outline: none !important;
+		background: var(--input-background-fill);
+		padding: var(--input-padding);
+		width: 90%;
+		max-height: 100%;
+		height: 25px;
+		color: var(--body-text-color);
+		font-weight: var(--input-text-weight);
+		font-size: var(--input-text-size);
+		line-height: var(--line-sm);
+		border: none;
+		margin-top: 0px;
+		margin-bottom: 0px;
+		margin-left: 30px;
+		padding-top: 12px;
+	}
+
+	textarea:disabled {
+		-webkit-text-fill-color: var(--body-text-color);
+		-webkit-opacity: 1;
+		opacity: 1;
+		width: 100%;
+		margin-left: 0px;
+	}
+
+	textarea::placeholder {
+		color: var(--input-placeholder-color);
+	}
+
+	.plus-button,
+	.submit-button {
+		position: absolute;
+		background: var(--button-secondary-background-fill);
+		color: var(--button-secondary-text-color);
+		border: none;
+		text-align: center;
+		text-decoration: none;
+		font-size: 20px;
+		cursor: pointer;
+		border-radius: 50%;
+		width: 30px;
+		height: 30px;
+		bottom: 15px;
+	}
+
+	.plus-button:hover,
+	.submit-button:hover {
+		background: var(--button-secondary-background-fill-hover);
+	}
+
+	.plus-button:active,
+	.submit-button:active {
+		box-shadow: var(--button-shadow-active);
+	}
+
+	.submit-button {
+		right: 10px;
+		margin-left: 5px;
+		padding-bottom: 5px;
+		padding-left: 2px;
+	}
+
+	.plus-button {
+		left: 10px;
+		margin-right: 5px;
+	}
+
+	.thumbnails :global(img) {
+		width: var(--size-full);
+		height: var(--size-full);
+		object-fit: cover;
+		border-radius: var(--radius-lg);
+	}
+
+	.thumbnails {
+		align-self: flex-start;
+		display: flex;
+		justify-content: left;
+		align-items: center;
+		gap: var(--spacing-lg);
+	}
+
+	.thumbnail-item {
+		display: flex;
+		justify-content: center;
+		align-items: center;
+		--ring-color: transparent;
+		position: relative;
+		box-shadow:
+			0 0 0 2px var(--ring-color),
+			var(--shadow-drop);
+		border: 1px solid var(--border-color-primary);
+		border-radius: var(--radius-lg);
+		background: var(--background-fill-secondary);
+		aspect-ratio: var(--ratio-square);
+		width: var(--size-full);
+		height: var(--size-full);
+		cursor: default;
+	}
+
+	.thumbnail-small {
+		flex: none;
+		transform: scale(0.9);
+		transition: 0.075s;
+		width: var(--size-12);
+		height: var(--size-12);
+	}
+
+	.thumbnail-item :global(svg) {
+		width: 30px;
+		height: 30px;
+	}
+
+	.delete-button {
+		display: flex;
+		justify-content: center;
+		align-items: center;
+		position: absolute;
+		right: -7px;
+		top: -7px;
+		color: var(--button-secondary-text-color);
+		background: var(--button-secondary-background-fill);
+		border: none;
+		text-align: center;
+		text-decoration: none;
+		font-size: 10px;
+		cursor: pointer;
+		border-radius: 50%;
+		width: 20px;
+		height: 20px;
+	}
+
+	.disabled {
+		display: none;
+	}
+
+	.delete-button :global(svg) {
+		width: 12px;
+		height: 12px;
+	}
+
+	.delete-button:hover {
+		filter: brightness(1.2);
+		border: 0.8px solid var(--color-grey-500);
+	}
+</style>
--- a/js/multimodaltextbox/shared/utils.ts
+++ b/js/multimodaltextbox/shared/utils.ts
@ -0,0 +1,58 @@
+import { tick } from "svelte";
+
+interface Value {
+	lines: number;
+	max_lines: number;
+	text: string;
+}
+
+export async function resize(
+	target: HTMLTextAreaElement | HTMLInputElement,
+	lines: number,
+	max_lines: number
+): Promise<void> {
+	await tick();
+	if (lines === max_lines) return;
+
+	let max =
+		max_lines === undefined
+			? false
+			: max_lines === undefined // default
+			? 21 * 11
+			: 21 * (max_lines + 1);
+	let min = 21 * (lines + 1);
+
+	target.style.height = "1px";
+
+	let scroll_height;
+	if (max && target.scrollHeight > max) {
+		scroll_height = max;
+	} else if (target.scrollHeight < min) {
+		scroll_height = min;
+	} else {
+		scroll_height = target.scrollHeight;
+	}
+
+	target.style.height = `${scroll_height}px`;
+}
+
+export function text_area_resize(
+	_el: HTMLTextAreaElement,
+	_value: Value
+): any | undefined {
+	if (_value.lines === _value.max_lines) return;
+	_el.style.overflowY = "scroll";
+	_el.addEventListener("input", (event: Event) =>
+		resize(event.target as HTMLTextAreaElement, _value.lines, _value.max_lines)
+	);
+
+	if (!_value.text.trim()) return;
+	resize(_el, _value.lines, _value.max_lines);
+
+	return {
+		destroy: () =>
+			_el.removeEventListener("input", (e: Event) =>
+				resize(e.target as HTMLTextAreaElement, _value.lines, _value.max_lines)
+			)
+	};
+}
--- a/js/upload/src/Upload.svelte
+++ b/js/upload/src/Upload.svelte
@ -16,6 +16,7 @@
 	export let hidden = false;
 	export let format: "blob" | "file" = "file";
 	export let uploading = false;
+	export let hidden_upload: HTMLInputElement | null = null;

 	let upload_id: string;
 	let file_data: FileData[];
@ -24,7 +25,6 @@
 	// Needed for wasm support
 	const upload_fn = getContext<typeof upload_files>("upload_files");

-	let hidden_upload: HTMLInputElement;
 	const dispatch = createEventDispatcher();
 	const validFileTypes = ["image", "video", "audio", "text", "file"];
 	const processFileType = (type: string): string => {
@ -70,8 +70,10 @@

 	export function open_file_upload(): void {
 		if (disable_click) return;
-		hidden_upload.value = "";
-		hidden_upload.click();
+		if (hidden_upload) {
+			hidden_upload.value = "";
+			hidden_upload.click();
+		}
 	}

 	async function handle_upload(
@ -92,7 +94,9 @@
 		if (!files.length) {
 			return;
 		}
-		let _files: File[] = files.map((f) => new File([f], f.name));
+		let _files: File[] = files.map(
+			(f) => new File([f], f.name, { type: f.type })
+		);
 		file_data = await prepare_files(_files);
 		return await handle_upload(file_data);
 	}
@ -191,6 +195,7 @@
 		class:center
 		class:boundedheight
 		class:flex
+		class:disable_click
 		style:height="100%"
 		tabindex={hidden ? -1 : 0}
 		on:drag|preventDefault|stopPropagation
@ -212,7 +217,7 @@
 			type="file"
 			bind:this={hidden_upload}
 			on:change={load_files_from_upload}
-			accept={accept_file_types}
+			accept={accept_file_types || undefined}
 			multiple={file_count === "multiple" || undefined}
 			webkitdirectory={file_count === "directory" || undefined}
 			mozdirectory={file_count === "directory" || undefined}
@ -240,9 +245,13 @@
 	}
 	.flex {
 		display: flex;
+		flex-direction: column;
 		justify-content: center;
 		align-items: center;
 	}
+	.disable_click {
+		cursor: default;
+	}

 	input {
 		display: none;
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@ -453,6 +453,9 @@ importers:
      '@gradio/model3d':
        specifier: workspace:^
        version: link:../model3D
+      '@gradio/multimodaltextbox':
+        specifier: workspace:^
+        version: link:../multimodaltextbox
      '@gradio/number':
        specifier: workspace:^
        version: link:../number
@ -1193,6 +1196,30 @@ importers:
        specifier: ^1.0.5
        version: 1.0.5

+  js/multimodaltextbox:
+    dependencies:
+      '@gradio/atoms':
+        specifier: workspace:^
+        version: link:../atoms
+      '@gradio/client':
+        specifier: workspace:^
+        version: link:../../client/js
+      '@gradio/icons':
+        specifier: workspace:^
+        version: link:../icons
+      '@gradio/image':
+        specifier: workspace:^
+        version: link:../image
+      '@gradio/statustracker':
+        specifier: workspace:^
+        version: link:../statustracker
+      '@gradio/upload':
+        specifier: workspace:^
+        version: link:../upload
+      '@gradio/utils':
+        specifier: workspace:^
+        version: link:../utils
+
  js/number:
    dependencies:
      '@gradio/atoms':
				`@ -0,0 +1 @@`
				{"cells": [{"cell_type": "markdown", "id": "302934307671667531413257853548643485645", "metadata": {}, "source": ["# Gradio Demo: chatinterface_multimodal"]}, {"cell_type": "code", "execution_count": null, "id": "272996653310673477252411125948039410165", "metadata": {}, "outputs": [], "source": ["!pip install -q gradio "]}, {"cell_type": "code", "execution_count": null, "id": "288918539441861185822528903084949547379", "metadata": {}, "outputs": [], "source": ["import gradio as gr\n", "\n", "def echo(message, history):\n", " return message[\"text\"]\n", "\n", "demo = gr.ChatInterface(fn=echo, examples=[{\"text\": \"hello\"}, {\"text\": \"hola\"}, {\"text\": \"merhaba\"}], title=\"Echo Bot\", multimodal=True)\n", "demo.launch()\n"]}], "metadata": {}, "nbformat": 4, "nbformat_minor": 5}