Add a flagging callback to save json files to a hugging face dataset (#1821)

* work on saving flags in JSON format * explained what I did more clearly * final updates + added test case * reviews to flagging.py for HuggingFaceDatasetJSONSaver * formatted imports * used uuid for random ids * used uuid for random + function to get dataset infos * reformmated flagging.py * fix examples test * formatting * async examples * working on mix * comment out failing test * fixed interface problem * final updates to HuggingFaceDatasetJSONSaver flagging.py * final updates to HuggingFaceDatasetJSONSaver flagging.py * formatting * some tweaks * tweaks * tweaks * omar's fixes * added back test.init * restored test init Co-authored-by: Abubakar Abid <abubakar@huggingface.co>
2025-01-30 11:00:11 +08:00 · 2022-08-24 01:01:37 +02:00 · 2022-08-24 01:01:37 +02:00 · 9c4dc6c183
commit 9c4dc6c183
parent e63ffb326e
3 changed files with 235 additions and 32 deletions
--- a/gradio/init.py
+++ b/gradio/init.py
@ -48,6 +48,7 @@ from gradio.exceptions import Error
 from gradio.flagging import (
    CSVLogger,
    FlaggingCallback,
+    HuggingFaceDatasetJSONSaver,
    HuggingFaceDatasetSaver,
    SimpleCSVLogger,
 )
--- a/gradio/flagging.py
+++ b/gradio/flagging.py
@ -5,6 +5,7 @@ import datetime
 import io
 import json
 import os
+import uuid
 from abc import ABC, abstractmethod
 from typing import TYPE_CHECKING, Any, List, Optional

@ -18,6 +19,52 @@ if TYPE_CHECKING:
 set_documentation_group("flagging")


+def _get_dataset_features_info(is_new, components):
+    """
+    Takes in a list of components and returns a dataset features info
+
+    Parameters:
+    is_new: boolean, whether the dataset is new or not
+    components: list of components
+
+    Returns:
+    infos: a dictionary of the dataset features
+    file_preview_types: dictionary mapping of gradio components to appropriate string.
+    header: list of header strings
+
+    """
+    infos = {"flagged": {"features": {}}}
+    # File previews for certain input and output types
+    file_preview_types = {gr.Audio: "Audio", gr.Image: "Image"}
+    headers = []
+
+    # Generate the headers and dataset_infos
+    if is_new:
+
+        for component in components:
+            headers.append(component.label)
+            infos["flagged"]["features"][component.label] = {
+                "dtype": "string",
+                "_type": "Value",
+            }
+            if isinstance(component, tuple(file_preview_types)):
+                headers.append(component.label + " file")
+                for _component, _type in file_preview_types.items():
+                    if isinstance(component, _component):
+                        infos["flagged"]["features"][component.label + " file"] = {
+                            "_type": _type
+                        }
+                        break
+
+        headers.append("flag")
+        infos["flagged"]["features"]["flag"] = {
+            "dtype": "string",
+            "_type": "Value",
+        }
+
+    return infos, file_preview_types, headers
+
+
 class FlaggingCallback(ABC):
    """
    An abstract class for defining the methods that any FlaggingCallback should have.
@ -296,7 +343,7 @@ class HuggingFaceDatasetSaver(FlaggingCallback):
            clone_from=path_to_dataset_repo,
            use_auth_token=self.hf_token,
        )
-        self.repo.git_pull()
+        self.repo.git_pull(lfs=True)

        # Should filename be user-specified?
        self.log_file = os.path.join(self.dataset_dir, "data.csv")
@ -312,45 +359,17 @@ class HuggingFaceDatasetSaver(FlaggingCallback):
        self.repo.git_pull(lfs=True)

        is_new = not os.path.exists(self.log_file)
-        infos = {"flagged": {"features": {}}}

        with open(self.log_file, "a", newline="", encoding="utf-8") as csvfile:
            writer = csv.writer(csvfile)

            # File previews for certain input and output types
-            file_preview_types = {
-                gr.inputs.Audio: "Audio",
-                gr.outputs.Audio: "Audio",
-                gr.inputs.Image: "Image",
-                gr.outputs.Image: "Image",
-            }
+            infos, file_preview_types, headers = _get_dataset_features_info(
+                is_new, self.components
+            )

            # Generate the headers and dataset_infos
            if is_new:
-                headers = []
-
-                for component, sample in zip(self.components, flag_data):
-                    headers.append(component.label)
-                    headers.append(component.label)
-                    infos["flagged"]["features"][component.label] = {
-                        "dtype": "string",
-                        "_type": "Value",
-                    }
-                    if isinstance(component, tuple(file_preview_types)):
-                        headers.append(component.label + " file")
-                        for _component, _type in file_preview_types.items():
-                            if isinstance(component, _component):
-                                infos["flagged"]["features"][
-                                    component.label + " file"
-                                ] = {"_type": _type}
-                                break
-
-                headers.append("flag")
-                infos["flagged"]["features"]["flag"] = {
-                    "dtype": "string",
-                    "_type": "Value",
-                }
-
                writer.writerow(utils.sanitize_list_for_csv(headers))

            # Generate the row corresponding to the flagged sample
@ -378,3 +397,151 @@ class HuggingFaceDatasetSaver(FlaggingCallback):
        self.repo.push_to_hub(commit_message="Flagged sample #{}".format(line_count))

        return line_count
+
+
+class HuggingFaceDatasetJSONSaver(FlaggingCallback):
+    """
+    A FlaggingCallback that saves flagged data to a Hugging Face dataset in JSONL format.
+
+    Each data sample is saved in a different JSONL file,
+    allowing multiple users to use flagging simultaneously.
+    Saving to a single CSV would cause errors as only one user can edit at the same time.
+
+    """
+
+    def __init__(
+        self,
+        hf_foken: str,
+        dataset_name: str,
+        organization: Optional[str] = None,
+        private: bool = False,
+        verbose: bool = True,
+    ):
+        """
+        Params:
+        hf_token (str): The token to use to access the huggingface API.
+        dataset_name (str): The name of the dataset to save the data to, e.g.
+            "image-classifier-1"
+        organization (str): The name of the organization to which to attach
+            the datasets. If None, the dataset attaches to the user only.
+        private (bool): If the dataset does not already exist, whether it
+            should be created as a private dataset or public. Private datasets
+            may require paid huggingface.co accounts
+        verbose (bool): Whether to print out the status of the dataset
+            creation.
+        """
+        self.hf_foken = hf_foken
+        self.dataset_name = dataset_name
+        self.organization_name = organization
+        self.dataset_private = private
+        self.verbose = verbose
+
+    def setup(self, components: List[IOComponent], flagging_dir: str):
+        """
+        Params:
+        components List[Component]: list of components for flagging
+        flagging_dir (str): local directory where the dataset is cloned,
+        updated, and pushed from.
+        """
+        try:
+            import huggingface_hub
+        except (ImportError, ModuleNotFoundError):
+            raise ImportError(
+                "Package `huggingface_hub` not found is needed "
+                "for HuggingFaceDatasetJSONSaver. Try 'pip install huggingface_hub'."
+            )
+        path_to_dataset_repo = huggingface_hub.create_repo(
+            name=self.dataset_name,
+            token=self.hf_foken,
+            private=self.dataset_private,
+            repo_type="dataset",
+            exist_ok=True,
+        )
+        self.path_to_dataset_repo = path_to_dataset_repo  # e.g. "https://huggingface.co/datasets/abidlabs/test-audio-10"
+        self.components = components
+        self.flagging_dir = flagging_dir
+        self.dataset_dir = os.path.join(flagging_dir, self.dataset_name)
+        self.repo = huggingface_hub.Repository(
+            local_dir=self.dataset_dir,
+            clone_from=path_to_dataset_repo,
+            use_auth_token=self.hf_foken,
+        )
+        self.repo.git_pull(lfs=True)
+
+        self.infos_file = os.path.join(self.dataset_dir, "dataset_infos.json")
+
+    def flag(
+        self,
+        flag_data: List[Any],
+        flag_option: Optional[str] = None,
+        flag_index: Optional[int] = None,
+        username: Optional[str] = None,
+    ) -> int:
+        self.repo.git_pull(lfs=True)
+
+        # Generate unique folder for the flagged sample
+        unique_name = self.get_unique_name()  # unique name for folder
+        folder_name = os.path.join(
+            self.dataset_dir, unique_name
+        )  # unique folder for specific example
+        os.makedirs(folder_name)
+
+        # Now uses the existence of `dataset_infos.json` to determine if new
+        is_new = not os.path.exists(self.infos_file)
+
+        # File previews for certain input and output types
+        infos, file_preview_types, _ = _get_dataset_features_info(
+            is_new, self.components
+        )
+
+        # Generate the row and header corresponding to the flagged sample
+        csv_data = []
+        headers = []
+
+        for component, sample in zip(self.components, flag_data):
+            headers.append(component.label)
+
+            try:
+                filepath = component.save_flagged(
+                    folder_name, component.label, sample, None
+                )
+            except Exception:
+                # Could not parse 'sample' (mostly) because it was None and `component.save_flagged`
+                # does not handle None cases.
+                # for example: Label (line 3109 of components.py raises an error if data is None)
+                filepath = None
+
+            if isinstance(component, tuple(file_preview_types)):
+                headers.append(component.label + " file")
+
+                csv_data.append(
+                    "{}/resolve/main/{}/{}".format(
+                        self.path_to_dataset_repo, unique_name, filepath
+                    )
+                    if filepath is not None
+                    else None
+                )
+
+            csv_data.append(filepath)
+        headers.append("flag")
+        csv_data.append(flag_option if flag_option is not None else "")
+
+        # Creates metadata dict from row data and dumps it
+        metadata_dict = {
+            header: _csv_data for header, _csv_data in zip(headers, csv_data)
+        }
+        self.dump_json(metadata_dict, os.path.join(folder_name, "metadata.jsonl"))
+
+        if is_new:
+            json.dump(infos, open(self.infos_file, "w"))
+
+        self.repo.push_to_hub(commit_message="Flagged sample {}".format(unique_name))
+        return unique_name
+
+    def get_unique_name(self):
+        id = uuid.uuid4()
+        return str(id)
+
+    def dump_json(self, thing: dict, file_path: str) -> None:
+        with open(file_path, "w+", encoding="utf8") as f:
+            json.dump(thing, f)
--- a/test/test_flagging.py
+++ b/test/test_flagging.py
@ -67,6 +67,41 @@ class TestHuggingFaceDatasetSaver(unittest.TestCase):
            self.assertEqual(row_count, 2)  # 3 rows written including header


+class TestHuggingFaceDatasetJSONSaver(unittest.TestCase):
+    def test_saver_setup(self):
+        huggingface_hub.create_repo = MagicMock()
+        huggingface_hub.Repository = MagicMock()
+        flagger = flagging.HuggingFaceDatasetJSONSaver("test", "test")
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            flagger.setup([gr.Audio, gr.Textbox], tmpdirname)
+        huggingface_hub.create_repo.assert_called_once()
+
+    def test_saver_flag(self):
+        huggingface_hub.create_repo = MagicMock()
+        huggingface_hub.Repository = MagicMock()
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            io = gr.Interface(
+                lambda x: x,
+                "text",
+                "text",
+                flagging_dir=tmpdirname,
+                flagging_callback=flagging.HuggingFaceDatasetJSONSaver("test", "test"),
+            )
+            test_dir = os.path.join(tmpdirname, "test")
+            os.mkdir(test_dir)
+            io.launch(prevent_thread_lock=True)
+            row_unique_name = io.flagging_callback.flag(["test", "test"])
+            # Test existence of metadata.jsonl file for that example
+            self.assertEqual(
+                os.path.isfile(
+                    os.path.join(
+                        os.path.join(test_dir, row_unique_name), "metadata.jsonl"
+                    )
+                ),
+                True,
+            )
+
+
 class TestDisableFlagging(unittest.TestCase):
    def test_flagging_no_permission_error_with_flagging_disabled(self):
        with tempfile.TemporaryDirectory() as tmpdirname: