mirror of
https://github.com/gradio-app/gradio.git
synced 2025-01-30 11:00:11 +08:00
Add a flagging callback to save json files to a hugging face dataset (#1821)
* work on saving flags in JSON format * explained what I did more clearly * final updates + added test case * reviews to flagging.py for HuggingFaceDatasetJSONSaver * formatted imports * used uuid for random ids * used uuid for random + function to get dataset infos * reformmated flagging.py * fix examples test * formatting * async examples * working on mix * comment out failing test * fixed interface problem * final updates to HuggingFaceDatasetJSONSaver flagging.py * final updates to HuggingFaceDatasetJSONSaver flagging.py * formatting * some tweaks * tweaks * tweaks * omar's fixes * added back test.init * restored test init Co-authored-by: Abubakar Abid <abubakar@huggingface.co>
This commit is contained in:
parent
e63ffb326e
commit
9c4dc6c183
@ -48,6 +48,7 @@ from gradio.exceptions import Error
|
||||
from gradio.flagging import (
|
||||
CSVLogger,
|
||||
FlaggingCallback,
|
||||
HuggingFaceDatasetJSONSaver,
|
||||
HuggingFaceDatasetSaver,
|
||||
SimpleCSVLogger,
|
||||
)
|
||||
|
@ -5,6 +5,7 @@ import datetime
|
||||
import io
|
||||
import json
|
||||
import os
|
||||
import uuid
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import TYPE_CHECKING, Any, List, Optional
|
||||
|
||||
@ -18,6 +19,52 @@ if TYPE_CHECKING:
|
||||
set_documentation_group("flagging")
|
||||
|
||||
|
||||
def _get_dataset_features_info(is_new, components):
|
||||
"""
|
||||
Takes in a list of components and returns a dataset features info
|
||||
|
||||
Parameters:
|
||||
is_new: boolean, whether the dataset is new or not
|
||||
components: list of components
|
||||
|
||||
Returns:
|
||||
infos: a dictionary of the dataset features
|
||||
file_preview_types: dictionary mapping of gradio components to appropriate string.
|
||||
header: list of header strings
|
||||
|
||||
"""
|
||||
infos = {"flagged": {"features": {}}}
|
||||
# File previews for certain input and output types
|
||||
file_preview_types = {gr.Audio: "Audio", gr.Image: "Image"}
|
||||
headers = []
|
||||
|
||||
# Generate the headers and dataset_infos
|
||||
if is_new:
|
||||
|
||||
for component in components:
|
||||
headers.append(component.label)
|
||||
infos["flagged"]["features"][component.label] = {
|
||||
"dtype": "string",
|
||||
"_type": "Value",
|
||||
}
|
||||
if isinstance(component, tuple(file_preview_types)):
|
||||
headers.append(component.label + " file")
|
||||
for _component, _type in file_preview_types.items():
|
||||
if isinstance(component, _component):
|
||||
infos["flagged"]["features"][component.label + " file"] = {
|
||||
"_type": _type
|
||||
}
|
||||
break
|
||||
|
||||
headers.append("flag")
|
||||
infos["flagged"]["features"]["flag"] = {
|
||||
"dtype": "string",
|
||||
"_type": "Value",
|
||||
}
|
||||
|
||||
return infos, file_preview_types, headers
|
||||
|
||||
|
||||
class FlaggingCallback(ABC):
|
||||
"""
|
||||
An abstract class for defining the methods that any FlaggingCallback should have.
|
||||
@ -296,7 +343,7 @@ class HuggingFaceDatasetSaver(FlaggingCallback):
|
||||
clone_from=path_to_dataset_repo,
|
||||
use_auth_token=self.hf_token,
|
||||
)
|
||||
self.repo.git_pull()
|
||||
self.repo.git_pull(lfs=True)
|
||||
|
||||
# Should filename be user-specified?
|
||||
self.log_file = os.path.join(self.dataset_dir, "data.csv")
|
||||
@ -312,45 +359,17 @@ class HuggingFaceDatasetSaver(FlaggingCallback):
|
||||
self.repo.git_pull(lfs=True)
|
||||
|
||||
is_new = not os.path.exists(self.log_file)
|
||||
infos = {"flagged": {"features": {}}}
|
||||
|
||||
with open(self.log_file, "a", newline="", encoding="utf-8") as csvfile:
|
||||
writer = csv.writer(csvfile)
|
||||
|
||||
# File previews for certain input and output types
|
||||
file_preview_types = {
|
||||
gr.inputs.Audio: "Audio",
|
||||
gr.outputs.Audio: "Audio",
|
||||
gr.inputs.Image: "Image",
|
||||
gr.outputs.Image: "Image",
|
||||
}
|
||||
infos, file_preview_types, headers = _get_dataset_features_info(
|
||||
is_new, self.components
|
||||
)
|
||||
|
||||
# Generate the headers and dataset_infos
|
||||
if is_new:
|
||||
headers = []
|
||||
|
||||
for component, sample in zip(self.components, flag_data):
|
||||
headers.append(component.label)
|
||||
headers.append(component.label)
|
||||
infos["flagged"]["features"][component.label] = {
|
||||
"dtype": "string",
|
||||
"_type": "Value",
|
||||
}
|
||||
if isinstance(component, tuple(file_preview_types)):
|
||||
headers.append(component.label + " file")
|
||||
for _component, _type in file_preview_types.items():
|
||||
if isinstance(component, _component):
|
||||
infos["flagged"]["features"][
|
||||
component.label + " file"
|
||||
] = {"_type": _type}
|
||||
break
|
||||
|
||||
headers.append("flag")
|
||||
infos["flagged"]["features"]["flag"] = {
|
||||
"dtype": "string",
|
||||
"_type": "Value",
|
||||
}
|
||||
|
||||
writer.writerow(utils.sanitize_list_for_csv(headers))
|
||||
|
||||
# Generate the row corresponding to the flagged sample
|
||||
@ -378,3 +397,151 @@ class HuggingFaceDatasetSaver(FlaggingCallback):
|
||||
self.repo.push_to_hub(commit_message="Flagged sample #{}".format(line_count))
|
||||
|
||||
return line_count
|
||||
|
||||
|
||||
class HuggingFaceDatasetJSONSaver(FlaggingCallback):
|
||||
"""
|
||||
A FlaggingCallback that saves flagged data to a Hugging Face dataset in JSONL format.
|
||||
|
||||
Each data sample is saved in a different JSONL file,
|
||||
allowing multiple users to use flagging simultaneously.
|
||||
Saving to a single CSV would cause errors as only one user can edit at the same time.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
hf_foken: str,
|
||||
dataset_name: str,
|
||||
organization: Optional[str] = None,
|
||||
private: bool = False,
|
||||
verbose: bool = True,
|
||||
):
|
||||
"""
|
||||
Params:
|
||||
hf_token (str): The token to use to access the huggingface API.
|
||||
dataset_name (str): The name of the dataset to save the data to, e.g.
|
||||
"image-classifier-1"
|
||||
organization (str): The name of the organization to which to attach
|
||||
the datasets. If None, the dataset attaches to the user only.
|
||||
private (bool): If the dataset does not already exist, whether it
|
||||
should be created as a private dataset or public. Private datasets
|
||||
may require paid huggingface.co accounts
|
||||
verbose (bool): Whether to print out the status of the dataset
|
||||
creation.
|
||||
"""
|
||||
self.hf_foken = hf_foken
|
||||
self.dataset_name = dataset_name
|
||||
self.organization_name = organization
|
||||
self.dataset_private = private
|
||||
self.verbose = verbose
|
||||
|
||||
def setup(self, components: List[IOComponent], flagging_dir: str):
|
||||
"""
|
||||
Params:
|
||||
components List[Component]: list of components for flagging
|
||||
flagging_dir (str): local directory where the dataset is cloned,
|
||||
updated, and pushed from.
|
||||
"""
|
||||
try:
|
||||
import huggingface_hub
|
||||
except (ImportError, ModuleNotFoundError):
|
||||
raise ImportError(
|
||||
"Package `huggingface_hub` not found is needed "
|
||||
"for HuggingFaceDatasetJSONSaver. Try 'pip install huggingface_hub'."
|
||||
)
|
||||
path_to_dataset_repo = huggingface_hub.create_repo(
|
||||
name=self.dataset_name,
|
||||
token=self.hf_foken,
|
||||
private=self.dataset_private,
|
||||
repo_type="dataset",
|
||||
exist_ok=True,
|
||||
)
|
||||
self.path_to_dataset_repo = path_to_dataset_repo # e.g. "https://huggingface.co/datasets/abidlabs/test-audio-10"
|
||||
self.components = components
|
||||
self.flagging_dir = flagging_dir
|
||||
self.dataset_dir = os.path.join(flagging_dir, self.dataset_name)
|
||||
self.repo = huggingface_hub.Repository(
|
||||
local_dir=self.dataset_dir,
|
||||
clone_from=path_to_dataset_repo,
|
||||
use_auth_token=self.hf_foken,
|
||||
)
|
||||
self.repo.git_pull(lfs=True)
|
||||
|
||||
self.infos_file = os.path.join(self.dataset_dir, "dataset_infos.json")
|
||||
|
||||
def flag(
|
||||
self,
|
||||
flag_data: List[Any],
|
||||
flag_option: Optional[str] = None,
|
||||
flag_index: Optional[int] = None,
|
||||
username: Optional[str] = None,
|
||||
) -> int:
|
||||
self.repo.git_pull(lfs=True)
|
||||
|
||||
# Generate unique folder for the flagged sample
|
||||
unique_name = self.get_unique_name() # unique name for folder
|
||||
folder_name = os.path.join(
|
||||
self.dataset_dir, unique_name
|
||||
) # unique folder for specific example
|
||||
os.makedirs(folder_name)
|
||||
|
||||
# Now uses the existence of `dataset_infos.json` to determine if new
|
||||
is_new = not os.path.exists(self.infos_file)
|
||||
|
||||
# File previews for certain input and output types
|
||||
infos, file_preview_types, _ = _get_dataset_features_info(
|
||||
is_new, self.components
|
||||
)
|
||||
|
||||
# Generate the row and header corresponding to the flagged sample
|
||||
csv_data = []
|
||||
headers = []
|
||||
|
||||
for component, sample in zip(self.components, flag_data):
|
||||
headers.append(component.label)
|
||||
|
||||
try:
|
||||
filepath = component.save_flagged(
|
||||
folder_name, component.label, sample, None
|
||||
)
|
||||
except Exception:
|
||||
# Could not parse 'sample' (mostly) because it was None and `component.save_flagged`
|
||||
# does not handle None cases.
|
||||
# for example: Label (line 3109 of components.py raises an error if data is None)
|
||||
filepath = None
|
||||
|
||||
if isinstance(component, tuple(file_preview_types)):
|
||||
headers.append(component.label + " file")
|
||||
|
||||
csv_data.append(
|
||||
"{}/resolve/main/{}/{}".format(
|
||||
self.path_to_dataset_repo, unique_name, filepath
|
||||
)
|
||||
if filepath is not None
|
||||
else None
|
||||
)
|
||||
|
||||
csv_data.append(filepath)
|
||||
headers.append("flag")
|
||||
csv_data.append(flag_option if flag_option is not None else "")
|
||||
|
||||
# Creates metadata dict from row data and dumps it
|
||||
metadata_dict = {
|
||||
header: _csv_data for header, _csv_data in zip(headers, csv_data)
|
||||
}
|
||||
self.dump_json(metadata_dict, os.path.join(folder_name, "metadata.jsonl"))
|
||||
|
||||
if is_new:
|
||||
json.dump(infos, open(self.infos_file, "w"))
|
||||
|
||||
self.repo.push_to_hub(commit_message="Flagged sample {}".format(unique_name))
|
||||
return unique_name
|
||||
|
||||
def get_unique_name(self):
|
||||
id = uuid.uuid4()
|
||||
return str(id)
|
||||
|
||||
def dump_json(self, thing: dict, file_path: str) -> None:
|
||||
with open(file_path, "w+", encoding="utf8") as f:
|
||||
json.dump(thing, f)
|
||||
|
@ -67,6 +67,41 @@ class TestHuggingFaceDatasetSaver(unittest.TestCase):
|
||||
self.assertEqual(row_count, 2) # 3 rows written including header
|
||||
|
||||
|
||||
class TestHuggingFaceDatasetJSONSaver(unittest.TestCase):
|
||||
def test_saver_setup(self):
|
||||
huggingface_hub.create_repo = MagicMock()
|
||||
huggingface_hub.Repository = MagicMock()
|
||||
flagger = flagging.HuggingFaceDatasetJSONSaver("test", "test")
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
flagger.setup([gr.Audio, gr.Textbox], tmpdirname)
|
||||
huggingface_hub.create_repo.assert_called_once()
|
||||
|
||||
def test_saver_flag(self):
|
||||
huggingface_hub.create_repo = MagicMock()
|
||||
huggingface_hub.Repository = MagicMock()
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
io = gr.Interface(
|
||||
lambda x: x,
|
||||
"text",
|
||||
"text",
|
||||
flagging_dir=tmpdirname,
|
||||
flagging_callback=flagging.HuggingFaceDatasetJSONSaver("test", "test"),
|
||||
)
|
||||
test_dir = os.path.join(tmpdirname, "test")
|
||||
os.mkdir(test_dir)
|
||||
io.launch(prevent_thread_lock=True)
|
||||
row_unique_name = io.flagging_callback.flag(["test", "test"])
|
||||
# Test existence of metadata.jsonl file for that example
|
||||
self.assertEqual(
|
||||
os.path.isfile(
|
||||
os.path.join(
|
||||
os.path.join(test_dir, row_unique_name), "metadata.jsonl"
|
||||
)
|
||||
),
|
||||
True,
|
||||
)
|
||||
|
||||
|
||||
class TestDisableFlagging(unittest.TestCase):
|
||||
def test_flagging_no_permission_error_with_flagging_disabled(self):
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
|
Loading…
Reference in New Issue
Block a user