NER Improvements and Guide (#1869)

* ner accept hf format * formatting * added ner guide * guide fixes * added unittests for highlighttext * formatting
2025-03-31 12:20:26 +08:00 · 2022-07-25 12:16:00 -07:00 · 2022-07-25 12:16:00 -07:00 · c9b8ad80fd
commit c9b8ad80fd
parent cd0d520696
5 changed files with 162 additions and 5 deletions
--- a/demo/ner_pipeline/requirements.txt
+++ b/demo/ner_pipeline/requirements.txt
@ -0,0 +1,2 @@
+torch
+transformers
--- a/demo/ner_pipeline/run.py
+++ b/demo/ner_pipeline/run.py
@ -0,0 +1,18 @@
+from transformers import pipeline
+
+import gradio as gr
+
+ner_pipeline = pipeline("ner")
+
+examples = [
+    "Does Chicago have any stores and does Joe live here?",
+]
+
+def ner(text):
+    output = ner_pipeline(text)
+    return {"text": text, "entities": output}    
+
+gr.Interface(ner, 
+             gr.Textbox(placeholder="Enter sentence here..."), 
+             gr.HighlightedText(),
+             examples=examples).launch()
--- a/gradio/components.py
+++ b/gradio/components.py
@ -3148,14 +3148,14 @@ class HighlightedText(Changeable, IOComponent):
    """
    Displays text that contains spans that are highlighted by category or numerical value.
    Preprocessing: this component does *not* accept input.
-    Postprocessing: expects a {List[Tuple[str, float | str]]]} consisting of spans of text and their associated labels.
+    Postprocessing: expects a {List[Tuple[str, float | str]]]} consisting of spans of text and their associated labels, or a {Dict} with two keys: (1) "text" whose value is the complete text, and "entities", which is a list of dictionaries, each of which have the keys: "entity" (consisting of the entity label), "start" (the character index where the label starts), and "end" (the character index where the label ends).

    Demos: diff_texts, text_analysis
    """

    def __init__(
        self,
-        value: Optional[str] = None,
+        value: Optional[List[Tuple[str, str | float | None]] | Dict] = None,
        *,
        color_map: Dict[str, str] = None,  # Parameter moved to HighlightedText.style()
        show_legend: bool = False,
@ -3206,7 +3206,7 @@ class HighlightedText(Changeable, IOComponent):

    @staticmethod
    def update(
-        value: Optional[Any] = None,
+        value: Optional[List[Tuple[str, str | float | None]] | Dict] = None,
        color_map: Optional[Dict[str, str]] = None,
        show_legend: Optional[bool] = None,
        label: Optional[str] = None,
@ -3225,8 +3225,8 @@ class HighlightedText(Changeable, IOComponent):
        return updated_config

    def postprocess(
-        self, y: List[Tuple[str, str | float | None]]
-    ) -> List[Tuple[str, str | float | None]]:
+        self, y: Optional[List[Tuple[str, str | float | None]] | Dict]
+    ) -> Optional[List[Tuple[str, str | float | None]]]:
        """
        Parameters:
            y: List of (word, category) tuples
@ -3235,6 +3235,22 @@ class HighlightedText(Changeable, IOComponent):
        """
        if y is None:
            return None
+        if isinstance(y, dict):
+            text = y["text"]
+            entities = y["entities"]
+            if len(entities) == 0:
+                y = [(text, None)]
+            else:
+                list_format = []
+                index = 0
+                for entity in entities:
+                    list_format.append((text[index : entity["start"]], None))
+                    list_format.append(
+                        (text[entity["start"] : entity["end"]], entity["entity"])
+                    )
+                    index = entity["end"]
+                list_format.append((text[index:], None))
+                y = list_format
        if self.combine_adjacent:
            output = []
            running_text, running_category = None, None
--- a/guides/named_entity_recognition.md
+++ b/guides/named_entity_recognition.md
@ -0,0 +1,86 @@
+# Named-Entity Recognition 
+
+Related spaces: https://huggingface.co/spaces/rajistics/biobert_ner_demo, https://huggingface.co/spaces/abidlabs/ner, https://huggingface.co/spaces/rajistics/Financial_Analyst_AI
+Tags: NER, TEXT, HIGHLIGHT
+Docs: highlightedtext
+
+## Introduction
+
+Named-entity recognition (NER), also known as token classification or text tagging, is the task of taking a sentence and classifying every word (or "token") into different categories, such as names of people or names of locations, or different parts of speech. 
+
+For example, given the sentence:
+
+> Does Chicago have any Pakistani restaurants?
+
+A named-entity recognition algorithm may  identify:
+
+* "Chicago" as a **location**
+* "Pakistani" as an **ethnicity**  
+
+
+and so on. 
+
+Using `gradio` (specifically the `HighlightedText` component), you can easily build a web demo of your NER model and share that with the rest of your team.
+
+Here is an example of a demo that you'll be able to build:
+
+$demo_ner_pipeline
+
+This tutorial will show how to take a pretrained NER model and deploy it with a Gradio interface. We will show two different ways to use the `HighlightedText` component -- depending on your NER model, either of these two ways may be easier to learn! 
+
+### Prerequisites
+
+Make sure you have the `gradio` Python package already [installed](/getting_started). You will also need a pretrained named-entity recognition model. You can use your own, or this in this tutorial, we will use one from the `transformers` library.
+
+### Approach 1: List of Entity Dictionaries
+
+Many named-entity recognition models output a list of dictionaries. Each dictionary consists of an *entity*, a "start" index, and an "end" index. This is, for example, how NER models in the `transformers` library operate:
+
+```py
+from transformers import pipeline 
+ner_pipeline = pipeline("ner")
+ner_pipeline("Does Chicago have any Pakistani restaurants")
+```
+
+Output:
+
+```bash
+[{'entity': 'I-LOC',
+  'score': 0.9988978,
+  'index': 2,
+  'word': 'Chicago',
+  'start': 5,
+  'end': 12},
+ {'entity': 'I-MISC',
+  'score': 0.9958592,
+  'index': 5,
+  'word': 'Pakistani',
+  'start': 22,
+  'end': 31}]
+```
+
+If you have such a model, it is very easy to hook it up to Gradio's `HighlightedText` component. All you need to do is pass in this **list of entities**, along with the **original text** to the model, together as dictionary, with the keys being `"entities"` and `"text"` respectively.
+
+Here is a complete example:
+
+$code_ner_pipeline
+$demo_ner_pipeline
+
+### Approach 2: List of Tuples
+
+An alternative way to pass data into the `HighlightedText` component is a list of tuples. The first element of each tuple should be the word or words that are being classified into a particular entity. The second element should be the entity label (or `None` if they should be unlabeled). The `HighlightedText` component automatically strings together the words and labels to display the entities.
+
+In some cases, this can be easier than the first approach. Here is a demo showing this approach using Spacy's parts-of-speech tagger:
+
+$code_text_analysis
+$demo_text_analysis
+
+
+--------------------------------------------
+
+
+And you're done! That's all you need to know to build a web-based GUI for your NER model. 
+
+Fun tip: you can share your NER demo instantly with others simply by setting `share=True` in `launch()`. 
+
+
--- a/test/test_components.py
+++ b/test/test_components.py
@ -1508,6 +1508,41 @@ class TestLabel(unittest.TestCase):


 class TestHighlightedText(unittest.TestCase):
+    def test_postprocess(self):
+        """
+        postprocess
+        """
+        component = gr.HighlightedText()
+        result = [
+            ("", None),
+            ("Wolfgang", "PER"),
+            (" lives in ", None),
+            ("Berlin", "LOC"),
+            ("", None),
+        ]
+        result_ = component.postprocess(result)
+        self.assertEqual(result, result_)
+
+        text = "Wolfgang lives in Berlin"
+        entities = [
+            {"entity": "PER", "start": 0, "end": 8},
+            {"entity": "LOC", "start": 18, "end": 24},
+        ]
+        result_ = component.postprocess({"text": text, "entities": entities})
+        self.assertEqual(result, result_)
+
+        text = "I live there"
+        entities = []
+        result_ = component.postprocess({"text": text, "entities": entities})
+        self.assertEqual([(text, None)], result_)
+
+        text = "Wolfgang"
+        entities = [
+            {"entity": "PER", "start": 0, "end": 8},
+        ]
+        result_ = component.postprocess({"text": text, "entities": entities})
+        self.assertEqual([("", None), (text, "PER"), ("", None)], result_)
+
    def test_component_functions(self):
        """
        get_config, save_flagged, restore_flagged