mirror of
https://github.com/gradio-app/gradio.git
synced 2025-03-31 12:20:26 +08:00
NER Improvements and Guide (#1869)
* ner accept hf format * formatting * added ner guide * guide fixes * added unittests for highlighttext * formatting
This commit is contained in:
parent
cd0d520696
commit
c9b8ad80fd
2
demo/ner_pipeline/requirements.txt
Normal file
2
demo/ner_pipeline/requirements.txt
Normal file
@ -0,0 +1,2 @@
|
||||
torch
|
||||
transformers
|
18
demo/ner_pipeline/run.py
Normal file
18
demo/ner_pipeline/run.py
Normal file
@ -0,0 +1,18 @@
|
||||
from transformers import pipeline
|
||||
|
||||
import gradio as gr
|
||||
|
||||
ner_pipeline = pipeline("ner")
|
||||
|
||||
examples = [
|
||||
"Does Chicago have any stores and does Joe live here?",
|
||||
]
|
||||
|
||||
def ner(text):
|
||||
output = ner_pipeline(text)
|
||||
return {"text": text, "entities": output}
|
||||
|
||||
gr.Interface(ner,
|
||||
gr.Textbox(placeholder="Enter sentence here..."),
|
||||
gr.HighlightedText(),
|
||||
examples=examples).launch()
|
@ -3148,14 +3148,14 @@ class HighlightedText(Changeable, IOComponent):
|
||||
"""
|
||||
Displays text that contains spans that are highlighted by category or numerical value.
|
||||
Preprocessing: this component does *not* accept input.
|
||||
Postprocessing: expects a {List[Tuple[str, float | str]]]} consisting of spans of text and their associated labels.
|
||||
Postprocessing: expects a {List[Tuple[str, float | str]]]} consisting of spans of text and their associated labels, or a {Dict} with two keys: (1) "text" whose value is the complete text, and "entities", which is a list of dictionaries, each of which have the keys: "entity" (consisting of the entity label), "start" (the character index where the label starts), and "end" (the character index where the label ends).
|
||||
|
||||
Demos: diff_texts, text_analysis
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
value: Optional[str] = None,
|
||||
value: Optional[List[Tuple[str, str | float | None]] | Dict] = None,
|
||||
*,
|
||||
color_map: Dict[str, str] = None, # Parameter moved to HighlightedText.style()
|
||||
show_legend: bool = False,
|
||||
@ -3206,7 +3206,7 @@ class HighlightedText(Changeable, IOComponent):
|
||||
|
||||
@staticmethod
|
||||
def update(
|
||||
value: Optional[Any] = None,
|
||||
value: Optional[List[Tuple[str, str | float | None]] | Dict] = None,
|
||||
color_map: Optional[Dict[str, str]] = None,
|
||||
show_legend: Optional[bool] = None,
|
||||
label: Optional[str] = None,
|
||||
@ -3225,8 +3225,8 @@ class HighlightedText(Changeable, IOComponent):
|
||||
return updated_config
|
||||
|
||||
def postprocess(
|
||||
self, y: List[Tuple[str, str | float | None]]
|
||||
) -> List[Tuple[str, str | float | None]]:
|
||||
self, y: Optional[List[Tuple[str, str | float | None]] | Dict]
|
||||
) -> Optional[List[Tuple[str, str | float | None]]]:
|
||||
"""
|
||||
Parameters:
|
||||
y: List of (word, category) tuples
|
||||
@ -3235,6 +3235,22 @@ class HighlightedText(Changeable, IOComponent):
|
||||
"""
|
||||
if y is None:
|
||||
return None
|
||||
if isinstance(y, dict):
|
||||
text = y["text"]
|
||||
entities = y["entities"]
|
||||
if len(entities) == 0:
|
||||
y = [(text, None)]
|
||||
else:
|
||||
list_format = []
|
||||
index = 0
|
||||
for entity in entities:
|
||||
list_format.append((text[index : entity["start"]], None))
|
||||
list_format.append(
|
||||
(text[entity["start"] : entity["end"]], entity["entity"])
|
||||
)
|
||||
index = entity["end"]
|
||||
list_format.append((text[index:], None))
|
||||
y = list_format
|
||||
if self.combine_adjacent:
|
||||
output = []
|
||||
running_text, running_category = None, None
|
||||
|
86
guides/named_entity_recognition.md
Normal file
86
guides/named_entity_recognition.md
Normal file
@ -0,0 +1,86 @@
|
||||
# Named-Entity Recognition
|
||||
|
||||
Related spaces: https://huggingface.co/spaces/rajistics/biobert_ner_demo, https://huggingface.co/spaces/abidlabs/ner, https://huggingface.co/spaces/rajistics/Financial_Analyst_AI
|
||||
Tags: NER, TEXT, HIGHLIGHT
|
||||
Docs: highlightedtext
|
||||
|
||||
## Introduction
|
||||
|
||||
Named-entity recognition (NER), also known as token classification or text tagging, is the task of taking a sentence and classifying every word (or "token") into different categories, such as names of people or names of locations, or different parts of speech.
|
||||
|
||||
For example, given the sentence:
|
||||
|
||||
> Does Chicago have any Pakistani restaurants?
|
||||
|
||||
A named-entity recognition algorithm may identify:
|
||||
|
||||
* "Chicago" as a **location**
|
||||
* "Pakistani" as an **ethnicity**
|
||||
|
||||
|
||||
and so on.
|
||||
|
||||
Using `gradio` (specifically the `HighlightedText` component), you can easily build a web demo of your NER model and share that with the rest of your team.
|
||||
|
||||
Here is an example of a demo that you'll be able to build:
|
||||
|
||||
$demo_ner_pipeline
|
||||
|
||||
This tutorial will show how to take a pretrained NER model and deploy it with a Gradio interface. We will show two different ways to use the `HighlightedText` component -- depending on your NER model, either of these two ways may be easier to learn!
|
||||
|
||||
### Prerequisites
|
||||
|
||||
Make sure you have the `gradio` Python package already [installed](/getting_started). You will also need a pretrained named-entity recognition model. You can use your own, or this in this tutorial, we will use one from the `transformers` library.
|
||||
|
||||
### Approach 1: List of Entity Dictionaries
|
||||
|
||||
Many named-entity recognition models output a list of dictionaries. Each dictionary consists of an *entity*, a "start" index, and an "end" index. This is, for example, how NER models in the `transformers` library operate:
|
||||
|
||||
```py
|
||||
from transformers import pipeline
|
||||
ner_pipeline = pipeline("ner")
|
||||
ner_pipeline("Does Chicago have any Pakistani restaurants")
|
||||
```
|
||||
|
||||
Output:
|
||||
|
||||
```bash
|
||||
[{'entity': 'I-LOC',
|
||||
'score': 0.9988978,
|
||||
'index': 2,
|
||||
'word': 'Chicago',
|
||||
'start': 5,
|
||||
'end': 12},
|
||||
{'entity': 'I-MISC',
|
||||
'score': 0.9958592,
|
||||
'index': 5,
|
||||
'word': 'Pakistani',
|
||||
'start': 22,
|
||||
'end': 31}]
|
||||
```
|
||||
|
||||
If you have such a model, it is very easy to hook it up to Gradio's `HighlightedText` component. All you need to do is pass in this **list of entities**, along with the **original text** to the model, together as dictionary, with the keys being `"entities"` and `"text"` respectively.
|
||||
|
||||
Here is a complete example:
|
||||
|
||||
$code_ner_pipeline
|
||||
$demo_ner_pipeline
|
||||
|
||||
### Approach 2: List of Tuples
|
||||
|
||||
An alternative way to pass data into the `HighlightedText` component is a list of tuples. The first element of each tuple should be the word or words that are being classified into a particular entity. The second element should be the entity label (or `None` if they should be unlabeled). The `HighlightedText` component automatically strings together the words and labels to display the entities.
|
||||
|
||||
In some cases, this can be easier than the first approach. Here is a demo showing this approach using Spacy's parts-of-speech tagger:
|
||||
|
||||
$code_text_analysis
|
||||
$demo_text_analysis
|
||||
|
||||
|
||||
--------------------------------------------
|
||||
|
||||
|
||||
And you're done! That's all you need to know to build a web-based GUI for your NER model.
|
||||
|
||||
Fun tip: you can share your NER demo instantly with others simply by setting `share=True` in `launch()`.
|
||||
|
||||
|
@ -1508,6 +1508,41 @@ class TestLabel(unittest.TestCase):
|
||||
|
||||
|
||||
class TestHighlightedText(unittest.TestCase):
|
||||
def test_postprocess(self):
|
||||
"""
|
||||
postprocess
|
||||
"""
|
||||
component = gr.HighlightedText()
|
||||
result = [
|
||||
("", None),
|
||||
("Wolfgang", "PER"),
|
||||
(" lives in ", None),
|
||||
("Berlin", "LOC"),
|
||||
("", None),
|
||||
]
|
||||
result_ = component.postprocess(result)
|
||||
self.assertEqual(result, result_)
|
||||
|
||||
text = "Wolfgang lives in Berlin"
|
||||
entities = [
|
||||
{"entity": "PER", "start": 0, "end": 8},
|
||||
{"entity": "LOC", "start": 18, "end": 24},
|
||||
]
|
||||
result_ = component.postprocess({"text": text, "entities": entities})
|
||||
self.assertEqual(result, result_)
|
||||
|
||||
text = "I live there"
|
||||
entities = []
|
||||
result_ = component.postprocess({"text": text, "entities": entities})
|
||||
self.assertEqual([(text, None)], result_)
|
||||
|
||||
text = "Wolfgang"
|
||||
entities = [
|
||||
{"entity": "PER", "start": 0, "end": 8},
|
||||
]
|
||||
result_ = component.postprocess({"text": text, "entities": entities})
|
||||
self.assertEqual([("", None), (text, "PER"), ("", None)], result_)
|
||||
|
||||
def test_component_functions(self):
|
||||
"""
|
||||
get_config, save_flagged, restore_flagged
|
||||
|
Loading…
x
Reference in New Issue
Block a user