Skip to content
Snippets Groups Projects
Commit 4d8d5696 authored by Thomas Wolf's avatar Thomas Wolf
Browse files

NEC with GLiNER

parent 2c66220a
No related branches found
No related tags found
No related merge requests found
...@@ -4,18 +4,19 @@ Makes evaluating models easier. ...@@ -4,18 +4,19 @@ Makes evaluating models easier.
""" """
from src.models.llms_interface import available_models as llms from src.models.llms_interface import available_models as llms
from src.models.GLiNER import find_entities as find_entities_gliner from src.models.GLiNER import find_entities as find_entities_gliner
from src.models.GLiNER import classify_entity as classify_entity_gliner
from src.experiments.NER_with_T5 import classify_entity as classify_entity_t5 from src.experiments.NER_with_T5 import classify_entity as classify_entity_t5
from src.experiments.NER_with_LLMs.NER_with_LLMs import find_entities as find_entities_llm from src.experiments.NER_with_LLMs.NER_with_LLMs import find_entities as find_entities_llm
def classify_entity(model_name, sentence, entity, labels): def classify_entity(model_name, sentence, entity, labels):
""" """
Entity Classification NEC. Returns label (string) for entity.
""" """
if model_name == "T5": if model_name == "T5":
return classify_entity_t5(sentence, entity, labels) return classify_entity_t5(sentence, entity, labels)
elif model_name == "GLiNER": elif model_name == "GLiNER":
pass # todo return classify_entity_gliner(sentence, entity, labels)
def predict_mask_mlm(model_name, masked_sentence, labels): def predict_mask_mlm(model_name, masked_sentence, labels):
...@@ -32,7 +33,7 @@ def predict_mask_nli(model_name, masked_sentence, labels): ...@@ -32,7 +33,7 @@ def predict_mask_nli(model_name, masked_sentence, labels):
def find_entities(model_name, sentence, labels): def find_entities(model_name, sentence, labels):
""" """
NER NER. Returns list of pairs [(entity, label), ...]
""" """
if model_name in llms: if model_name in llms:
return find_entities_llm(model_name, sentence, labels) return find_entities_llm(model_name, sentence, labels)
......
...@@ -4,6 +4,6 @@ Evaluates GLiNER as SotA and plots results using reusable functions in plotter. ...@@ -4,6 +4,6 @@ Evaluates GLiNER as SotA and plots results using reusable functions in plotter.
from src.metrics import NER_metrics, read_NER_metrics from src.metrics import NER_metrics, read_NER_metrics
NER_metrics("GLiNER", "CoNLL", "results", test_instances=100) # NER_metrics("GLiNER", "CoNLL", "results", test_instances=100)
read_NER_metrics("results") read_NER_metrics("results")
"""
This file evaluates all NEC approaches.
"""
# todo: perform tests on datasets and store results
# todo read results and compare the models / plot the results
from src.models.T5 import infer_nli from src.models.T5 import infer_nli
def classify_entity(sentence, entity, labels): def classify_entity(sentence, entity, labels):
print("classify entity") print("classify entity")
for label in labels: for label in labels:
......
...@@ -16,3 +16,13 @@ def find_entities(sentence, labels): ...@@ -16,3 +16,13 @@ def find_entities(sentence, labels):
entity_list.append((entity["text"], entity["label"])) entity_list.append((entity["text"], entity["label"]))
return entity_list return entity_list
def classify_entity(sentence, entity, labels):
entity_list = find_entities(sentence, labels)
for e in entity_list:
if e[0] == entity:
return e[1] # Return label
return ""
...@@ -10,9 +10,10 @@ tokenizer = T5Tokenizer.from_pretrained(model_name) ...@@ -10,9 +10,10 @@ tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name) model = T5ForConditionalGeneration.from_pretrained(model_name)
print("Finished loading model: T5 NLI") print("Finished loading model: T5 NLI")
def infer_nli(premise, hypothesis): def infer_nli(premise, hypothesis):
input_text = f"nli hypothesis: {hypothesis} premise: {premise}" input_text = f"nli hypothesis: {hypothesis} premise: {premise}"
print("tokenize") print("tokenize")
inputs = tokenizer(input_text, return_tensors="pt") inputs = tokenizer(input_text, return_tensors="pt")
...@@ -24,19 +25,21 @@ def infer_nli(premise, hypothesis): ...@@ -24,19 +25,21 @@ def infer_nli(premise, hypothesis):
return result return result
def preprocess_data(sample): def preprocess_data(sample):
input_text = f"nli hypothesis: {sample['hypothesis']} premise: {sample['premise']}" input_text = f"nli hypothesis: {sample['hypothesis']} premise: {sample['premise']}"
target_text = label_map[bool(sample['entailment'])] target_text = label_map[bool(sample['entailment'])]
tokenized_input = tokenizer(input_text, padding="max_length", truncation=True, max_length=512) tokenized_input = tokenizer(input_text, padding="max_length", truncation=True, max_length=512)
tokenized_target = tokenizer(target_text, padding="max_length", truncation=True, max_length=10) tokenized_target = tokenizer(target_text, padding="max_length", truncation=True, max_length=10)
tokenized_input["labels"] = tokenized_target["input_ids"] tokenized_input["labels"] = tokenized_target["input_ids"]
return tokenized_input return tokenized_input
def finetune_model(premises, hypotheses, entailment): def finetune_model(premises, hypotheses, entailment):
# TODO: should we use dataset on a higher level as well? # TODO: should we use dataset on a higher level as well?
data_dict = { "premise": premises, "hypothesis": hypotheses, "entailment": entailment} data_dict = {"premise": premises, "hypothesis": hypotheses, "entailment": entailment}
dataset = Dataset.from_dict(data_dict) dataset = Dataset.from_dict(data_dict)
print(dataset) print(dataset)
...@@ -78,6 +81,3 @@ def finetune_model(premises, hypotheses, entailment): ...@@ -78,6 +81,3 @@ def finetune_model(premises, hypotheses, entailment):
) )
trainer.train() trainer.train()
...@@ -14,3 +14,6 @@ def plot_bars(data, x_column, y_column, grouping, title, ylabel, xlabel): ...@@ -14,3 +14,6 @@ def plot_bars(data, x_column, y_column, grouping, title, ylabel, xlabel):
""" """
Reusable barchart plotting function Reusable barchart plotting function
""" """
# todo: bar chart with three grouped columns for each model: precision, recall, f1-score
\ No newline at end of file
from src.common_interface import classify_entity from src.common_interface import classify_entity
tested_models = ["T5"] tested_models = ["GLiNER", "T5"]
test_sentences = ["Barack Obama was the president of the United States."] test_sentence = "Barack Obama was the president of the United States."
test_entity = ["Barack Obama"] test_entities = ["Barack Obama", "United States"]
true_labels = ["person"] labels = ["person", "organization", "time", "location", "miscellaneous"]
labels = ["person", "organization", "time", "location"]
print("Test NEC") print("Test NEC")
for model in tested_models: for model in tested_models:
for index in range(len(test_sentences)): print("\n")
classify_entity(model, test_sentences[index], test_entity[index], labels) for test_entity in test_entities:
print(f"{model} prediction for {test_entity}:")
print(classify_entity(model, test_sentence, test_entity, labels))
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment