diff --git a/.env-example b/.env-example index b2c620706a514ec82c3fef12209f021d926f4573..2af4cdc4ab9b8b8612cac3645a1d2c9b8f9f041d 100644 --- a/.env-example +++ b/.env-example @@ -1,4 +1,4 @@ -To use DeepSeek-R1, please create a HuggingFace token (for free!). +To use HuggingFace models, please create a HuggingFace token (for free!). Create a .env file with the following content: HF_ACCESS_TOKEN="Your HuggingFace Token" diff --git a/README.md b/README.md index 5fa6054023e556185174cc7f21d0ad31067fee09..2b739c3b5311b7f9c5d80402644853f3b41ae6f0 100644 --- a/README.md +++ b/README.md @@ -1,18 +1,28 @@ # Named-Entity Recognition of Different Granularities ## Project Overview and Motivation +This project explores various named entity recognition (NER) approaches, focusing on +named entity classification (NEC). +These methods include: +- NLI +- MLM (entity/class masking) +- Word2Vec +- LLM Prompting + +More about these in the report (link report here) ## Setup +1. Run `pip install requirements.txt` +2. If you want to use DeepSeek-R1 (via HuggingFace), follow the instructions in [`.env-example`](.env-example). +3. Execute whatever you want to execute - the models and datasets will be downloaded automatically. ## Project Structure +Testcases for models, datasets and individual approaches for debugging can be found in [`/tests`](tests). -## Datasets - -## Models +Models are defined in [`/models`](src/models) and are accessed via the [`common_interface`](src/common_interface.py). -## Experiments and Results +Datasets can be found in [`/data`](data) and are accessed via the [`data manager`](data/data_manager.py). -## Conclusion +Scripts for executing code on the Cluster are in [`/scripts`](scripts). -## References -https://aclanthology.org/P15-2048.pdf \ No newline at end of file +The experiments conducted during this project and some of their results are in [`/src/experiments`](src/experiments). diff --git a/src/common_interface.py b/src/common_interface.py index 578b666329e312b86ca43753b2b41f511e5ba663..dc6c73cd9c4d765d5bf75a801b9a109974b88dcc 100644 --- a/src/common_interface.py +++ b/src/common_interface.py @@ -10,7 +10,7 @@ from src.models.T5_MLM_label import classify_entity as classify_entity_t5_mlm_la from src.models.Word2Vec import classify_entity as classify_entity_word2vec from src.models.Word2Vec import set_label_dict as set_label_dict_word2vec from src.experiments.NER_with_LLMs.NER_with_LLMs import find_entities as find_entities_llm -from src.experiments.NER_with_LLMs.NER_with_LLMs import classify_entity as classify_entities_llm +from src.experiments.NER_with_LLMs.NER_with_LLMs import classify_entity as classify_entity_llm def classify_entity(model_name, sentence, entity, labels): @@ -26,7 +26,7 @@ def classify_entity(model_name, sentence, entity, labels): elif model_name == "Word2Vec": return classify_entity_word2vec(entity, labels) elif model_name in llms: - return classify_entities_llm(model_name, sentence, entity, labels) + return classify_entity_llm(model_name, sentence, entity, labels) else: print(f"classify_entity not implemented for {model_name}") @@ -42,6 +42,7 @@ def find_entities(model_name, sentence, labels): else: print(f"find_entities not implemented for {model_name}") + def set_label_dict(model_name, label_dict): """ NER. Sets the label dictionary required for the Word2Vec model diff --git a/src/experiments/NER_with_LLMs/NER_with_LLMs.py b/src/experiments/NER_with_LLMs/NER_with_LLMs.py index d1374ed3fce6dd6e59cc12ca212b82d47c3f44e0..ae0f72edfe034a86511e35a8576649dbef3636e4 100644 --- a/src/experiments/NER_with_LLMs/NER_with_LLMs.py +++ b/src/experiments/NER_with_LLMs/NER_with_LLMs.py @@ -18,6 +18,7 @@ Result: <answer>[('Europe', 'organization'), ('Europe', 'location'), ('German', If there is no valid entity in the target sentence, answer "<answer>[]</answer>". +Your Task: """ nli_system_prompt = """You are part of a named entity classification pipeline. @@ -31,6 +32,7 @@ Sentence: 'Europe rejects German call to boycott British lamb.' Target Entity: Europe Desired Result: <answer>organization</answer> +Your Task: """ diff --git a/src/models/GLiNER.py b/src/models/GLiNER.py index 6f431d4c5dc265fcb62c5a870c9bc4fb4030cba8..35908c1a046f6c80a90a866765a016c8bfb9c42d 100644 --- a/src/models/GLiNER.py +++ b/src/models/GLiNER.py @@ -24,5 +24,5 @@ def classify_entity(sentence, entity, labels): if e[0] == entity: return e[1] # Return label - return "Not an entity. (acc. to GLiNER)" + return "Target entity not found during NER." diff --git a/src/models/llms_interface.py b/src/models/llms_interface.py index e00690f911916dd9f43c66cb75530c833dbe37da..31ed9fc5c5373a96f477c2fcd84cc7dd3ef00a59 100644 --- a/src/models/llms_interface.py +++ b/src/models/llms_interface.py @@ -19,6 +19,19 @@ available_models = [ standard_system_prompt = "" +def ensure_model_installed(model_name): + installed_models = ollama.list().models + # print(f"Installed models: {installed_models}") + + installed_model_names = [m.model for m in installed_models] + + if model_name not in installed_model_names: + print(f"Model '{model_name}' not found. Installing...") + ollama.pull(model_name) + else: + print(f"Model '{model_name}' is already installed.") + + class LLM: _registry = {} # Registry to store child classes @@ -52,6 +65,7 @@ class Llama8b: # Uses 8-Bit precision def __init__(self, system_prompt=standard_system_prompt): + ensure_model_installed("llama3.1:8b") self.system_prompt = system_prompt self.model_id = "ollama/llama-3.1-8b" self.chat_history = [{"role": "system", "content": self.system_prompt}] @@ -76,46 +90,6 @@ class Llama8b: return self.generate_response(prompt) -@LLM.register("Llama-3.3-70B-Instruct") -class Llama8b: - # Uses 8-Bit precision - def __init__(self, - system_prompt=standard_system_prompt): - self.system_prompt = system_prompt - self.client = InferenceClient(api_key=os.getenv('HF_ACCESS_TOKEN')) - self.model_id = "meta-llama/Llama-3.3-70B-Instruct" - self.chat_history = [{"role": "system", "content": self.system_prompt}] - self.token_usage = [] - - def generate_response(self, prompt): - try: - output = self.client.chat.completions.create( - messages=[ - { - "role": "user", - "content": self.system_prompt + "\n" + prompt, - } - ], - model=self.model_id, - ) - - token_usage = output.usage - self.token_usage.append({ - "prompt_tokens": token_usage.prompt_tokens, - "completion_tokens": token_usage.completion_tokens, - "total_tokens": token_usage.total_tokens, - }) - return output.choices[0].message.content - except Exception as e: - return f"Error: {e}" - - def __call__(self, prompt, reset=True): - if reset: - self.chat_history = [{"role": "system", "content": self.system_prompt}] - # Else append new message to chat - return self.generate_response(prompt) - - @LLM.register("DeepSeek-R1-Distill-Qwen-32B") class R1: def __init__(self, system_prompt=standard_system_prompt):