Remove keyword function

df1cc4c8 · vvye · 037dba51 · df1cc4c8 · df1cc4c8 · df1cc4c8
Commit df1cc4c8 authored 3 years ago by vvye
--- a/sentence_selection.py
+++ b/sentence_selection.py
 from datetime import datetime, timedelta
 from sklearn.metrics.pairwise import cosine_similarity
 import numpy as np
-from scipy import sparse

 import util

@@ -48,8 +47,6 @@ def candidate_sentences(articles, date, vectorizer):


 def sentences_published_on_date(articles, date, tolerance_days, num_first_sentences):
-    # implementation details are the same as ghalandari et al:
-    # a sentence is not included in the final list if it also mentions any date at all
    sentences = []
    for article in articles:
        pub_date = datetime.strptime(article['pub_date'], '%Y-%m-%d')

--- a/summarization.py
+++ b/summarization.py
@@ -41,7 +41,7 @@ def summarize(sentences, vectorizer, keywords, num_sentences):
        for i in sorted_indices:
            remaining_indices.remove(i)
            sentence = sentences[i]
-            if not util.contains_any(sentence['text'], keywords):
+            if not any([kw.lower() in sentence['text'].lower() for kw in keywords]):
                continue
            if redundant(i, selected_indices, X):
                continue

--- a/util.py
+++ b/util.py
 import os
-import re


 def subdirs(path):
@@ -10,15 +9,6 @@ def files(path, extension=None):
    return [f for f in os.listdir(path) if os.path.isfile(path / f) and (extension is None or f.endswith(extension))]


-def contains_any(string, keywords):
-    for keyword in keywords:
-        # following ghalandari, don't account for word boundaries
-        # if re.search(fr'\b{keyword.lower()}\b', string.lower()):
-        if keyword.lower() in string.lower():
-            return True
-    return False
-
-
 def avg(lst):
    return sum(lst) / len(lst)