Aktualisieren structures.py

97c971d6 · kreuzer · 3b956c5d · 97c971d6
Commit 97c971d6 authored 2 years ago by kreuzer
--- a/structures.py
+++ b/structures.py
-# DataSet 
-# assembles all datapoints, representing cnn_dailymail
-class DataSetPreprocessed:
-    def __init__(self, dataset ):
 # DataPoint
 # preprocessed Repr of a Document for model (training, testing)
 import torch
 import numpy as np
 # m = 3 sentences in summaries
 # p = 20 top scoring sentences
 # k = 5 top scoring summaries
@@ -22,8 +14,6 @@ class DataPoint:
    def __init__(self, doc_str, sum_str, model_gensim, nlp): 
        def preprocess(string):
            doc_preprocessed = nlp(string)
@@ -55,7 +45,6 @@ class DataPoint:
        self.raw_document, self.document = preprocess(doc_str)
        self.raw_summary, self.summary = preprocess(sum_str)
        # output of sentence encoder (not padded)
        self.sent_vecs = None
@@ -74,18 +63,27 @@ class DataPoint:
        pass
-    def preproces(doc_str): 
+    def compute_sent_vecs(mdoel):
-    # takes a document, split in sentences, and returns tokens in list of list
+        #model.sentence_encoder
-        pass
-    def get_word_embeddings(model_gensim):
-    # takes string repr of document, after preprocessing returns word-embeddings => tensor repr for doc, summary
+# DataSet 
+# assembles all datapoints, representing cnn_dailymail
+class DataSetPreprocessed:
+    def __init__(self, dataset, model_gensim, nlp):
+        def read_data(part):
+            l = []
+            for y in dataset[part]:
+                article, highlights = y['article'], y['highlights']
+                # for every document
+                l.append(DataPoint(article, highlights, model_gensim, nlp))
+            return l
-    def compute_sent_vecs(mdoel):
-        #model.sentence_encoder
+        self.train = read_data('train')
+        self.test = read_data('test')
+        self.validation = read_data('validation')
 import stanza