Skip to content
Snippets Groups Projects
Commit 97c971d6 authored by kreuzer's avatar kreuzer
Browse files

Aktualisieren structures.py

parent 3b956c5d
No related branches found
No related tags found
No related merge requests found
# DataSet
# assembles all datapoints, representing cnn_dailymail
class DataSetPreprocessed:
def __init__(self, dataset ):
# DataPoint # DataPoint
# preprocessed Repr of a Document for model (training, testing) # preprocessed Repr of a Document for model (training, testing)
import torch import torch
import numpy as np import numpy as np
# m = 3 sentences in summaries # m = 3 sentences in summaries
# p = 20 top scoring sentences # p = 20 top scoring sentences
# k = 5 top scoring summaries # k = 5 top scoring summaries
...@@ -22,8 +14,6 @@ class DataPoint: ...@@ -22,8 +14,6 @@ class DataPoint:
def __init__(self, doc_str, sum_str, model_gensim, nlp): def __init__(self, doc_str, sum_str, model_gensim, nlp):
def preprocess(string): def preprocess(string):
doc_preprocessed = nlp(string) doc_preprocessed = nlp(string)
...@@ -55,7 +45,6 @@ class DataPoint: ...@@ -55,7 +45,6 @@ class DataPoint:
self.raw_document, self.document = preprocess(doc_str) self.raw_document, self.document = preprocess(doc_str)
self.raw_summary, self.summary = preprocess(sum_str) self.raw_summary, self.summary = preprocess(sum_str)
# output of sentence encoder (not padded) # output of sentence encoder (not padded)
self.sent_vecs = None self.sent_vecs = None
...@@ -74,18 +63,27 @@ class DataPoint: ...@@ -74,18 +63,27 @@ class DataPoint:
pass pass
def preproces(doc_str): def compute_sent_vecs(mdoel):
# takes a document, split in sentences, and returns tokens in list of list #model.sentence_encoder
pass
def get_word_embeddings(model_gensim):
# takes string repr of document, after preprocessing returns word-embeddings => tensor repr for doc, summary
# DataSet
# assembles all datapoints, representing cnn_dailymail
class DataSetPreprocessed:
def __init__(self, dataset, model_gensim, nlp):
def read_data(part):
l = []
for y in dataset[part]:
article, highlights = y['article'], y['highlights']
# for every document
l.append(DataPoint(article, highlights, model_gensim, nlp))
return l
def compute_sent_vecs(mdoel):
#model.sentence_encoder
self.train = read_data('train')
self.test = read_data('test')
self.validation = read_data('validation')
import stanza import stanza
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment