Skip to content
Snippets Groups Projects
Commit 3b956c5d authored by wu's avatar wu
Browse files

Merge branch 'wu-master-patch-75128' into 'master'

Wu master patch 75128

See merge request kreuzer/nn-projekt-ss22!1
parents bf26c03c be4e54af
No related branches found
No related tags found
No related merge requests found
......@@ -92,6 +92,12 @@ import stanza
from gensim.models import KeyedVectors
from datasets import load_dataset
# helper func for padding sentences to length 100
def pad(seq, size = 100):
if len(seq) < size:
return torch.cat((seq, torch.zeros( size- len(seq), 200 )), 0)
return seq[:size]
# loads skipgram gensim
file_name = "data/1-billion-word-language-modeling-benchmark-r13output.word2vec.vec"
......@@ -107,37 +113,16 @@ for x in ['train', 'test']:
for y in dataset[x]:
article, highlights = y['article'], y['highlights']
# for every document
datapoint = DataPoint(self, article, highlights, model_gensim, nlp)
doc_preprocessed = nlp(article)
#sum_preprocessed = nlp(highlights)
doc_embeddings, raw_doc = [], []
for sent in doc_preprocessed.sents:
sent_embeddings, raw_sent = [], []
for tok in sent.tokens:
# raw output
raw_sent.append(tok.text)
# collect all datapoints of training| test dataset preprocessed
# initialize DataSetPreprocessed
# get embeddings
try:
embedding = torch.from_numpy(model_gensim[tok.text]) # output numpy array, convert to tensor
except:
embedding = torch.zeros(200)
sent_embeddings.append(embedding)
raw_doc.append(raw_sent)
doc_embeddings.append(pad(torch.stack(sent_embeddings, 0)))
document = torch.stack(doc_embeddings, 0)
def pad(seq, size = 100):
if len(seq) < size:
return torch.cat((seq, torch.zeros( size- len(seq), 200 )), 0)
return seq[:size]
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment