diff --git a/DD-GloVe/.gitignore b/DD-GloVe/.gitignore
new file mode 100755
index 0000000000000000000000000000000000000000..8d489efb902dea11ff5698f037c6278d0af273ee
--- /dev/null
+++ b/DD-GloVe/.gitignore
@@ -0,0 +1,27 @@
+__pycache__/
+
+# OS generated files #
+######################
+.DS_Store
+.DS_Store?
+._*
+.Spotlight-V100
+.Trashes
+ehthumbs.db
+Thumbs.db
+
+# Packages #
+############
+# it's better to unpack these files and commit the raw source
+# git has its own built in compression methods
+*.7z
+*.dmg
+*.gz
+*.iso
+*.jar
+*.rar
+*.tar
+*.zip
+
+.ipynb_checkpoints
+.idea
diff --git a/DD-GloVe/build_italian/common.o b/DD-GloVe/build_italian/common.o
new file mode 100644
index 0000000000000000000000000000000000000000..088c14ae1dc6738f16a896dc3f7e0da0a78991f8
Binary files /dev/null and b/DD-GloVe/build_italian/common.o differ
diff --git a/DD-GloVe/build_italian/cooccur b/DD-GloVe/build_italian/cooccur
new file mode 100755
index 0000000000000000000000000000000000000000..d3b18e4114a96308a0bd63629591f05b9e36ad7c
Binary files /dev/null and b/DD-GloVe/build_italian/cooccur differ
diff --git a/DD-GloVe/build_italian/cooccur.o b/DD-GloVe/build_italian/cooccur.o
new file mode 100644
index 0000000000000000000000000000000000000000..61c7705614003d3f8f510f99650f91d5705ac3e8
Binary files /dev/null and b/DD-GloVe/build_italian/cooccur.o differ
diff --git a/DD-GloVe/build_italian/glove b/DD-GloVe/build_italian/glove
new file mode 100755
index 0000000000000000000000000000000000000000..b16223d0bf62e3a3d294cc8677c8dd0703b7a367
Binary files /dev/null and b/DD-GloVe/build_italian/glove differ
diff --git a/DD-GloVe/build_italian/glove.o b/DD-GloVe/build_italian/glove.o
new file mode 100644
index 0000000000000000000000000000000000000000..512bbba7951aa67922f30c45c2aff94d571ed943
Binary files /dev/null and b/DD-GloVe/build_italian/glove.o differ
diff --git a/DD-GloVe/build_italian/shuffle b/DD-GloVe/build_italian/shuffle
new file mode 100755
index 0000000000000000000000000000000000000000..de823538782da1e4c2ca3ba4d60e9bcd8fd715f1
Binary files /dev/null and b/DD-GloVe/build_italian/shuffle differ
diff --git a/DD-GloVe/build_italian/shuffle.o b/DD-GloVe/build_italian/shuffle.o
new file mode 100644
index 0000000000000000000000000000000000000000..89d21794d4d9902303c291f1ae2aff237286817e
Binary files /dev/null and b/DD-GloVe/build_italian/shuffle.o differ
diff --git a/DD-GloVe/build_italian/vocab_count b/DD-GloVe/build_italian/vocab_count
new file mode 100755
index 0000000000000000000000000000000000000000..e75a09885e23036bc89cf9450d0862cd17efa506
Binary files /dev/null and b/DD-GloVe/build_italian/vocab_count differ
diff --git a/DD-GloVe/build_italian/vocab_count.o b/DD-GloVe/build_italian/vocab_count.o
new file mode 100644
index 0000000000000000000000000000000000000000..125cc4d26e5df037a3e816608f2427f103694376
Binary files /dev/null and b/DD-GloVe/build_italian/vocab_count.o differ
diff --git a/DD-GloVe/build_polish/common.o b/DD-GloVe/build_polish/common.o
new file mode 100644
index 0000000000000000000000000000000000000000..088c14ae1dc6738f16a896dc3f7e0da0a78991f8
Binary files /dev/null and b/DD-GloVe/build_polish/common.o differ
diff --git a/DD-GloVe/build_polish/cooccur b/DD-GloVe/build_polish/cooccur
new file mode 100755
index 0000000000000000000000000000000000000000..d3b18e4114a96308a0bd63629591f05b9e36ad7c
Binary files /dev/null and b/DD-GloVe/build_polish/cooccur differ
diff --git a/DD-GloVe/build_polish/cooccur.o b/DD-GloVe/build_polish/cooccur.o
new file mode 100644
index 0000000000000000000000000000000000000000..61c7705614003d3f8f510f99650f91d5705ac3e8
Binary files /dev/null and b/DD-GloVe/build_polish/cooccur.o differ
diff --git a/DD-GloVe/build_polish/glove b/DD-GloVe/build_polish/glove
new file mode 100755
index 0000000000000000000000000000000000000000..1fdb7a73d6be0addf28460d9f024d416bf6ae193
Binary files /dev/null and b/DD-GloVe/build_polish/glove differ
diff --git a/DD-GloVe/build_polish/glove.o b/DD-GloVe/build_polish/glove.o
new file mode 100644
index 0000000000000000000000000000000000000000..f38f17561c94ce417d9acd62343c73d1068c98aa
Binary files /dev/null and b/DD-GloVe/build_polish/glove.o differ
diff --git a/DD-GloVe/build_polish/shuffle b/DD-GloVe/build_polish/shuffle
new file mode 100755
index 0000000000000000000000000000000000000000..de823538782da1e4c2ca3ba4d60e9bcd8fd715f1
Binary files /dev/null and b/DD-GloVe/build_polish/shuffle differ
diff --git a/DD-GloVe/build_polish/shuffle.o b/DD-GloVe/build_polish/shuffle.o
new file mode 100644
index 0000000000000000000000000000000000000000..89d21794d4d9902303c291f1ae2aff237286817e
Binary files /dev/null and b/DD-GloVe/build_polish/shuffle.o differ
diff --git a/DD-GloVe/build_polish/vocab_count b/DD-GloVe/build_polish/vocab_count
new file mode 100755
index 0000000000000000000000000000000000000000..e75a09885e23036bc89cf9450d0862cd17efa506
Binary files /dev/null and b/DD-GloVe/build_polish/vocab_count differ
diff --git a/DD-GloVe/build_polish/vocab_count.o b/DD-GloVe/build_polish/vocab_count.o
new file mode 100644
index 0000000000000000000000000000000000000000..125cc4d26e5df037a3e816608f2427f103694376
Binary files /dev/null and b/DD-GloVe/build_polish/vocab_count.o differ
diff --git a/DD-GloVe/build_turkish/common.o b/DD-GloVe/build_turkish/common.o
new file mode 100644
index 0000000000000000000000000000000000000000..088c14ae1dc6738f16a896dc3f7e0da0a78991f8
Binary files /dev/null and b/DD-GloVe/build_turkish/common.o differ
diff --git a/DD-GloVe/build_turkish/cooccur b/DD-GloVe/build_turkish/cooccur
new file mode 100755
index 0000000000000000000000000000000000000000..d3b18e4114a96308a0bd63629591f05b9e36ad7c
Binary files /dev/null and b/DD-GloVe/build_turkish/cooccur differ
diff --git a/DD-GloVe/build_turkish/cooccur.o b/DD-GloVe/build_turkish/cooccur.o
new file mode 100644
index 0000000000000000000000000000000000000000..61c7705614003d3f8f510f99650f91d5705ac3e8
Binary files /dev/null and b/DD-GloVe/build_turkish/cooccur.o differ
diff --git a/DD-GloVe/build_turkish/glove b/DD-GloVe/build_turkish/glove
new file mode 100755
index 0000000000000000000000000000000000000000..d4a62a7bc0d6205c32236e877a130dc0d6308c83
Binary files /dev/null and b/DD-GloVe/build_turkish/glove differ
diff --git a/DD-GloVe/build_turkish/glove.o b/DD-GloVe/build_turkish/glove.o
new file mode 100644
index 0000000000000000000000000000000000000000..9293eee2b202955d5e01c125c31160e2aad6fb26
Binary files /dev/null and b/DD-GloVe/build_turkish/glove.o differ
diff --git a/DD-GloVe/build_turkish/shuffle b/DD-GloVe/build_turkish/shuffle
new file mode 100755
index 0000000000000000000000000000000000000000..de823538782da1e4c2ca3ba4d60e9bcd8fd715f1
Binary files /dev/null and b/DD-GloVe/build_turkish/shuffle differ
diff --git a/DD-GloVe/build_turkish/shuffle.o b/DD-GloVe/build_turkish/shuffle.o
new file mode 100644
index 0000000000000000000000000000000000000000..89d21794d4d9902303c291f1ae2aff237286817e
Binary files /dev/null and b/DD-GloVe/build_turkish/shuffle.o differ
diff --git a/DD-GloVe/build_turkish/vocab_count b/DD-GloVe/build_turkish/vocab_count
new file mode 100755
index 0000000000000000000000000000000000000000..e75a09885e23036bc89cf9450d0862cd17efa506
Binary files /dev/null and b/DD-GloVe/build_turkish/vocab_count differ
diff --git a/DD-GloVe/build_turkish/vocab_count.o b/DD-GloVe/build_turkish/vocab_count.o
new file mode 100644
index 0000000000000000000000000000000000000000..125cc4d26e5df037a3e816608f2427f103694376
Binary files /dev/null and b/DD-GloVe/build_turkish/vocab_count.o differ
diff --git a/DD-GloVe/crawl_english_definitions.py b/DD-GloVe/crawl_english_definitions.py
new file mode 100644
index 0000000000000000000000000000000000000000..2527a36a47203d72ad36ccffa44c47db277415da
--- /dev/null
+++ b/DD-GloVe/crawl_english_definitions.py
@@ -0,0 +1,86 @@
+"""Get definitions for all words in GloVe vocab from dictionary.com"""
+
+import json
+import logging
+import time
+import typing
+import requests
+from bs4 import BeautifulSoup
+from string import punctuation
+from tqdm import tqdm
+from requests.adapters import HTTPAdapter
+from urllib3.util.retry import Retry
+
+def extract_definitions(word):
+    """Access dictionary.com and return a list
+    of found definitions for the given word."""
+
+    # format word for url
+    word = word.replace("'", "-")
+
+    # the website takes care of capitalisation issues issues on its own
+    # e.g., dictionary.com/browse/germany and dictionary.com/browse/Germany both direct
+    # to the same entry for Germany 
+    url = f"https://www.dictionary.com/browse/{word}"
+
+    session = requests.Session()
+    retry = Retry(connect=3, backoff_factor=10)
+    adapter = HTTPAdapter(max_retries=retry)
+    session.mount('http://', adapter)
+    session.mount('https://', adapter)
+    response = session.get(url)
+
+    if response.status_code == 200:
+        soup = BeautifulSoup(response.content, "html.parser")
+        definition_divs = soup.find_all("div", {"data-type": "word-definition-content"})
+
+        definitions = []
+        for div in definition_divs:
+            p_tags = div.find_all("p")
+            for p_tag in p_tags:
+                definitions.append(p_tag.get_text())
+  
+        return definitions
+    else:
+        logging.debug(f"Failed to fetch definitions for '{word}'. Status code: {response.status_code}")
+        return []
+
+def main(start: int=0, end: typing.Optional[int]=None):
+    """Execute the crawling function for all words in the vocab,
+    or, alternatively, specify a range in which vocab words should be considered.
+    Save the definitions in a json file."""
+
+    logging.info("Reading vocab ...")
+    with open("/workspace/students/reichelt/BA/data/dd-glove/english_vocab.txt",
+              "r", encoding="utf-8") as f:
+        lines = f.readlines()
+         # vocab.txt contains entries like "0 the" -> extract only the token without index
+        word_list = [line.split()[0] for line in lines]
+    logging.info("Reading done.")
+
+    all_definitions = {}  # create empty dictionary to store definitions
+    empty_returns = 0  # count tokens without defintion
+    logging.info("Crawling definitons ...")
+    for i, word in tqdm(enumerate(word_list[start:end])):  # TODO ideally log this instead
+        # wait between requests to not overload their servers
+        if i % 100 == 0:
+            time.sleep(5)
+        elif i % 20 == 0:
+            time.sleep(1)
+        definitions = extract_definitions(word)
+        if not definitions:
+            empty_returns += 1
+        all_definitions[word] = definitions  # store the definitions in the dictionary
+        logging.debug(f"Extracted {len(definitions)} definitions for {word}")
+    logging.info(f"Crawling done. {empty_returns} out of {len(word_list[start:end])} tokens had no definiton.")
+
+    # store the definitions in a json file
+    with open("/workspace/students/reichelt/BA/data/dd-glove/english_definitions.json", "w") as f:
+        logging.info("Saving definitions ...")
+        json.dump(all_definitions, f, indent=4)
+        logging.info("Saving done.")
+
+if __name__ == "__main__":
+    logging.basicConfig(filename='crawl_english_definitions.log', level=logging.INFO,
+                        format='%(asctime)s - %(levelname)s: %(message)s', filemode="w")
+    main()  # e.g. end=100 means use only first 100 vocab words
diff --git a/preprocess_wikipedia.py b/preprocess_wikipedia.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb8806d636f5eebbc6daa2bab2d80bf63366b2ef
--- /dev/null
+++ b/preprocess_wikipedia.py
@@ -0,0 +1,19 @@
+"""
+Download Wikipedia dump using huggingface and preprocess it
+using nltk tokenizer, lowercasing, punctuation removal
+"""
+
+from tqdm import tqdm
+from nltk.tokenize import word_tokenize
+from datasets import load_dataset
+
+wikipedia = load_dataset("wikipedia", "20220301.en")
+wikipedia = wikipedia["train"]
+
+with open("/workspace/students/reichelt/BA/data/wikipedia/english_wikipedia_preprocessed.txt",
+          "w+", encoding="utf-8") as f:
+    for article in tqdm(wikipedia):
+        tokenized = word_tokenize(article["text"], language='english')
+        tokenized = [token.lower() for token in tokenized]
+        JOINED = " ".join(tokenized)
+        f.write(JOINED + "\n")
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a784b9f812b75a9fe610ab648df6d4f2a7d2eeb2
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,115 @@
+aiohttp==3.8.4
+aiosignal==1.3.1
+apache-beam==2.49.0
+asttokens==2.2.1
+async-timeout==4.0.2
+attrs==23.1.0
+backcall==0.2.0
+beautifulsoup4==4.12.2
+bpemb==0.3.4
+bs4==0.0.1
+certifi==2023.5.7
+charset-normalizer==3.1.0
+click==8.1.3
+cloudpickle==2.2.1
+comm==0.1.3
+contourpy==1.1.1
+crcmod==1.7
+cycler==0.12.1
+datasets==2.12.0
+debugpy==1.6.7
+decorator==5.1.1
+dill==0.3.1.1
+dnspython==2.4.2
+docopt==0.6.2
+exceptiongroup==1.1.2
+executing==1.2.0
+fastavro==1.8.2
+fasteners==0.18
+fastenum==1.0.4
+fasttext==0.9.2
+filelock==3.12.0
+fonttools==4.47.0
+frozenlist==1.3.3
+fsspec==2023.5.0
+gensim==4.3.1
+germanetpy==0.2.2
+grpcio==1.57.0
+hdfs==2.7.2
+httplib2==0.22.0
+huggingface-hub==0.15.1
+idna==3.4
+importlib-metadata==6.7.0
+importlib-resources==6.1.1
+iniconfig==2.0.0
+ipykernel==6.24.0
+ipython==8.12.2
+jedi==0.18.2
+joblib==1.2.0
+jupyter-client==8.3.0
+jupyter-core==5.3.1
+kiwisolver==1.4.5
+lxml==4.9.3
+matplotlib==3.7.4
+matplotlib-inline==0.1.6
+multidict==6.0.4
+multiprocess==0.70.14
+nest-asyncio==1.5.6
+nltk==3.8.1
+numpy==1.24.3
+objsize==0.6.1
+orjson==3.9.5
+packaging==23.1
+pandas==2.0.2
+parso==0.8.3
+pexpect==4.8.0
+pickleshare==0.7.5
+Pillow==10.1.0
+platformdirs==3.8.0
+pluggy==1.2.0
+prompt-toolkit==3.0.39
+proto-plus==1.22.3
+protobuf==4.23.4
+psutil==5.9.5
+ptyprocess==0.7.0
+pure-eval==0.2.2
+pyarrow==11.0.0
+pybind11==2.10.4
+pydot==1.4.2
+Pygments==2.15.1
+pymongo==4.5.0
+pyparsing==3.1.1
+pytest==7.4.0
+python-dateutil==2.8.2
+python-Levenshtein==0.12.2
+pytz==2023.3
+PyYAML==6.0
+pyzmq==25.1.0
+regex==2023.6.3
+requests==2.31.0
+responses==0.18.0
+safetensors==0.3.1
+scikit-learn==1.2.2
+scipy==1.10.1
+seaborn==0.13.0
+sentencepiece==0.1.99
+six==1.16.0
+smart-open==6.3.0
+soupsieve==2.4.1
+stack-data==0.6.2
+threadpoolctl==3.1.0
+tokenizers==0.13.3
+tomli==2.0.1
+tornado==6.3.2
+tqdm==4.65.0
+traitlets==5.9.0
+transformers==4.30.2
+typing-extensions==4.6.2
+tzdata==2023.3
+Unidecode==1.3.7
+urllib3==2.0.2
+wcwidth==0.2.6
+xxhash==3.2.0
+yarl==1.9.2
+zipp==3.15.0
+zstandard==0.21.0