diff --git a/DD-GloVe/.gitignore b/DD-GloVe/.gitignore new file mode 100755 index 0000000000000000000000000000000000000000..8d489efb902dea11ff5698f037c6278d0af273ee --- /dev/null +++ b/DD-GloVe/.gitignore @@ -0,0 +1,27 @@ +__pycache__/ + +# OS generated files # +###################### +.DS_Store +.DS_Store? +._* +.Spotlight-V100 +.Trashes +ehthumbs.db +Thumbs.db + +# Packages # +############ +# it's better to unpack these files and commit the raw source +# git has its own built in compression methods +*.7z +*.dmg +*.gz +*.iso +*.jar +*.rar +*.tar +*.zip + +.ipynb_checkpoints +.idea diff --git a/DD-GloVe/build_italian/common.o b/DD-GloVe/build_italian/common.o new file mode 100644 index 0000000000000000000000000000000000000000..088c14ae1dc6738f16a896dc3f7e0da0a78991f8 Binary files /dev/null and b/DD-GloVe/build_italian/common.o differ diff --git a/DD-GloVe/build_italian/cooccur b/DD-GloVe/build_italian/cooccur new file mode 100755 index 0000000000000000000000000000000000000000..d3b18e4114a96308a0bd63629591f05b9e36ad7c Binary files /dev/null and b/DD-GloVe/build_italian/cooccur differ diff --git a/DD-GloVe/build_italian/cooccur.o b/DD-GloVe/build_italian/cooccur.o new file mode 100644 index 0000000000000000000000000000000000000000..61c7705614003d3f8f510f99650f91d5705ac3e8 Binary files /dev/null and b/DD-GloVe/build_italian/cooccur.o differ diff --git a/DD-GloVe/build_italian/glove b/DD-GloVe/build_italian/glove new file mode 100755 index 0000000000000000000000000000000000000000..b16223d0bf62e3a3d294cc8677c8dd0703b7a367 Binary files /dev/null and b/DD-GloVe/build_italian/glove differ diff --git a/DD-GloVe/build_italian/glove.o b/DD-GloVe/build_italian/glove.o new file mode 100644 index 0000000000000000000000000000000000000000..512bbba7951aa67922f30c45c2aff94d571ed943 Binary files /dev/null and b/DD-GloVe/build_italian/glove.o differ diff --git a/DD-GloVe/build_italian/shuffle b/DD-GloVe/build_italian/shuffle new file mode 100755 index 0000000000000000000000000000000000000000..de823538782da1e4c2ca3ba4d60e9bcd8fd715f1 Binary files /dev/null and b/DD-GloVe/build_italian/shuffle differ diff --git a/DD-GloVe/build_italian/shuffle.o b/DD-GloVe/build_italian/shuffle.o new file mode 100644 index 0000000000000000000000000000000000000000..89d21794d4d9902303c291f1ae2aff237286817e Binary files /dev/null and b/DD-GloVe/build_italian/shuffle.o differ diff --git a/DD-GloVe/build_italian/vocab_count b/DD-GloVe/build_italian/vocab_count new file mode 100755 index 0000000000000000000000000000000000000000..e75a09885e23036bc89cf9450d0862cd17efa506 Binary files /dev/null and b/DD-GloVe/build_italian/vocab_count differ diff --git a/DD-GloVe/build_italian/vocab_count.o b/DD-GloVe/build_italian/vocab_count.o new file mode 100644 index 0000000000000000000000000000000000000000..125cc4d26e5df037a3e816608f2427f103694376 Binary files /dev/null and b/DD-GloVe/build_italian/vocab_count.o differ diff --git a/DD-GloVe/build_polish/common.o b/DD-GloVe/build_polish/common.o new file mode 100644 index 0000000000000000000000000000000000000000..088c14ae1dc6738f16a896dc3f7e0da0a78991f8 Binary files /dev/null and b/DD-GloVe/build_polish/common.o differ diff --git a/DD-GloVe/build_polish/cooccur b/DD-GloVe/build_polish/cooccur new file mode 100755 index 0000000000000000000000000000000000000000..d3b18e4114a96308a0bd63629591f05b9e36ad7c Binary files /dev/null and b/DD-GloVe/build_polish/cooccur differ diff --git a/DD-GloVe/build_polish/cooccur.o b/DD-GloVe/build_polish/cooccur.o new file mode 100644 index 0000000000000000000000000000000000000000..61c7705614003d3f8f510f99650f91d5705ac3e8 Binary files /dev/null and b/DD-GloVe/build_polish/cooccur.o differ diff --git a/DD-GloVe/build_polish/glove b/DD-GloVe/build_polish/glove new file mode 100755 index 0000000000000000000000000000000000000000..1fdb7a73d6be0addf28460d9f024d416bf6ae193 Binary files /dev/null and b/DD-GloVe/build_polish/glove differ diff --git a/DD-GloVe/build_polish/glove.o b/DD-GloVe/build_polish/glove.o new file mode 100644 index 0000000000000000000000000000000000000000..f38f17561c94ce417d9acd62343c73d1068c98aa Binary files /dev/null and b/DD-GloVe/build_polish/glove.o differ diff --git a/DD-GloVe/build_polish/shuffle b/DD-GloVe/build_polish/shuffle new file mode 100755 index 0000000000000000000000000000000000000000..de823538782da1e4c2ca3ba4d60e9bcd8fd715f1 Binary files /dev/null and b/DD-GloVe/build_polish/shuffle differ diff --git a/DD-GloVe/build_polish/shuffle.o b/DD-GloVe/build_polish/shuffle.o new file mode 100644 index 0000000000000000000000000000000000000000..89d21794d4d9902303c291f1ae2aff237286817e Binary files /dev/null and b/DD-GloVe/build_polish/shuffle.o differ diff --git a/DD-GloVe/build_polish/vocab_count b/DD-GloVe/build_polish/vocab_count new file mode 100755 index 0000000000000000000000000000000000000000..e75a09885e23036bc89cf9450d0862cd17efa506 Binary files /dev/null and b/DD-GloVe/build_polish/vocab_count differ diff --git a/DD-GloVe/build_polish/vocab_count.o b/DD-GloVe/build_polish/vocab_count.o new file mode 100644 index 0000000000000000000000000000000000000000..125cc4d26e5df037a3e816608f2427f103694376 Binary files /dev/null and b/DD-GloVe/build_polish/vocab_count.o differ diff --git a/DD-GloVe/build_turkish/common.o b/DD-GloVe/build_turkish/common.o new file mode 100644 index 0000000000000000000000000000000000000000..088c14ae1dc6738f16a896dc3f7e0da0a78991f8 Binary files /dev/null and b/DD-GloVe/build_turkish/common.o differ diff --git a/DD-GloVe/build_turkish/cooccur b/DD-GloVe/build_turkish/cooccur new file mode 100755 index 0000000000000000000000000000000000000000..d3b18e4114a96308a0bd63629591f05b9e36ad7c Binary files /dev/null and b/DD-GloVe/build_turkish/cooccur differ diff --git a/DD-GloVe/build_turkish/cooccur.o b/DD-GloVe/build_turkish/cooccur.o new file mode 100644 index 0000000000000000000000000000000000000000..61c7705614003d3f8f510f99650f91d5705ac3e8 Binary files /dev/null and b/DD-GloVe/build_turkish/cooccur.o differ diff --git a/DD-GloVe/build_turkish/glove b/DD-GloVe/build_turkish/glove new file mode 100755 index 0000000000000000000000000000000000000000..d4a62a7bc0d6205c32236e877a130dc0d6308c83 Binary files /dev/null and b/DD-GloVe/build_turkish/glove differ diff --git a/DD-GloVe/build_turkish/glove.o b/DD-GloVe/build_turkish/glove.o new file mode 100644 index 0000000000000000000000000000000000000000..9293eee2b202955d5e01c125c31160e2aad6fb26 Binary files /dev/null and b/DD-GloVe/build_turkish/glove.o differ diff --git a/DD-GloVe/build_turkish/shuffle b/DD-GloVe/build_turkish/shuffle new file mode 100755 index 0000000000000000000000000000000000000000..de823538782da1e4c2ca3ba4d60e9bcd8fd715f1 Binary files /dev/null and b/DD-GloVe/build_turkish/shuffle differ diff --git a/DD-GloVe/build_turkish/shuffle.o b/DD-GloVe/build_turkish/shuffle.o new file mode 100644 index 0000000000000000000000000000000000000000..89d21794d4d9902303c291f1ae2aff237286817e Binary files /dev/null and b/DD-GloVe/build_turkish/shuffle.o differ diff --git a/DD-GloVe/build_turkish/vocab_count b/DD-GloVe/build_turkish/vocab_count new file mode 100755 index 0000000000000000000000000000000000000000..e75a09885e23036bc89cf9450d0862cd17efa506 Binary files /dev/null and b/DD-GloVe/build_turkish/vocab_count differ diff --git a/DD-GloVe/build_turkish/vocab_count.o b/DD-GloVe/build_turkish/vocab_count.o new file mode 100644 index 0000000000000000000000000000000000000000..125cc4d26e5df037a3e816608f2427f103694376 Binary files /dev/null and b/DD-GloVe/build_turkish/vocab_count.o differ diff --git a/DD-GloVe/crawl_english_definitions.py b/DD-GloVe/crawl_english_definitions.py new file mode 100644 index 0000000000000000000000000000000000000000..2527a36a47203d72ad36ccffa44c47db277415da --- /dev/null +++ b/DD-GloVe/crawl_english_definitions.py @@ -0,0 +1,86 @@ +"""Get definitions for all words in GloVe vocab from dictionary.com""" + +import json +import logging +import time +import typing +import requests +from bs4 import BeautifulSoup +from string import punctuation +from tqdm import tqdm +from requests.adapters import HTTPAdapter +from urllib3.util.retry import Retry + +def extract_definitions(word): + """Access dictionary.com and return a list + of found definitions for the given word.""" + + # format word for url + word = word.replace("'", "-") + + # the website takes care of capitalisation issues issues on its own + # e.g., dictionary.com/browse/germany and dictionary.com/browse/Germany both direct + # to the same entry for Germany + url = f"https://www.dictionary.com/browse/{word}" + + session = requests.Session() + retry = Retry(connect=3, backoff_factor=10) + adapter = HTTPAdapter(max_retries=retry) + session.mount('http://', adapter) + session.mount('https://', adapter) + response = session.get(url) + + if response.status_code == 200: + soup = BeautifulSoup(response.content, "html.parser") + definition_divs = soup.find_all("div", {"data-type": "word-definition-content"}) + + definitions = [] + for div in definition_divs: + p_tags = div.find_all("p") + for p_tag in p_tags: + definitions.append(p_tag.get_text()) + + return definitions + else: + logging.debug(f"Failed to fetch definitions for '{word}'. Status code: {response.status_code}") + return [] + +def main(start: int=0, end: typing.Optional[int]=None): + """Execute the crawling function for all words in the vocab, + or, alternatively, specify a range in which vocab words should be considered. + Save the definitions in a json file.""" + + logging.info("Reading vocab ...") + with open("/workspace/students/reichelt/BA/data/dd-glove/english_vocab.txt", + "r", encoding="utf-8") as f: + lines = f.readlines() + # vocab.txt contains entries like "0 the" -> extract only the token without index + word_list = [line.split()[0] for line in lines] + logging.info("Reading done.") + + all_definitions = {} # create empty dictionary to store definitions + empty_returns = 0 # count tokens without defintion + logging.info("Crawling definitons ...") + for i, word in tqdm(enumerate(word_list[start:end])): # TODO ideally log this instead + # wait between requests to not overload their servers + if i % 100 == 0: + time.sleep(5) + elif i % 20 == 0: + time.sleep(1) + definitions = extract_definitions(word) + if not definitions: + empty_returns += 1 + all_definitions[word] = definitions # store the definitions in the dictionary + logging.debug(f"Extracted {len(definitions)} definitions for {word}") + logging.info(f"Crawling done. {empty_returns} out of {len(word_list[start:end])} tokens had no definiton.") + + # store the definitions in a json file + with open("/workspace/students/reichelt/BA/data/dd-glove/english_definitions.json", "w") as f: + logging.info("Saving definitions ...") + json.dump(all_definitions, f, indent=4) + logging.info("Saving done.") + +if __name__ == "__main__": + logging.basicConfig(filename='crawl_english_definitions.log', level=logging.INFO, + format='%(asctime)s - %(levelname)s: %(message)s', filemode="w") + main() # e.g. end=100 means use only first 100 vocab words diff --git a/preprocess_wikipedia.py b/preprocess_wikipedia.py new file mode 100644 index 0000000000000000000000000000000000000000..eb8806d636f5eebbc6daa2bab2d80bf63366b2ef --- /dev/null +++ b/preprocess_wikipedia.py @@ -0,0 +1,19 @@ +""" +Download Wikipedia dump using huggingface and preprocess it +using nltk tokenizer, lowercasing, punctuation removal +""" + +from tqdm import tqdm +from nltk.tokenize import word_tokenize +from datasets import load_dataset + +wikipedia = load_dataset("wikipedia", "20220301.en") +wikipedia = wikipedia["train"] + +with open("/workspace/students/reichelt/BA/data/wikipedia/english_wikipedia_preprocessed.txt", + "w+", encoding="utf-8") as f: + for article in tqdm(wikipedia): + tokenized = word_tokenize(article["text"], language='english') + tokenized = [token.lower() for token in tokenized] + JOINED = " ".join(tokenized) + f.write(JOINED + "\n") diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..a784b9f812b75a9fe610ab648df6d4f2a7d2eeb2 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,115 @@ +aiohttp==3.8.4 +aiosignal==1.3.1 +apache-beam==2.49.0 +asttokens==2.2.1 +async-timeout==4.0.2 +attrs==23.1.0 +backcall==0.2.0 +beautifulsoup4==4.12.2 +bpemb==0.3.4 +bs4==0.0.1 +certifi==2023.5.7 +charset-normalizer==3.1.0 +click==8.1.3 +cloudpickle==2.2.1 +comm==0.1.3 +contourpy==1.1.1 +crcmod==1.7 +cycler==0.12.1 +datasets==2.12.0 +debugpy==1.6.7 +decorator==5.1.1 +dill==0.3.1.1 +dnspython==2.4.2 +docopt==0.6.2 +exceptiongroup==1.1.2 +executing==1.2.0 +fastavro==1.8.2 +fasteners==0.18 +fastenum==1.0.4 +fasttext==0.9.2 +filelock==3.12.0 +fonttools==4.47.0 +frozenlist==1.3.3 +fsspec==2023.5.0 +gensim==4.3.1 +germanetpy==0.2.2 +grpcio==1.57.0 +hdfs==2.7.2 +httplib2==0.22.0 +huggingface-hub==0.15.1 +idna==3.4 +importlib-metadata==6.7.0 +importlib-resources==6.1.1 +iniconfig==2.0.0 +ipykernel==6.24.0 +ipython==8.12.2 +jedi==0.18.2 +joblib==1.2.0 +jupyter-client==8.3.0 +jupyter-core==5.3.1 +kiwisolver==1.4.5 +lxml==4.9.3 +matplotlib==3.7.4 +matplotlib-inline==0.1.6 +multidict==6.0.4 +multiprocess==0.70.14 +nest-asyncio==1.5.6 +nltk==3.8.1 +numpy==1.24.3 +objsize==0.6.1 +orjson==3.9.5 +packaging==23.1 +pandas==2.0.2 +parso==0.8.3 +pexpect==4.8.0 +pickleshare==0.7.5 +Pillow==10.1.0 +platformdirs==3.8.0 +pluggy==1.2.0 +prompt-toolkit==3.0.39 +proto-plus==1.22.3 +protobuf==4.23.4 +psutil==5.9.5 +ptyprocess==0.7.0 +pure-eval==0.2.2 +pyarrow==11.0.0 +pybind11==2.10.4 +pydot==1.4.2 +Pygments==2.15.1 +pymongo==4.5.0 +pyparsing==3.1.1 +pytest==7.4.0 +python-dateutil==2.8.2 +python-Levenshtein==0.12.2 +pytz==2023.3 +PyYAML==6.0 +pyzmq==25.1.0 +regex==2023.6.3 +requests==2.31.0 +responses==0.18.0 +safetensors==0.3.1 +scikit-learn==1.2.2 +scipy==1.10.1 +seaborn==0.13.0 +sentencepiece==0.1.99 +six==1.16.0 +smart-open==6.3.0 +soupsieve==2.4.1 +stack-data==0.6.2 +threadpoolctl==3.1.0 +tokenizers==0.13.3 +tomli==2.0.1 +tornado==6.3.2 +tqdm==4.65.0 +traitlets==5.9.0 +transformers==4.30.2 +typing-extensions==4.6.2 +tzdata==2023.3 +Unidecode==1.3.7 +urllib3==2.0.2 +wcwidth==0.2.6 +xxhash==3.2.0 +yarl==1.9.2 +zipp==3.15.0 +zstandard==0.21.0