diff --git a/WEAT/run_weat.sh b/WEAT/run_weat.sh new file mode 100644 index 0000000000000000000000000000000000000000..e29a45b155f4208cb75679e2fe8371669143d901 --- /dev/null +++ b/WEAT/run_weat.sh @@ -0,0 +1,17 @@ +#!/bin/bash +# +#SBATCH --job-name=weat +#SBATCH --output=weat_output_4.txt +#SBATCH --mem=32G +#SBATCH --partition=compute +#SBATCH --cpus-per-task=32 +#SBATCH --mail-user=reichelt@cl.uni-heidelberg.de +#SBATCH --mail-type=ALL +#SBATCH --time=3-00:00:00 + +# JOB STEPS +source /home/students/reichelt/ba/bias-mitigation-ba/bias-venv/bin/activate + +srun python /home/students/reichelt/ba/bias-mitigation-ba/WEAT/weat_experiments.py --attribute italian --vector_location /home/students/reichelt/ba/bias-mitigation-ba/data/embeddings/glove/dd-glove/glove_hard_debiased_polish_w2vformat.txt +srun python /home/students/reichelt/ba/bias-mitigation-ba/WEAT/weat_experiments.py --attribute turkish --vector_location /home/students/reichelt/ba/bias-mitigation-ba/data/embeddings/glove/dd-glove/glove_hard_debiased_italian_w2vformat.txt +srun python /home/students/reichelt/ba/bias-mitigation-ba/WEAT/weat_experiments.py --attribute polish --vector_location /home/students/reichelt/ba/bias-mitigation-ba/data/embeddings/glove/dd-glove/glove_hard_debiased_italian_w2vformat.txt diff --git a/WEAT/weat_experiments.ipynb b/WEAT/weat_experiments.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..78a949cd605052e835669fad68c40d8b010e4c9c --- /dev/null +++ b/WEAT/weat_experiments.ipynb @@ -0,0 +1,1164 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "8EUMdJYWwb5H" + }, + "source": [ + "# WEAT experiments\n", + "\n", + "This notebook checks bias in various embeddings (pretrained German deepset GloVe embeddings, pretrained German fastText embeddings, self-trained GloVe embeddings, self-trained \"debiased\" DD-GloVe embeddings), using my WEAT reimplementation and either my new name lists or the original translated German lists from Kurpicz-Briki." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Ss1A3qPTOhLN" + }, + "source": [ + "## Preparation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Psu1iIJyv4w0", + "outputId": "f536db15-615b-4660-c1a3-a4996fc25409" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Collecting fasttext\n", + " Downloading fasttext-0.9.2.tar.gz (68 kB)\n", + "\u001b[2K \u001b[90mâ”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”\u001b[0m \u001b[32m68.8/68.8 kB\u001b[0m \u001b[31m1.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "Collecting pybind11>=2.2 (from fasttext)\n", + " Using cached pybind11-2.11.1-py3-none-any.whl (227 kB)\n", + "Requirement already satisfied: setuptools>=0.7.0 in /usr/local/lib/python3.10/dist-packages (from fasttext) (67.7.2)\n", + "Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from fasttext) (1.23.5)\n", + "Building wheels for collected packages: fasttext\n", + " Building wheel for fasttext (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for fasttext: filename=fasttext-0.9.2-cp310-cp310-linux_x86_64.whl size=4199770 sha256=957cb165a6122eee2f7cb2d97ef261efcec55e77b30450f5453dfda81d903666\n", + " Stored in directory: /root/.cache/pip/wheels/a5/13/75/f811c84a8ab36eedbaef977a6a58a98990e8e0f1967f98f394\n", + "Successfully built fasttext\n", + "Installing collected packages: pybind11, fasttext\n", + "Successfully installed fasttext-0.9.2 pybind11-2.11.1\n" + ] + } + ], + "source": [ + "!pip install fasttext\n", + "import fasttext" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "bJD6jAi74Qvl", + "outputId": "3660dc64-ccb3-4695-ba79-d9c443d95aaf" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Mounted at /content/drive\n" + ] + } + ], + "source": [ + "from google.colab import drive\n", + "drive.mount('/content/drive')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "3F_XXAic4GT6" + }, + "outputs": [], + "source": [ + "from sklearn.metrics.pairwise import cosine_similarity as cosine\n", + "import numpy as np\n", + "from itertools import combinations, islice\n", + "import pandas as pd\n", + "from tqdm import tqdm\n", + "import gzip\n", + "from gensim.models import KeyedVectors" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "0Cfc4Heu4GT_" + }, + "outputs": [], + "source": [ + "np.random.seed(42)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ZdvHcbGfOq6X" + }, + "source": [ + "### Target words\n", + "\n", + "Choose a WEAT set." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "-iCKpnmBzUlt" + }, + "outputs": [], + "source": [ + "# load German target words for origin as per Kurpicz-Briki 2020 (WEAT 5)\n", + "\n", + "pleasant = {\n", + " \"Spaß\": [],\n", + " \"Liebe\": [],\n", + " \"Frieden\": [],\n", + " \"wunderbar\": [],\n", + " \"Freude\": [],\n", + " \"Lachen\": [],\n", + " \"Glück\": [],\n", + "}\n", + "\n", + "unpleasant = {\n", + " \"Qual\": [],\n", + " \"furchtbar\": [],\n", + " \"schrecklich\": [],\n", + " \"übel\": [],\n", + " \"böse\": [],\n", + " \"Krieg\": [],\n", + " \"grauenhaft\": [],\n", + " \"Versagen\": []\n", + "}\n", + "\n", + "target_dicts = {\"pleasant\": pleasant, \"unpleasant\": unpleasant}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "vKG6On74vNYl" + }, + "outputs": [], + "source": [ + "# load English target for gender as per Caliskan et al. 2017 (WEAT 6)\n", + "\n", + "career = {\n", + " \"executive\": [],\n", + " \"management\": [],\n", + " \"professional\": [],\n", + " \"corporation\": [],\n", + " \"salary\": [],\n", + " \"office\": [],\n", + " \"business\": [],\n", + " \"career\": []\n", + "}\n", + "\n", + "family = {\n", + " \"home\": [],\n", + " \"parents\": [],\n", + " \"children\": [],\n", + " \"family\": [],\n", + " \"cousins\": [],\n", + " \"marriage\": [],\n", + " \"wedding\": [],\n", + " \"relatives\": []\n", + "}\n", + "\n", + "target_dicts = {\"pleasant\": career, \"unpleasant\": family}\n", + "# 'pleasant' and 'unpleasant' here refer to the other WEAT experiments where\n", + "# the target word groups are named such" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Nt4eT41zYYOe" + }, + "outputs": [], + "source": [ + "# load English target for race as per Caliskan et al. 2017 (WEAT 5)\n", + "\n", + "pleasant = {\n", + " \"joy\": [],\n", + " \"love\": [],\n", + " \"peace\": [],\n", + " \"wonderful\": [],\n", + " \"pleasure\": [],\n", + " \"friend\": [],\n", + " \"laughter\": [],\n", + " \"happy\": []\n", + "}\n", + "\n", + "unpleasant = {\n", + " \"agony\": [],\n", + " \"terrible\": [],\n", + " \"horrible\": [],\n", + " \"nasty\": [],\n", + " \"evil\": [],\n", + " \"war\": [],\n", + " \"awful\": [],\n", + " \"failure\": []\n", + "}\n", + "\n", + "target_dicts = {\"pleasant\": pleasant, \"unpleasant\": unpleasant}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "WcdxB4eMOyFW" + }, + "source": [ + "### Attribute words\n", + "\n", + "Choose **one** of the following ways to load attribute words. (new or old name lists, different WEAT experiments)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "_scMqMc84GUA" + }, + "outputs": [], + "source": [ + "# loading expanded name lists (attribute words)\n", + "names_df = pd.read_csv(\"/content/drive/MyDrive/BA/name_selection.csv\", usecols=[\"name\", \"keep\", \"nationality\"])\n", + "names_df = names_df[names_df[\"keep\"] == \"y\"] # discard all rows where \"keep\" is not \"y\"\n", + "\n", + "grouped_df = names_df.groupby(\"nationality\")\n", + "\n", + "name_dicts = {} # will be: {\"German\": {\"Sabine\": [], \"Susanne\": [], ...}, ...}\n", + "\n", + "for nationality, group in grouped_df:\n", + " names = group[\"name\"].tolist()\n", + " name_dict = {key: [] for key in names}\n", + " name_dicts[nationality] = name_dict" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "dDtHna2JVIqv" + }, + "outputs": [], + "source": [ + "# loading German attribute words for origin as per Kurpicz-Briki 2020 (WEAT 5)\n", + "german = {\n", + " \"Peter\": [],\n", + " \"Daniel\": [],\n", + " \"Hans\": [],\n", + " \"Thomas\": [],\n", + " \"Andreas\": [],\n", + " \"Martin\": [],\n", + " \"Markus\": [],\n", + " \"Michael\": [],\n", + " \"Maria\": [],\n", + " \"Anna\": [],\n", + " \"Ursula\": [],\n", + " \"Ruth\": [],\n", + " \"Monika\": [],\n", + " \"Elisabeth\": [],\n", + " \"Verena\": [],\n", + " \"Sandra\": []\n", + "}\n", + "\n", + "foreign = {\n", + " \"Ladina\": [],\n", + " \"Fatima\": [],\n", + " \"Fatma\": [],\n", + " \"Alma\": [],\n", + " \"Soraya\": [],\n", + " \"Svetlana\": [],\n", + " \"Elif\": [],\n", + " \"Vesna\": [],\n", + " \"Mehmet\": [],\n", + " \"Mustafa\": [],\n", + " \"Aleksandar\": [],\n", + " \"Mohamed\": [],\n", + " \"Ibrahim\": [],\n", + " \"Dragan\": [],\n", + " \"Hasan\": [],\n", + " \"Mohammad\": []\n", + "}\n", + "\n", + "name_dicts = {\"German\": german, \"Foreign\": foreign}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "lGhhucvEyNVO" + }, + "outputs": [], + "source": [ + "# loading English target for gender as per Caliskan et al. 2017 (WEAT 6)\n", + "\n", + "male = {\n", + " \"John\": [],\n", + " \"Paul\": [],\n", + " \"Mike\": [],\n", + " \"Kevin\": [],\n", + " \"Steve\": [],\n", + " \"Greg\": [],\n", + " \"Jeff\": [],\n", + " \"Bill\": []\n", + "}\n", + "\n", + "female = {\n", + " \"Amy\": [],\n", + " \"Joan\": [],\n", + " \"Lisa\": [],\n", + " \"Sarah\": [],\n", + " \"Diana\": [],\n", + " \"Kate\": [],\n", + " \"Ann\": [],\n", + " \"Donna\": []\n", + "}\n", + "\n", + "name_dicts = {\"German\": male, \"Foreign\": female}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "3f6iiv-QY9z1" + }, + "outputs": [], + "source": [ + "# loading English target for race as per Caliskan et al. 2017 (WEAT 5)\n", + "\n", + "european_american = {\n", + " \"Brad\": [],\n", + " \"Brendan\": [],\n", + " \"Geoffrey\": [],\n", + " \"Greg\": [],\n", + " \"Brett\": [],\n", + " \"Jay\": [],\n", + " \"Matthew\": [],\n", + " \"Neil\": [],\n", + " \"Todd\": [],\n", + " \"Allison\": [],\n", + " \"Anne\": [],\n", + " \"Carrie\": [],\n", + " \"Emily\": [],\n", + " \"Jill\": [],\n", + " \"Laurie\": [],\n", + " \"Kristen\": [],\n", + " \"Meredith\": [],\n", + " \"Sarah\": []\n", + "}\n", + "\n", + "african_american = {\n", + " \"Darnell\": [],\n", + " \"Hakim\": [],\n", + " \"Jermaine\": [],\n", + " \"Kareem\": [],\n", + " \"Jamal\": [],\n", + " \"Leroy\": [],\n", + " \"Rasheed\": [],\n", + " \"Tremayne\": [],\n", + " \"Tyrone\": [],\n", + " \"Aisha\": [],\n", + " \"Ebony\": [],\n", + " \"Keisha\": [],\n", + " \"Kenya\": [],\n", + " \"Latonya\": [],\n", + " \"Lakisha\": [],\n", + " \"Latoya\": [],\n", + " \"Tamika\": [],\n", + " \"Tanisha\": []\n", + "}\n", + "\n", + "name_dicts = {\"German\": european_american, \"Foreign\": african_american}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "vsrlh17mPAWp" + }, + "source": [ + "### Embeddings\n", + "\n", + "Choose **one** of the following ways to load embeddings." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "K62RNLToPGkM" + }, + "source": [ + "#### fastText" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "jKFt0Eq_Z05c", + "outputId": "25983f7e-4418-4565-b88e-79e7b1dcac6a" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Warning : `load_model` does not return WordVectorModel or SupervisedModel any more, but a `FastText` object which is very similar.\n" + ] + } + ], + "source": [ + "# load pretrained fastText embeddings\n", + "!gunzip /content/drive/MyDrive/BA/cc.de.300.bin.gz -c > /content/cc.de.300.bin\n", + "ft = fasttext.load_model(\"/content/cc.de.300.bin\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "68y2X9xvaw92" + }, + "outputs": [], + "source": [ + "# get fastText attribute vectors\n", + "for i, name_dict in name_dicts.items():\n", + " for key, _ in name_dict.items():\n", + " token = key.lower()\n", + " vec = ft.get_word_vector(token) # vec is numpy array\n", + " name_dicts[i][key] = vec.tolist()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "H2LNWi0yMLHt" + }, + "outputs": [], + "source": [ + "# get fastText target vectors\n", + "for i, target_dict in target_dicts.items():\n", + " for key, _ in target_dict.items():\n", + " token = key.lower()\n", + " vec = ft.get_word_vector(token) # vec is numpy array\n", + " target_dicts[i][key] = vec.tolist()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "p0YHNgKGLjkl" + }, + "outputs": [], + "source": [ + "# Check if all the words are in the fastText vocabulary\n", + "X = name_dicts[\"German\"]\n", + "Y = name_dicts[\"Turkish\"]\n", + "A = target_dicts[\"pleasant\"]\n", + "B = target_dicts[\"unpleasant\"]\n", + "for words in [A, B, X, Y]:\n", + " for word in words:\n", + " if word not in ft:\n", + " print(f\"{word} not in vocabulary\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "J1nHSRZtz0PZ" + }, + "outputs": [], + "source": [ + "# Substitute oov name\n", + "del name_dicts[\"Turkish\"][\"Miraç\"]\n", + "name_dicts[\"Turkish\"][\"Hasan\"] = []" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "QTbMC3ukPpdd" + }, + "source": [ + "#### GloVe Embeddings" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "it8AtBmPttSN" + }, + "source": [ + "Choose one out of these." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "1KysLezBt46I" + }, + "outputs": [], + "source": [ + "# Loading might take some time, only needs to be run once.\n", + "\n", + "#embeddings_df = pd.read_csv(\"/content/drive/MyDrive/BA/deepset_german_glove_vectors.txt\", sep=\" \")\n", + "#embeddings_df = pd.read_csv(\"/content/drive/MyDrive/BA/german_glove_vectors_no_debiasing.txt\", sep=\" \", skiprows=1, header=None)\n", + "#embeddings_df = pd.read_csv(\"/content/drive/MyDrive/BA/vectors_debiased_auslaender_deutscher_399994cap.txt\", sep=\" \", skiprows=1, header=None)\n", + "#embeddings_df = pd.read_csv(\"/content/drive/MyDrive/BA/vectors_deutscher_ausländer_top10.txt\", sep=\" \", skiprows=1, header=None)\n", + "#embeddings_df = pd.read_csv(\"/content/drive/MyDrive/BA/german_dd-glove_vectors_hard-debiased.txt\", sep=\" \")\n", + "#embeddings_df = pd.read_csv(\"/content/drive/MyDrive/BA/english_vectors_no_debiasing.txt\", sep=\" \", skiprows=1, header=None)\n", + "embeddings_df = pd.read_csv(\"/content/drive/MyDrive/BA/english_dd-glove_vectors_gender.txt\", sep=\" \", skiprows=1, header=None)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "mBYqT_XmtyRl" + }, + "source": [ + "Run the following independent of GloVe embedding choice:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "QE0tjubv1-q8" + }, + "outputs": [], + "source": [ + "# look up relevant target vectors\n", + "for category, target_dict in target_dicts.items():\n", + " for key, _ in target_dict.items():\n", + " token = key.lower()\n", + " if token in embeddings_df.iloc[:, 0].values:\n", + " # get the row where the token was found\n", + " row = embeddings_df.loc[embeddings_df.iloc[:, 0] == token]\n", + " # get the vector\n", + " vec = row.iloc[:, 1:].values.flatten().tolist()\n", + " target_dicts[category][key] = vec\n", + " else:\n", + " # token not in vocab\n", + " print(f'{token} not in vocab')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "6SEoeNm9qK0W" + }, + "outputs": [], + "source": [ + "# replacement for oov tokens - ONLY IF NECESSARY\n", + "del target_dicts[\"unpleasant\"][\"scheußlich\"]\n", + "target_dicts[\"unpleasant\"][\"entsetzlich\"] = []" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "i0001jeu7kcX" + }, + "outputs": [], + "source": [ + "# look up relevant attribute vectors\n", + "for nationality, name_dict in name_dicts.items():\n", + " for name, _ in name_dict.items():\n", + " token = name.lower()\n", + " if token in embeddings_df.iloc[:, 0].values:\n", + " # get the row where the token was found\n", + " row = embeddings_df.loc[embeddings_df.iloc[:, 0] == token]\n", + " # get the vector\n", + " vec = row.iloc[:, 1:].values.flatten().tolist()\n", + " name_dicts[nationality][name] = vec\n", + " else:\n", + " # token not in vocab\n", + " print(f'{token} not in vocab')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "37euUn74s-6U" + }, + "outputs": [], + "source": [ + "# replacement for oov tokens - ONLY IF NECESSARY\n", + "\n", + "del name_dicts[\"Polish\"][\"Oliwia\"]\n", + "name_dicts[\"Polish\"][\"Wiktoria\"] = []\n", + "\n", + "del name_dicts[\"Turkish\"][\"Miraç\"]\n", + "name_dicts[\"Turkish\"][\"Yusuf\"] = []\n", + "\n", + "del name_dicts[\"Turkish\"][\"AyÅŸegül\"]\n", + "name_dicts[\"Turkish\"][\"Rabia\"] = []\n", + "\n", + "del name_dicts[\"Turkish\"][\"Büşra\"]\n", + "name_dicts[\"Turkish\"][\"Amina\"] = []\n", + "\n", + "del name_dicts[\"Turkish\"][\"Melike\"]\n", + "name_dicts[\"Turkish\"][\"Elif\"] = []\n", + "\n", + "del name_dicts[\"Turkish\"][\"Zehra\"]\n", + "name_dicts[\"Turkish\"][\"Amina\"] = []" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "YMsRXwQTpPxe" + }, + "source": [ + "#### Word2Vec" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "DFWIIhNHpWbi" + }, + "outputs": [], + "source": [ + "# CHOOSE either original or debiased (Bolukbasi) embeddings\n", + "#file_path = \"/content/drive/MyDrive/BA/GoogleNews-vectors-negative300-hard-debiased.bin.gz\"\n", + "#file_path = \"/content/drive/MyDrive/BA/GoogleNews-vectors-negative300.bin.gz\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "neRMLx7JAO3N" + }, + "outputs": [], + "source": [ + "# unzip\n", + "with gzip.open(file_path, \"rb\") as gz_file:\n", + " with open(\"/content/drive/MyDrive/BA/w2v_hard-debiased.bin\", \"wb\") as out_file:\n", + " out_file.write(gz_file.read())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "_VLAXq2XAYq5" + }, + "outputs": [], + "source": [ + "# load model\n", + "model = KeyedVectors.load_word2vec_format(\"/content/drive/MyDrive/BA/english_w2v_original.bin\", binary=True)\n", + "#model = KeyedVectors.load_word2vec_format(\"/content/drive/MyDrive/BA/english_w2v_hard-debiased.bin\", binary=True)\n", + "#model = KeyedVectors.load_word2vec_format(\"/content/drive/MyDrive/BA/english_w2v_hard-debiased_reproduction.bin\", binary=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "MDWGTkEHsDPf" + }, + "outputs": [], + "source": [ + "# look up relevant target vectors\n", + "for category, target_dict in target_dicts.items():\n", + " for key, _ in target_dict.items():\n", + " #token = key.lower()\n", + " token = key\n", + " if token in model:\n", + " vec = model[token]\n", + " target_dicts[category][key] = vec\n", + " else:\n", + " print(f'{token} not in vocab')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "TV5yj17utULE" + }, + "outputs": [], + "source": [ + "# replacement for oov tokens\n", + "\n", + "# run this after the cell below and only if any of the names/target words were\n", + "# not in the vocabulary. here is an example of how to replace an oov word.\n", + "# after replacing all oov words, run the cell below (looking up target vectors)\n", + "# again.\n", + "\n", + "del name_dicts[\"Polish\"][\"Oliwia\"]\n", + "name_dicts[\"Polish\"][\"Wiktoria\"] = []" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "cQFQB_wttt5g" + }, + "outputs": [], + "source": [ + "# look up relevant attribute vectors\n", + "for category, name_dict in name_dicts.items():\n", + " for name, _ in name_dict.items():\n", + " #token = name.lower()\n", + " token = name\n", + " if token in model:\n", + " vec = model[token]\n", + " name_dicts[category][name] = vec\n", + " else:\n", + " print(f'{token} not in vocab')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2_9g4NLyS-BQ" + }, + "source": [ + "### WEAT functions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "VbaSa5KC4GUR" + }, + "outputs": [], + "source": [ + "def attribute_association_s(word_vector, target_set1, target_set2):\n", + " reshaped_word_vector = np.array(word_vector).reshape(1, -1)\n", + " sims1 = [cosine(reshaped_word_vector, np.array(vec).reshape(1, -1)) for vec in list(target_set1.values())]\n", + " sims2 = [cosine(reshaped_word_vector, np.array(vec).reshape(1, -1)) for vec in list(target_set2.values())]\n", + " return np.mean(sims1) - np.mean(sims2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "79qgg0Ee4GUQ" + }, + "outputs": [], + "source": [ + "def differential_association_s(attr1, attr2, target1, target2):\n", + " sum1 = sum([attribute_association_s(vec, target1, target2) for vec in list(attr1.values())])\n", + " sum2 = sum([attribute_association_s(vec, target1, target2) for vec in list(attr2.values())])\n", + " return sum1 - sum2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "OYrYOo4Z4GUT" + }, + "outputs": [], + "source": [ + "def cohens_d(target1, target2, attr1, attr2):\n", + " mean1 = np.mean([attribute_association_s(x, attr1, attr2) for x in list(target1.values())])\n", + " mean2 = np.mean([attribute_association_s(x, attr1, attr2) for x in list(target2.values())])\n", + " join = list(target1.values()) + (list(target2.values()))\n", + " joint_association = [attribute_association_s(x, attr1, attr2) for x in join]\n", + " stddev = np.std(joint_association)\n", + " return (mean1 - mean2) / stddev" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "2bVhqL7c4GUW" + }, + "outputs": [], + "source": [ + "def permutations(target1, target2):\n", + " join = list(target1.keys()) + list(target2.keys())\n", + " combs = list(islice(combinations(join, int(len(join)/2)), 10000))\n", + " first_groups = []\n", + " second_groups = []\n", + " for c in combs:\n", + " rest = []\n", + " for e in join:\n", + " if e not in c:\n", + " rest.append(e)\n", + " first_groups.append(c)\n", + " second_groups.append(rest)\n", + " return first_groups, second_groups" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "rtLo55zP4GUX" + }, + "outputs": [], + "source": [ + "def p_value(comparison, X_perms, Y_perms, target1, target2, attr1, attr2):\n", + " counter = 0\n", + " joint_dict = {**target1, **target2} # to easily get access to all word-vector pairs\n", + " for i, _ in tqdm(enumerate(X_perms)):\n", + " X_subset = dict() # in A_perms there are only the keys, we need the values (vectors) back\n", + " for key in X_perms[i]:\n", + " X_subset[key] = joint_dict[key]\n", + " Y_subset = dict()\n", + " for key in Y_perms[i]:\n", + " Y_subset[key] = joint_dict[key]\n", + " if differential_association_s(X_subset, Y_subset, A, B) > comparison: # why not >=?\n", + " counter += 1\n", + " return counter" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "x0SzFOLyTW5U" + }, + "source": [ + "## Calculate scores" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ZjzVNvuy-bEv" + }, + "outputs": [], + "source": [ + "# determine X, Y, A, B\n", + "X = name_dicts[\"German\"]\n", + "Y = name_dicts[\"Foreign\"]\n", + "A = target_dicts[\"pleasant\"]\n", + "B = target_dicts[\"unpleasant\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "e6HGjHFS4GUS", + "outputId": "fa319197-48d7-4490-83d3-ef5d1c819c7d" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.992015104149375" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "original_diff_association = differential_association_s(X, Y, A, B)\n", + "original_diff_association" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "U3xW8KHu4GUV", + "outputId": "7255627b-6df9-4b35-bc6a-d065833a76e7" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "1.8821553290741988" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cohens_d(X, Y, A, B)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "LaCXCNDh4GUX" + }, + "outputs": [], + "source": [ + "X_i, Y_i = permutations(X, Y)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "7NxAxxFX4GUY", + "outputId": "3d30b795-244e-4c37-a7e0-ad1caf8d1ccb" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "10000it [15:23, 10.83it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "p-value: 0.0\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "p_value_count = p_value(original_diff_association, X_i, Y_i, X, Y, A, B)\n", + "result = p_value_count/10000\n", + "print(f\"\\np-value: {result}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tQSxqS6t4eOa" + }, + "source": [ + "# Sanity checks\n", + "\n", + "Trying to find out if my trained vectors are potentially semantically sensible." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "bECLGNf35VS6" + }, + "outputs": [], + "source": [ + "def get_vec(token: str) -> np.array:\n", + " \"\"\"For the given token, look up the corresponding embedding.\"\"\"\n", + " if token in embeddings_df.iloc[:, 0].values:\n", + " # get the row where the token was found\n", + " row = embeddings_df.loc[embeddings_df.iloc[:, 0] == token]\n", + " # get the vector\n", + " vec = row.iloc[:, 1:].values.flatten().tolist()\n", + " return np.array(vec)\n", + " else:\n", + " raise ValueError(f'{token} not in vocab')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "iI_wva_74hIS" + }, + "outputs": [], + "source": [ + "blue_vec = get_vec(\"blau\")\n", + "red_vec = get_vec(\"rot\")\n", + "table_vec = get_vec(\"tisch\")\n", + "tulip_vec = get_vec(\"tulpe\")\n", + "flower_vec = get_vec(\"blume\")\n", + "king_vec = get_vec(\"könig\")\n", + "queen_vec = get_vec(\"königin\")\n", + "beautiful_vec = get_vec(\"schön\")\n", + "pretty_vec = get_vec(\"hübsch\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "R2KEDAMl-URB", + "outputId": "a4c106c1-d2cf-4985-ffca-21c6404bad29" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "cos blau zu rot: [[0.84387678]]\n", + "cos blau zu tisch: [[0.21278686]]\n", + "cos tulpe zu blume: [[0.23828755]]\n", + "cos königin zu könig: [[0.63659369]]\n", + "cos schön zu hübsch: [[0.52444947]]\n" + ] + } + ], + "source": [ + "# for self-trained glove embeddings\n", + "print(f\"cos blau zu rot: {cosine(blue_vec.reshape(1, -1), red_vec.reshape(1, -1))}\")\n", + "print(f\"cos blau zu tisch: {cosine(blue_vec.reshape(1, -1), table_vec.reshape(1, -1))}\")\n", + "print(f\"cos tulpe zu blume: {cosine(tulip_vec.reshape(1, -1), flower_vec.reshape(1, -1))}\")\n", + "print(f\"cos königin zu könig: {cosine(queen_vec.reshape(1, -1), king_vec.reshape(1, -1))}\")\n", + "print(f\"cos schön zu hübsch: {cosine(beautiful_vec.reshape(1, -1), pretty_vec.reshape(1, -1))}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "n5o4CP_6IFBy", + "outputId": "10fde1cd-2697-4985-a3e1-a08883bd225b" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "cos blau zu rot: [[0.84387678]]\n", + "cos blau zu tisch: [[0.21278686]]\n", + "cos tulpe zu blume: [[0.23828755]]\n", + "cos königin zu könig: [[0.63659369]]\n", + "cos schön zu hübsch: [[0.52444947]]\n" + ] + } + ], + "source": [ + "# for self-trained dd-glove embeddings\n", + "print(f\"cos blau zu rot: {cosine(blue_vec.reshape(1, -1), red_vec.reshape(1, -1))}\")\n", + "print(f\"cos blau zu tisch: {cosine(blue_vec.reshape(1, -1), table_vec.reshape(1, -1))}\")\n", + "print(f\"cos tulpe zu blume: {cosine(tulip_vec.reshape(1, -1), flower_vec.reshape(1, -1))}\")\n", + "print(f\"cos königin zu könig: {cosine(queen_vec.reshape(1, -1), king_vec.reshape(1, -1))}\")\n", + "print(f\"cos schön zu hübsch: {cosine(beautiful_vec.reshape(1, -1), pretty_vec.reshape(1, -1))}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "N2OjHFe7IOqa", + "outputId": "c4f8650d-1061-46bf-94ef-20cddbd35463" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "cos blau zu rot: [[0.84387678]]\n", + "cos blau zu tisch: [[0.21278686]]\n", + "cos tulpe zu blume: [[0.23828755]]\n", + "cos königin zu könig: [[0.63659369]]\n", + "cos schön zu hübsch: [[0.52444947]]\n" + ] + } + ], + "source": [ + "# for deepset glove embeddings\n", + "print(f\"cos blau zu rot: {cosine(blue_vec.reshape(1, -1), red_vec.reshape(1, -1))}\")\n", + "print(f\"cos blau zu tisch: {cosine(blue_vec.reshape(1, -1), table_vec.reshape(1, -1))}\")\n", + "print(f\"cos tulpe zu blume: {cosine(tulip_vec.reshape(1, -1), flower_vec.reshape(1, -1))}\")\n", + "print(f\"cos königin zu könig: {cosine(queen_vec.reshape(1, -1), king_vec.reshape(1, -1))}\")\n", + "print(f\"cos schön zu hübsch: {cosine(beautiful_vec.reshape(1, -1), pretty_vec.reshape(1, -1))}\")" + ] + } + ], + "metadata": { + "colab": { + "collapsed_sections": [ + "K62RNLToPGkM", + "YMsRXwQTpPxe", + "tQSxqS6t4eOa" + ], + "provenance": [] + }, + "interpreter": { + "hash": "a4920fb32b6031dd460b788b209f98e92a11c7dc559d382140b62028ad10923d" + }, + "kernelspec": { + "display_name": "Python 3.8.10 ('bias-venv': venv)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/WEAT/weat_experiments.py b/WEAT/weat_experiments.py new file mode 100644 index 0000000000000000000000000000000000000000..007a6f7c3b17a07cc47b88d8413b717264052147 --- /dev/null +++ b/WEAT/weat_experiments.py @@ -0,0 +1,242 @@ +""" +A .py version of the WEAT experiments notebook, created so I can +run the script using SBATCH with various parameters. Limited to +German options and does not include W2V, unlike notebook. + +Parallelizes the permutation test. + +Checks bias in various pre-trained embeddings using the WEAT. Different +embeddings and attribute lists can be chosen. +""" +import argparse +from itertools import combinations, islice +from sklearn.metrics.pairwise import cosine_similarity as cosine +import numpy as np +from gensim.models import KeyedVectors +from concurrent.futures import ProcessPoolExecutor + + +def get_target_words(version: str) -> dict: + """ + load German target words for origin as per Kurpicz-Briki 2020 (WEAT 5), + or alternatively WEAT 6 target words from Caliskan et al 2017 + """ + if version == "reproduction": + pleasant = { # aka career + "executive": [], "management": [], "professional": [], "corporation": [], + "salary": [], "office": [], "business": [], "career": [], + } + unpleasant = { # aka family + "home": [], "parents": [], "children": [], "family": [], "cousins": [], + "marriage": [], "wedding": [], "relatives": [], + } + else: + pleasant = { + "spaß": [], + "liebe": [], + "frieden": [], + "wunderbar": [], + "freude": [], + "lachen": [], + "glück": [], + } + unpleasant = { + "qual": [], + "furchtbar": [], + "schrecklich": [], + "übel": [], + "böse": [], + "krieg": [], + "grausam": [], + "versagen": [] + } + return {"pleasant": pleasant, "unpleasant": unpleasant} + + +def get_attribute_words(version: str) -> dict: + """Load name lists depending on country. Alternatively load version by Kurpicz-Briki""" + if version == "kurpicz": + german = { + "Peter": [], "Daniel": [], "Hans": [], "Thomas": [], "Andreas": [], + "Martin": [], "Markus": [], "Michael": [], "Maria": [], "Anna": [], + "Ursula": [], "Ruth": [], "Monika": [], "Elisabeth": [], "Verena": [], + "Sandra": [] + } + + foreign = { + "Ladina": [], "Fatima": [], "Fatma": [], "Alma": [], "Soraya": [], + "Svetlana": [], "Elif": [], "Vesna": [], "Mehmet": [], "Mustafa": [], + "Aleksandar": [], "Mohamed": [], "Ibrahim": [], "Dragan": [], + "Hasan": [], "Mohammad": [] + } + + return {"german": german, "foreign": foreign} + elif version == "reproduction": + german = { # aka male + "John": [], "Paul": [], "Mike": [], "Kevin": [], "Steve": [], + "Greg": [], "Jeff": [], "Bill": [] + } + foreign = { # aka female + "Amy": [], "Joan": [], "Lisa": [], "Sarah": [], "Diana": [], + "Kate": [], "Ann": [], "Donna": [] + } + return {"german": german, "foreign": foreign} + else: + german = { + "Katharina": [], "Susanne": [], "Karin": [], "Ulrike": [], "Renate": [], + "Birgit": [], "Bettina": [], "Jutta": [], "Ute": [], "Cornelia": [], + "Katja": [], "Heike": [], "Stefanie": [], "Kerstin": [], "Tanja": [], + "Hans": [], "Carl": [], "Wolfgang": [], "Andreas": [], "Werner": [], + "Christoph": [], "Klaus": [], "Philipp": [], "Joachim": [], "Jürgen": [], + "Dieter": [], "Matthias": [], "Manfred": [], "Sebastian": [], "Rainer": [] + } + + if version == "turkish": + foreign = { + "Esra": [], "Merve": [], "Fatma": [], "Sibel": [], "Elif": [], "AyÅŸe": [], + "Emine": [], "Özlem": [], "Zeynep": [], "Hatice": [], "Dilek": [], "Ebru": [], + "Pınar": [], "Hülya": [], "Derya": [], "Mustafa": [], "Murat": [], + "Ahmet": [], "Kemal": [], "Orhan": [], "Hüseyin": [], "Bülent": [], + "Metin": [], "Ömer": [], "Emre": [], "Halil": [], "Erkan": [], + "UÄŸur": [], "Burak": [], "Volkan": [] + } + + elif version == "polish": + foreign = { + "Magdalena": [], "Ewa": [], "Zofia": [], "Beata": [], "Katarzyna": [], + "Krystyna": [], "MaÅ‚gorzata": [], "Jadwiga": [], "Danuta": [], + "Elżbieta": [], "Urszula": [], "Alicja": [], "Aneta": [], "Iwona": [], + "Edyta": [], "Andrzej": [], "StanisÅ‚aw": [], "Marek": [], "Józef": [], + "Henryk": [], "Krzysztof": [], "WÅ‚adysÅ‚aw": [], "Tadeusz": [], "Piotr": [], + "Janusz": [], "Tomasz": [], "Wojciech": [], "Jakub": [], "Marcin": [], + "Franciszek": [] + } + + elif version == "italian": + foreign = { + "Caterina": [], "Francesca": [], "Paola": [], "Giulia": [], "Chiara": [], + "Giovanna": [], "Alessandra": [], "Gioia": [], "Antonella": [], + "Giuseppina": [], "Azzurra": [], "Antonietta": [], "Ambra": [], + "Alessia": [], "Giorgia": [], "Giovanni": [], "Carlo": [], + "Francesco": [], "Giuseppe": [], "Pietro": [], "Luigi": [], "Paolo": [], + "Alessandro": [], "Angelo": [], "Giorgio": [], "Domenico": [], + "Enrico": [], "Stefano": [], "Vincenzo": [], "Matteo": [] + } + + else: + raise ValueError("Invalid version specified. See --help") + + return {"german": german, "foreign": foreign} + + +def get_embeddings(lookup_dict: dict, embeddings) -> dict(): + """Go through nested seed dicts and look up embedding for each word""" + for category, seeds in lookup_dict.items(): + for word, _ in seeds.items(): + if word.lower() in embeddings: + seeds[word] = embeddings[word.lower()] + else: + raise KeyError(f"'{word}' not in vocabulary") + lookup_dict[category] = seeds + return lookup_dict + + +def attribute_association_s(word_vector, target_set1, target_set2): + reshaped_word_vector = np.array(word_vector).reshape(1, -1) + sims1 = [cosine(reshaped_word_vector, np.array(vec).reshape(1, -1)) for vec in list(target_set1.values())] + sims2 = [cosine(reshaped_word_vector, np.array(vec).reshape(1, -1)) for vec in list(target_set2.values())] + return np.mean(sims1) - np.mean(sims2) + + +def differential_association_s(attr1, attr2, target1, target2): + sum1 = sum([attribute_association_s(vec, target1, target2) for vec in list(attr1.values())]) + sum2 = sum([attribute_association_s(vec, target1, target2) for vec in list(attr2.values())]) + return sum1 - sum2 + + +def cohens_d_calc(target1, target2, attr1, attr2): + mean1 = np.mean([attribute_association_s(x, attr1, attr2) for x in list(target1.values())]) + mean2 = np.mean([attribute_association_s(x, attr1, attr2) for x in list(target2.values())]) + join = list(target1.values()) + (list(target2.values())) + joint_association = [attribute_association_s(x, attr1, attr2) for x in join] + stddev = np.std(joint_association) + return (mean1 - mean2) / stddev + + +def permutations(target1, target2): + join = list(target1.keys()) + list(target2.keys()) + combs = list(islice(combinations(join, int(len(join)/2)), 100000)) + first_groups = [] + second_groups = [] + for c in combs: + rest = [] + for e in join: + if e not in c: + rest.append(e) + first_groups.append(c) + second_groups.append(rest) + return first_groups, second_groups + + +def p_value_calc_worker(args): + X_subset, Y_subset, comparison, attr1, attr2 = args + return differential_association_s(X_subset, Y_subset, attr1, attr2) > comparison + +def p_value_calc(comparison, X_perms, Y_perms, target1, target2, attr1, attr2): + counter = 0 + joint_dict = {**target1, **target2} + + with ProcessPoolExecutor() as executor: + args_list = [] + for i, _ in enumerate(X_perms): + X_subset = {key: joint_dict[key] for key in X_perms[i]} + Y_subset = {key: joint_dict[key] for key in Y_perms[i]} + args_list.append((X_subset, Y_subset, comparison, attr1, attr2)) + + results = list(executor.map(p_value_calc_worker, args_list)) + counter = sum(results) + + return counter + +def calculate_WEAT(target_data: dict, attribute_data: dict) -> tuple: + X = attribute_data["german"] + Y = attribute_data["foreign"] + A = target_data["pleasant"] + B = target_data["unpleasant"] + + original_diff_association = differential_association_s(X, Y, A, B) + d = cohens_d_calc(X, Y, A, B) + X_i, Y_i = permutations(X, Y) + p_value_count = p_value_calc(original_diff_association, X_i, Y_i, X, Y, A, B) + p = p_value_count/100000 + + return d, p + + + +if __name__ == "__main__": + np.random.seed(42) + + parser = argparse.ArgumentParser( + description="Calculate WEAT score for given attributes and vectors") + parser.add_argument("--attribute", help="'kurpicz', 'turkish', 'polish', 'reproduction'") + parser.add_argument("--vector_location", help="specify a file path to embeddings") + args = parser.parse_args() + + print("Loading seed words...") + target_dicts = get_target_words(args.attribute) + attribute_dicts = get_attribute_words(args.attribute) + + print("Loading model...") + model = KeyedVectors.load_word2vec_format(args.vector_location, binary=False) + + print("Retrieving embeddings...") + target_dicts = get_embeddings(target_dicts, model) + attribute_dicts = get_embeddings(attribute_dicts, model) + + print("Calculating WEAT...") + cohens_d, p_value = calculate_WEAT(target_dicts, attribute_dicts) + + print(f"WEAT scores for: {args.attribute} test, vectors from {args.vector_location}") + print(f"Cohen's d: {cohens_d:.4f}, p-value: {p_value:.4f}") + print("-----------------------------------------------") diff --git a/WEAT/weat_output.txt b/WEAT/weat_output.txt new file mode 100644 index 0000000000000000000000000000000000000000..ba28d8e93d6795b6c492305025da7b51c8847a92 --- /dev/null +++ b/WEAT/weat_output.txt @@ -0,0 +1,226 @@ +Loading seed words... +Loading model... +Retrieving embeddings... +Calculating WEAT... +WEAT scores for: turkish test, vectors from /home/students/reichelt/ba/bias-mitigation-ba/data/embeddings/fasttext/wiki.de.vec +Cohen's d: 1.3571, p-value: 0.0109 +----------------------------------------------- + +Loading seed words... +Loading model... +Retrieving embeddings... +Calculating WEAT... +WEAT scores for: polish test, vectors from /home/students/reichelt/ba/bias-mitigation-ba/data/embeddings/fasttext/wiki.de.vec +Cohen's d: 0.2829, p-value: 0.5185 +----------------------------------------------- + +Loading seed words... +Loading model... +Retrieving embeddings... +Calculating WEAT... +WEAT scores for: italian test, vectors from /home/students/reichelt/ba/bias-mitigation-ba/data/embeddings/fasttext/wiki.de.vec +Cohen's d: 1.0331, p-value: 0.1082 +----------------------------------------------- + +Loading seed words... +Loading model... +Retrieving embeddings... +Calculating WEAT... +WEAT scores for: turkish test, vectors from /home/students/reichelt/ba/bias-mitigation-ba/data/embeddings/fasttext/fasttext_hard_debiased_turkish_w2vformat.txt +Cohen's d: 1.1332, p-value: 0.0573 +----------------------------------------------- + +Loading seed words... +Loading model... +Retrieving embeddings... +Calculating WEAT... +WEAT scores for: polish test, vectors from /home/students/reichelt/ba/bias-mitigation-ba/data/embeddings/fasttext/fasttext_hard_debiased_polish_w2vformat.txt +Cohen's d: 0.1786, p-value: 0.5699 +----------------------------------------------- + +Loading seed words... +Loading model... +Retrieving embeddings... +Calculating WEAT... +WEAT scores for: italian test, vectors from /home/students/reichelt/ba/bias-mitigation-ba/data/embeddings/fasttext/fasttext_hard_debiased_italian_w2vformat.txt +Cohen's d: 0.5896, p-value: 0.3510 +----------------------------------------------- + +Loading seed words... +Loading model... +Retrieving embeddings... +Traceback (most recent call last): + File "/home/students/reichelt/ba/bias-mitigation-ba/WEAT/weat_experiments.py", line 212, in <module> + target_dicts = get_embeddings(target_dicts, model) + File "/home/students/reichelt/ba/bias-mitigation-ba/WEAT/weat_experiments.py", line 117, in get_embeddings + raise KeyError(f"'{word}' not in vocabulary") +KeyError: "'scheußlich' not in vocabulary" +srun: error: node37: task 0: Exited with exit code 1 +Loading seed words... +Loading model... +Retrieving embeddings... +Traceback (most recent call last): + File "/home/students/reichelt/ba/bias-mitigation-ba/WEAT/weat_experiments.py", line 212, in <module> + target_dicts = get_embeddings(target_dicts, model) + File "/home/students/reichelt/ba/bias-mitigation-ba/WEAT/weat_experiments.py", line 117, in get_embeddings + raise KeyError(f"'{word}' not in vocabulary") +KeyError: "'scheußlich' not in vocabulary" +srun: error: node37: task 0: Exited with exit code 1 +Loading seed words... +Loading model... +Retrieving embeddings... +Traceback (most recent call last): + File "/home/students/reichelt/ba/bias-mitigation-ba/WEAT/weat_experiments.py", line 212, in <module> + target_dicts = get_embeddings(target_dicts, model) + File "/home/students/reichelt/ba/bias-mitigation-ba/WEAT/weat_experiments.py", line 117, in get_embeddings + raise KeyError(f"'{word}' not in vocabulary") +KeyError: "'scheußlich' not in vocabulary" +srun: error: node37: task 0: Exited with exit code 1 +Loading seed words... +Loading model... +Retrieving embeddings... +Traceback (most recent call last): + File "/home/students/reichelt/ba/bias-mitigation-ba/WEAT/weat_experiments.py", line 212, in <module> + target_dicts = get_embeddings(target_dicts, model) + File "/home/students/reichelt/ba/bias-mitigation-ba/WEAT/weat_experiments.py", line 117, in get_embeddings + raise KeyError(f"'{word}' not in vocabulary") +KeyError: "'scheußlich' not in vocabulary" +srun: error: node37: task 0: Exited with exit code 1 +Loading seed words... +Loading model... +Retrieving embeddings... +Traceback (most recent call last): + File "/home/students/reichelt/ba/bias-mitigation-ba/WEAT/weat_experiments.py", line 212, in <module> + target_dicts = get_embeddings(target_dicts, model) + File "/home/students/reichelt/ba/bias-mitigation-ba/WEAT/weat_experiments.py", line 117, in get_embeddings + raise KeyError(f"'{word}' not in vocabulary") +KeyError: "'scheußlich' not in vocabulary" +srun: error: node37: task 0: Exited with exit code 1 +Loading seed words... +Loading model... +Retrieving embeddings... +Traceback (most recent call last): + File "/home/students/reichelt/ba/bias-mitigation-ba/WEAT/weat_experiments.py", line 212, in <module> + target_dicts = get_embeddings(target_dicts, model) + File "/home/students/reichelt/ba/bias-mitigation-ba/WEAT/weat_experiments.py", line 117, in get_embeddings + raise KeyError(f"'{word}' not in vocabulary") +KeyError: "'scheußlich' not in vocabulary" +srun: error: node37: task 0: Exited with exit code 1 +Loading seed words... +Loading model... +Retrieving embeddings... +Traceback (most recent call last): + File "/home/students/reichelt/ba/bias-mitigation-ba/WEAT/weat_experiments.py", line 212, in <module> + target_dicts = get_embeddings(target_dicts, model) + File "/home/students/reichelt/ba/bias-mitigation-ba/WEAT/weat_experiments.py", line 117, in get_embeddings + raise KeyError(f"'{word}' not in vocabulary") +KeyError: "'scheußlich' not in vocabulary" +srun: error: node37: task 0: Exited with exit code 1 +Loading seed words... +Loading model... +Retrieving embeddings... +Calculating WEAT... +WEAT scores for: polish test, vectors from /home/students/reichelt/ba/bias-mitigation-ba/data/embeddings/fasttext/fasttext_hard_debiased_turkish_w2vformat.txt +Cohen's d: 0.2123, p-value: 0.5639 +----------------------------------------------- + +Loading seed words... +Loading model... +Retrieving embeddings... +Calculating WEAT... +WEAT scores for: italian test, vectors from /home/students/reichelt/ba/bias-mitigation-ba/data/embeddings/fasttext/fasttext_hard_debiased_turkish_w2vformat.txt +Cohen's d: 0.9480, p-value: 0.1469 +----------------------------------------------- + +Loading seed words... +Loading model... +Retrieving embeddings... +Calculating WEAT... +WEAT scores for: turkish test, vectors from /home/students/reichelt/ba/bias-mitigation-ba/data/embeddings/fasttext/fasttext_hard_debiased_polish_w2vformat.txt +Cohen's d: 1.3612, p-value: 0.0103 +----------------------------------------------- + +Loading seed words... +Loading model... +Retrieving embeddings... +Calculating WEAT... +WEAT scores for: italian test, vectors from /home/students/reichelt/ba/bias-mitigation-ba/data/embeddings/fasttext/fasttext_hard_debiased_polish_w2vformat.txt +Cohen's d: 1.0477, p-value: 0.1018 +----------------------------------------------- + +Loading seed words... +Loading model... +Retrieving embeddings... +Calculating WEAT... +WEAT scores for: turkish test, vectors from /home/students/reichelt/ba/bias-mitigation-ba/data/embeddings/fasttext/fasttext_hard_debiased_italian_w2vformat.txt +Cohen's d: 1.3299, p-value: 0.0139 +----------------------------------------------- + +Loading seed words... +Loading model... +Retrieving embeddings... +Calculating WEAT... +WEAT scores for: polish test, vectors from /home/students/reichelt/ba/bias-mitigation-ba/data/embeddings/fasttext/fasttext_hard_debiased_italian_w2vformat.txt +Cohen's d: 0.1901, p-value: 0.5747 +----------------------------------------------- + +Loading seed words... +Loading model... +Retrieving embeddings... +Traceback (most recent call last): + File "/home/students/reichelt/ba/bias-mitigation-ba/WEAT/weat_experiments.py", line 212, in <module> + target_dicts = get_embeddings(target_dicts, model) + File "/home/students/reichelt/ba/bias-mitigation-ba/WEAT/weat_experiments.py", line 117, in get_embeddings + raise KeyError(f"'{word}' not in vocabulary") +KeyError: "'scheußlich' not in vocabulary" +srun: error: node37: task 0: Exited with exit code 1 +Loading seed words... +Loading model... +Retrieving embeddings... +Traceback (most recent call last): + File "/home/students/reichelt/ba/bias-mitigation-ba/WEAT/weat_experiments.py", line 212, in <module> + target_dicts = get_embeddings(target_dicts, model) + File "/home/students/reichelt/ba/bias-mitigation-ba/WEAT/weat_experiments.py", line 117, in get_embeddings + raise KeyError(f"'{word}' not in vocabulary") +KeyError: "'scheußlich' not in vocabulary" +srun: error: node37: task 0: Exited with exit code 1 +Loading seed words... +Loading model... +Retrieving embeddings... +Traceback (most recent call last): + File "/home/students/reichelt/ba/bias-mitigation-ba/WEAT/weat_experiments.py", line 212, in <module> + target_dicts = get_embeddings(target_dicts, model) + File "/home/students/reichelt/ba/bias-mitigation-ba/WEAT/weat_experiments.py", line 117, in get_embeddings + raise KeyError(f"'{word}' not in vocabulary") +KeyError: "'scheußlich' not in vocabulary" +srun: error: node37: task 0: Exited with exit code 1 +Loading seed words... +Loading model... +Retrieving embeddings... +Traceback (most recent call last): + File "/home/students/reichelt/ba/bias-mitigation-ba/WEAT/weat_experiments.py", line 212, in <module> + target_dicts = get_embeddings(target_dicts, model) + File "/home/students/reichelt/ba/bias-mitigation-ba/WEAT/weat_experiments.py", line 117, in get_embeddings + raise KeyError(f"'{word}' not in vocabulary") +KeyError: "'scheußlich' not in vocabulary" +srun: error: node37: task 0: Exited with exit code 1 +Loading seed words... +Loading model... +Retrieving embeddings... +Traceback (most recent call last): + File "/home/students/reichelt/ba/bias-mitigation-ba/WEAT/weat_experiments.py", line 212, in <module> + target_dicts = get_embeddings(target_dicts, model) + File "/home/students/reichelt/ba/bias-mitigation-ba/WEAT/weat_experiments.py", line 117, in get_embeddings + raise KeyError(f"'{word}' not in vocabulary") +KeyError: "'scheußlich' not in vocabulary" +srun: error: node37: task 0: Exited with exit code 1 +Loading seed words... +Loading model... +Retrieving embeddings... +Traceback (most recent call last): + File "/home/students/reichelt/ba/bias-mitigation-ba/WEAT/weat_experiments.py", line 212, in <module> + target_dicts = get_embeddings(target_dicts, model) + File "/home/students/reichelt/ba/bias-mitigation-ba/WEAT/weat_experiments.py", line 117, in get_embeddings + raise KeyError(f"'{word}' not in vocabulary") +KeyError: "'scheußlich' not in vocabulary" +srun: error: node37: task 0: Exited with exit code 1 diff --git a/WEAT/weat_output_2.txt b/WEAT/weat_output_2.txt new file mode 100644 index 0000000000000000000000000000000000000000..850e307c2b70409da4e713e5544efd0332a479b4 --- /dev/null +++ b/WEAT/weat_output_2.txt @@ -0,0 +1,96 @@ +Loading seed words... +Loading model... +Retrieving embeddings... +Calculating WEAT... +WEAT scores for: kurpicz test, vectors from /home/students/reichelt/ba/bias-mitigation-ba/data/embeddings/glove/dd-glove/vectors_no_debiasing.txt +Cohen's d: 1.7889, p-value: 0.0000 +----------------------------------------------- + +Loading seed words... +Loading model... +Retrieving embeddings... +Calculating WEAT... +WEAT scores for: turkish test, vectors from /home/students/reichelt/ba/bias-mitigation-ba/data/embeddings/glove/dd-glove/vectors_no_debiasing.txt +Cohen's d: 1.8321, p-value: 0.0000 +----------------------------------------------- + +Loading seed words... +Loading model... +Retrieving embeddings... +Calculating WEAT... +WEAT scores for: italian test, vectors from /home/students/reichelt/ba/bias-mitigation-ba/data/embeddings/glove/dd-glove/vectors_no_debiasing.txt +Cohen's d: 1.4650, p-value: 0.0004 +----------------------------------------------- + +Loading seed words... +Loading model... +Retrieving embeddings... +Calculating WEAT... +WEAT scores for: turkish test, vectors from /home/students/reichelt/ba/bias-mitigation-ba/data/embeddings/glove/dd-glove/glove_dd_turkish.txt +Cohen's d: 1.8209, p-value: 0.0000 +----------------------------------------------- + +Loading seed words... +Loading model... +Retrieving embeddings... +Calculating WEAT... +WEAT scores for: polish test, vectors from /home/students/reichelt/ba/bias-mitigation-ba/data/embeddings/glove/dd-glove/glove_dd_polish.txt +Cohen's d: 1.5567, p-value: 0.0000 +----------------------------------------------- + +Loading seed words... +Loading model... +Retrieving embeddings... +Calculating WEAT... +WEAT scores for: italian test, vectors from /home/students/reichelt/ba/bias-mitigation-ba/data/embeddings/glove/dd-glove/glove_dd_italian.txt +Cohen's d: 1.3057, p-value: 0.0001 +----------------------------------------------- + +Loading seed words... +Loading model... +Retrieving embeddings... +Calculating WEAT... +WEAT scores for: polish test, vectors from /home/students/reichelt/ba/bias-mitigation-ba/data/embeddings/glove/dd-glove/glove_dd_turkish.txt +Cohen's d: 1.6582, p-value: 0.0000 +----------------------------------------------- + +Loading seed words... +Loading model... +Retrieving embeddings... +Calculating WEAT... +WEAT scores for: italian test, vectors from /home/students/reichelt/ba/bias-mitigation-ba/data/embeddings/glove/dd-glove/glove_dd_turkish.txt +Cohen's d: 1.4522, p-value: 0.0000 +----------------------------------------------- + +Loading seed words... +Loading model... +Retrieving embeddings... +Calculating WEAT... +WEAT scores for: turkish test, vectors from /home/students/reichelt/ba/bias-mitigation-ba/data/embeddings/glove/dd-glove/glove_dd_polish.txt +Cohen's d: 1.7717, p-value: 0.0000 +----------------------------------------------- + +Loading seed words... +Loading model... +Retrieving embeddings... +Calculating WEAT... +WEAT scores for: italian test, vectors from /home/students/reichelt/ba/bias-mitigation-ba/data/embeddings/glove/dd-glove/glove_dd_polish.txt +Cohen's d: 1.3303, p-value: 0.0001 +----------------------------------------------- + +Loading seed words... +Loading model... +Retrieving embeddings... +Calculating WEAT... +WEAT scores for: turkish test, vectors from /home/students/reichelt/ba/bias-mitigation-ba/data/embeddings/glove/dd-glove/glove_dd_italian.txt +Cohen's d: 1.7891, p-value: 0.0000 +----------------------------------------------- + +Loading seed words... +Loading model... +Retrieving embeddings... +Calculating WEAT... +WEAT scores for: polish test, vectors from /home/students/reichelt/ba/bias-mitigation-ba/data/embeddings/glove/dd-glove/glove_dd_italian.txt +Cohen's d: 1.7005, p-value: 0.0000 +----------------------------------------------- + diff --git a/WEAT/weat_output_3.txt b/WEAT/weat_output_3.txt new file mode 100644 index 0000000000000000000000000000000000000000..910f0b652785e75f3142cf78308f102e8380f815 --- /dev/null +++ b/WEAT/weat_output_3.txt @@ -0,0 +1,39 @@ +Loading seed words... +Loading model... +Retrieving embeddings... +Calculating WEAT... +WEAT scores for: polish test, vectors from /home/students/reichelt/ba/bias-mitigation-ba/data/embeddings/glove/dd-glove/glove_hard_debiased_turkish_w2vformat.txt +Cohen's d: 1.5315, p-value: 0.0003 +----------------------------------------------- + +Loading seed words... +Loading model... +Retrieving embeddings... +Calculating WEAT... +WEAT scores for: italian test, vectors from /home/students/reichelt/ba/bias-mitigation-ba/data/embeddings/glove/dd-glove/glove_hard_debiased_turkish_w2vformat.txt +Cohen's d: 1.1688, p-value: 0.0115 +----------------------------------------------- + +Loading seed words... +Loading model... +Retrieving embeddings... +Calculating WEAT... +WEAT scores for: turkish test, vectors from /home/students/reichelt/ba/bias-mitigation-ba/data/embeddings/glove/dd-glove/glove_hard_debiased_polish_w2vformat.txt +Cohen's d: 1.7833, p-value: 0.0000 +----------------------------------------------- + + File "/home/students/reichelt/ba/bias-mitigation-ba/WEAT/weat_experiments.py", line 241 + print("-----------------------------------------------) + ^ +SyntaxError: EOL while scanning string literal +srun: error: node37: task 0: Exited with exit code 1 + File "/home/students/reichelt/ba/bias-mitigation-ba/WEAT/weat_experiments.py", line 241 + print("-----------------------------------------------) + ^ +SyntaxError: EOL while scanning string literal +srun: error: node37: task 0: Exited with exit code 1 + File "/home/students/reichelt/ba/bias-mitigation-ba/WEAT/weat_experiments.py", line 241 + print("-----------------------------------------------) + ^ +SyntaxError: EOL while scanning string literal +srun: error: node37: task 0: Exited with exit code 1 diff --git a/WEAT/weat_output_4.txt b/WEAT/weat_output_4.txt new file mode 100644 index 0000000000000000000000000000000000000000..89de5f14a8922bc3747b8dc8b1e095799c05b7ef --- /dev/null +++ b/WEAT/weat_output_4.txt @@ -0,0 +1,21 @@ +Loading seed words... +Loading model... +Retrieving embeddings... +Calculating WEAT... +WEAT scores for: italian test, vectors from /home/students/reichelt/ba/bias-mitigation-ba/data/embeddings/glove/dd-glove/glove_hard_debiased_polish_w2vformat.txt +Cohen's d: 1.2579, p-value: 0.0046 +----------------------------------------------- +Loading seed words... +Loading model... +Retrieving embeddings... +Calculating WEAT... +WEAT scores for: turkish test, vectors from /home/students/reichelt/ba/bias-mitigation-ba/data/embeddings/glove/dd-glove/glove_hard_debiased_italian_w2vformat.txt +Cohen's d: 1.7934, p-value: 0.0000 +----------------------------------------------- +Loading seed words... +Loading model... +Retrieving embeddings... +Calculating WEAT... +WEAT scores for: polish test, vectors from /home/students/reichelt/ba/bias-mitigation-ba/data/embeddings/glove/dd-glove/glove_hard_debiased_italian_w2vformat.txt +Cohen's d: 1.6358, p-value: 0.0000 +-----------------------------------------------