Commit cd6b0fa5 authored by Aileen Reichelt's avatar Aileen Reichelt
Browse files

Reestore WEAT files

parent 34e7918a
Loading
Loading
Loading
Loading

WEAT/run_weat.sh

0 → 100644
+17 −0
Original line number Diff line number Diff line
#!/bin/bash
#
#SBATCH --job-name=weat
#SBATCH --output=weat_output_4.txt
#SBATCH --mem=32G
#SBATCH --partition=compute
#SBATCH --cpus-per-task=32
#SBATCH --mail-user=reichelt@cl.uni-heidelberg.de
#SBATCH --mail-type=ALL
#SBATCH --time=3-00:00:00

# JOB STEPS
source /home/students/reichelt/ba/bias-mitigation-ba/bias-venv/bin/activate

srun python /home/students/reichelt/ba/bias-mitigation-ba/WEAT/weat_experiments.py --attribute italian --vector_location /home/students/reichelt/ba/bias-mitigation-ba/data/embeddings/glove/dd-glove/glove_hard_debiased_polish_w2vformat.txt
srun python /home/students/reichelt/ba/bias-mitigation-ba/WEAT/weat_experiments.py --attribute turkish --vector_location /home/students/reichelt/ba/bias-mitigation-ba/data/embeddings/glove/dd-glove/glove_hard_debiased_italian_w2vformat.txt
srun python /home/students/reichelt/ba/bias-mitigation-ba/WEAT/weat_experiments.py --attribute polish --vector_location /home/students/reichelt/ba/bias-mitigation-ba/data/embeddings/glove/dd-glove/glove_hard_debiased_italian_w2vformat.txt
+1164 −0

File added.

Preview size limit exceeded, changes collapsed.

+242 −0
Original line number Diff line number Diff line
"""
A .py version of the WEAT experiments notebook, created so I can
run the script using SBATCH with various parameters. Limited to
German options and does not include W2V, unlike notebook.

Parallelizes the permutation test.

Checks bias in various pre-trained embeddings using the WEAT. Different
embeddings and attribute lists can be chosen.
"""
import argparse
from itertools import combinations, islice
from sklearn.metrics.pairwise import cosine_similarity as cosine
import numpy as np
from gensim.models import KeyedVectors
from concurrent.futures import ProcessPoolExecutor


def get_target_words(version: str) -> dict:
    """
    load German target words for origin as per Kurpicz-Briki 2020 (WEAT 5),
    or alternatively WEAT 6 target words from Caliskan et al 2017
    """
    if version == "reproduction":
        pleasant = {  # aka career
            "executive": [], "management": [], "professional": [], "corporation": [],
            "salary": [], "office": [], "business": [], "career": [],
        }
        unpleasant = {  # aka family
            "home": [], "parents": [], "children": [], "family": [], "cousins": [],
            "marriage": [], "wedding": [], "relatives": [],
        }
    else:
        pleasant = {
            "spaß": [],
            "liebe": [],
            "frieden": [],
            "wunderbar": [],
            "freude": [],
            "lachen": [],
            "glück": [],
        }
        unpleasant = {
            "qual": [],
            "furchtbar": [],
            "schrecklich": [],
            "übel": [],
            "böse": [],
            "krieg": [],
            "grausam": [],
            "versagen": []
        }
    return {"pleasant": pleasant, "unpleasant": unpleasant}


def get_attribute_words(version: str) -> dict:
    """Load name lists depending on country. Alternatively load version by Kurpicz-Briki"""
    if version == "kurpicz":
        german = {
            "Peter": [], "Daniel": [], "Hans": [], "Thomas": [], "Andreas": [],
            "Martin": [], "Markus": [], "Michael": [], "Maria": [], "Anna": [],
            "Ursula": [], "Ruth": [], "Monika": [], "Elisabeth": [], "Verena": [],
            "Sandra": []
        }

        foreign = {
            "Ladina": [], "Fatima": [], "Fatma": [], "Alma": [], "Soraya": [],
            "Svetlana": [], "Elif": [], "Vesna": [], "Mehmet": [], "Mustafa": [],
            "Aleksandar": [], "Mohamed": [], "Ibrahim": [], "Dragan": [],
            "Hasan": [], "Mohammad": []
        }

        return {"german": german, "foreign": foreign}
    elif version == "reproduction":
        german = {  # aka male
            "John": [], "Paul": [], "Mike": [], "Kevin": [], "Steve": [],
            "Greg": [], "Jeff": [], "Bill": []
        }
        foreign = {  # aka female
            "Amy": [], "Joan": [], "Lisa": [], "Sarah": [], "Diana": [],
            "Kate": [], "Ann": [], "Donna": []
        }
        return {"german": german, "foreign": foreign}
    else:
        german = {
            "Katharina": [], "Susanne": [], "Karin": [], "Ulrike": [], "Renate": [],
            "Birgit": [], "Bettina": [], "Jutta": [], "Ute": [], "Cornelia": [],
            "Katja": [], "Heike": [], "Stefanie": [], "Kerstin": [], "Tanja": [],
            "Hans": [], "Carl": [], "Wolfgang": [], "Andreas": [], "Werner": [],
            "Christoph": [], "Klaus": [], "Philipp": [], "Joachim": [], "Jürgen": [],
            "Dieter": [], "Matthias": [], "Manfred": [], "Sebastian": [], "Rainer": []
        }

        if version == "turkish":
            foreign = {
                "Esra": [], "Merve": [], "Fatma": [], "Sibel": [], "Elif": [], "Ayşe": [],
                "Emine": [], "Özlem": [], "Zeynep": [], "Hatice": [], "Dilek": [], "Ebru": [],
                "Pınar": [], "Hülya": [], "Derya": [], "Mustafa": [], "Murat": [],
                "Ahmet": [], "Kemal": [], "Orhan": [], "Hüseyin": [], "Bülent": [],
                "Metin": [], "Ömer": [], "Emre": [], "Halil": [], "Erkan": [],
                "Uğur": [], "Burak": [], "Volkan": []
            }

        elif version == "polish":
            foreign = {
                "Magdalena": [], "Ewa": [], "Zofia": [], "Beata": [], "Katarzyna": [],
                "Krystyna": [], "Małgorzata": [], "Jadwiga": [], "Danuta": [],
                "Elżbieta": [], "Urszula": [], "Alicja": [], "Aneta": [], "Iwona": [],
                "Edyta": [], "Andrzej": [], "Stanisław": [], "Marek": [], "Józef": [],
                "Henryk": [], "Krzysztof": [], "Władysław": [], "Tadeusz": [], "Piotr": [],
                "Janusz": [], "Tomasz": [], "Wojciech": [], "Jakub": [], "Marcin": [],
                "Franciszek": []
            }

        elif version == "italian":
            foreign = {
                "Caterina": [], "Francesca": [], "Paola": [], "Giulia": [], "Chiara": [],
                "Giovanna": [], "Alessandra": [], "Gioia": [], "Antonella": [],
                "Giuseppina": [], "Azzurra": [], "Antonietta": [], "Ambra": [],
                "Alessia": [], "Giorgia": [], "Giovanni": [], "Carlo": [],
                "Francesco": [], "Giuseppe": [], "Pietro": [], "Luigi": [], "Paolo": [],
                "Alessandro": [], "Angelo": [], "Giorgio": [], "Domenico": [],
                "Enrico": [], "Stefano": [], "Vincenzo": [], "Matteo": []
            }

        else:
            raise ValueError("Invalid version specified. See --help")

        return {"german": german, "foreign": foreign}


def get_embeddings(lookup_dict: dict, embeddings) -> dict():
    """Go through nested seed dicts and look up embedding for each word"""
    for category, seeds in lookup_dict.items():
        for word, _ in seeds.items():
            if word.lower() in embeddings:
                seeds[word] = embeddings[word.lower()]
            else:
                raise KeyError(f"'{word}' not in vocabulary")
        lookup_dict[category] = seeds
    return lookup_dict


def attribute_association_s(word_vector, target_set1, target_set2):
    reshaped_word_vector = np.array(word_vector).reshape(1, -1)
    sims1 = [cosine(reshaped_word_vector, np.array(vec).reshape(1, -1)) for vec in list(target_set1.values())]
    sims2 = [cosine(reshaped_word_vector, np.array(vec).reshape(1, -1)) for vec in list(target_set2.values())]
    return np.mean(sims1) - np.mean(sims2)


def differential_association_s(attr1, attr2, target1, target2):
    sum1 = sum([attribute_association_s(vec, target1, target2) for vec in list(attr1.values())])
    sum2 = sum([attribute_association_s(vec, target1, target2) for vec in list(attr2.values())])
    return sum1 - sum2


def cohens_d_calc(target1, target2, attr1, attr2):
    mean1 = np.mean([attribute_association_s(x, attr1, attr2) for x in list(target1.values())])
    mean2 = np.mean([attribute_association_s(x, attr1, attr2) for x in list(target2.values())])
    join = list(target1.values()) + (list(target2.values()))
    joint_association = [attribute_association_s(x, attr1, attr2) for x in join]
    stddev = np.std(joint_association)
    return (mean1 - mean2) / stddev


def permutations(target1, target2):
    join = list(target1.keys()) + list(target2.keys())
    combs = list(islice(combinations(join, int(len(join)/2)), 100000))
    first_groups = []
    second_groups = []
    for c in combs:
        rest = []
        for e in join:
            if e not in c:
                rest.append(e)
        first_groups.append(c)
        second_groups.append(rest)
    return first_groups, second_groups


def p_value_calc_worker(args):
    X_subset, Y_subset, comparison, attr1, attr2 = args
    return differential_association_s(X_subset, Y_subset, attr1, attr2) > comparison

def p_value_calc(comparison, X_perms, Y_perms, target1, target2, attr1, attr2):
    counter = 0
    joint_dict = {**target1, **target2}

    with ProcessPoolExecutor() as executor:
        args_list = []
        for i, _ in enumerate(X_perms):
            X_subset = {key: joint_dict[key] for key in X_perms[i]}
            Y_subset = {key: joint_dict[key] for key in Y_perms[i]}
            args_list.append((X_subset, Y_subset, comparison, attr1, attr2))

        results = list(executor.map(p_value_calc_worker, args_list))
        counter = sum(results)

    return counter

def calculate_WEAT(target_data: dict, attribute_data: dict) -> tuple:
    X = attribute_data["german"]
    Y = attribute_data["foreign"]
    A = target_data["pleasant"]
    B = target_data["unpleasant"]

    original_diff_association = differential_association_s(X, Y, A, B)
    d = cohens_d_calc(X, Y, A, B)
    X_i, Y_i = permutations(X, Y)
    p_value_count = p_value_calc(original_diff_association, X_i, Y_i, X, Y, A, B)
    p = p_value_count/100000

    return d, p



if __name__ == "__main__":
    np.random.seed(42)

    parser = argparse.ArgumentParser(
        description="Calculate WEAT score for given attributes and vectors")
    parser.add_argument("--attribute", help="'kurpicz', 'turkish', 'polish', 'reproduction'")
    parser.add_argument("--vector_location", help="specify a file path to embeddings")
    args = parser.parse_args()

    print("Loading seed words...")
    target_dicts = get_target_words(args.attribute)
    attribute_dicts = get_attribute_words(args.attribute)

    print("Loading model...")
    model = KeyedVectors.load_word2vec_format(args.vector_location, binary=False)

    print("Retrieving embeddings...")
    target_dicts = get_embeddings(target_dicts, model)
    attribute_dicts = get_embeddings(attribute_dicts, model)

    print("Calculating WEAT...")
    cohens_d, p_value = calculate_WEAT(target_dicts, attribute_dicts)

    print(f"WEAT scores for: {args.attribute} test, vectors from {args.vector_location}")
    print(f"Cohen's d: {cohens_d:.4f}, p-value: {p_value:.4f}")
    print("-----------------------------------------------")

WEAT/weat_output.txt

0 → 100644
+226 −0
Original line number Diff line number Diff line
Loading seed words...
Loading model...
Retrieving embeddings...
Calculating WEAT...
WEAT scores for: turkish test, vectors from /home/students/reichelt/ba/bias-mitigation-ba/data/embeddings/fasttext/wiki.de.vec
Cohen's d: 1.3571, p-value: 0.0109
-----------------------------------------------

Loading seed words...
Loading model...
Retrieving embeddings...
Calculating WEAT...
WEAT scores for: polish test, vectors from /home/students/reichelt/ba/bias-mitigation-ba/data/embeddings/fasttext/wiki.de.vec
Cohen's d: 0.2829, p-value: 0.5185
-----------------------------------------------

Loading seed words...
Loading model...
Retrieving embeddings...
Calculating WEAT...
WEAT scores for: italian test, vectors from /home/students/reichelt/ba/bias-mitigation-ba/data/embeddings/fasttext/wiki.de.vec
Cohen's d: 1.0331, p-value: 0.1082
-----------------------------------------------

Loading seed words...
Loading model...
Retrieving embeddings...
Calculating WEAT...
WEAT scores for: turkish test, vectors from /home/students/reichelt/ba/bias-mitigation-ba/data/embeddings/fasttext/fasttext_hard_debiased_turkish_w2vformat.txt
Cohen's d: 1.1332, p-value: 0.0573
-----------------------------------------------

Loading seed words...
Loading model...
Retrieving embeddings...
Calculating WEAT...
WEAT scores for: polish test, vectors from /home/students/reichelt/ba/bias-mitigation-ba/data/embeddings/fasttext/fasttext_hard_debiased_polish_w2vformat.txt
Cohen's d: 0.1786, p-value: 0.5699
-----------------------------------------------

Loading seed words...
Loading model...
Retrieving embeddings...
Calculating WEAT...
WEAT scores for: italian test, vectors from /home/students/reichelt/ba/bias-mitigation-ba/data/embeddings/fasttext/fasttext_hard_debiased_italian_w2vformat.txt
Cohen's d: 0.5896, p-value: 0.3510
-----------------------------------------------

Loading seed words...
Loading model...
Retrieving embeddings...
Traceback (most recent call last):
  File "/home/students/reichelt/ba/bias-mitigation-ba/WEAT/weat_experiments.py", line 212, in <module>
    target_dicts = get_embeddings(target_dicts, model)
  File "/home/students/reichelt/ba/bias-mitigation-ba/WEAT/weat_experiments.py", line 117, in get_embeddings
    raise KeyError(f"'{word}' not in vocabulary")
KeyError: "'scheußlich' not in vocabulary"
srun: error: node37: task 0: Exited with exit code 1
Loading seed words...
Loading model...
Retrieving embeddings...
Traceback (most recent call last):
  File "/home/students/reichelt/ba/bias-mitigation-ba/WEAT/weat_experiments.py", line 212, in <module>
    target_dicts = get_embeddings(target_dicts, model)
  File "/home/students/reichelt/ba/bias-mitigation-ba/WEAT/weat_experiments.py", line 117, in get_embeddings
    raise KeyError(f"'{word}' not in vocabulary")
KeyError: "'scheußlich' not in vocabulary"
srun: error: node37: task 0: Exited with exit code 1
Loading seed words...
Loading model...
Retrieving embeddings...
Traceback (most recent call last):
  File "/home/students/reichelt/ba/bias-mitigation-ba/WEAT/weat_experiments.py", line 212, in <module>
    target_dicts = get_embeddings(target_dicts, model)
  File "/home/students/reichelt/ba/bias-mitigation-ba/WEAT/weat_experiments.py", line 117, in get_embeddings
    raise KeyError(f"'{word}' not in vocabulary")
KeyError: "'scheußlich' not in vocabulary"
srun: error: node37: task 0: Exited with exit code 1
Loading seed words...
Loading model...
Retrieving embeddings...
Traceback (most recent call last):
  File "/home/students/reichelt/ba/bias-mitigation-ba/WEAT/weat_experiments.py", line 212, in <module>
    target_dicts = get_embeddings(target_dicts, model)
  File "/home/students/reichelt/ba/bias-mitigation-ba/WEAT/weat_experiments.py", line 117, in get_embeddings
    raise KeyError(f"'{word}' not in vocabulary")
KeyError: "'scheußlich' not in vocabulary"
srun: error: node37: task 0: Exited with exit code 1
Loading seed words...
Loading model...
Retrieving embeddings...
Traceback (most recent call last):
  File "/home/students/reichelt/ba/bias-mitigation-ba/WEAT/weat_experiments.py", line 212, in <module>
    target_dicts = get_embeddings(target_dicts, model)
  File "/home/students/reichelt/ba/bias-mitigation-ba/WEAT/weat_experiments.py", line 117, in get_embeddings
    raise KeyError(f"'{word}' not in vocabulary")
KeyError: "'scheußlich' not in vocabulary"
srun: error: node37: task 0: Exited with exit code 1
Loading seed words...
Loading model...
Retrieving embeddings...
Traceback (most recent call last):
  File "/home/students/reichelt/ba/bias-mitigation-ba/WEAT/weat_experiments.py", line 212, in <module>
    target_dicts = get_embeddings(target_dicts, model)
  File "/home/students/reichelt/ba/bias-mitigation-ba/WEAT/weat_experiments.py", line 117, in get_embeddings
    raise KeyError(f"'{word}' not in vocabulary")
KeyError: "'scheußlich' not in vocabulary"
srun: error: node37: task 0: Exited with exit code 1
Loading seed words...
Loading model...
Retrieving embeddings...
Traceback (most recent call last):
  File "/home/students/reichelt/ba/bias-mitigation-ba/WEAT/weat_experiments.py", line 212, in <module>
    target_dicts = get_embeddings(target_dicts, model)
  File "/home/students/reichelt/ba/bias-mitigation-ba/WEAT/weat_experiments.py", line 117, in get_embeddings
    raise KeyError(f"'{word}' not in vocabulary")
KeyError: "'scheußlich' not in vocabulary"
srun: error: node37: task 0: Exited with exit code 1
Loading seed words...
Loading model...
Retrieving embeddings...
Calculating WEAT...
WEAT scores for: polish test, vectors from /home/students/reichelt/ba/bias-mitigation-ba/data/embeddings/fasttext/fasttext_hard_debiased_turkish_w2vformat.txt
Cohen's d: 0.2123, p-value: 0.5639
-----------------------------------------------

Loading seed words...
Loading model...
Retrieving embeddings...
Calculating WEAT...
WEAT scores for: italian test, vectors from /home/students/reichelt/ba/bias-mitigation-ba/data/embeddings/fasttext/fasttext_hard_debiased_turkish_w2vformat.txt
Cohen's d: 0.9480, p-value: 0.1469
-----------------------------------------------

Loading seed words...
Loading model...
Retrieving embeddings...
Calculating WEAT...
WEAT scores for: turkish test, vectors from /home/students/reichelt/ba/bias-mitigation-ba/data/embeddings/fasttext/fasttext_hard_debiased_polish_w2vformat.txt
Cohen's d: 1.3612, p-value: 0.0103
-----------------------------------------------

Loading seed words...
Loading model...
Retrieving embeddings...
Calculating WEAT...
WEAT scores for: italian test, vectors from /home/students/reichelt/ba/bias-mitigation-ba/data/embeddings/fasttext/fasttext_hard_debiased_polish_w2vformat.txt
Cohen's d: 1.0477, p-value: 0.1018
-----------------------------------------------

Loading seed words...
Loading model...
Retrieving embeddings...
Calculating WEAT...
WEAT scores for: turkish test, vectors from /home/students/reichelt/ba/bias-mitigation-ba/data/embeddings/fasttext/fasttext_hard_debiased_italian_w2vformat.txt
Cohen's d: 1.3299, p-value: 0.0139
-----------------------------------------------

Loading seed words...
Loading model...
Retrieving embeddings...
Calculating WEAT...
WEAT scores for: polish test, vectors from /home/students/reichelt/ba/bias-mitigation-ba/data/embeddings/fasttext/fasttext_hard_debiased_italian_w2vformat.txt
Cohen's d: 0.1901, p-value: 0.5747
-----------------------------------------------

Loading seed words...
Loading model...
Retrieving embeddings...
Traceback (most recent call last):
  File "/home/students/reichelt/ba/bias-mitigation-ba/WEAT/weat_experiments.py", line 212, in <module>
    target_dicts = get_embeddings(target_dicts, model)
  File "/home/students/reichelt/ba/bias-mitigation-ba/WEAT/weat_experiments.py", line 117, in get_embeddings
    raise KeyError(f"'{word}' not in vocabulary")
KeyError: "'scheußlich' not in vocabulary"
srun: error: node37: task 0: Exited with exit code 1
Loading seed words...
Loading model...
Retrieving embeddings...
Traceback (most recent call last):
  File "/home/students/reichelt/ba/bias-mitigation-ba/WEAT/weat_experiments.py", line 212, in <module>
    target_dicts = get_embeddings(target_dicts, model)
  File "/home/students/reichelt/ba/bias-mitigation-ba/WEAT/weat_experiments.py", line 117, in get_embeddings
    raise KeyError(f"'{word}' not in vocabulary")
KeyError: "'scheußlich' not in vocabulary"
srun: error: node37: task 0: Exited with exit code 1
Loading seed words...
Loading model...
Retrieving embeddings...
Traceback (most recent call last):
  File "/home/students/reichelt/ba/bias-mitigation-ba/WEAT/weat_experiments.py", line 212, in <module>
    target_dicts = get_embeddings(target_dicts, model)
  File "/home/students/reichelt/ba/bias-mitigation-ba/WEAT/weat_experiments.py", line 117, in get_embeddings
    raise KeyError(f"'{word}' not in vocabulary")
KeyError: "'scheußlich' not in vocabulary"
srun: error: node37: task 0: Exited with exit code 1
Loading seed words...
Loading model...
Retrieving embeddings...
Traceback (most recent call last):
  File "/home/students/reichelt/ba/bias-mitigation-ba/WEAT/weat_experiments.py", line 212, in <module>
    target_dicts = get_embeddings(target_dicts, model)
  File "/home/students/reichelt/ba/bias-mitigation-ba/WEAT/weat_experiments.py", line 117, in get_embeddings
    raise KeyError(f"'{word}' not in vocabulary")
KeyError: "'scheußlich' not in vocabulary"
srun: error: node37: task 0: Exited with exit code 1
Loading seed words...
Loading model...
Retrieving embeddings...
Traceback (most recent call last):
  File "/home/students/reichelt/ba/bias-mitigation-ba/WEAT/weat_experiments.py", line 212, in <module>
    target_dicts = get_embeddings(target_dicts, model)
  File "/home/students/reichelt/ba/bias-mitigation-ba/WEAT/weat_experiments.py", line 117, in get_embeddings
    raise KeyError(f"'{word}' not in vocabulary")
KeyError: "'scheußlich' not in vocabulary"
srun: error: node37: task 0: Exited with exit code 1
Loading seed words...
Loading model...
Retrieving embeddings...
Traceback (most recent call last):
  File "/home/students/reichelt/ba/bias-mitigation-ba/WEAT/weat_experiments.py", line 212, in <module>
    target_dicts = get_embeddings(target_dicts, model)
  File "/home/students/reichelt/ba/bias-mitigation-ba/WEAT/weat_experiments.py", line 117, in get_embeddings
    raise KeyError(f"'{word}' not in vocabulary")
KeyError: "'scheußlich' not in vocabulary"
srun: error: node37: task 0: Exited with exit code 1

WEAT/weat_output_2.txt

0 → 100644
+96 −0

File added.

Preview size limit exceeded, changes collapsed.

Loading