diff --git a/debiaswe-master/.gitignore b/debiaswe-master/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..02724e2d3005f2e7e39b281aabc33cdd6f27ce9a --- /dev/null +++ b/debiaswe-master/.gitignore @@ -0,0 +1,94 @@ +# PROJECT SPECIFIC + + +# PYTHON RELATED + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +env/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +*.egg-info/ +.installed.cfg +*.egg + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*,cover +.hypothesis/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# IPython Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# dotenv +.env + +# virtualenv +venv/ +ENV/ + +# Spyder project settings +.spyderproject + +# Rope project settings +.ropeproject diff --git a/debiaswe-master/LICENSE b/debiaswe-master/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..963fdd22206b067eec644ef7b2f679aff9b13c18 --- /dev/null +++ b/debiaswe-master/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2016 Tolga + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/debiaswe-master/README.md b/debiaswe-master/README.md new file mode 100644 index 0000000000000000000000000000000000000000..ce94c5fae6ef2805e551b8b93df669d021d0f7f5 --- /dev/null +++ b/debiaswe-master/README.md @@ -0,0 +1,37 @@ +# Debiaswe: try to make word embeddings less sexist + +🔴[FAT* 2018 tutorial slides](https://drive.google.com/file/d/1IxIdmreH4qVYnx68QVkqCC9-_yyksoxR/view?usp=sharing) + + +Here we have the code and data for the following paper: +[Man is to Computer Programmer as Woman is to +Homemaker? Debiasing Word Embeddings](http://papers.nips.cc/paper/6228-man-is-to-computer-programmer-as-woman-is-to-homemaker-debiasing-word-embeddings.pdf) by +Tolga Bolukbasi, Kai-Wei Chang, James Zou, Venkatesh Saligrama, and Adam Kalai. Proceedings of [NIPS 2016](https://papers.nips.cc/paper/6228-man-is-to-computer-programmer-as-woman-is-to-homemaker-debiasing-word-embeddings). + +**Just looking to download a debiased embedding?** + +You can download [binary](https://drive.google.com/file/d/0B5vZVlu2WoS5ZTBSekpUX0RSNDg/view?usp=sharing&resourcekey=0-qO1UY06KB42G1T6IeJ2XCQ)/[txt](https://drive.google.com/file/d/1_PvT4ZvtZjhq4HPywA8-u06epht9ccOw/view?usp=sharing) hard debiased version of the Google's Word2Vec embedding trained on Google News (Origin: GoogleNews-vectors-negative300.bin.gz found [here](https://code.google.com/archive/p/word2vec/)). + +**Python scripts:** +- **learn_gender_specific.py**: given a word embedding and a seed set of gender-specific words (like <i>king</i>, <i>she</i>, etc.), it learns a much larger list of gender-specific words +- **debias.py**: given a word embedding, sets of gender-pairs, gender-specific words, and pairs to equalize, it outputs a new word embedding. This version basically reads/writes word2vec binary file format. + +``` +python learn_gender_specific.py ../embeddings/GoogleNews-vectors-negative300.bin 50000 ../data/gender_specific_seed.json gender_specific_full.json +``` + +``` +python debias.py ../embeddings/GoogleNews-vectors-negative300.bin ../data/definitional_pairs.json ../data/gender_specific_full.json ../data/equalize_pairs.json ../embeddings/GoogleNews-vectors-negative300-hard-debiased.bin +``` + + +We also have seed data used to debias and crowd data used to evaluate the embeddings. + +**Data files:** +- **gender_specific_seed.json**: A list of 218 gender-specific words +- **gender_specific_full.json**: A list of 1441 gender-specific words +- **definitional_pairs.json**: The ten pairs of words we use to define the gender direction +- **equalize_pairs.json**: Some crowdsourced F-M pairs of words that represent gender direction + + +(All external files that I refer within this repo can be found in [this folder](https://drive.google.com/drive/folders/0B5vZVlu2WoS5dkRFY19YUXVIU2M?resourcekey=0-rZ1HR4Fb0XCi4HFUERGhRA&usp=sharing).) diff --git a/debiaswe-master/data/general_origin/bias_specific_full.json b/debiaswe-master/data/general_origin/bias_specific_full.json new file mode 100644 index 0000000000000000000000000000000000000000..21d96db29b05499cc36ff495716a8e4d22f32f53 --- /dev/null +++ b/debiaswe-master/data/general_origin/bias_specific_full.json @@ -0,0 +1 @@ +["germane", "ostgeld", "focaccia", "ostalgie", "volksgenosse", "hetman", "auslandsdeutscher", "sinto", "lech", "auslandsgeschäft", "bambino", "reichsbahn", "engadin", "schilling", "grundgesetz", "prosecco", "kleindeutsch", "aventiure", "europide", "flüchtlingsausweis", "weser", "völkerwanderung", "azzurri", "landammann", "trecento", "deutschfeindlichkeit", "polnisch", "baron", "mitteldeutsch", "bundesminister", "germanisch", "itaker", "groschen", "quempaslied", "flüchtlingshilfe", "baden-württemberg", "thai", "zuwanderin", "edeling", "italienisch", "ausländerhass", "confoederatio helvetica", "germanisieren", "vaudeville", "italowestern", "mittelhochdeutsch", "schwarz-rot-gold", "westmitteldeutsch", "tamtam", "janitscharenmusik", "öterreichisch-ungarisch", "weichsel", "germanentum", "jungdeutscher", "plattdeutsch", "grappa", "exarch", "abate", "carabiniere", "bairisch", "alldeutsch", "quart", "sultan", "ramasan", "liechtenstein", "sachsen-anhalt", "settecento", "greyerzer", "reichsdeutsch", "urdeutsch", "bundesrepublikanisch", "thurgau", "germanist", "labiovelar", "kampanile", "ostmitteldeutsch", "frühneuhochdeutsch", "ostverträge", "geschäftsträger", "hochlautung", "reinheitsgebot", "wallis", "signore", "brandenburg", "nazarener", "sbrinz", "carnotzet", "verhochdeutschen", "cinquecento", "großglockner", "aargau", "einwanderungsstrom", "beg", "kastell", "asylsuchende", "panje", "spartakiade", "veltliner", "pizzaservice", "wittum", "helvetien", "nibelungen", "papagallo", "amerikahaus", "romand", "ausländisch", "boatpeople", "neudeutsch", "zibetkatze", "rom", "bundesstadt", "schweizweit", "mora", "signor", "chianti", "bundesstraße", "asylsuchender", "indogermane", "kenning", "sejm", "stadtammann", "wessi", "normanne", "ostdeutschland", "volksdeutscher", "rugier", "sütterlinschrift", "secondo", "gambir", "hochdeutsch", "bel paese", "frühneuhochdeutsch", "bundesliga", "devisenbewirtschaftung", "signorina", "zweigelt", "toskana", "auswärtig", "bundesadler", "deutschtum", "khan", "hodscha", "hinterindien", "franken", "deutschlandchef", "helvetier", "afroasiatisch", "bundesbürger", "eindeutschung", "oberdeutsch", "deutschtümelei", "deutschsprachlich", "radicchio", "missingsch", "ostblock", "himalaja", "puzzolan", "taverne", "sundainseln", "außerdeutsch", "signoria", "cinzano", "wirtschaftsflüchtling", "markomanne", "austrofaschismus", "swissair", "ddr-bürger", "deutsche", "migrationshintergrund", "asylbewerberin", "deutschamerikanisch", "levante", "piefke", "padre", "theatiner", "westdeutschland", "zimber", "stasimitarbeiter", "deutschnational", "karawane", "norddeutsch", "russland", "spumante", "kosovokrieg", "italienisch", "italienerin", "mittelhochdeutsch", "bundeskanzlei", "gastarbeiter", "freiheitlich", "deutsch-schweizerisch", "italienreise", "flüchtlingselend", "ostler", "landesstraße", "sachsen", "verfassungsgerichtshof", "trentino-südtirol", "freisinn", "sprachgesellschaft", "ausländerfeindlich", "kurfürst", "deutschtürke", "schwäbeln", "kirchenstaat", "kebab", "staatsgerichtshof", "oberlandesgericht", "deutschkunde", "ostmitteldeutsch", "bergama", "cajunmusic", "spieloper", "rheinfall", "auffangen", "auslieferungsantrag", "spatha", "ausführen", "magnat", "polonistik", "undeutsch", "westfernsehen", "uri", "indoeuropäisch", "donnerer", "odal", "indogermanisch", "schufa", "dw", "po", "schleswig-holstein", "unter", "humanismus", "piazza", "ausländerkind", "westen", "kentumsprache", "reisefreiheit", "tessin", "greencard", "stasiakte", "ard", "einwandererkind", "pirogge", "lambrusco", "zimbal", "cavaliere", "mark", "raki", "aussiedler", "flüchtlingslager", "ufa", "deutschschweizerisch", "jot", "gorgonzola", "kanake", "u-häkchen", "fichtelberg", "südtirol", "asti", "burgunde", "ost-west-dialog", "centime", "tiber", "jul", "hartz", "quattrocento", "alphorn", "bundesverdienstkreuz", "pasta", "madrigal", "kelte", "deutschschweizer", "displaced person", "zentralasien", "handballbundesliga", "rentenmark", "althochdeutsch", "fremdenpolizei", "tarantella", "frühmittelhochdeutsch", "sibirien", "urgermanisch", "migrantenkind", "schwyzertütsch", "arte povera", "gastarbeiterin", "weißherbst", "bundespräsident", "öterreichweit", "ural", "pidginenglisch", "uckermark", "immigrantin", "ger", "scudo", "spätaussiedler", "lastenausgleichsgesetz", "kulturinstitut", "welschschweizer", "menschenhandel", "austriazismus", "bundeshaus", "italer", "mitternachtssonne", "deutschlandfunk", "welsche", "latsche", "weißbuch", "hilfswillige", "schweizerdeutsch", "kufe", "kolonialherrschaft", "bundesdeutsche", "mitteldeutschland", "stracciatella", "frühmittelhochdeutsch", "vielfraß", "bundesdeutscher", "kaspisches meer", "kanton", "deutschtürkisch", "pfingstochse", "teutonin", "dihk", "amerikadeutsche", "stasiunterlagen", "deutsch", "ausländerpolitik", "niedersachsen", "ausfuhrgarantie", "harz", "lateinisch", "palazzo", "futhark", "schriftdeutsch", "wien", "pan", "weißwurstäquator", "pazifischer ozean", "basso", "welschland", "stabreim", "bundesministerium", "quarta", "schwyzerdütsch", "lasagne", "aga", "karelien", "polnisch", "russlanddeutsch", "kurrentschrift", "canzone", "oberdeutsch", "härtefallkommission", "nachkriegsschweiz", "levantiner", "faschismus", "pole", "angelsachse", "ararat", "reichstag", "verismo", "börde", "paying guest", "balsamessig", "schwabenspiegel", "westmitteldeutsch", "landesversicherungsanstalt", "illyrer", "pagode", "treuhandanstalt", "doktorhut", "fußballbundesliga", "italoamerikaner", "kalabrien", "arbeitsemigrant", "deutschlehrer", "arier", "bajazzo", "kabinett", "lufthansa", "mikrozensus", "verrechnungseinheit", "hanswurst", "sezession", "schlepper", "aufenthaltsgenehmigung", "deutschjüdisch", "einwandererstrom", "außenwert", "misereor", "bundesgartenschau", "bezirkstag", "alpenrepublik", "zwangsumtausch", "auslandsdeutsch", "teehaus", "panasiatisch", "einwanderin", "westdeutscher", "duce", "konsularkorps", "italianisieren", "siamkatze", "auslese", "isolationismus", "expedition", "zav", "einreisen", "turkisieren", "ostöterreich", "anwerbestopp", "waadt", "ausländerin", "franzöisch-deutsch", "vorderasien", "administrator", "stradivari", "welschschweizerisch", "ostgermane", "kolonie", "bundesbetreuung", "gefolgschaft", "bundeshaushalt", "sizilien", "vendetta", "botschafter", "hermesbürgschaft", "nidwalden", "zahlungsbilanz", "apo", "generaloberst", "altnordisch", "jura", "ostasien", "pandschabi", "volksdeutsche", "einwanderer", "saarland", "effendi", "deutschlandlied", "intershop", "eisheiligen", "ch-laut", "bundschuh", "landeshauptmann", "cherusker", "migrantin", "deutsch gesinnt", "dolma", "pecorino", "nordrhein-westfalen", "inländerin", "obwalden", "schrumpfgermane", "osten", "eindeutschen", "amerikadeutscher", "thing", "ciabatta", "hamburg", "schweizergarde", "welscher", "parmesan", "altdeutsch", "mazurka", "böhmerwald", "sowjetzone", "westdeutsche", "berlin", "deutsch", "rhododendron", "fra", "hispano", "deutschsprachig", "osmane", "immigrant", "bundespolitiker", "ubier", "hilfswilliger", "wechselkurs", "marchese", "apulien", "reisescheck", "bergamotte", "defa", "sonata", "zentralschweiz", "apennin", "dax", "ostdeutsche", "bremen", "konsistorium", "deutschfreundlichkeit", "honved", "padrone", "schweizer", "kawass", "departement", "frikadelle", "großdeutsch", "verdeutschung", "jiddisch", "neubürger", "trattoria", "panettone", "austromarxismus", "metamusik", "ddr-bürgerin", "boreal", "nordgermane", "notaufnahme", "antipasto", "drk", "catenaccio", "hesperien", "pannacotta", "schweizerin", "moxibustion", "allgäu", "schriftdeutsch", "welschschweiz", "bundesgebiet", "auslandsdeutsche", "eurasier", "schakal", "jass", "bundesrat", "warenumsatzsteuer", "deutscher", "swiss", "westschweiz", "trakehner", "gote", "fürstentag", "autarkie", "flühtlingsstrom", "landesgartenschau", "futurismus", "ligurien", "bundesautobahn", "ku-klux-klan", "standarddeutsch", "kappadozien", "westdeutsch", "westlich", "innerschweiz", "steppenhuhn", "ösi", "orient", "achtundvierziger", "entsendegesetz", "hethiter", "deutsch-türkisch", "romanismus", "schweizerbürgerin", "daus", "franke", "senat", "bundesnachrichtendienst", "bundesbahn", "beamtendeutsch", "zuwandrer", "lombardei", "rittmeister", "lori", "alta moda", "standarddeutsch", "buntnessel", "belcanto", "deutschkenntnis", "piccolo", "tschibuk", "auffanglager", "elba", "arlecchino", "lira", "exilliteratur", "niederdeutsch", "bundesausbildungsförderungsgesetz", "ehrenspielführer", "durchgangslager", "apenninen-halbinsel", "cassata", "schwarz-weiß-rot", "deutschlandsender", "autark", "erzherzog", "eurokommunismus", "europider", "hennastrauch", "öterreichisch", "brd", "plateresk", "prignitz", "treck", "buch", "iberer", "pancetta", "lüneburger heide", "ostig", "fdp", "couvert", "asylbewerberheim", "quintal", "heldenlied", "asiatisch", "kandidat", "notlager", "ems", "bundestag", "hindukusch", "beitrittsgebiet", "türkisch", "güteraustausch", "importe", "mittelniederdeutsch", "mauerschütze", "bundeskanzleramt", "ß", "tagliatelle", "büffel", "ossi", "seconda", "zaubernuss", "ziehungsrecht", "brandgans", "katamaran", "feldgrau", "pizza", "afrodeutsch", "importhandel", "zloty", "italienische", "ostdeutsch", "anopheles", "betäubungsmittelgesetz", "kreuzer", "resident", "bundesdeutsch", "italianismus", "ötlich", "türkischstämmig", "welsch", "valuta", "schleichkatze", "fernamt", "südasien", "deutschlandpolitik", "germanin", "muchtar", "ostpolitik", "thüringen", "flüchtlingsrat", "brillenschlange", "met", "schabzieger", "piva", "krevette", "devise", "ausländerfeindlichkeit", "boccia", "konak", "alpenjäger", "prädikatswein", "preislied", "studienkolleg", "sudetenland", "chassidismus", "hemlocktanne", "baba", "novecento", "großdeutschland", "rheinland-pfalz", "lizenziat", "nachkriegsöterreich", "binnendeutsch", "geest", "billigflagge", "bundeswehr", "amischer", "getto", "kanzleideutsch", "moschustier", "neudeutsch", "polentum", "italienischsprachig", "kamtschatka", "vacherin", "fantasia", "volksgericht", "nationalratspräsident", "kontor", "scampi", "teutonisch", "plattdeutsch", "germanistik", "biedermeier", "certosa", "eurocityzug", "ausländer", "seele", "staatsrat", "bundeskabinett", "alitalia", "italien", "migrationspolitik", "verfassungsinitiative", "diplomatie", "neuhochdeutsch", "zwergkiefer", "marktamt", "dienstpragmatik", "deutschschweiz", "frascati", "kurrent", "türkisch", "fpö", "eurasien", "kemalismus", "landeskirche", "mittelmeerländer", "eidgenosse", "friedensfahrt", "renaissance", "rotwelsch", "hyäne", "italianist", "prälat", "pfalz", "fremdarbeiter", "quent", "spruch", "wandervogel", "hortensie", "türbe", "bundesgesetzblatt", "schwarzwald", "ausländeranteil", "hafenzoll", "integrationsbeauftragte", "mecklenburg-vorpommern", "ostdeutscher", "satemsprache", "mittelniederdeutsch", "botschaft", "maggiore", "schutztruppe", "ländle", "kreole", "hamam", "conte", "incoming", "ripuarisch", "lingua franca", "aare", "bundesversammlung", "bootsflühtling", "mitteldeutsch", "unteritalien", "althochdeutsch", "bigos", "ingwäonen", "schwarzes meer", "bundesanleihe", "fremde", "ober", "ausländeramt", "qualitätswein", "sardinien", "westler", "einigungsvertrag", "asean", "visconte", "don", "halbesel", "bundesbank", "gesandtschaft", "indogermanistik", "behördendeutsch", "notaufnahmelager", "ausländerbehörde", "josephinismus", "schwaben", "flühtlingspolitik", "rote-armee-fraktion", "schutzzoll", "katzelmacher", "deutschstämmig", "reichsdeutscher", "deutsch sprechend", "staatsminister", "präfekt", "deutschamerikaner", "asylgerichtshof", "glosse", "italianistisch", "alemanne", "legionär", "sammellager", "reichsdeutsche", "kapitalflucht", "ostschweiz", "germanien", "orientteppich", "landeshauptfrau", "romandie", "ultra", "oder-neiße-linie", "platt", "neuhochdeutsch", "staatssicherheitsdienst", "südeuropäisch", "deutschstämmige", "umweltflühtling", "ostzone", "mezzogiorno", "villanell", "frisör", "oberitalien", "süddeutsch", "treudeutsch", "bundesverfassungsgericht", "ischia", "mozzarella", "sudetendeutsch", "tramontana", "bayern", "einwandererfamilie", "sprachführer", "durchgangsverkehr", "arno", "rütlischwur", "volkskammer", "mad", "ns-staat", "volksmarine", "dienstleistungsverkehr", "expatriate", "gemeindeutsch", "österreicherin", "zonenrandgebiet", "amtssprache", "tifoso", "schweizerisch", "studienaufenthalt", "hansestadt", "hessen", "bure", "ostflüchtling", "flüchtlingstreck", "ristorante", "osteria", "teutonengrill", "assisen", "riviera", "kolonialherr", "wendezeit", "flüchtlingsheim", "bundesverwaltungsgericht", "diwan", "exequatur", "krautrock", "deutschstämmiger", "woiwod", "geniezeit", "anatolien", "bundessozialgericht", "freiburg"] \ No newline at end of file diff --git a/debiaswe-master/data/general_origin/bias_specific_seed.json b/debiaswe-master/data/general_origin/bias_specific_seed.json new file mode 100644 index 0000000000000000000000000000000000000000..21d96db29b05499cc36ff495716a8e4d22f32f53 --- /dev/null +++ b/debiaswe-master/data/general_origin/bias_specific_seed.json @@ -0,0 +1 @@ +["germane", "ostgeld", "focaccia", "ostalgie", "volksgenosse", "hetman", "auslandsdeutscher", "sinto", "lech", "auslandsgeschäft", "bambino", "reichsbahn", "engadin", "schilling", "grundgesetz", "prosecco", "kleindeutsch", "aventiure", "europide", "flüchtlingsausweis", "weser", "völkerwanderung", "azzurri", "landammann", "trecento", "deutschfeindlichkeit", "polnisch", "baron", "mitteldeutsch", "bundesminister", "germanisch", "itaker", "groschen", "quempaslied", "flüchtlingshilfe", "baden-württemberg", "thai", "zuwanderin", "edeling", "italienisch", "ausländerhass", "confoederatio helvetica", "germanisieren", "vaudeville", "italowestern", "mittelhochdeutsch", "schwarz-rot-gold", "westmitteldeutsch", "tamtam", "janitscharenmusik", "öterreichisch-ungarisch", "weichsel", "germanentum", "jungdeutscher", "plattdeutsch", "grappa", "exarch", "abate", "carabiniere", "bairisch", "alldeutsch", "quart", "sultan", "ramasan", "liechtenstein", "sachsen-anhalt", "settecento", "greyerzer", "reichsdeutsch", "urdeutsch", "bundesrepublikanisch", "thurgau", "germanist", "labiovelar", "kampanile", "ostmitteldeutsch", "frühneuhochdeutsch", "ostverträge", "geschäftsträger", "hochlautung", "reinheitsgebot", "wallis", "signore", "brandenburg", "nazarener", "sbrinz", "carnotzet", "verhochdeutschen", "cinquecento", "großglockner", "aargau", "einwanderungsstrom", "beg", "kastell", "asylsuchende", "panje", "spartakiade", "veltliner", "pizzaservice", "wittum", "helvetien", "nibelungen", "papagallo", "amerikahaus", "romand", "ausländisch", "boatpeople", "neudeutsch", "zibetkatze", "rom", "bundesstadt", "schweizweit", "mora", "signor", "chianti", "bundesstraße", "asylsuchender", "indogermane", "kenning", "sejm", "stadtammann", "wessi", "normanne", "ostdeutschland", "volksdeutscher", "rugier", "sütterlinschrift", "secondo", "gambir", "hochdeutsch", "bel paese", "frühneuhochdeutsch", "bundesliga", "devisenbewirtschaftung", "signorina", "zweigelt", "toskana", "auswärtig", "bundesadler", "deutschtum", "khan", "hodscha", "hinterindien", "franken", "deutschlandchef", "helvetier", "afroasiatisch", "bundesbürger", "eindeutschung", "oberdeutsch", "deutschtümelei", "deutschsprachlich", "radicchio", "missingsch", "ostblock", "himalaja", "puzzolan", "taverne", "sundainseln", "außerdeutsch", "signoria", "cinzano", "wirtschaftsflüchtling", "markomanne", "austrofaschismus", "swissair", "ddr-bürger", "deutsche", "migrationshintergrund", "asylbewerberin", "deutschamerikanisch", "levante", "piefke", "padre", "theatiner", "westdeutschland", "zimber", "stasimitarbeiter", "deutschnational", "karawane", "norddeutsch", "russland", "spumante", "kosovokrieg", "italienisch", "italienerin", "mittelhochdeutsch", "bundeskanzlei", "gastarbeiter", "freiheitlich", "deutsch-schweizerisch", "italienreise", "flüchtlingselend", "ostler", "landesstraße", "sachsen", "verfassungsgerichtshof", "trentino-südtirol", "freisinn", "sprachgesellschaft", "ausländerfeindlich", "kurfürst", "deutschtürke", "schwäbeln", "kirchenstaat", "kebab", "staatsgerichtshof", "oberlandesgericht", "deutschkunde", "ostmitteldeutsch", "bergama", "cajunmusic", "spieloper", "rheinfall", "auffangen", "auslieferungsantrag", "spatha", "ausführen", "magnat", "polonistik", "undeutsch", "westfernsehen", "uri", "indoeuropäisch", "donnerer", "odal", "indogermanisch", "schufa", "dw", "po", "schleswig-holstein", "unter", "humanismus", "piazza", "ausländerkind", "westen", "kentumsprache", "reisefreiheit", "tessin", "greencard", "stasiakte", "ard", "einwandererkind", "pirogge", "lambrusco", "zimbal", "cavaliere", "mark", "raki", "aussiedler", "flüchtlingslager", "ufa", "deutschschweizerisch", "jot", "gorgonzola", "kanake", "u-häkchen", "fichtelberg", "südtirol", "asti", "burgunde", "ost-west-dialog", "centime", "tiber", "jul", "hartz", "quattrocento", "alphorn", "bundesverdienstkreuz", "pasta", "madrigal", "kelte", "deutschschweizer", "displaced person", "zentralasien", "handballbundesliga", "rentenmark", "althochdeutsch", "fremdenpolizei", "tarantella", "frühmittelhochdeutsch", "sibirien", "urgermanisch", "migrantenkind", "schwyzertütsch", "arte povera", "gastarbeiterin", "weißherbst", "bundespräsident", "öterreichweit", "ural", "pidginenglisch", "uckermark", "immigrantin", "ger", "scudo", "spätaussiedler", "lastenausgleichsgesetz", "kulturinstitut", "welschschweizer", "menschenhandel", "austriazismus", "bundeshaus", "italer", "mitternachtssonne", "deutschlandfunk", "welsche", "latsche", "weißbuch", "hilfswillige", "schweizerdeutsch", "kufe", "kolonialherrschaft", "bundesdeutsche", "mitteldeutschland", "stracciatella", "frühmittelhochdeutsch", "vielfraß", "bundesdeutscher", "kaspisches meer", "kanton", "deutschtürkisch", "pfingstochse", "teutonin", "dihk", "amerikadeutsche", "stasiunterlagen", "deutsch", "ausländerpolitik", "niedersachsen", "ausfuhrgarantie", "harz", "lateinisch", "palazzo", "futhark", "schriftdeutsch", "wien", "pan", "weißwurstäquator", "pazifischer ozean", "basso", "welschland", "stabreim", "bundesministerium", "quarta", "schwyzerdütsch", "lasagne", "aga", "karelien", "polnisch", "russlanddeutsch", "kurrentschrift", "canzone", "oberdeutsch", "härtefallkommission", "nachkriegsschweiz", "levantiner", "faschismus", "pole", "angelsachse", "ararat", "reichstag", "verismo", "börde", "paying guest", "balsamessig", "schwabenspiegel", "westmitteldeutsch", "landesversicherungsanstalt", "illyrer", "pagode", "treuhandanstalt", "doktorhut", "fußballbundesliga", "italoamerikaner", "kalabrien", "arbeitsemigrant", "deutschlehrer", "arier", "bajazzo", "kabinett", "lufthansa", "mikrozensus", "verrechnungseinheit", "hanswurst", "sezession", "schlepper", "aufenthaltsgenehmigung", "deutschjüdisch", "einwandererstrom", "außenwert", "misereor", "bundesgartenschau", "bezirkstag", "alpenrepublik", "zwangsumtausch", "auslandsdeutsch", "teehaus", "panasiatisch", "einwanderin", "westdeutscher", "duce", "konsularkorps", "italianisieren", "siamkatze", "auslese", "isolationismus", "expedition", "zav", "einreisen", "turkisieren", "ostöterreich", "anwerbestopp", "waadt", "ausländerin", "franzöisch-deutsch", "vorderasien", "administrator", "stradivari", "welschschweizerisch", "ostgermane", "kolonie", "bundesbetreuung", "gefolgschaft", "bundeshaushalt", "sizilien", "vendetta", "botschafter", "hermesbürgschaft", "nidwalden", "zahlungsbilanz", "apo", "generaloberst", "altnordisch", "jura", "ostasien", "pandschabi", "volksdeutsche", "einwanderer", "saarland", "effendi", "deutschlandlied", "intershop", "eisheiligen", "ch-laut", "bundschuh", "landeshauptmann", "cherusker", "migrantin", "deutsch gesinnt", "dolma", "pecorino", "nordrhein-westfalen", "inländerin", "obwalden", "schrumpfgermane", "osten", "eindeutschen", "amerikadeutscher", "thing", "ciabatta", "hamburg", "schweizergarde", "welscher", "parmesan", "altdeutsch", "mazurka", "böhmerwald", "sowjetzone", "westdeutsche", "berlin", "deutsch", "rhododendron", "fra", "hispano", "deutschsprachig", "osmane", "immigrant", "bundespolitiker", "ubier", "hilfswilliger", "wechselkurs", "marchese", "apulien", "reisescheck", "bergamotte", "defa", "sonata", "zentralschweiz", "apennin", "dax", "ostdeutsche", "bremen", "konsistorium", "deutschfreundlichkeit", "honved", "padrone", "schweizer", "kawass", "departement", "frikadelle", "großdeutsch", "verdeutschung", "jiddisch", "neubürger", "trattoria", "panettone", "austromarxismus", "metamusik", "ddr-bürgerin", "boreal", "nordgermane", "notaufnahme", "antipasto", "drk", "catenaccio", "hesperien", "pannacotta", "schweizerin", "moxibustion", "allgäu", "schriftdeutsch", "welschschweiz", "bundesgebiet", "auslandsdeutsche", "eurasier", "schakal", "jass", "bundesrat", "warenumsatzsteuer", "deutscher", "swiss", "westschweiz", "trakehner", "gote", "fürstentag", "autarkie", "flühtlingsstrom", "landesgartenschau", "futurismus", "ligurien", "bundesautobahn", "ku-klux-klan", "standarddeutsch", "kappadozien", "westdeutsch", "westlich", "innerschweiz", "steppenhuhn", "ösi", "orient", "achtundvierziger", "entsendegesetz", "hethiter", "deutsch-türkisch", "romanismus", "schweizerbürgerin", "daus", "franke", "senat", "bundesnachrichtendienst", "bundesbahn", "beamtendeutsch", "zuwandrer", "lombardei", "rittmeister", "lori", "alta moda", "standarddeutsch", "buntnessel", "belcanto", "deutschkenntnis", "piccolo", "tschibuk", "auffanglager", "elba", "arlecchino", "lira", "exilliteratur", "niederdeutsch", "bundesausbildungsförderungsgesetz", "ehrenspielführer", "durchgangslager", "apenninen-halbinsel", "cassata", "schwarz-weiß-rot", "deutschlandsender", "autark", "erzherzog", "eurokommunismus", "europider", "hennastrauch", "öterreichisch", "brd", "plateresk", "prignitz", "treck", "buch", "iberer", "pancetta", "lüneburger heide", "ostig", "fdp", "couvert", "asylbewerberheim", "quintal", "heldenlied", "asiatisch", "kandidat", "notlager", "ems", "bundestag", "hindukusch", "beitrittsgebiet", "türkisch", "güteraustausch", "importe", "mittelniederdeutsch", "mauerschütze", "bundeskanzleramt", "ß", "tagliatelle", "büffel", "ossi", "seconda", "zaubernuss", "ziehungsrecht", "brandgans", "katamaran", "feldgrau", "pizza", "afrodeutsch", "importhandel", "zloty", "italienische", "ostdeutsch", "anopheles", "betäubungsmittelgesetz", "kreuzer", "resident", "bundesdeutsch", "italianismus", "ötlich", "türkischstämmig", "welsch", "valuta", "schleichkatze", "fernamt", "südasien", "deutschlandpolitik", "germanin", "muchtar", "ostpolitik", "thüringen", "flüchtlingsrat", "brillenschlange", "met", "schabzieger", "piva", "krevette", "devise", "ausländerfeindlichkeit", "boccia", "konak", "alpenjäger", "prädikatswein", "preislied", "studienkolleg", "sudetenland", "chassidismus", "hemlocktanne", "baba", "novecento", "großdeutschland", "rheinland-pfalz", "lizenziat", "nachkriegsöterreich", "binnendeutsch", "geest", "billigflagge", "bundeswehr", "amischer", "getto", "kanzleideutsch", "moschustier", "neudeutsch", "polentum", "italienischsprachig", "kamtschatka", "vacherin", "fantasia", "volksgericht", "nationalratspräsident", "kontor", "scampi", "teutonisch", "plattdeutsch", "germanistik", "biedermeier", "certosa", "eurocityzug", "ausländer", "seele", "staatsrat", "bundeskabinett", "alitalia", "italien", "migrationspolitik", "verfassungsinitiative", "diplomatie", "neuhochdeutsch", "zwergkiefer", "marktamt", "dienstpragmatik", "deutschschweiz", "frascati", "kurrent", "türkisch", "fpö", "eurasien", "kemalismus", "landeskirche", "mittelmeerländer", "eidgenosse", "friedensfahrt", "renaissance", "rotwelsch", "hyäne", "italianist", "prälat", "pfalz", "fremdarbeiter", "quent", "spruch", "wandervogel", "hortensie", "türbe", "bundesgesetzblatt", "schwarzwald", "ausländeranteil", "hafenzoll", "integrationsbeauftragte", "mecklenburg-vorpommern", "ostdeutscher", "satemsprache", "mittelniederdeutsch", "botschaft", "maggiore", "schutztruppe", "ländle", "kreole", "hamam", "conte", "incoming", "ripuarisch", "lingua franca", "aare", "bundesversammlung", "bootsflühtling", "mitteldeutsch", "unteritalien", "althochdeutsch", "bigos", "ingwäonen", "schwarzes meer", "bundesanleihe", "fremde", "ober", "ausländeramt", "qualitätswein", "sardinien", "westler", "einigungsvertrag", "asean", "visconte", "don", "halbesel", "bundesbank", "gesandtschaft", "indogermanistik", "behördendeutsch", "notaufnahmelager", "ausländerbehörde", "josephinismus", "schwaben", "flühtlingspolitik", "rote-armee-fraktion", "schutzzoll", "katzelmacher", "deutschstämmig", "reichsdeutscher", "deutsch sprechend", "staatsminister", "präfekt", "deutschamerikaner", "asylgerichtshof", "glosse", "italianistisch", "alemanne", "legionär", "sammellager", "reichsdeutsche", "kapitalflucht", "ostschweiz", "germanien", "orientteppich", "landeshauptfrau", "romandie", "ultra", "oder-neiße-linie", "platt", "neuhochdeutsch", "staatssicherheitsdienst", "südeuropäisch", "deutschstämmige", "umweltflühtling", "ostzone", "mezzogiorno", "villanell", "frisör", "oberitalien", "süddeutsch", "treudeutsch", "bundesverfassungsgericht", "ischia", "mozzarella", "sudetendeutsch", "tramontana", "bayern", "einwandererfamilie", "sprachführer", "durchgangsverkehr", "arno", "rütlischwur", "volkskammer", "mad", "ns-staat", "volksmarine", "dienstleistungsverkehr", "expatriate", "gemeindeutsch", "österreicherin", "zonenrandgebiet", "amtssprache", "tifoso", "schweizerisch", "studienaufenthalt", "hansestadt", "hessen", "bure", "ostflüchtling", "flüchtlingstreck", "ristorante", "osteria", "teutonengrill", "assisen", "riviera", "kolonialherr", "wendezeit", "flüchtlingsheim", "bundesverwaltungsgericht", "diwan", "exequatur", "krautrock", "deutschstämmiger", "woiwod", "geniezeit", "anatolien", "bundessozialgericht", "freiburg"] \ No newline at end of file diff --git a/debiaswe-master/data/general_origin/definitional_pairs.json b/debiaswe-master/data/general_origin/definitional_pairs.json new file mode 100644 index 0000000000000000000000000000000000000000..210f5a3f05185a4f6777a7376ac2589dbab84a5e --- /dev/null +++ b/debiaswe-master/data/general_origin/definitional_pairs.json @@ -0,0 +1,10 @@ +[ + ["deutscher", "ausländer"], + ["deutsche", "ausländerin"], + ["deutschland", "ausland"], + ["einheimischer", "immigrant"], + ["einheimische", "immigrantin"], + ["deutsch", "fremd"], + ["heimat", "gast"], + ["deutschsprachig", "fremdsprachig"] +] \ No newline at end of file diff --git a/debiaswe-master/data/general_origin/equalize_pairs.json b/debiaswe-master/data/general_origin/equalize_pairs.json new file mode 100644 index 0000000000000000000000000000000000000000..191e93c78d71e17602ba7c0acdb77085ceceda27 --- /dev/null +++ b/debiaswe-master/data/general_origin/equalize_pairs.json @@ -0,0 +1,35 @@ +[ + ["deutscher", "ausländer"], + ["deutsche", "ausländerin"], + ["inländer", "ausländer"], + ["inländerin", "ausländerin"], + ["deutschland", "ausland"], + ["einheimischer", "immigrant"], + ["einheimische", "immigrantin"], + ["deutsch", "fremd"], + ["deutsch", "ausländisch"], + ["traditionell", "exotisch"], + ["heimat", "gast"], + ["deutscher", "migrant"], + ["deutsche", "migrantin"], + ["deutschsprachig", "fremdsprachig"], + ["sabine", "elif"], + ["klaus", "mehmet"], + ["staatsbürger", "einwanderer"], + ["staatsbürgerin", "einwanderin"], + ["christlich", "muslimisch"], + ["christlich", "jüdisch"], + ["deutsch", "türkisch"], + ["deutsch", "polnisch"], + ["deutsch", "italienisch"], + ["euro", "zÅ‚oty"], + ["euro", "lira"], + ["berlin", "warschau"], + ["berlin", "rom"], + ["berlin", "istanbul"], + ["hamburg", "krakau"], + ["hamburg", "ankara"], + ["hamburg", "mailand"], + ["deutscher", "flüchtling"], + ["deutscher", "asylbewerber"] +] \ No newline at end of file diff --git a/debiaswe-master/data/general_origin/professions.json b/debiaswe-master/data/general_origin/professions.json new file mode 100644 index 0000000000000000000000000000000000000000..b6897fe6ce421396385a31d2624aaa928ea18808 --- /dev/null +++ b/debiaswe-master/data/general_origin/professions.json @@ -0,0 +1 @@ +[["accountant", 0.0, 0.4], ["acquaintance", 0.0, 0.0], ["actor", 0.8, 0.0], ["actress", -1.0, 0.0], ["adjunct_professor", 0.0, 0.5], ["administrator", 0.0, 0.2], ["adventurer", 0.0, 0.5], ["advocate", 0.0, -0.1], ["aide", 0.0, -0.2], ["alderman", 0.7, 0.2], ["alter_ego", 0.0, 0.0], ["ambassador", 0.0, 0.7], ["analyst", 0.0, 0.4], ["anthropologist", 0.0, 0.4], ["archaeologist", 0.0, 0.6], ["archbishop", 0.4, 0.5], ["architect", 0.1, 0.6], ["artist", 0.0, -0.2], ["artiste", -0.1, -0.2], ["assassin", 0.1, 0.8], ["assistant_professor", 0.1, 0.4], ["associate_dean", 0.0, 0.4], ["associate_professor", 0.0, 0.4], ["astronaut", 0.1, 0.8], ["astronomer", 0.1, 0.5], ["athlete", 0.0, 0.7], ["athletic_director", 0.1, 0.7], ["attorney", 0.0, 0.3], ["author", 0.0, 0.1], ["baker", 0.0, -0.1], ["ballerina", -0.5, -0.5], ["ballplayer", 0.2, 0.8], ["banker", 0.0, 0.6], ["barber", 0.5, 0.5], ["baron", 0.6, 0.3], ["barrister", 0.1, 0.4], ["bartender", 0.0, 0.3], ["biologist", 0.0, 0.1], ["bishop", 0.6, 0.4], ["bodyguard", 0.1, 0.9], ["bookkeeper", 0.0, -0.4], ["boss", 0.0, 0.7], ["boxer", 0.1, 0.9], ["broadcaster", -0.1, 0.4], ["broker", 0.1, 0.5], ["bureaucrat", 0.1, 0.5], ["businessman", 0.8, 0.2], ["businesswoman", -0.9, -0.1], ["butcher", 0.1, 0.9], ["butler", 0.5, 0.5], ["cab_driver", 0.1, 0.8], ["cabbie", 0.1, 0.6], ["cameraman", 0.8, 0.1], ["campaigner", 0.0, 0.2], ["captain", 0.1, 0.6], ["cardiologist", 0.1, 0.5], ["caretaker", 0.0, -0.9], ["carpenter", 0.1, 0.8], ["cartoonist", 0.0, 0.5], ["cellist", -0.1, 0.0], ["chancellor", 0.1, 0.6], ["chaplain", 0.1, 0.6], ["character", 0.0, 0.0], ["chef", 0.0, 0.5], ["chemist", 0.0, 0.2], ["choreographer", -0.2, -0.2], ["cinematographer", 0.0, 0.5], ["citizen", 0.0, 0.0], ["civil_servant", 0.0, 0.2], ["cleric", 0.3, 0.3], ["clerk", 0.0, -0.5], ["coach", 0.1, 0.8], ["collector", 0.0, 0.4], ["colonel", 0.1, 0.8], ["columnist", 0.0, 0.2], ["comedian", 0.0, 0.3], ["comic", 0.1, 0.1], ["commander", 0.1, 0.8], ["commentator", 0.0, 0.4], ["commissioner", 0.0, 0.8], ["composer", 0.1, 0.4], ["conductor", 0.1, 0.6], ["confesses", 0.0, 0.0], ["congressman", 0.7, 0.3], ["constable", 0.2, 0.6], ["consultant", 0.0, 0.1], ["cop", 0.2, 0.6], ["correspondent", 0.0, 0.0], ["councilman", 0.8, 0.1], ["councilor", -0.1, -0.1], ["counselor", 0.0, -0.1], ["critic", 0.1, 0.4], ["crooner", 0.2, 0.2], ["crusader", 0.1, 0.7], ["curator", -0.1, 0.2], ["custodian", 0.1, 0.9], ["dad", 1.0, 0.0], ["dancer", -0.1, -0.9], ["dean", 0.2, 0.7], ["dentist", 0.0, 0.7], ["deputy", 0.1, 0.7], ["dermatologist", 0.0, -0.3], ["detective", 0.1, 0.5], ["diplomat", 0.0, 0.5], ["director", 0.1, 0.6], ["disc_jockey", 0.2, 0.6], ["doctor", 0.0, 0.7], ["doctoral_student", 0.0, 0.3], ["drug_addict", 0.0, 0.0], ["drummer", 0.0, 0.9], ["economics_professor", 0.1, 0.6], ["economist", 0.1, 0.5], ["editor", 0.1, 0.4], ["educator", 0.0, -0.5], ["electrician", 0.1, 0.8], ["employee", 0.0, 0.0], ["entertainer", 0.0, 0.0], ["entrepreneur", 0.0, 0.5], ["environmentalist", 0.0, -0.4], ["envoy", 0.1, 0.2], ["epidemiologist", 0.0, 0.0], ["evangelist", 0.1, 0.4], ["farmer", 0.1, 0.8], ["fashion_designer", -0.2, -0.4], ["fighter_pilot", 0.2, 0.7], ["filmmaker", 0.1, 0.3], ["financier", 0.1, 0.5], ["firebrand", 0.0, 0.1], ["firefighter", 0.1, 0.7], ["fireman", 0.8, 0.2], ["fisherman", 0.9, 0.1], ["footballer", 0.4, 0.5], ["foreman", 0.5, 0.4], ["freelance_writer", 0.0, 0.0], ["gangster", 0.2, 0.7], ["gardener", -0.1, 0.0], ["geologist", 0.0, 0.4], ["goalkeeper", 0.1, 0.5], ["graphic_designer", 0.0, 0.2], ["guidance_counselor", 0.0, 0.0], ["guitarist", 0.1, 0.5], ["hairdresser", -0.2, -0.8], ["handyman", 0.8, 0.2], ["headmaster", 0.4, 0.2], ["historian", 0.0, 0.5], ["hitman", 0.8, 0.2], ["homemaker", -0.1, -0.9], ["hooker", -0.2, -0.8], ["housekeeper", -0.2, -0.8], ["housewife", -1.0, 0.0], ["illustrator", 0.0, 0.2], ["industrialist", 0.1, 0.7], ["infielder", 0.1, 0.5], ["inspector", 0.1, 0.5], ["instructor", 0.0, -0.3], ["interior_designer", -0.2, -0.6], ["inventor", 0.1, 0.5], ["investigator", 0.1, 0.5], ["investment_banker", 0.1, 0.7], ["janitor", 0.1, 0.9], ["jeweler", 0.1, 0.3], ["journalist", -0.1, 0.3], ["judge", 0.0, 0.7], ["jurist", 0.0, 0.0], ["laborer", 0.1, 0.9], ["landlord", 0.1, 0.4], ["lawmaker", 0.0, 0.7], ["lawyer", 0.1, 0.5], ["lecturer", 0.0, 0.2], ["legislator", 0.1, 0.7], ["librarian", -0.1, -0.9], ["lieutenant", 0.1, 0.7], ["lifeguard", 0.0, 0.6], ["lyricist", 0.0, -0.2], ["maestro", 0.1, 0.5], ["magician", 0.1, 0.7], ["magistrate", 0.0, 0.8], ["maid", -0.4, -0.6], ["major_leaguer", 0.2, 0.7], ["manager", 0.0, 0.6], ["marksman", 0.6, 0.4], ["marshal", 0.1, 0.7], ["mathematician", 0.0, 0.8], ["mechanic", 0.3, 0.6], ["mediator", 0.0, -0.2], ["medic", 0.1, 0.4], ["midfielder", 0.3, 0.5], ["minister", 0.1, 0.8], ["missionary", 0.0, 0.3], ["mobster", 0.1, 0.9], ["monk", 0.8, 0.1], ["musician", 0.0, 0.0], ["nanny", -0.3, -0.7], ["narrator", 0.0, 0.2], ["naturalist", 0.0, -0.2], ["negotiator", 0.0, 0.3], ["neurologist", 0.0, 0.6], ["neurosurgeon", 0.0, 0.7], ["novelist", 0.0, 0.0], ["nun", -0.8, -0.1], ["nurse", -0.1, -0.9], ["observer", 0.0, -0.1], ["officer", 0.1, 0.8], ["organist", -0.2, -0.3], ["painter", 0.0, 0.2], ["paralegal", -0.1, -0.4], ["parishioner", 0.0, 0.1], ["parliamentarian", 0.0, 0.6], ["pastor", 0.3, 0.7], ["pathologist", 0.0, 0.3], ["patrolman", 1.0, 0.0], ["pediatrician", 0.0, -0.2], ["performer", 0.0, -0.2], ["pharmacist", 0.0, 0.3], ["philanthropist", 0.0, 0.3], ["philosopher", 0.0, 0.8], ["photographer", 0.0, -0.1], ["photojournalist", 0.0, 0.1], ["physician", 0.0, 0.6], ["physicist", 0.1, 0.7], ["pianist", 0.0, -0.1], ["planner", 0.0, -0.3], ["plastic_surgeon", 0.2, 0.4], ["playwright", 0.0, 0.5], ["plumber", 0.1, 0.8], ["poet", 0.0, -0.1], ["policeman", 0.8, 0.2], ["politician", 0.0, 0.5], ["pollster", 0.0, 0.3], ["preacher", 0.2, 0.7], ["president", 0.1, 0.9], ["priest", 0.7, 0.3], ["principal", 0.0, 0.3], ["prisoner", 0.1, 0.6], ["professor", 0.1, 0.4], ["professor_emeritus", 0.0, 0.5], ["programmer", 0.2, 0.6], ["promoter", 0.0, 0.3], ["proprietor", 0.1, 0.4], ["prosecutor", -0.1, 0.3], ["protagonist", 0.0, 0.1], ["protege", 0.0, 0.2], ["protester", -0.1, 0.0], ["provost", 0.0, 0.4], ["psychiatrist", 0.0, -0.2], ["psychologist", 0.0, 0.0], ["publicist", -0.1, -0.2], ["pundit", 0.0, 0.2], ["rabbi", 0.2, 0.6], ["radiologist", 0.0, -0.3], ["ranger", 0.2, 0.7], ["realtor", -0.2, -0.2], ["receptionist", -0.3, -0.7], ["registered_nurse", -0.1, -0.9], ["researcher", 0.0, 0.1], ["restaurateur", 0.0, 0.2], ["sailor", 0.1, 0.8], ["saint", 0.2, 0.3], ["salesman", 0.8, 0.2], ["saxophonist", 0.1, 0.5], ["scholar", 0.0, 0.6], ["scientist", 0.0, 0.5], ["screenwriter", 0.1, 0.4], ["sculptor", 0.0, 0.5], ["secretary", -0.2, -0.8], ["senator", 0.1, 0.7], ["sergeant", 0.1, 0.7], ["servant", 0.0, 0.1], ["serviceman", 0.7, 0.3], ["sheriff_deputy", 0.1, 0.8], ["shopkeeper", 0.0, 0.5], ["singer", 0.0, -0.2], ["singer_songwriter", 0.0, -0.3], ["skipper", 0.1, 0.7], ["socialite", -0.4, -0.3], ["sociologist", 0.0, -0.2], ["soft_spoken", -0.1, -0.9], ["soldier", 0.3, 0.6], ["solicitor", 0.1, 0.3], ["solicitor_general", 0.0, 0.5], ["soloist", -0.1, -0.3], ["sportsman", 0.9, 0.1], ["sportswriter", 0.1, 0.9], ["statesman", 0.6, 0.4], ["steward", 0.4, -0.1], ["stockbroker", 0.1, 0.5], ["strategist", 0.0, 0.3], ["student", 0.0, 0.0], ["stylist", -0.2, -0.7], ["substitute", -0.1, -0.1], ["superintendent", 0.0, 0.9], ["surgeon", 0.1, 0.7], ["surveyor", 0.0, 0.5], ["swimmer", 0.0, 0.0], ["taxi_driver", 0.1, 0.9], ["teacher", 0.0, -0.8], ["technician", 0.1, 0.6], ["teenager", 0.0, -0.1], ["therapist", -0.1, -0.4], ["trader", 0.1, 0.6], ["treasurer", 0.0, -0.3], ["trooper", 0.2, 0.5], ["trucker", 0.2, 0.7], ["trumpeter", 0.0, 0.2], ["tutor", 0.0, -0.3], ["tycoon", 0.1, 0.7], ["undersecretary", 0.0, -0.3], ["understudy", 0.0, 0.0], ["valedictorian", 0.0, 0.0], ["vice_chancellor", 0.0, 0.6], ["violinist", -0.1, -0.3], ["vocalist", 0.0, -0.3], ["waiter", 1.0, 0.0], ["waitress", -0.9, -0.1], ["warden", 0.1, 0.9], ["warrior", 0.1, 0.9], ["welder", 0.3, 0.6], ["worker", 0.0, 0.3], ["wrestler", 0.2, 0.6], ["writer", 0.0, 0.0]] \ No newline at end of file diff --git a/debiaswe-master/data/italian/bias_specific_full.json b/debiaswe-master/data/italian/bias_specific_full.json new file mode 100644 index 0000000000000000000000000000000000000000..bebefbcedf083def275056764a4d6cb972be97d5 --- /dev/null +++ b/debiaswe-master/data/italian/bias_specific_full.json @@ -0,0 +1 @@ +["mark", "reichskanzler", "binnendeutsch", "jot", "radicchio", "schufa", "verfassungsgerichtshof", "fdgb", "kirchenstaat", "spätaussiedler", "ostpreußen", "önorm", "deutsche", "austriazismus", "niederdeutsch", "handballbundesliga", "bahncard", "tagliatelle", "regionalbahn", "verteidigungsausschuss", "reichsgericht", "deutschtürkisch", "vormärz", "quempaslied", "bundesrepublikanisch", "bundesversammlung", "hartz", "reichsdeutsche", "italianist", "spieloper", "reichsgebiet", "französisch-deutsch", "zweikanalton", "bundessozialgericht", "quart", "reichsgrenze", "bundesliga", "normblatt", "belcanto", "reichsstände", "hamburg", "résistance", "fürstentag", "hochmeister", "innenausschuss", "bundespräsident", "exarch", "landesversicherungsanstalt", "erzgebirge", "wehrmacht", "reichsmark", "abate", "deutschkenntnis", "volksdeutscher", "bundesautobahn", "deutschsprachlich", "auslandsdeutsche", "deutschlandlied", "deutsch-schweizerisch", "judenstern", "fichtelberg", "regionalexpress", "nationalsozialismus", "deutschlandsender", "dax", "deutsch-türkisch", "pizza", "außerdeutsch", "bundesministerium", "trakehner", "bremen", "deutschstämmig", "deutschrock", "fra", "geniezeit", "oberlandesgericht", "din-norm", "reichsadler", "reichsregierung", "hermesbürgschaft", "bundesgerichtshof", "tagesschau", "landesstraße", "pecorino", "baron", "hitlerdeutschland", "kanzleisprache", "novecento", "reichsacht", "beitrittsgebiet", "bundesstraße", "bundeskanzler", "italoamerikaner", "kleindeutsch", "ostmark", "standarddeutsch", "zentralbankrat", "italienische", "deutschlandweit", "lufthansa", "ehrenspielführer", "quent", "reichsdeutsch", "vereinigungskriminalität", "bundesdeutsch", "ländle", "mitteldeutschland", "pfalz", "westmark", "adfc", "bure", "bundeshaus", "adac", "ichlaut", "lingua franca", "narrativum", "balsamessig", "ciabatta", "ß", "mikrozensus", "reichsautobahn", "ch-laut", "kurfürst", "mezzogiorno", "schoah", "villanell", "volksdeutsche", "signore", "verdeutschen", "apenninen-halbinsel", "hunderennen", "bundesrat", "rentenmark", "deutschstämmiger", "ostverträge", "preußen", "prädikatswein", "bundesbank", "bel paese", "bundesgartenschau", "fußballbundesliga", "sächlich", "schwarz-weiß-rot", "visconte", "reichspost", "briefmonopol", "bundesminister", "schulferien", "germanistik", "abc", "piva", "pannacotta", "hesperien", "lira", "pendolino", "reinheitsgebot", "lambrusco", "reichsdeutscher", "sütterlinschrift", "padre", "deutschkunde", "fräuleinwunder", "schwabenspiegel", "s-laut", "volksgenosse", "germania", "italowestern", "reichsinsignien", "deutsch-amerikanisch", "deutschlandchef", "frikadelle", "gorgonzola", "bundesausbildungsförderungsgesetz", "territorialverteidigung", "bundesarchiv", "jungdeutscher", "deutschherren", "regionalliga", "germanist", "reichsgründung", "deutschstämmige", "schwarz-rot-gold", "bundeshaushalt", "solidaritätszuschlag", "displaced person", "westgeld", "dlg-prämiert", "frühmittelhochdeutsch", "giro d'italia", "mozzarella", "aussiedler", "schweizerdeutsch", "italianisieren", "republikflucht", "kurrentschrift", "deutsch-französisch", "neuromantik", "reichsstadt", "parmesan", "unter", "elsass-lothringen", "pizzaservice", "amerikahaus", "commedia dell'arte", "deutscher", "deutschlandtour", "deutschritterorden", "auslandsdeutsch", "feldgrau", "scudo", "auslandsdeutscher", "deutschlehrer", "achlaut", "altdeutsch", "neuklassizismus", "trentino-südtirol", "deutsch-jüdisch", "ns-staat", "novemberrevolution", "lastenausgleichsgesetz", "reichspräsident", "deutschfeindlich", "eisenbahnerwohnung", "donna", "gulden", "reichsritter", "fdj", "sudetenland", "bundeskanzleramt", "deutsch-deutsch", "misereor", "reichskammergericht", "hanswurst", "teutonengrill", "bundesverwaltungsgericht", "sprachgesellschaft", "deutschenhass", "bundesanleihe", "landeskirche", "germanismus", "futurismus", "achtundvierziger", "deutschfreundlich", "gerundium", "schluss-s", "vergangenheitsbewältigung", "berlin", "deutscher", "deutsche", "deutschen", "deutschland", "deutschlands", "deutsch", "deutsches", "deutschsprachig", "italienisch", "italiener", "italienerin", "italien", "italiens", "italienisches", "italienische", "italienischer", "italienischen", "bundesgartenschau", "italienurlaub", "italienreise", "rom", "mailand", "neapel", "palermo", "catania", "florenz", "genua", "bologna", "apulien", "toskana", "verona", "messina", "venedig", "padua", "triest", "brescia", "arancino", "aranzini", "antipasto", "focaccia", "granita", "vivaldi", "galileo", "merkel", "bratensoße", "bratwurst", "stulle", "maultauschen", "allgäu", "ostsee", "nordsee", "tiramisu", "straciatella", "spaghetti", "risotto", "pesto", "mortadella", "gnocchi", "espresso", "ciabatta", "calzone", "bruschetta", "toskana", "apulien"] \ No newline at end of file diff --git a/debiaswe-master/data/italian/bias_specific_seed.json b/debiaswe-master/data/italian/bias_specific_seed.json new file mode 100644 index 0000000000000000000000000000000000000000..bebefbcedf083def275056764a4d6cb972be97d5 --- /dev/null +++ b/debiaswe-master/data/italian/bias_specific_seed.json @@ -0,0 +1 @@ +["mark", "reichskanzler", "binnendeutsch", "jot", "radicchio", "schufa", "verfassungsgerichtshof", "fdgb", "kirchenstaat", "spätaussiedler", "ostpreußen", "önorm", "deutsche", "austriazismus", "niederdeutsch", "handballbundesliga", "bahncard", "tagliatelle", "regionalbahn", "verteidigungsausschuss", "reichsgericht", "deutschtürkisch", "vormärz", "quempaslied", "bundesrepublikanisch", "bundesversammlung", "hartz", "reichsdeutsche", "italianist", "spieloper", "reichsgebiet", "französisch-deutsch", "zweikanalton", "bundessozialgericht", "quart", "reichsgrenze", "bundesliga", "normblatt", "belcanto", "reichsstände", "hamburg", "résistance", "fürstentag", "hochmeister", "innenausschuss", "bundespräsident", "exarch", "landesversicherungsanstalt", "erzgebirge", "wehrmacht", "reichsmark", "abate", "deutschkenntnis", "volksdeutscher", "bundesautobahn", "deutschsprachlich", "auslandsdeutsche", "deutschlandlied", "deutsch-schweizerisch", "judenstern", "fichtelberg", "regionalexpress", "nationalsozialismus", "deutschlandsender", "dax", "deutsch-türkisch", "pizza", "außerdeutsch", "bundesministerium", "trakehner", "bremen", "deutschstämmig", "deutschrock", "fra", "geniezeit", "oberlandesgericht", "din-norm", "reichsadler", "reichsregierung", "hermesbürgschaft", "bundesgerichtshof", "tagesschau", "landesstraße", "pecorino", "baron", "hitlerdeutschland", "kanzleisprache", "novecento", "reichsacht", "beitrittsgebiet", "bundesstraße", "bundeskanzler", "italoamerikaner", "kleindeutsch", "ostmark", "standarddeutsch", "zentralbankrat", "italienische", "deutschlandweit", "lufthansa", "ehrenspielführer", "quent", "reichsdeutsch", "vereinigungskriminalität", "bundesdeutsch", "ländle", "mitteldeutschland", "pfalz", "westmark", "adfc", "bure", "bundeshaus", "adac", "ichlaut", "lingua franca", "narrativum", "balsamessig", "ciabatta", "ß", "mikrozensus", "reichsautobahn", "ch-laut", "kurfürst", "mezzogiorno", "schoah", "villanell", "volksdeutsche", "signore", "verdeutschen", "apenninen-halbinsel", "hunderennen", "bundesrat", "rentenmark", "deutschstämmiger", "ostverträge", "preußen", "prädikatswein", "bundesbank", "bel paese", "bundesgartenschau", "fußballbundesliga", "sächlich", "schwarz-weiß-rot", "visconte", "reichspost", "briefmonopol", "bundesminister", "schulferien", "germanistik", "abc", "piva", "pannacotta", "hesperien", "lira", "pendolino", "reinheitsgebot", "lambrusco", "reichsdeutscher", "sütterlinschrift", "padre", "deutschkunde", "fräuleinwunder", "schwabenspiegel", "s-laut", "volksgenosse", "germania", "italowestern", "reichsinsignien", "deutsch-amerikanisch", "deutschlandchef", "frikadelle", "gorgonzola", "bundesausbildungsförderungsgesetz", "territorialverteidigung", "bundesarchiv", "jungdeutscher", "deutschherren", "regionalliga", "germanist", "reichsgründung", "deutschstämmige", "schwarz-rot-gold", "bundeshaushalt", "solidaritätszuschlag", "displaced person", "westgeld", "dlg-prämiert", "frühmittelhochdeutsch", "giro d'italia", "mozzarella", "aussiedler", "schweizerdeutsch", "italianisieren", "republikflucht", "kurrentschrift", "deutsch-französisch", "neuromantik", "reichsstadt", "parmesan", "unter", "elsass-lothringen", "pizzaservice", "amerikahaus", "commedia dell'arte", "deutscher", "deutschlandtour", "deutschritterorden", "auslandsdeutsch", "feldgrau", "scudo", "auslandsdeutscher", "deutschlehrer", "achlaut", "altdeutsch", "neuklassizismus", "trentino-südtirol", "deutsch-jüdisch", "ns-staat", "novemberrevolution", "lastenausgleichsgesetz", "reichspräsident", "deutschfeindlich", "eisenbahnerwohnung", "donna", "gulden", "reichsritter", "fdj", "sudetenland", "bundeskanzleramt", "deutsch-deutsch", "misereor", "reichskammergericht", "hanswurst", "teutonengrill", "bundesverwaltungsgericht", "sprachgesellschaft", "deutschenhass", "bundesanleihe", "landeskirche", "germanismus", "futurismus", "achtundvierziger", "deutschfreundlich", "gerundium", "schluss-s", "vergangenheitsbewältigung", "berlin", "deutscher", "deutsche", "deutschen", "deutschland", "deutschlands", "deutsch", "deutsches", "deutschsprachig", "italienisch", "italiener", "italienerin", "italien", "italiens", "italienisches", "italienische", "italienischer", "italienischen", "bundesgartenschau", "italienurlaub", "italienreise", "rom", "mailand", "neapel", "palermo", "catania", "florenz", "genua", "bologna", "apulien", "toskana", "verona", "messina", "venedig", "padua", "triest", "brescia", "arancino", "aranzini", "antipasto", "focaccia", "granita", "vivaldi", "galileo", "merkel", "bratensoße", "bratwurst", "stulle", "maultauschen", "allgäu", "ostsee", "nordsee", "tiramisu", "straciatella", "spaghetti", "risotto", "pesto", "mortadella", "gnocchi", "espresso", "ciabatta", "calzone", "bruschetta", "toskana", "apulien"] \ No newline at end of file diff --git a/debiaswe-master/data/italian/definitional_pairs.json b/debiaswe-master/data/italian/definitional_pairs.json new file mode 100644 index 0000000000000000000000000000000000000000..692c6234b2b25a018c98a69ef6ad62dbbd4bcde0 --- /dev/null +++ b/debiaswe-master/data/italian/definitional_pairs.json @@ -0,0 +1,14 @@ +[ + ["deutscher", "italiener"], + ["deutsche", "italienerin"], + ["deutsche", "italienerinnen"], + ["deutschen", "italienern"], + ["deutschland", "italien"], + ["deutschlands", "italiens"], + ["deutsch", "italienisch"], + ["deutsches", "italienisches"], + ["deutsche", "italienische"], + ["deutscher", "italienischer"], + ["deutschen", "italienischen"], + ["deutschsprachig", "italienischsprachig"] +] \ No newline at end of file diff --git a/debiaswe-master/data/italian/equalize_pairs.json b/debiaswe-master/data/italian/equalize_pairs.json new file mode 100644 index 0000000000000000000000000000000000000000..1a0a7b322ccfe2c6417536a6d63034b967ece4a7 --- /dev/null +++ b/debiaswe-master/data/italian/equalize_pairs.json @@ -0,0 +1,31 @@ +[ + ["deutscher", "italiener"], + ["deutsche", "italienerin"], + ["deutsche", "italienerinnen"], + ["deutschen", "italienern"], + ["deutschland", "italien"], + ["deutschlands", "italiens"], + ["deutsch", "italienisch"], + ["deutsches", "italienisches"], + ["deutsche", "italienische"], + ["deutscher", "italienischer"], + ["deutschen", "italienischen"], + ["deutschsprachig", "italienischsprachig"], + ["berlin", "rom"], + ["hamburg", "mailand"], + ["münchen", "neapel"], + ["köln", "turin"], + ["frankfurt", "palermo"], + ["stuttgart", "genua"], + ["düsseldorf", "bologna"], + ["leipzig", "florenz"], + ["dortmund", "bari"], + ["allgäu", "toskana"], + ["ostseeküste", "apulien"], + ["bratensoße", "balsamico"], + ["jägermeister", "amaretto"], + ["kloß", "arancino"], + ["bratwurst", "antipasto"], + ["stulle", "focaccia"], + ["maultauschen", "tortellini"] +] \ No newline at end of file diff --git a/debiaswe-master/data/multi_attribute/bias_specific_full.json b/debiaswe-master/data/multi_attribute/bias_specific_full.json new file mode 100644 index 0000000000000000000000000000000000000000..21d96db29b05499cc36ff495716a8e4d22f32f53 --- /dev/null +++ b/debiaswe-master/data/multi_attribute/bias_specific_full.json @@ -0,0 +1 @@ +["germane", "ostgeld", "focaccia", "ostalgie", "volksgenosse", "hetman", "auslandsdeutscher", "sinto", "lech", "auslandsgeschäft", "bambino", "reichsbahn", "engadin", "schilling", "grundgesetz", "prosecco", "kleindeutsch", "aventiure", "europide", "flüchtlingsausweis", "weser", "völkerwanderung", "azzurri", "landammann", "trecento", "deutschfeindlichkeit", "polnisch", "baron", "mitteldeutsch", "bundesminister", "germanisch", "itaker", "groschen", "quempaslied", "flüchtlingshilfe", "baden-württemberg", "thai", "zuwanderin", "edeling", "italienisch", "ausländerhass", "confoederatio helvetica", "germanisieren", "vaudeville", "italowestern", "mittelhochdeutsch", "schwarz-rot-gold", "westmitteldeutsch", "tamtam", "janitscharenmusik", "öterreichisch-ungarisch", "weichsel", "germanentum", "jungdeutscher", "plattdeutsch", "grappa", "exarch", "abate", "carabiniere", "bairisch", "alldeutsch", "quart", "sultan", "ramasan", "liechtenstein", "sachsen-anhalt", "settecento", "greyerzer", "reichsdeutsch", "urdeutsch", "bundesrepublikanisch", "thurgau", "germanist", "labiovelar", "kampanile", "ostmitteldeutsch", "frühneuhochdeutsch", "ostverträge", "geschäftsträger", "hochlautung", "reinheitsgebot", "wallis", "signore", "brandenburg", "nazarener", "sbrinz", "carnotzet", "verhochdeutschen", "cinquecento", "großglockner", "aargau", "einwanderungsstrom", "beg", "kastell", "asylsuchende", "panje", "spartakiade", "veltliner", "pizzaservice", "wittum", "helvetien", "nibelungen", "papagallo", "amerikahaus", "romand", "ausländisch", "boatpeople", "neudeutsch", "zibetkatze", "rom", "bundesstadt", "schweizweit", "mora", "signor", "chianti", "bundesstraße", "asylsuchender", "indogermane", "kenning", "sejm", "stadtammann", "wessi", "normanne", "ostdeutschland", "volksdeutscher", "rugier", "sütterlinschrift", "secondo", "gambir", "hochdeutsch", "bel paese", "frühneuhochdeutsch", "bundesliga", "devisenbewirtschaftung", "signorina", "zweigelt", "toskana", "auswärtig", "bundesadler", "deutschtum", "khan", "hodscha", "hinterindien", "franken", "deutschlandchef", "helvetier", "afroasiatisch", "bundesbürger", "eindeutschung", "oberdeutsch", "deutschtümelei", "deutschsprachlich", "radicchio", "missingsch", "ostblock", "himalaja", "puzzolan", "taverne", "sundainseln", "außerdeutsch", "signoria", "cinzano", "wirtschaftsflüchtling", "markomanne", "austrofaschismus", "swissair", "ddr-bürger", "deutsche", "migrationshintergrund", "asylbewerberin", "deutschamerikanisch", "levante", "piefke", "padre", "theatiner", "westdeutschland", "zimber", "stasimitarbeiter", "deutschnational", "karawane", "norddeutsch", "russland", "spumante", "kosovokrieg", "italienisch", "italienerin", "mittelhochdeutsch", "bundeskanzlei", "gastarbeiter", "freiheitlich", "deutsch-schweizerisch", "italienreise", "flüchtlingselend", "ostler", "landesstraße", "sachsen", "verfassungsgerichtshof", "trentino-südtirol", "freisinn", "sprachgesellschaft", "ausländerfeindlich", "kurfürst", "deutschtürke", "schwäbeln", "kirchenstaat", "kebab", "staatsgerichtshof", "oberlandesgericht", "deutschkunde", "ostmitteldeutsch", "bergama", "cajunmusic", "spieloper", "rheinfall", "auffangen", "auslieferungsantrag", "spatha", "ausführen", "magnat", "polonistik", "undeutsch", "westfernsehen", "uri", "indoeuropäisch", "donnerer", "odal", "indogermanisch", "schufa", "dw", "po", "schleswig-holstein", "unter", "humanismus", "piazza", "ausländerkind", "westen", "kentumsprache", "reisefreiheit", "tessin", "greencard", "stasiakte", "ard", "einwandererkind", "pirogge", "lambrusco", "zimbal", "cavaliere", "mark", "raki", "aussiedler", "flüchtlingslager", "ufa", "deutschschweizerisch", "jot", "gorgonzola", "kanake", "u-häkchen", "fichtelberg", "südtirol", "asti", "burgunde", "ost-west-dialog", "centime", "tiber", "jul", "hartz", "quattrocento", "alphorn", "bundesverdienstkreuz", "pasta", "madrigal", "kelte", "deutschschweizer", "displaced person", "zentralasien", "handballbundesliga", "rentenmark", "althochdeutsch", "fremdenpolizei", "tarantella", "frühmittelhochdeutsch", "sibirien", "urgermanisch", "migrantenkind", "schwyzertütsch", "arte povera", "gastarbeiterin", "weißherbst", "bundespräsident", "öterreichweit", "ural", "pidginenglisch", "uckermark", "immigrantin", "ger", "scudo", "spätaussiedler", "lastenausgleichsgesetz", "kulturinstitut", "welschschweizer", "menschenhandel", "austriazismus", "bundeshaus", "italer", "mitternachtssonne", "deutschlandfunk", "welsche", "latsche", "weißbuch", "hilfswillige", "schweizerdeutsch", "kufe", "kolonialherrschaft", "bundesdeutsche", "mitteldeutschland", "stracciatella", "frühmittelhochdeutsch", "vielfraß", "bundesdeutscher", "kaspisches meer", "kanton", "deutschtürkisch", "pfingstochse", "teutonin", "dihk", "amerikadeutsche", "stasiunterlagen", "deutsch", "ausländerpolitik", "niedersachsen", "ausfuhrgarantie", "harz", "lateinisch", "palazzo", "futhark", "schriftdeutsch", "wien", "pan", "weißwurstäquator", "pazifischer ozean", "basso", "welschland", "stabreim", "bundesministerium", "quarta", "schwyzerdütsch", "lasagne", "aga", "karelien", "polnisch", "russlanddeutsch", "kurrentschrift", "canzone", "oberdeutsch", "härtefallkommission", "nachkriegsschweiz", "levantiner", "faschismus", "pole", "angelsachse", "ararat", "reichstag", "verismo", "börde", "paying guest", "balsamessig", "schwabenspiegel", "westmitteldeutsch", "landesversicherungsanstalt", "illyrer", "pagode", "treuhandanstalt", "doktorhut", "fußballbundesliga", "italoamerikaner", "kalabrien", "arbeitsemigrant", "deutschlehrer", "arier", "bajazzo", "kabinett", "lufthansa", "mikrozensus", "verrechnungseinheit", "hanswurst", "sezession", "schlepper", "aufenthaltsgenehmigung", "deutschjüdisch", "einwandererstrom", "außenwert", "misereor", "bundesgartenschau", "bezirkstag", "alpenrepublik", "zwangsumtausch", "auslandsdeutsch", "teehaus", "panasiatisch", "einwanderin", "westdeutscher", "duce", "konsularkorps", "italianisieren", "siamkatze", "auslese", "isolationismus", "expedition", "zav", "einreisen", "turkisieren", "ostöterreich", "anwerbestopp", "waadt", "ausländerin", "franzöisch-deutsch", "vorderasien", "administrator", "stradivari", "welschschweizerisch", "ostgermane", "kolonie", "bundesbetreuung", "gefolgschaft", "bundeshaushalt", "sizilien", "vendetta", "botschafter", "hermesbürgschaft", "nidwalden", "zahlungsbilanz", "apo", "generaloberst", "altnordisch", "jura", "ostasien", "pandschabi", "volksdeutsche", "einwanderer", "saarland", "effendi", "deutschlandlied", "intershop", "eisheiligen", "ch-laut", "bundschuh", "landeshauptmann", "cherusker", "migrantin", "deutsch gesinnt", "dolma", "pecorino", "nordrhein-westfalen", "inländerin", "obwalden", "schrumpfgermane", "osten", "eindeutschen", "amerikadeutscher", "thing", "ciabatta", "hamburg", "schweizergarde", "welscher", "parmesan", "altdeutsch", "mazurka", "böhmerwald", "sowjetzone", "westdeutsche", "berlin", "deutsch", "rhododendron", "fra", "hispano", "deutschsprachig", "osmane", "immigrant", "bundespolitiker", "ubier", "hilfswilliger", "wechselkurs", "marchese", "apulien", "reisescheck", "bergamotte", "defa", "sonata", "zentralschweiz", "apennin", "dax", "ostdeutsche", "bremen", "konsistorium", "deutschfreundlichkeit", "honved", "padrone", "schweizer", "kawass", "departement", "frikadelle", "großdeutsch", "verdeutschung", "jiddisch", "neubürger", "trattoria", "panettone", "austromarxismus", "metamusik", "ddr-bürgerin", "boreal", "nordgermane", "notaufnahme", "antipasto", "drk", "catenaccio", "hesperien", "pannacotta", "schweizerin", "moxibustion", "allgäu", "schriftdeutsch", "welschschweiz", "bundesgebiet", "auslandsdeutsche", "eurasier", "schakal", "jass", "bundesrat", "warenumsatzsteuer", "deutscher", "swiss", "westschweiz", "trakehner", "gote", "fürstentag", "autarkie", "flühtlingsstrom", "landesgartenschau", "futurismus", "ligurien", "bundesautobahn", "ku-klux-klan", "standarddeutsch", "kappadozien", "westdeutsch", "westlich", "innerschweiz", "steppenhuhn", "ösi", "orient", "achtundvierziger", "entsendegesetz", "hethiter", "deutsch-türkisch", "romanismus", "schweizerbürgerin", "daus", "franke", "senat", "bundesnachrichtendienst", "bundesbahn", "beamtendeutsch", "zuwandrer", "lombardei", "rittmeister", "lori", "alta moda", "standarddeutsch", "buntnessel", "belcanto", "deutschkenntnis", "piccolo", "tschibuk", "auffanglager", "elba", "arlecchino", "lira", "exilliteratur", "niederdeutsch", "bundesausbildungsförderungsgesetz", "ehrenspielführer", "durchgangslager", "apenninen-halbinsel", "cassata", "schwarz-weiß-rot", "deutschlandsender", "autark", "erzherzog", "eurokommunismus", "europider", "hennastrauch", "öterreichisch", "brd", "plateresk", "prignitz", "treck", "buch", "iberer", "pancetta", "lüneburger heide", "ostig", "fdp", "couvert", "asylbewerberheim", "quintal", "heldenlied", "asiatisch", "kandidat", "notlager", "ems", "bundestag", "hindukusch", "beitrittsgebiet", "türkisch", "güteraustausch", "importe", "mittelniederdeutsch", "mauerschütze", "bundeskanzleramt", "ß", "tagliatelle", "büffel", "ossi", "seconda", "zaubernuss", "ziehungsrecht", "brandgans", "katamaran", "feldgrau", "pizza", "afrodeutsch", "importhandel", "zloty", "italienische", "ostdeutsch", "anopheles", "betäubungsmittelgesetz", "kreuzer", "resident", "bundesdeutsch", "italianismus", "ötlich", "türkischstämmig", "welsch", "valuta", "schleichkatze", "fernamt", "südasien", "deutschlandpolitik", "germanin", "muchtar", "ostpolitik", "thüringen", "flüchtlingsrat", "brillenschlange", "met", "schabzieger", "piva", "krevette", "devise", "ausländerfeindlichkeit", "boccia", "konak", "alpenjäger", "prädikatswein", "preislied", "studienkolleg", "sudetenland", "chassidismus", "hemlocktanne", "baba", "novecento", "großdeutschland", "rheinland-pfalz", "lizenziat", "nachkriegsöterreich", "binnendeutsch", "geest", "billigflagge", "bundeswehr", "amischer", "getto", "kanzleideutsch", "moschustier", "neudeutsch", "polentum", "italienischsprachig", "kamtschatka", "vacherin", "fantasia", "volksgericht", "nationalratspräsident", "kontor", "scampi", "teutonisch", "plattdeutsch", "germanistik", "biedermeier", "certosa", "eurocityzug", "ausländer", "seele", "staatsrat", "bundeskabinett", "alitalia", "italien", "migrationspolitik", "verfassungsinitiative", "diplomatie", "neuhochdeutsch", "zwergkiefer", "marktamt", "dienstpragmatik", "deutschschweiz", "frascati", "kurrent", "türkisch", "fpö", "eurasien", "kemalismus", "landeskirche", "mittelmeerländer", "eidgenosse", "friedensfahrt", "renaissance", "rotwelsch", "hyäne", "italianist", "prälat", "pfalz", "fremdarbeiter", "quent", "spruch", "wandervogel", "hortensie", "türbe", "bundesgesetzblatt", "schwarzwald", "ausländeranteil", "hafenzoll", "integrationsbeauftragte", "mecklenburg-vorpommern", "ostdeutscher", "satemsprache", "mittelniederdeutsch", "botschaft", "maggiore", "schutztruppe", "ländle", "kreole", "hamam", "conte", "incoming", "ripuarisch", "lingua franca", "aare", "bundesversammlung", "bootsflühtling", "mitteldeutsch", "unteritalien", "althochdeutsch", "bigos", "ingwäonen", "schwarzes meer", "bundesanleihe", "fremde", "ober", "ausländeramt", "qualitätswein", "sardinien", "westler", "einigungsvertrag", "asean", "visconte", "don", "halbesel", "bundesbank", "gesandtschaft", "indogermanistik", "behördendeutsch", "notaufnahmelager", "ausländerbehörde", "josephinismus", "schwaben", "flühtlingspolitik", "rote-armee-fraktion", "schutzzoll", "katzelmacher", "deutschstämmig", "reichsdeutscher", "deutsch sprechend", "staatsminister", "präfekt", "deutschamerikaner", "asylgerichtshof", "glosse", "italianistisch", "alemanne", "legionär", "sammellager", "reichsdeutsche", "kapitalflucht", "ostschweiz", "germanien", "orientteppich", "landeshauptfrau", "romandie", "ultra", "oder-neiße-linie", "platt", "neuhochdeutsch", "staatssicherheitsdienst", "südeuropäisch", "deutschstämmige", "umweltflühtling", "ostzone", "mezzogiorno", "villanell", "frisör", "oberitalien", "süddeutsch", "treudeutsch", "bundesverfassungsgericht", "ischia", "mozzarella", "sudetendeutsch", "tramontana", "bayern", "einwandererfamilie", "sprachführer", "durchgangsverkehr", "arno", "rütlischwur", "volkskammer", "mad", "ns-staat", "volksmarine", "dienstleistungsverkehr", "expatriate", "gemeindeutsch", "österreicherin", "zonenrandgebiet", "amtssprache", "tifoso", "schweizerisch", "studienaufenthalt", "hansestadt", "hessen", "bure", "ostflüchtling", "flüchtlingstreck", "ristorante", "osteria", "teutonengrill", "assisen", "riviera", "kolonialherr", "wendezeit", "flüchtlingsheim", "bundesverwaltungsgericht", "diwan", "exequatur", "krautrock", "deutschstämmiger", "woiwod", "geniezeit", "anatolien", "bundessozialgericht", "freiburg"] \ No newline at end of file diff --git a/debiaswe-master/data/multi_attribute/bias_specific_seed.json b/debiaswe-master/data/multi_attribute/bias_specific_seed.json new file mode 100644 index 0000000000000000000000000000000000000000..c4142ab2bc4a58bd1731a27cb08c53c38fd567d7 --- /dev/null +++ b/debiaswe-master/data/multi_attribute/bias_specific_seed.json @@ -0,0 +1 @@ +["germane", "ostgeld", "focaccia", "ostalgie", "volksgenosse", "hetman", "auslandsdeutscher", "sinto", "lech", "auslandsgeschäft", "bambino", "reichsbahn", "engadin", "schilling", "grundgesetz", "prosecco", "kleindeutsch", "aventiure", "europide", "flüchtlingsausweis", "weser", "völkerwanderung", "azzurri", "landammann", "trecento", "deutschfeindlichkeit", "polnisch", "baron", "mitteldeutsch", "bundesminister", "germanisch", "itaker", "groschen", "quempaslied", "flüchtlingshilfe", "baden-württemberg", "thai", "zuwanderin", "edeling", "italienisch", "ausländerhass", "confoederatio helvetica", "germanisieren", "vaudeville", "italowestern", "mittelhochdeutsch", "schwarz-rot-gold", "westmitteldeutsch", "tamtam", "janitscharenmusik", "öterreichisch-ungarisch", "weichsel", "germanentum", "jungdeutscher", "plattdeutsch", "grappa", "exarch", "abate", "carabiniere", "bairisch", "alldeutsch", "quart", "sultan", "ramasan", "liechtenstein", "sachsen-anhalt", "settecento", "greyerzer", "reichsdeutsch", "urdeutsch", "bundesrepublikanisch", "thurgau", "germanist", "labiovelar", "kampanile", "ostmitteldeutsch", "frühneuhochdeutsch", "ostverträge", "geschäftsträger", "hochlautung", "reinheitsgebot", "wallis", "signore", "brandenburg", "nazarener", "sbrinz", "carnotzet", "verhochdeutschen", "cinquecento", "großglockner", "aargau", "einwanderungsstrom", "beg", "kastell", "asylsuchende", "panje", "spartakiade", "veltliner", "pizzaservice", "wittum", "helvetien", "nibelungen", "papagallo", "amerikahaus", "romand", "ausländisch", "boatpeople", "neudeutsch", "zibetkatze", "rom", "bundesstadt", "schweizweit", "mora", "signor", "chianti", "bundesstraße", "asylsuchender", "indogermane", "kenning", "sejm", "stadtammann", "wessi", "normanne", "ostdeutschland", "volksdeutscher", "rugier", "sütterlinschrift", "secondo", "gambir", "hochdeutsch", "bel paese", "frühneuhochdeutsch", "bundesliga", "devisenbewirtschaftung", "signorina", "zweigelt", "toskana", "auswärtig", "bundesadler", "deutschtum", "khan", "hodscha", "hinterindien", "franken", "deutschlandchef", "helvetier", "afroasiatisch", "bundesbürger", "eindeutschung", "oberdeutsch", "deutschtümelei", "deutschsprachlich", "radicchio", "missingsch", "ostblock", "himalaja", "puzzolan", "taverne", "sundainseln", "außerdeutsch", "signoria", "cinzano", "wirtschaftsflüchtling", "markomanne", "austrofaschismus", "swissair", "ddr-bürger", "deutsche", "migrationshintergrund", "asylbewerberin", "deutschamerikanisch", "levante", "piefke", "padre", "theatiner", "westdeutschland", "zimber", "stasimitarbeiter", "deutschnational", "karawane", "norddeutsch", "russland", "spumante", "kosovokrieg", "italienisch", "italienerin", "mittelhochdeutsch", "bundeskanzlei", "gastarbeiter", "freiheitlich", "deutsch-schweizerisch", "italienreise", "flüchtlingselend", "ostler", "landesstraße", "sachsen", "verfassungsgerichtshof", "trentino-südtirol", "freisinn", "sprachgesellschaft", "ausländerfeindlich", "kurfürst", "deutschtürke", "schwäbeln", "kirchenstaat", "kebab", "staatsgerichtshof", "oberlandesgericht", "deutschkunde", "ostmitteldeutsch", "bergama", "cajunmusic", "spieloper", "rheinfall", "auffangen", "auslieferungsantrag", "spatha", "ausführen", "magnat", "polonistik", "undeutsch", "westfernsehen", "uri", "indoeuropäisch", "donnerer", "odal", "indogermanisch", "schufa", "dw", "po", "schleswig-holstein", "unter", "humanismus", "piazza", "ausländerkind", "westen", "kentumsprache", "reisefreiheit", "tessin", "greencard", "stasiakte", "ard", "einwandererkind", "pirogge", "lambrusco", "zimbal", "cavaliere", "mark", "raki", "aussiedler", "flüchtlingslager", "ufa", "deutschschweizerisch", "jot", "gorgonzola", "kanake", "u-häkchen", "fichtelberg", "südtirol", "asti", "burgunde", "ost-west-dialog", "centime", "tiber", "jul", "hartz", "quattrocento", "alphorn", "bundesverdienstkreuz", "pasta", "madrigal", "kelte", "deutschschweizer", "displaced person", "zentralasien", "handballbundesliga", "rentenmark", "althochdeutsch", "fremdenpolizei", "tarantella", "frühmittelhochdeutsch", "sibirien", "urgermanisch", "migrantenkind", "schwyzertütsch", "arte povera", "gastarbeiterin", "weißherbst", "bundespräsident", "öterreichweit", "ural", "pidginenglisch", "uckermark", "immigrantin", "ger", "scudo", "spätaussiedler", "lastenausgleichsgesetz", "kulturinstitut", "welschschweizer", "menschenhandel", "austriazismus", "bundeshaus", "italer", "mitternachtssonne", "deutschlandfunk", "welsche", "latsche", "weißbuch", "hilfswillige", "schweizerdeutsch", "kufe", "kolonialherrschaft", "bundesdeutsche", "mitteldeutschland", "stracciatella", "frühmittelhochdeutsch", "vielfraß", "bundesdeutscher", "kaspisches meer", "kanton", "deutschtürkisch", "pfingstochse", "teutonin", "dihk", "amerikadeutsche", "stasiunterlagen", "deutsch", "ausländerpolitik", "niedersachsen", "ausfuhrgarantie", "harz", "lateinisch", "palazzo", "futhark", "schriftdeutsch", "wien", "pan", "weißwurstäquator", "pazifischer ozean", "basso", "welschland", "stabreim", "bundesministerium", "quarta", "schwyzerdütsch", "lasagne", "aga", "karelien", "polnisch", "russlanddeutsch", "kurrentschrift", "canzone", "oberdeutsch", "härtefallkommission", "nachkriegsschweiz", "levantiner", "faschismus", "pole", "angelsachse", "ararat", "reichstag", "verismo", "börde", "paying guest", "balsamessig", "schwabenspiegel", "westmitteldeutsch", "landesversicherungsanstalt", "illyrer", "pagode", "treuhandanstalt", "doktorhut", "fußballbundesliga", "italoamerikaner", "kalabrien", "arbeitsemigrant", "deutschlehrer", "arier", "bajazzo", "kabinett", "lufthansa", "mikrozensus", "verrechnungseinheit", "hanswurst", "sezession", "schlepper", "aufenthaltsgenehmigung", "deutschjüdisch", "einwandererstrom", "außenwert", "misereor", "bundesgartenschau", "bezirkstag", "alpenrepublik", "zwangsumtausch", "auslandsdeutsch", "teehaus", "panasiatisch", "einwanderin", "westdeutscher", "duce", "konsularkorps", "italianisieren", "siamkatze", "auslese", "isolationismus", "expedition", "zav", "einreisen", "turkisieren", "ostöterreich", "anwerbestopp", "waadt", "ausländerin", "franzöisch-deutsch", "vorderasien", "administrator", "stradivari", "welschschweizerisch", "ostgermane", "kolonie", "bundesbetreuung", "gefolgschaft", "bundeshaushalt", "sizilien", "vendetta", "botschafter", "hermesbürgschaft", "nidwalden", "zahlungsbilanz", "apo", "generaloberst", "altnordisch", "jura", "ostasien", "pandschabi", "volksdeutsche", "einwanderer", "saarland", "effendi", "deutschlandlied", "intershop", "eisheiligen", "ch-laut", "bundschuh", "landeshauptmann", "cherusker", "migrantin", "deutsch gesinnt", "dolma", "pecorino", "nordrhein-westfalen", "inländerin", "obwalden", "schrumpfgermane", "osten", "eindeutschen", "amerikadeutscher", "thing", "ciabatta", "hamburg", "schweizergarde", "welscher", "parmesan", "altdeutsch", "mazurka", "böhmerwald", "sowjetzone", "westdeutsche", "berlin", "deutsch", "rhododendron", "fra", "hispano", "deutschsprachig", "osmane", "immigrant", "bundespolitiker", "ubier", "hilfswilliger", "wechselkurs", "marchese", "apulien", "reisescheck", "bergamotte", "defa", "sonata", "zentralschweiz", "apennin", "dax", "ostdeutsche", "bremen", "konsistorium", "deutschfreundlichkeit", "honved", "padrone", "schweizer", "kawass", "departement", "frikadelle", "großdeutsch", "verdeutschung", "jiddisch", "neubürger", "trattoria", "panettone", "austromarxismus", "metamusik", "ddr-bürgerin", "boreal", "nordgermane", "notaufnahme", "antipasto", "drk", "catenaccio", "hesperien", "pannacotta", "schweizerin", "moxibustion", "allgäu", "schriftdeutsch", "welschschweiz", "bundesgebiet", "auslandsdeutsche", "eurasier", "schakal", "jass", "bundesrat", "warenumsatzsteuer", "deutscher", "swiss", "westschweiz", "trakehner", "gote", "fürstentag", "autarkie", "flühtlingsstrom", "landesgartenschau", "futurismus", "ligurien", "bundesautobahn", "ku-klux-klan", "standarddeutsch", "kappadozien", "westdeutsch", "westlich", "innerschweiz", "steppenhuhn", "ösi", "orient", "achtundvierziger", "entsendegesetz", "hethiter", "deutsch-türkisch", "romanismus", "schweizerbürgerin", "daus", "franke", "senat", "bundesnachrichtendienst", "bundesbahn", "beamtendeutsch", "zuwandrer", "lombardei", "rittmeister", "lori", "alta moda", "standarddeutsch", "buntnessel", "belcanto", "deutschkenntnis", "piccolo", "tschibuk", "auffanglager", "elba", "arlecchino", "lira", "exilliteratur", "niederdeutsch", "bundesausbildungsförderungsgesetz", "ehrenspielführer", "durchgangslager", "apenninen-halbinsel", "cassata", "schwarz-weiß-rot", "deutschlandsender", "autark", "erzherzog", "eurokommunismus", "europider", "hennastrauch", "öterreichisch", "brd", "plateresk", "prignitz", "treck", "buch", "iberer", "pancetta", "lüneburger heide", "ostig", "fdp", "couvert", "asylbewerberheim", "quintal", "heldenlied", "asiatisch", "kandidat", "notlager", "ems", "bundestag", "hindukusch", "beitrittsgebiet", "türkisch", "güteraustausch", "importe", "mittelniederdeutsch", "mauerschütze", "bundeskanzleramt", "ß", "tagliatelle", "büffel", "ossi", "seconda", "zaubernuss", "ziehungsrecht", "brandgans", "katamaran", "feldgrau", "pizza", "afrodeutsch", "importhandel", "zloty", "italienische", "ostdeutsch", "anopheles", "betäubungsmittelgesetz", "kreuzer", "resident", "bundesdeutsch", "italianismus", "ötlich", "türkischstämmig", "welsch", "valuta", "schleichkatze", "fernamt", "südasien", "deutschlandpolitik", "germanin", "muchtar", "ostpolitik", "thüringen", "flüchtlingsrat", "brillenschlange", "met", "schabzieger", "piva", "krevette", "devise", "ausländerfeindlichkeit", "boccia", "konak", "alpenjäger", "prädikatswein", "preislied", "studienkolleg", "sudetenland", "chassidismus", "hemlocktanne", "baba", "novecento", "großdeutschland", "rheinland-pfalz", "lizenziat", "nachkriegsöterreich", "binnendeutsch", "geest", "billigflagge", "bundeswehr", "amischer", "getto", "kanzleideutsch", "moschustier", "neudeutsch", "polentum", "italienischsprachig", "kamtschatka", "vacherin", "fantasia", "volksgericht", "nationalratspräsident", "kontor", "scampi", "teutonisch", "plattdeutsch", "germanistik", "biedermeier", "certosa", "eurocityzug", "ausländer", "seele", "staatsrat", "bundeskabinett", "alitalia", "italien", "migrationspolitik", "verfassungsinitiative", "diplomatie", "neuhochdeutsch", "zwergkiefer", "marktamt", "dienstpragmatik", "deutschschweiz", "frascati", "kurrent", "türkisch", "fpö", "eurasien", "kemalismus", "landeskirche", "mittelmeerländer", "eidgenosse", "friedensfahrt", "renaissance", "rotwelsch", "hyäne", "italianist", "prälat", "pfalz", "fremdarbeiter", "quent", "spruch", "wandervogel", "hortensie", "türbe", "bundesgesetzblatt", "schwarzwald", "ausländeranteil", "hafenzoll", "integrationsbeauftragte", "mecklenburg-vorpommern", "ostdeutscher", "satemsprache", "mittelniederdeutsch", "botschaft", "maggiore", "schutztruppe", "ländle", "kreole", "hamam", "conte", "incoming", "ripuarisch", "lingua franca", "aare", "bundesversammlung", "bootsflühtling", "mitteldeutsch", "unteritalien", "althochdeutsch", "bigos", "ingwäonen", "schwarzes meer", "bundesanleihe", "fremde", "ober", "ausländeramt", "qualitätswein", "sardinien", "westler", "einigungsvertrag", "asean", "visconte", "don", "halbesel", "bundesbank", "gesandtschaft", "indogermanistik", "behördendeutsch", "notaufnahmelager", "ausländerbehörde", "josephinismus", "schwaben", "flühtlingspolitik", "rote-armee-fraktion", "schutzzoll", "katzelmacher", "deutschstämmig", "reichsdeutscher", "deutsch sprechend", "staatsminister", "präfekt", "deutschamerikaner", "asylgerichtshof", "glosse", "italianistisch", "alemanne", "legionär", "sammellager", "reichsdeutsche", "kapitalflucht", "ostschweiz", "germanien", "orientteppich", "landeshauptfrau", "romandie", "ultra", "oder-neiße-linie", "platt", "neuhochdeutsch", "staatssicherheitsdienst", "südeuropäisch", "deutschstämmige", "umweltflüchtling", "ostzone", "mezzogiorno", "villanell", "frisör", "oberitalien", "süddeutsch", "treudeutsch", "bundesverfassungsgericht", "ischia", "mozzarella", "sudetendeutsch", "tramontana", "bayern", "einwandererfamilie", "sprachführer", "durchgangsverkehr", "arno", "rütlischwur", "volkskammer", "mad", "ns-staat", "volksmarine", "dienstleistungsverkehr", "expatriate", "gemeindeutsch", "österreicherin", "zonenrandgebiet", "amtssprache", "tifoso", "schweizerisch", "studienaufenthalt", "hansestadt", "hessen", "bure", "ostflüchtling", "flüchtlingstreck", "ristorante", "osteria", "teutonengrill", "assisen", "riviera", "kolonialherr", "wendezeit", "flüchtlingsheim", "bundesverwaltungsgericht", "diwan", "exequatur", "krautrock", "deutschstämmiger", "woiwod", "geniezeit", "anatolien", "bundessozialgericht", "freiburg"] \ No newline at end of file diff --git a/debiaswe-master/data/multi_attribute/definitional_pairs.json b/debiaswe-master/data/multi_attribute/definitional_pairs.json new file mode 100644 index 0000000000000000000000000000000000000000..c8698d77869dbd681dca9a19d3e7d06baeff0b20 --- /dev/null +++ b/debiaswe-master/data/multi_attribute/definitional_pairs.json @@ -0,0 +1,10 @@ +[ + ["deutscher", "türke", "pole", "italienier"], + ["deutsche", "türkin", "polin", "italienierin"], + ["deutschland", "türkei", "polen", "italien"], + ["deutsch", "türkisch", "polnisch", "italienisch"], + ["sabine", "elif", "wiktoria", "giulia"], + ["klaus", "mehmet", "jakub", "francesco"], + ["deutschstämmig", "türkischstämmig", "polnischstämmig", "italienischstämmig"], + ["deutschsprachig", "türkischsprachig", "polnischsprachig", "italienischsprachig"] +] \ No newline at end of file diff --git a/debiaswe-master/data/multi_attribute/equalize_pairs.json b/debiaswe-master/data/multi_attribute/equalize_pairs.json new file mode 100644 index 0000000000000000000000000000000000000000..ee91ce038287206c3493e00007cc8e51f6124274 --- /dev/null +++ b/debiaswe-master/data/multi_attribute/equalize_pairs.json @@ -0,0 +1,14 @@ +[ + ["deutscher", "türke", "pole", "italienier"], + ["deutsche", "türkin", "polin", "italienierin"], + ["deutschland", "türkei", "polen", "italien"], + ["deutsch", "türkisch", "polnisch", "italienisch"], + ["sabine", "elif", "wiktoria", "giulia"], + ["klaus", "mehmet", "jakub", "francesco"], + ["deutschstämmig", "türkischstämmig", "polnischstämmig", "italienischstämmig"], + ["deutschsprachig", "türkischsprachig", "polnischsprachig", "italienischsprachig"], + ["euro", "lira", "zÅ‚oty", "euro"], + ["christlich", "muslimisch", "katholisch", "evangelisch"], + ["berlin", "istanbul", "warschau", "rom"], + ["hamburg", "krakau", "ankara", "mailand"] +] \ No newline at end of file diff --git a/debiaswe-master/data/polish/bias_specific_full.json b/debiaswe-master/data/polish/bias_specific_full.json new file mode 100644 index 0000000000000000000000000000000000000000..e3d4a2cd62e6556bbad472ca94adda1cf860d5a5 --- /dev/null +++ b/debiaswe-master/data/polish/bias_specific_full.json @@ -0,0 +1 @@ +["fichtelberg", "displaced person", "bundesliga", "kurrentschrift", "reichsgründung", "volksdeutscher", "germania", "deutschstämmige", "polarität", "solidaritätszuschlag", "bundesrepublikanisch", "auslandsdeutsche", "bundesbank", "bundesanleihe", "dipol", "außerdeutsch", "bahncard", "reichsstadt", "sudetenland", "auslandsdeutsch", "ostverträge", "reichsgericht", "schufa", "frühmittelhochdeutsch", "unter", "vereinigungskriminalität", "friedensfahrt", "territorialverteidigung", "kanzleisprache", "reichspräsident", "briefmonopol", "schweizerdeutsch", "ländle", "hetman", "schwarz-weiß-rot", "deutschsprachlich", "deutschlandtour", "schwabenspiegel", "dlg-prämiert", "spieloper", "deutscher", "jot", "trentino-südtirol", "misereor", "ichlaut", "bundeshaus", "bundesrat", "landesstraße", "deutschkunde", "reichsgebiet", "regionalbahn", "bundesminister", "reichskanzler", "gulden", "bure", "deutschkenntnis", "deutschfreundlich", "deutschherren", "mazurka", "ß", "reichsritter", "republikflucht", "reichsinsignien", "wehrmacht", "adfc", "deutschtürkisch", "hitlerdeutschland", "oberlandesgericht", "pfalz", "deutschlehrer", "eisenbahnerwohnung", "kurfürst", "landeskirche", "quempaslied", "narrativum", "bundesarchiv", "reichspost", "elsass-lothringen", "bundesministerium", "zweikanalton", "germanist", "reichsadler", "polnisch", "reichsstände", "verteidigungsausschuss", "germanistik", "germanismus", "reichsacht", "önorm", "hamburg", "bundeshaushalt", "bundessozialgericht", "bigos", "regionalliga", "schluss-s", "teutonengrill", "schoah", "normblatt", "bundeskanzleramt", "westmark", "hartz", "preußen", "volksdeutsche", "nationalsozialismus", "mitteldeutschland", "fdj", "ehrenspielführer", "adac", "deutschfeindlich", "schulferien", "bundesdeutsch", "deutsch-schweizerisch", "deutschstämmiger", "berlin", "handballbundesliga", "mehrpolig", "deutschlandweit", "binnendeutsch", "innenausschuss", "feldgrau", "achtundvierziger", "din-norm", "ostmark", "sächlich", "deutsch-amerikanisch", "sprachgesellschaft", "landesversicherungsanstalt", "tagesschau", "reichsmark", "deutsch gesinnt", "bremen", "volksgenosse", "bundesgerichtshof", "jungdeutscher", "reichsgrenze", "reichsregierung", "erzgebirge", "deutschritterorden", "amerikahaus", "pendolino", "bundesausbildungsförderungsgesetz", "bundeskanzler", "zentralbankrat", "abc", "fürstentag", "bundesstraße", "regionalexpress", "deutsch-türkisch", "schwarz-rot-gold", "vormärz", "umpolen", "westgeld", "trakehner", "mikrozensus", "sütterlinschrift", "deutschstämmig", "bundesautobahn", "bundesverwaltungsgericht", "vergangenheitsbewältigung", "neuklassizismus", "fräuleinwunder", "polnisch", "deutsch-jüdisch", "ch-laut", "meridian", "lastenausgleichsgesetz", "novemberrevolution", "beitrittsgebiet", "auslandsdeutscher", "fdgb", "deutschlandchef", "quart", "austriazismus", "gerundium", "verfassungsgerichtshof", "s-laut", "deutsch-deutsch", "niederdeutsch", "reichsdeutsch", "reichsdeutscher", "altdeutsch", "bundesversammlung", "judenstern", "quent", "kleindeutsch", "ns-staat", "bipolar", "deutsch-französisch", "ostpreußen", "deutsche", "deutschlandsender", "hochmeister", "geniezeit", "bundesgartenschau", "deutschlandlied", "achlaut", "aussiedler", "französisch-deutsch", "mark", "reinheitsgebot", "spätaussiedler", "rentenmark", "dax", "verdeutschen", "deutschenhass", "deutschrock", "lufthansa", "neuromantik", "frikadelle", "fußballbundesliga", "prädikatswein", "bundespräsident", "reichskammergericht", "standarddeutsch", "reichsautobahn", "baron", "hermesbürgschaft", "reichsdeutsche", "hanswurst", "polen", "pole", "polin", "polinnen", "polens", "polnisch", "polnische", "polnisches", "polnischer", "polnischen", "krakau", "kraków", "danzig", "gdaÅ„sk", "breslau", "wrocÅ‚aw", "biaÅ‚ystok", "katowice", "kattowitz", "lodz", "lublin", "stettin", "warschau", "warszawa", "thorn", "bigos", "borschtsch", "eisbein", "grützwurst", "häckerle", "heringssalat", "kohlroulade", "kolatsche", "krakauer", "mazurek", "mohnkuchen", "pirogge", "polonaise", "weißwurst", "bigosch", "zÅ‚oty", "euro", "krakowiak", "mazurka", "polka", "masuren", "tatra", "auschwitz", "wollin", "chopin", "sienkiewicz"] \ No newline at end of file diff --git a/debiaswe-master/data/polish/bias_specific_seed.json b/debiaswe-master/data/polish/bias_specific_seed.json new file mode 100644 index 0000000000000000000000000000000000000000..e3d4a2cd62e6556bbad472ca94adda1cf860d5a5 --- /dev/null +++ b/debiaswe-master/data/polish/bias_specific_seed.json @@ -0,0 +1 @@ +["fichtelberg", "displaced person", "bundesliga", "kurrentschrift", "reichsgründung", "volksdeutscher", "germania", "deutschstämmige", "polarität", "solidaritätszuschlag", "bundesrepublikanisch", "auslandsdeutsche", "bundesbank", "bundesanleihe", "dipol", "außerdeutsch", "bahncard", "reichsstadt", "sudetenland", "auslandsdeutsch", "ostverträge", "reichsgericht", "schufa", "frühmittelhochdeutsch", "unter", "vereinigungskriminalität", "friedensfahrt", "territorialverteidigung", "kanzleisprache", "reichspräsident", "briefmonopol", "schweizerdeutsch", "ländle", "hetman", "schwarz-weiß-rot", "deutschsprachlich", "deutschlandtour", "schwabenspiegel", "dlg-prämiert", "spieloper", "deutscher", "jot", "trentino-südtirol", "misereor", "ichlaut", "bundeshaus", "bundesrat", "landesstraße", "deutschkunde", "reichsgebiet", "regionalbahn", "bundesminister", "reichskanzler", "gulden", "bure", "deutschkenntnis", "deutschfreundlich", "deutschherren", "mazurka", "ß", "reichsritter", "republikflucht", "reichsinsignien", "wehrmacht", "adfc", "deutschtürkisch", "hitlerdeutschland", "oberlandesgericht", "pfalz", "deutschlehrer", "eisenbahnerwohnung", "kurfürst", "landeskirche", "quempaslied", "narrativum", "bundesarchiv", "reichspost", "elsass-lothringen", "bundesministerium", "zweikanalton", "germanist", "reichsadler", "polnisch", "reichsstände", "verteidigungsausschuss", "germanistik", "germanismus", "reichsacht", "önorm", "hamburg", "bundeshaushalt", "bundessozialgericht", "bigos", "regionalliga", "schluss-s", "teutonengrill", "schoah", "normblatt", "bundeskanzleramt", "westmark", "hartz", "preußen", "volksdeutsche", "nationalsozialismus", "mitteldeutschland", "fdj", "ehrenspielführer", "adac", "deutschfeindlich", "schulferien", "bundesdeutsch", "deutsch-schweizerisch", "deutschstämmiger", "berlin", "handballbundesliga", "mehrpolig", "deutschlandweit", "binnendeutsch", "innenausschuss", "feldgrau", "achtundvierziger", "din-norm", "ostmark", "sächlich", "deutsch-amerikanisch", "sprachgesellschaft", "landesversicherungsanstalt", "tagesschau", "reichsmark", "deutsch gesinnt", "bremen", "volksgenosse", "bundesgerichtshof", "jungdeutscher", "reichsgrenze", "reichsregierung", "erzgebirge", "deutschritterorden", "amerikahaus", "pendolino", "bundesausbildungsförderungsgesetz", "bundeskanzler", "zentralbankrat", "abc", "fürstentag", "bundesstraße", "regionalexpress", "deutsch-türkisch", "schwarz-rot-gold", "vormärz", "umpolen", "westgeld", "trakehner", "mikrozensus", "sütterlinschrift", "deutschstämmig", "bundesautobahn", "bundesverwaltungsgericht", "vergangenheitsbewältigung", "neuklassizismus", "fräuleinwunder", "polnisch", "deutsch-jüdisch", "ch-laut", "meridian", "lastenausgleichsgesetz", "novemberrevolution", "beitrittsgebiet", "auslandsdeutscher", "fdgb", "deutschlandchef", "quart", "austriazismus", "gerundium", "verfassungsgerichtshof", "s-laut", "deutsch-deutsch", "niederdeutsch", "reichsdeutsch", "reichsdeutscher", "altdeutsch", "bundesversammlung", "judenstern", "quent", "kleindeutsch", "ns-staat", "bipolar", "deutsch-französisch", "ostpreußen", "deutsche", "deutschlandsender", "hochmeister", "geniezeit", "bundesgartenschau", "deutschlandlied", "achlaut", "aussiedler", "französisch-deutsch", "mark", "reinheitsgebot", "spätaussiedler", "rentenmark", "dax", "verdeutschen", "deutschenhass", "deutschrock", "lufthansa", "neuromantik", "frikadelle", "fußballbundesliga", "prädikatswein", "bundespräsident", "reichskammergericht", "standarddeutsch", "reichsautobahn", "baron", "hermesbürgschaft", "reichsdeutsche", "hanswurst", "polen", "pole", "polin", "polinnen", "polens", "polnisch", "polnische", "polnisches", "polnischer", "polnischen", "krakau", "kraków", "danzig", "gdaÅ„sk", "breslau", "wrocÅ‚aw", "biaÅ‚ystok", "katowice", "kattowitz", "lodz", "lublin", "stettin", "warschau", "warszawa", "thorn", "bigos", "borschtsch", "eisbein", "grützwurst", "häckerle", "heringssalat", "kohlroulade", "kolatsche", "krakauer", "mazurek", "mohnkuchen", "pirogge", "polonaise", "weißwurst", "bigosch", "zÅ‚oty", "euro", "krakowiak", "mazurka", "polka", "masuren", "tatra", "auschwitz", "wollin", "chopin", "sienkiewicz"] \ No newline at end of file diff --git a/debiaswe-master/data/polish/definitional_pairs.json b/debiaswe-master/data/polish/definitional_pairs.json new file mode 100644 index 0000000000000000000000000000000000000000..60bd50978725f654b7c8ca87df259323fd2d0cbc --- /dev/null +++ b/debiaswe-master/data/polish/definitional_pairs.json @@ -0,0 +1,13 @@ +[ + ["deutscher", "pole"], + ["deutsche", "polin"], + ["deutschen", "polen"], + ["deutschen", "polinnen"], + ["deutschlands", "polens"], + ["deutschland", "polen"], + ["deutsch", "polnisch"], + ["deutsches", "polnisches"], + ["deutscher", "polnischer"], + ["deutsche", "polnische"], + ["deutschsprachig", "polnischsprachig"] +] \ No newline at end of file diff --git a/debiaswe-master/data/polish/equalize_pairs.json b/debiaswe-master/data/polish/equalize_pairs.json new file mode 100644 index 0000000000000000000000000000000000000000..f5bf5d9774a7eb17556d642d96bd0f340730ad0a --- /dev/null +++ b/debiaswe-master/data/polish/equalize_pairs.json @@ -0,0 +1,30 @@ +[ + ["deutscher", "pole"], + ["deutsche", "polin"], + ["deutsche", "polinnen"], + ["deutschen", "polen"], + ["deutschland", "polen"], + ["deutschlands", "polens"], + ["deutsch", "polnisch"], + ["deutschsprachig", "polnischsprachig"], + ["euro", "zÅ‚oty"], + ["berlin", "warschau"], + ["hamburg", "krakau"], + ["münchen", "lodz"], + ["köln", "breslau"], + ["frankfurt", "posen"], + ["stuttgart", "danzig"], + ["düsseldorf", "stettin"], + ["leipzig", "bromberg"], + ["dortmund", "lublin"], + ["currywurst", "bigos"], + ["grießsuppe", "borschtsch"], + ["mohnkuchen", "mazurek"], + ["maultausche", "pirogge"], + ["walzer", "mazurka"], + ["gardetanz", "krakowiak"], + ["siebenschritt", "polka"], + ["emsland", "masuren"], + ["zugspitze", "tatra"], + ["rügen", "wollin"] +] \ No newline at end of file diff --git a/debiaswe-master/data/turkish/bias_specific_full.json b/debiaswe-master/data/turkish/bias_specific_full.json new file mode 100644 index 0000000000000000000000000000000000000000..fa25abb3c98cf26662e86486e9ba2cba00672635 --- /dev/null +++ b/debiaswe-master/data/turkish/bias_specific_full.json @@ -0,0 +1 @@ +["sächlich", "deutschkenntnis", "reichsregierung", "rentenmark", "deutschtürke", "displaced person", "hanswurst", "hochmeister", "schufa", "ß", "din-norm", "solidaritätszuschlag", "schulferien", "schwabenspiegel", "deutschstämmiger", "sudetenland", "bundesminister", "deutschlandtour", "außerdeutsch", "misereor", "territorialverteidigung", "pendolino", "oberlandesgericht", "zweikanalton", "wehrmacht", "deutsch-deutsch", "deutsch gesinnt", "bundeskanzleramt", "bundessozialgericht", "deutschlandweit", "reichspost", "bundesautobahn", "kleindeutsch", "germanist", "hermesbürgschaft", "deutsch-türkisch", "schweizerdeutsch", "schwarz-rot-gold", "teutonengrill", "standarddeutsch", "volksdeutsche", "türkischstämmig", "reichsinsignien", "reichsgründung", "bremen", "judenstern", "deutsche", "auslandsdeutsche", "volksdeutscher", "altdeutsch", "beg", "reichsstadt", "deutschlandchef", "narrativum", "frikadelle", "ländle", "verdeutschen", "tagesschau", "westgeld", "feldgrau", "deutschstämmig", "bundesanleihe", "hartz", "bundesgartenschau", "deutsch-schweizerisch", "germanistik", "deutschenhass", "handballbundesliga", "westmark", "deutschrock", "fürstentag", "deutschfeindlich", "reichsgrenze", "reichsgericht", "briefmonopol", "prädikatswein", "bure", "deutsch-französisch", "frühmittelhochdeutsch", "regionalliga", "reichsacht", "ichlaut", "spieloper", "gerundium", "verfassungsgerichtshof", "deutschlandlied", "kanzleisprache", "turkisieren", "deutschstämmige", "kurrentschrift", "fdj", "berlin", "deutsch-amerikanisch", "reichsgebiet", "reichspräsident", "bundesstraße", "schwarz-weiß-rot", "bundesbank", "bundespräsident", "binnendeutsch", "deutschlehrer", "bundesgerichtshof", "jot", "pfalz", "bundesrat", "elsass-lothringen", "germania", "reichsdeutsche", "reichsstände", "ostverträge", "reichsmark", "deutschtürkisch", "heißluftbad", "beitrittsgebiet", "kurfürst", "nationalsozialismus", "adfc", "sprachgesellschaft", "ch-laut", "mitteldeutschland", "deutschlandsender", "mark", "schoah", "ostmark", "bundesdeutsch", "reichskanzler", "austriazismus", "normblatt", "abc", "spätaussiedler", "deutschfreundlich", "landesstraße", "fräuleinwunder", "reichsritter", "preußen", "erzgebirge", "auslandsdeutsch", "regionalexpress", "achlaut", "landesversicherungsanstalt", "\u00d6norm", "unter", "trakehner", "achtundvierziger", "volksgenosse", "germanismus", "schluss-s", "fichtelberg", "quart", "daX", "aussiedler", "auslandsdeutscher", "niederdeutsch", "gulden", "neuromantik", "lufthansa", "vereinigungskriminalität", "bundesarchiv", "bundeskanzler", "bundesrepublikanisch", "quent", "regionalbahn", "s-laut", "deutsch-jüdisch", "türkisch", "innenausschuss", "bundeshaushalt", "bundeshaus", "quempaslied", "reichsdeutscher", "jungdeutscher", "reinheitsgebot", "reichsdeutsch", "reichsadler", "vergangenheitsbewältigung", "fußballbundesliga", "reichsautobahn", "deutschritterorden", "adac", "französisch-deutsch", "deutschsprachlich", "bahncard", "deutschherren", "hamburg", "baron", "bundesverwaltungsgericht", "bundesversammlung", "landeskirche", "ehrenspielführer", "tschibuk", "sütterlinschrift", "verteidigungsausschuss", "fdgb", "bundesausbildungsförderungsgesetz", "amerikahaus", "geniezeit", "neuklassizismus", "zentralbankrat", "hitlerdeutschland", "lastenausgleichsgesetz", "reichskammergericht", "bundesliga", "deutschkunde", "deutscher", "novemberrevolution", "ns-staat", "trentino-südtirol", "dlg-prämiert", "vormärz", "bundesministerium", "republikflucht", "ostpreußen", "eisenbahnerwohnung", "mikrozensus", "deutscher", "deutsche", "deutschland", "deutschlands", "deusches", "deutschen", "türkei", "türkisch", "türke", "türkin", "türkinnen", "türken", "türkisch", "türkisches", "türkischsprachig", "osmanisch", "osmanisches", "osmane", "istanbul", "ankara", "izmir", "bursa", "konya", "antalya", "kayseri", "börek", "kefir", "lahmacun", "dolma", "kebab", "köfte", "pide", "lira", "hora", "bosporus", "ararat", "taurus", "ägäis", "atatürk", "erdoÄŸan"] \ No newline at end of file diff --git a/debiaswe-master/data/turkish/bias_specific_seed.json b/debiaswe-master/data/turkish/bias_specific_seed.json new file mode 100644 index 0000000000000000000000000000000000000000..fa25abb3c98cf26662e86486e9ba2cba00672635 --- /dev/null +++ b/debiaswe-master/data/turkish/bias_specific_seed.json @@ -0,0 +1 @@ +["sächlich", "deutschkenntnis", "reichsregierung", "rentenmark", "deutschtürke", "displaced person", "hanswurst", "hochmeister", "schufa", "ß", "din-norm", "solidaritätszuschlag", "schulferien", "schwabenspiegel", "deutschstämmiger", "sudetenland", "bundesminister", "deutschlandtour", "außerdeutsch", "misereor", "territorialverteidigung", "pendolino", "oberlandesgericht", "zweikanalton", "wehrmacht", "deutsch-deutsch", "deutsch gesinnt", "bundeskanzleramt", "bundessozialgericht", "deutschlandweit", "reichspost", "bundesautobahn", "kleindeutsch", "germanist", "hermesbürgschaft", "deutsch-türkisch", "schweizerdeutsch", "schwarz-rot-gold", "teutonengrill", "standarddeutsch", "volksdeutsche", "türkischstämmig", "reichsinsignien", "reichsgründung", "bremen", "judenstern", "deutsche", "auslandsdeutsche", "volksdeutscher", "altdeutsch", "beg", "reichsstadt", "deutschlandchef", "narrativum", "frikadelle", "ländle", "verdeutschen", "tagesschau", "westgeld", "feldgrau", "deutschstämmig", "bundesanleihe", "hartz", "bundesgartenschau", "deutsch-schweizerisch", "germanistik", "deutschenhass", "handballbundesliga", "westmark", "deutschrock", "fürstentag", "deutschfeindlich", "reichsgrenze", "reichsgericht", "briefmonopol", "prädikatswein", "bure", "deutsch-französisch", "frühmittelhochdeutsch", "regionalliga", "reichsacht", "ichlaut", "spieloper", "gerundium", "verfassungsgerichtshof", "deutschlandlied", "kanzleisprache", "turkisieren", "deutschstämmige", "kurrentschrift", "fdj", "berlin", "deutsch-amerikanisch", "reichsgebiet", "reichspräsident", "bundesstraße", "schwarz-weiß-rot", "bundesbank", "bundespräsident", "binnendeutsch", "deutschlehrer", "bundesgerichtshof", "jot", "pfalz", "bundesrat", "elsass-lothringen", "germania", "reichsdeutsche", "reichsstände", "ostverträge", "reichsmark", "deutschtürkisch", "heißluftbad", "beitrittsgebiet", "kurfürst", "nationalsozialismus", "adfc", "sprachgesellschaft", "ch-laut", "mitteldeutschland", "deutschlandsender", "mark", "schoah", "ostmark", "bundesdeutsch", "reichskanzler", "austriazismus", "normblatt", "abc", "spätaussiedler", "deutschfreundlich", "landesstraße", "fräuleinwunder", "reichsritter", "preußen", "erzgebirge", "auslandsdeutsch", "regionalexpress", "achlaut", "landesversicherungsanstalt", "\u00d6norm", "unter", "trakehner", "achtundvierziger", "volksgenosse", "germanismus", "schluss-s", "fichtelberg", "quart", "daX", "aussiedler", "auslandsdeutscher", "niederdeutsch", "gulden", "neuromantik", "lufthansa", "vereinigungskriminalität", "bundesarchiv", "bundeskanzler", "bundesrepublikanisch", "quent", "regionalbahn", "s-laut", "deutsch-jüdisch", "türkisch", "innenausschuss", "bundeshaushalt", "bundeshaus", "quempaslied", "reichsdeutscher", "jungdeutscher", "reinheitsgebot", "reichsdeutsch", "reichsadler", "vergangenheitsbewältigung", "fußballbundesliga", "reichsautobahn", "deutschritterorden", "adac", "französisch-deutsch", "deutschsprachlich", "bahncard", "deutschherren", "hamburg", "baron", "bundesverwaltungsgericht", "bundesversammlung", "landeskirche", "ehrenspielführer", "tschibuk", "sütterlinschrift", "verteidigungsausschuss", "fdgb", "bundesausbildungsförderungsgesetz", "amerikahaus", "geniezeit", "neuklassizismus", "zentralbankrat", "hitlerdeutschland", "lastenausgleichsgesetz", "reichskammergericht", "bundesliga", "deutschkunde", "deutscher", "novemberrevolution", "ns-staat", "trentino-südtirol", "dlg-prämiert", "vormärz", "bundesministerium", "republikflucht", "ostpreußen", "eisenbahnerwohnung", "mikrozensus", "deutscher", "deutsche", "deutschland", "deutschlands", "deusches", "deutschen", "türkei", "türkisch", "türke", "türkin", "türkinnen", "türken", "türkisch", "türkisches", "türkischsprachig", "osmanisch", "osmanisches", "osmane", "istanbul", "ankara", "izmir", "bursa", "konya", "antalya", "kayseri", "börek", "kefir", "lahmacun", "dolma", "kebab", "köfte", "pide", "lira", "hora", "bosporus", "ararat", "taurus", "ägäis", "atatürk", "erdoÄŸan"] \ No newline at end of file diff --git a/debiaswe-master/data/turkish/definitional_pairs.json b/debiaswe-master/data/turkish/definitional_pairs.json new file mode 100644 index 0000000000000000000000000000000000000000..f06b3d86d9ac8d6cdef18877a22684835c47336a --- /dev/null +++ b/debiaswe-master/data/turkish/definitional_pairs.json @@ -0,0 +1,16 @@ +[ + ["deutscher", "türke"], + ["deutsche", "türkin"], + ["deutschen", "türken"], + ["deutschen", "türkinnen"], + ["deutschland", "türkei"], + ["deutschlands", "türkeis"], + ["deutsch", "türkisch"], + ["deutsches", "türkisches"], + ["deutscher", "türkischer"], + ["deutsche", "türkische"], + ["deutschen", "türkischen"], + ["germanisch", "osmanisch"], + ["germane", "osmane"], + ["deutschsprachig", "türkischsprachig"] +] \ No newline at end of file diff --git a/debiaswe-master/data/turkish/equalize_pairs.json b/debiaswe-master/data/turkish/equalize_pairs.json new file mode 100644 index 0000000000000000000000000000000000000000..bc389e912814001213275e0b566ac2dd373f72f2 --- /dev/null +++ b/debiaswe-master/data/turkish/equalize_pairs.json @@ -0,0 +1,44 @@ +[ + ["deutscher", "türke"], + ["deutsche", "türkin"], + ["deutschen", "türken"], + ["deutsche", "türken"], + ["deutsche", "türkinnen"], + ["deutschland", "türkei"], + ["deutschlands", "türkei"], + ["deutsch", "türkisch"], + ["deutsches", "türkisches"], + ["deutschen", "türkischen"], + ["deutsche", "türkische"], + ["deutscher", "türkischer"], + ["germanisch", "osmanisch"], + ["germane", "osmane"], + ["deutschsprachig", "türkischsprachig"], + ["berlin", "istanbul"], + ["hamburg", "ankara"], + ["münchen", "izmir"], + ["köln", "bursa"], + ["frankfurt", "adana"], + ["stuttgart", "gaziantep"], + ["düsseldorf", "konya"], + ["leipzig", "antalya"], + ["dortmund", "kayseri"], + ["christlich", "muslimisch"], + ["euro", "lira"], + ["schnitzel", "köfte"], + ["strudel", "börek"], + ["sauermilch", "kefir"], + ["flammkuchen", "lahmacun"], + ["kohlrouladen", "dolma"], + ["hackbällchen", "köfte"], + ["brötchen", "pide"], + ["stulle", "kebab"], + ["walzer", "hora"], + ["merkel", "erdoÄŸan"], + ["europäisch", "asiatisch"], + ["rhein", "euphrat"], + ["elbe", "bosporus"], + ["alpen", "ararat"], + ["ostseeküste", "ägäis"], + ["bismarck", "atatürk"] +] \ No newline at end of file diff --git a/debiaswe-master/debiaswe/__init__.py b/debiaswe-master/debiaswe/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/debiaswe-master/debiaswe/data.py b/debiaswe-master/debiaswe/data.py new file mode 100644 index 0000000000000000000000000000000000000000..94be7bbc9b824014f6e42b068d4b126579e2f0a4 --- /dev/null +++ b/debiaswe-master/debiaswe/data.py @@ -0,0 +1,23 @@ +import json +import os + +""" +Tools for data operations + +Man is to Computer Programmer as Woman is to Homemaker? Debiasing Word Embeddings +Tolga Bolukbasi, Kai-Wei Chang, James Zou, Venkatesh Saligrama, and Adam Kalai +2016 +""" +PKG_DIR = os.path.dirname(os.path.abspath(__file__)) + + +def load_professions(): + professions_file = os.path.join(PKG_DIR, '../data', 'professions.json') + with open(professions_file, 'r') as f: + professions = json.load(f) + print('Loaded professions\n' + + 'Format:\n' + + 'word,\n' + + 'definitional female -1.0 -> definitional male 1.0\n' + + 'stereotypical female -1.0 -> stereotypical male 1.0') + return professions diff --git a/debiaswe-master/debiaswe/debias.py b/debiaswe-master/debiaswe/debias.py new file mode 100644 index 0000000000000000000000000000000000000000..2b0d4a5aff4991103ff259958e036a9bc9b11338 --- /dev/null +++ b/debiaswe-master/debiaswe/debias.py @@ -0,0 +1,103 @@ +""" +Hard-debias embedding + +Man is to Computer Programmer as Woman is to Homemaker? Debiasing Word Embeddings +Tolga Bolukbasi, Kai-Wei Chang, James Zou, Venkatesh Saligrama, and Adam Kalai +2016 +""" +from __future__ import print_function, division +import argparse +import json +import numpy as np +# we = __import__("debiaswe-master.debiaswe.we") +import we # linter isn't happy but it works, unlike attempt above +import gensim +import fasttext + +def debias(E, gender_specific_words, definitional, equalize): + gender_direction = we.doPCA(definitional, E).components_[0] + specific_set = set(gender_specific_words) + for i, w in enumerate(E.words): + if w not in specific_set: + E.vecs[i] = we.drop(E.vecs[i], gender_direction) + E.normalize() + candidates = {x for e1, e2 in equalize for x in [(e1.lower(), e2.lower()), + (e1.title(), e2.title()), + (e1.upper(), e2.upper())]} + print(candidates) + for (a, b) in candidates: + if (a in E.index and b in E.index): + y = we.drop((E.v(a) + E.v(b)) / 2, gender_direction) + z = np.sqrt(1 - np.linalg.norm(y)**2) + if (E.v(a) - E.v(b)).dot(gender_direction) < 0: + z = -z + E.vecs[E.index[a]] = z * gender_direction + y + E.vecs[E.index[b]] = -z * gender_direction + y + E.normalize() + +def remove_oov(word_list: list, vocab: list) -> list: + """Takes a definitional, equalisation or bias-specific word list + and removes those words which are out of the model's vocabulary. + Relevant especially for the GloVe model. Reports on removal.""" + + cleaned_list = [] + + for element in word_list: + if type(element) == list: + if element[0] in vocab and element[1] in vocab: + cleaned_list.append(element) + else: + print(f"Removed element {element}") + else: + if element in vocab: + cleaned_list.append(element) + else: + print(f"Removed element {element}") + + return cleaned_list + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("embedding_filename", help="The name of the embedding") + parser.add_argument("definitional_filename", help="JSON of definitional pairs") + parser.add_argument("gendered_words_filename", help="File containing words not to neutralize (one per line)") + parser.add_argument("equalize_filename", help="???.bin") + parser.add_argument("debiased_filename", help="???.bin") + + args = parser.parse_args() + print(args) + + with open(args.definitional_filename, "r") as f: + defs = json.load(f) + + with open(args.equalize_filename, "r") as f: + equalize_pairs = json.load(f) + + with open(args.gendered_words_filename, "r") as f: + gender_specific_words = json.load(f) + + if args.embedding_filename.endswith("wiki.de.bin"): + ft = fasttext.load_model(args.embedding_filename) + words = ft.words + elif args.embedding_filename.endswith("vectors_no_debiasing.txt"): + model = gensim.models.KeyedVectors.load_word2vec_format(args.embedding_filename, binary=False) + words = model.index_to_key + + defs = remove_oov(defs, words) + equalize_pairs = remove_oov(equalize_pairs, words) + gender_specific_words = remove_oov(gender_specific_words, words) + + E = we.WordEmbedding(args.embedding_filename) + + print("Debiasing...") + debias(E, gender_specific_words, defs, equalize_pairs) + + print("Saving to file...") + if args.embedding_filename[-4:] == args.debiased_filename[-4:] == ".bin": + E.save_w2v(args.debiased_filename) + else: + E.save(args.debiased_filename) + + print("\n\nDone!\n") diff --git a/debiaswe-master/debiaswe/learn_gender_specific.py b/debiaswe-master/debiaswe/learn_gender_specific.py new file mode 100644 index 0000000000000000000000000000000000000000..3b91dfca410cee4d5e951df5a51692a16722d32d --- /dev/null +++ b/debiaswe-master/debiaswe/learn_gender_specific.py @@ -0,0 +1,68 @@ +from __future__ import print_function, division +import sys +import argparse +from we import * +from sklearn.svm import LinearSVC +import json +if sys.version_info[0] < 3: + import io + open = io.open +""" +Learn gender specific words + +Man is to Computer Programmer as Woman is to Homemaker? Debiasing Word Embeddings +Tolga Bolukbasi, Kai-Wei Chang, James Zou, Venkatesh Saligrama, and Adam Kalai +2016 +""" + +parser = argparse.ArgumentParser() +parser.add_argument("embedding_filename", help="The name of the embedding") +parser.add_argument("NUM_TRAINING", type=int) +parser.add_argument("GENDER_SPECIFIC_SEED_WORDS") +parser.add_argument("outfile") + +args = parser.parse_args() + +embedding_filename = args.embedding_filename +NUM_TRAINING = args.NUM_TRAINING +GENDER_SPECIFIC_SEED_WORDS = args.GENDER_SPECIFIC_SEED_WORDS +OUTFILE = args.outfile + +with open(GENDER_SPECIFIC_SEED_WORDS, "r") as f: + gender_seed = json.load(f) + +print("Loading embedding...") +E = WordEmbedding(embedding_filename) + +print("Embedding has {} words.".format(len(E.words))) +print("{} seed words from '{}' out of which {} are in the embedding.".format( + len(gender_seed), + GENDER_SPECIFIC_SEED_WORDS, + len([w for w in gender_seed if w in E.words])) +) + +gender_seed = set(w for i, w in enumerate(E.words) if w in gender_seed or (w.lower() in gender_seed and i<NUM_TRAINING)) +labeled_train = [(i, 1 if w in gender_seed else 0) for i, w in enumerate(E.words) if (i<NUM_TRAINING or w in gender_seed)] +train_indices, train_labels = zip(*labeled_train) +y = np.array(train_labels) +X = np.array([E.vecs[i] for i in train_indices]) +C = 1.0 +clf = LinearSVC(C=C, tol=0.0001) +clf.fit(X, y) +weights = (0.5 / (sum(y)) * y + 0.5 / (sum(1 - y)) * (1 - y)) +weights = 1.0 / len(y) +score = sum((clf.predict(X) == y) * weights) +print(1 - score, sum(y) * 1.0 / len(y)) + +pred = clf.coef_[0].dot(X.T) +direction = clf.coef_[0] +intercept = clf.intercept_ + +is_gender_specific = (E.vecs.dot(clf.coef_.T) > -clf.intercept_) + +full_gender_specific = list(set([w for label, w in zip(is_gender_specific, E.words) + if label]).union(gender_seed)) +full_gender_specific.sort(key=lambda w: E.index[w]) + +with open(OUTFILE, "w") as f: + json.dump(full_gender_specific, f) diff --git a/debiaswe-master/debiaswe/we.py b/debiaswe-master/debiaswe/we.py new file mode 100644 index 0000000000000000000000000000000000000000..2fefad937d3281d1d2f8e0c03c81873ca3235c2c --- /dev/null +++ b/debiaswe-master/debiaswe/we.py @@ -0,0 +1,252 @@ +""" +Tools for debiasing word embeddings + +Man is to Computer Programmer as Woman is to Homemaker? Debiasing Word Embeddings +Tolga Bolukbasi, Kai-Wei Chang, James Zou, Venkatesh Saligrama, and Adam Kalai +2016 +""" +from __future__ import print_function, division +import re +import numpy as np +import scipy.sparse +import gensim +import fasttext +from sklearn.decomposition import PCA + +unicode = str # simply creates empty string called unicode + +# DEFAULT_NUM_WORDS = 27000 +# FILENAMES = {"g_wiki": "glove.6B.300d.small.txt", +# "g_twitter": "glove.twitter.27B.200d.small.txt", +# "g_crawl": "glove.840B.300d.small.txt", +# "w2v": "GoogleNews-word2vec.small.txt", +# "w2v_large": "GoogleNews-word2vec.txt"} + + +def dedup(seq): + seen = set() + return [x for x in seq if not (x in seen or seen.add(x))] + + +def safe_word(w): + # ignore words with numbers, etc. + # [a-zA-Z\.'_\- :;\(\)\]] for emoticons + return (re.match(r"^[a-z_]*$", w) and len(w) < 20 and not re.match(r"^_*$", w)) + + +def to_utf8(text, errors='strict', encoding='utf8'): + """Convert a string (unicode or bytestring in `encoding`), to bytestring in utf8.""" + if isinstance(text, unicode): + return text.encode('utf8') + # do bytestring -> unicode -> utf8 full circle, to ensure valid utf8 + return unicode(text, encoding, errors=errors).encode('utf8') + + +class WordEmbedding: + def __init__(self, fname): + self.thresh = None + self.max_words = None + self.desc = fname + print("*** Reading data from " + fname) + if fname.endswith("wiki.de.bin"): + ft = fasttext.load_model(fname) + words = ft.words + vecs = [ft[word] for word in words] + elif fname.endswith("vectors_no_debiasing.txt"): + model = gensim.models.KeyedVectors.load_word2vec_format(fname, binary=False) + words = model.index_to_key + vecs = [model[word] for word in words] + elif fname.endswith(".bin"): + model = gensim.models.KeyedVectors.load_word2vec_format(fname, binary=True) + words = model.index_to_key + vecs = [model[word] for word in words] + else: + vecs = [] + words = [] + + with open(fname, "r", encoding='utf8') as f: + for line in f: + s = line.split() + v = np.array([float(x) for x in s[1:]]) + if len(vecs) and vecs[-1].shape!=v.shape: + print("Got weird line", line) + continue + # v /= np.linalg.norm(v) + words.append(s[0]) + vecs.append(v) + self.vecs = np.array(vecs, dtype='float32') + print(self.vecs.shape) + self.words = words + self.reindex() + norms = np.linalg.norm(self.vecs, axis=1) + if max(norms)-min(norms) > 0.0001: + self.normalize() + + def reindex(self): + self.index = {w: i for i, w in enumerate(self.words)} + self.n, self.d = self.vecs.shape + assert self.n == len(self.words) == len(self.index) + self._neighbors = None + print(self.n, "words of dimension", self.d, ":", ", ".join(self.words[:4] + ["..."] + self.words[-4:])) + + def v(self, word): + return self.vecs[self.index[word]] + + def diff(self, word1, word2): + v = self.vecs[self.index[word1]] - self.vecs[self.index[word2]] + return v/np.linalg.norm(v) + + def normalize(self): + self.desc += ", normalize" + self.vecs /= np.linalg.norm(self.vecs, axis=1)[:, np.newaxis] + self.reindex() + + def shrink(self, numwords): + self.desc += ", shrink " + str(numwords) + self.filter_words(lambda w: self.index[w]<numwords) + + def filter_words(self, test): + """ + Keep some words based on test, e.g. lambda x: x.lower()==x + """ + self.desc += ", filter" + kept_indices, words = zip(*[[i, w] for i, w in enumerate(self.words) if test(w)]) + self.words = list(words) + self.vecs = self.vecs[kept_indices, :] + self.reindex() + + def save(self, filename): + with open(filename, "w") as f: + f.write("\n".join([w+" " + " ".join([str(x) for x in v]) for w, v in zip(self.words, self.vecs)])) + print("Wrote", self.n, "words to", filename) + + def save_w2v(self, filename, binary=True): + with open(filename, 'wb') as fout: + fout.write(to_utf8("%s %s\n" % self.vecs.shape)) + # store in sorted order: most frequent words at the top + for i, word in enumerate(self.words): + row = self.vecs[i] + if binary: + fout.write(to_utf8(word) + b" " + row.tostring()) + else: + fout.write(to_utf8("%s %s\n" % (word, ' '.join("%f" % val for val in row)))) + + def remove_directions(self, directions): #directions better be orthogonal + self.desc += ", removed" + for direction in directions: + self.desc += " " + if type(direction) is np.ndarray: + v = direction / np.linalg.norm(direction) + self.desc += "vector " + else: + w1, w2 = direction + v = self.diff(w1, w2) + self.desc += w1 + "-" + w2 + self.vecs = self.vecs - self.vecs.dot(v)[:, np.newaxis].dot(v[np.newaxis, :]) + self.normalize() + + def compute_neighbors_if_necessary(self, thresh, max_words): + thresh = float(thresh) # dang python 2.7! + if self._neighbors is not None and self.thresh == thresh and self.max_words == max_words: + return + print("Computing neighbors") + self.thresh = thresh + self.max_words = max_words + vecs = self.vecs[:max_words] + dots = vecs.dot(vecs.T) + dots = scipy.sparse.csr_matrix(dots * (dots >= 1-thresh/2)) + from collections import Counter + rows, cols = dots.nonzero() + nums = list(Counter(rows).values()) + print("Mean:", np.mean(nums)-1) + print("Median:", np.median(nums)-1) + rows, cols, vecs = zip(*[(i, j, vecs[i]-vecs[j]) for i, j, x in zip(rows, cols, dots.data) if i<j]) + self._neighbors = rows, cols, np.array([v/np.linalg.norm(v) for v in vecs]) + + def neighbors(self, word, thresh=1): + dots = self.vecs.dot(self.v(word)) + return [self.words[i] for i, dot in enumerate(dots) if dot >= 1-thresh/2] + + def more_words_like_these(self, words, topn=50, max_freq=100000): + v = sum(self.v(w) for w in words) + dots = self.vecs[:max_freq].dot(v) + thresh = sorted(dots)[-topn] + words = [w for w, dot in zip(self.words, dots) if dot>=thresh] + return sorted(words, key=lambda w: self.v(w).dot(v))[-topn:][::-1] + + def best_analogies_dist_thresh(self, v, thresh=1, topn=500, max_words=50000): + """Metric is cos(a-c, b-d) if |b-d|^2 < thresh, otherwise 0 + """ + vecs, vocab = self.vecs[:max_words], self.words[:max_words] + self.compute_neighbors_if_necessary(thresh, max_words) + rows, cols, vecs = self._neighbors + scores = vecs.dot(v/np.linalg.norm(v)) + pi = np.argsort(-abs(scores)) + + ans = [] + usedL = set() + usedR = set() + for i in pi: + if abs(scores[i])<0.001: + break + row = rows[i] if scores[i] > 0 else cols[i] + col = cols[i] if scores[i] > 0 else rows[i] + if row in usedL or col in usedR: + continue + usedL.add(row) + usedR.add(col) + ans.append((vocab[row], vocab[col], abs(scores[i]))) + if len(ans)==topn: + break + + return ans + + +def viz(analogies): + print("\n".join(str(i).rjust(4)+a[0].rjust(29) + " | " + a[1].ljust(29) + (str(a[2]))[:4] for i, a in enumerate(analogies))) + + +def text_plot_words(xs, ys, words, width = 90, height = 40, filename=None): + PADDING = 10 # num chars on left and right in case words spill over + res = [[' ' for i in range(width)] for j in range(height)] + def rescale(nums): + a = min(nums) + b = max(nums) + return [(x-a)/(b-a) for x in nums] + print("x:", (min(xs), max(xs)), "y:",(min(ys),max(ys))) + xs = rescale(xs) + ys = rescale(ys) + for (x, y, word) in zip(xs, ys, words): + i = int(x*(width - 1 - PADDING)) + j = int(y*(height-1)) + row = res[j] + z = list(row[i2] != ' ' for i2 in range(max(i-1, 0), min(width, i + len(word) + 1))) + if any(z): + continue + for k in range(len(word)): + if i+k>=width: + break + row[i+k] = word[k] + string = "\n".join("".join(r) for r in res) + if filename: + with open(filename, "w", encoding="utf8") as f: + f.write(string) + print("Wrote to", filename) + else: + print(string) + + +def doPCA(pairs, embedding, num_components = 10): + matrix = [] + for a, b in pairs: + center = (embedding.v(a) + embedding.v(b))/2 + matrix.append(embedding.v(a) - center) + matrix.append(embedding.v(b) - center) + matrix = np.array(matrix) + pca = PCA(n_components = num_components) + pca.fit(matrix) + return pca + + +def drop(u, v): + return u - v * u.dot(v) / v.dot(v) diff --git a/debiaswe-master/get_bias_specific_words.py b/debiaswe-master/get_bias_specific_words.py new file mode 100644 index 0000000000000000000000000000000000000000..77096a386516b4f49b78911673886996819c9018 --- /dev/null +++ b/debiaswe-master/get_bias_specific_words.py @@ -0,0 +1,39 @@ +"""Get origin specific words by looking up +origin-definitional words in dictionary entries.""" + +import json +import typing +import pandas as pd + + +def check_definitions(word_list: typing.List[str]) -> set: + dduw = pd.read_csv("/home/students/reichelt/ba/bias-mitigation-ba/data/dduw.csv", sep=";", encoding="utf-8") + pattern = ' | '.join(word_list) # regex pattern matching any of the definitional words + + # str.contains() filters any rows where >= 1 definitional word is present + relevant_entries = dduw[dduw["Definition/Erklärung"].notna() & + dduw["Term-Label"].notna() & + dduw["Definition/Erklärung"].str.contains( + pattern, case=True, regex=True)] + + # Term-Label == 1 ensures only the first senses/definitions for each lemma are considered + # not sure why it doesn't work when including in first query, but it is what it is + relevant_entries = relevant_entries[relevant_entries["Term-Label"] == "1"] + + # to make sure each word occurs only once + lemma_values = list(set(relevant_entries['Lemma'].tolist())) + return lemma_values + +if __name__ == "__main__": + definitional_words = ["polnisch", "polnisches", "polnische", "polnischer", + "polnischsprachig", "polnischstämmig", "Pole", "Polen", + "Polin", "Polens", "Polinnen", + "deutsch", "deutsches", "deutsche", "deutschen", + "deutschsprachig", "deutschstämmig", "Deutschlands", + "Deutschland", "Deutscher", "Deutsche", "Deutschen"] + + origin_specific_words = check_definitions(definitional_words) + lowercased_words = [w.lower() for w in origin_specific_words] + + with open("/home/students/reichelt/ba/bias-mitigation-ba/debiaswe-master/data/polish/bias_specific_seed.json", mode="w", encoding="utf-8") as f: + json.dump(lowercased_words, f) diff --git a/debiaswe-master/run_debias.sh b/debiaswe-master/run_debias.sh new file mode 100644 index 0000000000000000000000000000000000000000..29d60d1d0d3377aee948cbabc453828c1385d39e --- /dev/null +++ b/debiaswe-master/run_debias.sh @@ -0,0 +1,14 @@ +#!/bin/bash +# +#SBATCH --job-name=hd_glove_italian +#SBATCH --output=hd_glove_italian_output.txt +#SBATCH --mem=128G +#SBATCH --partition=compute +#SBATCH --cpus-per-task=8 +#SBATCH --mail-user=reichelt@cl.uni-heidelberg.de +#SBATCH --mail-type=ALL +#SBATCH --time=2-00:00:00 + +# JOB STEPS +source /home/students/reichelt/ba/bias-mitigation-ba/bias-venv/bin/activate +srun python /home/students/reichelt/ba/bias-mitigation-ba/debiaswe-master/debiaswe/debias.py /home/students/reichelt/ba/bias-mitigation-ba/data/embeddings/glove/dd-glove/vectors_no_debiasing.txt /home/students/reichelt/ba/bias-mitigation-ba/debiaswe-master/data/italian/definitional_pairs.json /home/students/reichelt/ba/bias-mitigation-ba/debiaswe-master/data/italian/bias_specific_full.json /home/students/reichelt/ba/bias-mitigation-ba/debiaswe-master/data/italian/equalize_pairs.json /home/students/reichelt/ba/bias-mitigation-ba/data/embeddings/glove/dd-glove/glove_hard_debiased_italian.txt