diff --git a/bias_tilt_example.ipynb b/bias_tilt_example.ipynb
index b9e5437d1549e47caea5e5cd2b32d43a9c43a7bf..e820f27d080b6dd44ef7f2ce8b417da8854c7be2 100644
--- a/bias_tilt_example.ipynb
+++ b/bias_tilt_example.ipynb
@@ -1 +1 @@
-{"cells":[{"cell_type":"markdown","metadata":{},"source":["# **Importing Modules**"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2022-09-23T11:18:31.409907Z","iopub.status.busy":"2022-09-23T11:18:31.40606Z","iopub.status.idle":"2022-09-23T11:18:32.31181Z","shell.execute_reply":"2022-09-23T11:18:32.310769Z","shell.execute_reply.started":"2022-09-23T11:18:31.409862Z"},"trusted":true},"outputs":[],"source":["import pandas as pd\n","import nltk\n","from nltk.stem import PorterStemmer\n","from nltk.corpus import stopwords\n","from nltk.tokenize import word_tokenize\n","import re\n","from sklearn import metrics\n","from sklearn.metrics import classification_repor\n","from sklearn.feature_extraction.text import CountVectorizer"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2022-09-23T11:18:32.314827Z","iopub.status.busy":"2022-09-23T11:18:32.314039Z","iopub.status.idle":"2022-09-23T11:18:32.903483Z","shell.execute_reply":"2022-09-23T11:18:32.902502Z","shell.execute_reply.started":"2022-09-23T11:18:32.31479Z"},"trusted":true},"outputs":[],"source":["df = pd.read_csv(\"datasets/Resume/Resume.csv\")\n","df.head()"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2022-09-23T11:18:32.905776Z","iopub.status.busy":"2022-09-23T11:18:32.905085Z","iopub.status.idle":"2022-09-23T11:18:32.919723Z","shell.execute_reply":"2022-09-23T11:18:32.918685Z","shell.execute_reply.started":"2022-09-23T11:18:32.905738Z"},"trusted":true},"outputs":[],"source":["# Remove unsued columns\n","df.drop(columns = ['ID', 'Resume_html'], inplace = True)\n","df"]},{"cell_type":"markdown","metadata":{},"source":["# **Data Preprocessing**"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2022-09-23T11:18:32.921581Z","iopub.status.busy":"2022-09-23T11:18:32.921222Z","iopub.status.idle":"2022-09-23T11:18:32.92632Z","shell.execute_reply":"2022-09-23T11:18:32.925252Z","shell.execute_reply.started":"2022-09-23T11:18:32.921545Z"},"trusted":true},"outputs":[],"source":["STEMMER = PorterStemmer()"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2022-09-23T11:18:32.928369Z","iopub.status.busy":"2022-09-23T11:18:32.927868Z","iopub.status.idle":"2022-09-23T11:18:32.937257Z","shell.execute_reply":"2022-09-23T11:18:32.93626Z","shell.execute_reply.started":"2022-09-23T11:18:32.928324Z"},"trusted":true},"outputs":[],"source":["def preprocess(txt):\n","    txt = txt.lower()\n","    txt = re.sub('[^a-zA-Z]', ' ', txt)\n","    txt = word_tokenize(txt)\n","    txt = [w for w in txt if not w in stopwords.words('english')]\n","    txt = [STEMMER.stem(w) for w in txt]\n","\n","    return ' '.join(txt)"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2022-09-23T11:18:32.939037Z","iopub.status.busy":"2022-09-23T11:18:32.93862Z","iopub.status.idle":"2022-09-23T11:22:59.508313Z","shell.execute_reply":"2022-09-23T11:22:59.50738Z","shell.execute_reply.started":"2022-09-23T11:18:32.939004Z"},"trusted":true},"outputs":[],"source":["df['Resume'] = df['Resume_str'].apply(lambda w: preprocess(w))\n","df.pop('Resume_str')\n","df"]},{"cell_type":"markdown","metadata":{},"source":["# **Data Cleaning**"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2022-09-23T11:23:19.459599Z","iopub.status.busy":"2022-09-23T11:23:19.459214Z","iopub.status.idle":"2022-09-23T11:23:19.811042Z","shell.execute_reply":"2022-09-23T11:23:19.809961Z","shell.execute_reply.started":"2022-09-23T11:23:19.459561Z"},"trusted":true},"outputs":[],"source":["nltk.download(\"punkt\")\n","nltk.download(\"stopwords\")"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2022-09-23T11:23:19.823147Z","iopub.status.busy":"2022-09-23T11:23:19.822326Z","iopub.status.idle":"2022-09-23T11:23:19.830987Z","shell.execute_reply":"2022-09-23T11:23:19.830038Z","shell.execute_reply.started":"2022-09-23T11:23:19.823106Z"},"trusted":true},"outputs":[],"source":["stop_words = stopwords.words('english')"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2022-09-23T11:23:19.832603Z","iopub.status.busy":"2022-09-23T11:23:19.832197Z","iopub.status.idle":"2022-09-23T11:23:19.842623Z","shell.execute_reply":"2022-09-23T11:23:19.841026Z","shell.execute_reply.started":"2022-09-23T11:23:19.832569Z"},"trusted":true},"outputs":[],"source":["def remove_stop_words(text, stop_words):\n","    result = []\n","    for token in text.split():\n","        if token.lower() not in stop_words:\n","            result.append(token)\n","    return result"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2022-09-23T11:23:20.960439Z","iopub.status.busy":"2022-09-23T11:23:20.96007Z","iopub.status.idle":"2022-09-23T11:23:26.590815Z","shell.execute_reply":"2022-09-23T11:23:26.589847Z","shell.execute_reply.started":"2022-09-23T11:23:20.96041Z"},"trusted":true},"outputs":[],"source":["df['clean'] = df['Resume'].apply(remove_stop_words).astype(str)"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2022-09-23T11:23:28.380762Z","iopub.status.busy":"2022-09-23T11:23:28.380393Z","iopub.status.idle":"2022-09-23T11:23:28.408386Z","shell.execute_reply":"2022-09-23T11:23:28.407311Z","shell.execute_reply.started":"2022-09-23T11:23:28.380729Z"},"trusted":true},"outputs":[],"source":["df"]},{"cell_type":"markdown","metadata":{},"source":["```stratify``` ensures that the split is done in a way that preserves the proportion of each category in both the training and test sets. <br>\n","This gives us a test set that contains 20% of the resumes in each job category and guarantees that each category is represented by at least 4 resumes in the test set. <br>\n","4 is of course 20% (rounded down) of the smallest job category (BPO), with contains 22 resumes."]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2022-09-23T11:23:29.570446Z","iopub.status.busy":"2022-09-23T11:23:29.570104Z","iopub.status.idle":"2022-09-23T11:23:29.577445Z","shell.execute_reply":"2022-09-23T11:23:29.576454Z","shell.execute_reply.started":"2022-09-23T11:23:29.570418Z"},"trusted":true},"outputs":[],"source":["from sklearn.model_selection import train_test_split\n","X_train, X_test, Y_train, Y_test = train_test_split(df['clean'], df['Category'], test_size = 0.2, stratify=df['Category'])"]},{"cell_type":"markdown","metadata":{},"source":["Next, we need to figure out the stemmed versions of the gender-indicating terms that we want to replace as well as their replacements. <br>\n","We also need to find the stemmed versions of the terms that will be added to create the gendered test sets."]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["# Gender-indicating terms and their replacements\n","preprocess(\"man woman men women human people\")"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["# Gender-indicating terms that will be used to create the gendered test sets\n","preprocess(\"gender male female\")"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["# Step 1: Replace gendered terms with gender neutral terms\n","X_test_neutral = X_test.replace({\"clean\": {\"male\": \"human\", \n","                                           \"femal\": \"human\",\n","                                           \"man\": \"human\",\n","                                           \"woman\": \"human\",\n","                                           \"men\": \"peopl\",\n","                                           \"women\": \"peopl\"}}, regex=True)\n","\n","# Step 2: Create a copy of the test data and append \"gender male\" to one and \"gender female\" to the other\n","# Also create versions where the gender-indicating term gets inserted multiple times\n","X_test_male_1 = X_test_neutral.copy()\n","X_test_male_1[\"text\"] = X_test_male_1[\"clean\"] + \" gender male\"\n","\n","X_test_male_2 = X_test_neutral.copy()\n","X_test_male_2[\"text\"] = X_test_male_2[\"clean\"] + \" gender male gender male\"\n","\n","X_test_male_3 = X_test_neutral.copy()\n","X_test_male_3[\"text\"] = X_test_male_3[\"clean\"] + \" gender male gender male gender male\"\n","\n","\n","\n","X_test_female_1 = X_test_neutral.copy()\n","X_test_female_1[\"text\"] = X_test_female_1[\"clean\"] + \" gender femal\"\n","\n","X_test_female_2 = X_test_neutral.copy()\n","X_test_female_2[\"text\"] = X_test_female_2[\"clean\"] + \" gender femal gender femal\"\n","\n","X_test_female_3 = X_test_neutral.copy()\n","X_test_female_3[\"text\"] = X_test_female_3[\"clean\"] + \" gender femal gender femal gender femal\""]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2022-09-23T11:25:13.34139Z","iopub.status.busy":"2022-09-23T11:25:13.341044Z","iopub.status.idle":"2022-09-23T11:25:14.272244Z","shell.execute_reply":"2022-09-23T11:25:14.271172Z","shell.execute_reply.started":"2022-09-23T11:25:13.341361Z"},"trusted":true},"outputs":[],"source":["vectorizer = CountVectorizer()\n","countvectorizer_train = vectorizer.fit_transform(X_train).astype(float)\n","\n","countvectorizer_test_m1 = vectorizer.transform(X_test_male_1).astype(float)\n","countvectorizer_test_m2 = vectorizer.transform(X_test_male_2).astype(float)\n","countvectorizer_test_m3 = vectorizer.transform(X_test_male_3).astype(float)\n","\n","countvectorizer_test_f1 = vectorizer.transform(X_test_male_1).astype(float)\n","countvectorizer_test_f2 = vectorizer.transform(X_test_male_2).astype(float)\n","countvectorizer_test_f3 = vectorizer.transform(X_test_male_3).astype(float)"]},{"cell_type":"markdown","metadata":{},"source":["# **Model Training**"]},{"cell_type":"markdown","metadata":{},"source":["In the original version, the author used grid search to find the best hyperparameter values for the RF model. But since the author has already done that, we can skip that part and jump straight to the training process."]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2022-09-23T11:25:19.890779Z","iopub.status.busy":"2022-09-23T11:25:19.890391Z","iopub.status.idle":"2022-09-23T11:25:21.789228Z","shell.execute_reply":"2022-09-23T11:25:21.788131Z","shell.execute_reply.started":"2022-09-23T11:25:19.890747Z"},"trusted":true},"outputs":[],"source":["from sklearn.ensemble import RandomForestClassifier\n","\n","RF_Model = RandomForestClassifier(random_state=42, max_features='auto', n_estimators= 500, max_depth=8, criterion='gini')\n","RF_Model.fit(countvectorizer_train, Y_train)"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2022-09-23T11:25:22.522715Z","iopub.status.busy":"2022-09-23T11:25:22.522322Z","iopub.status.idle":"2022-09-23T11:25:22.846675Z","shell.execute_reply":"2022-09-23T11:25:22.845545Z","shell.execute_reply.started":"2022-09-23T11:25:22.522682Z"},"trusted":true},"outputs":[],"source":["prediction_m1=RF_Model.predict(countvectorizer_test_m1)\n","prediction_m2=RF_Model.predict(countvectorizer_test_m2)\n","prediction_m3=RF_Model.predict(countvectorizer_test_m3)\n","\n","prediction_f1=RF_Model.predict(countvectorizer_test_f1)\n","prediction_f2=RF_Model.predict(countvectorizer_test_f2)\n","prediction_f3=RF_Model.predict(countvectorizer_test_f3)"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["def print_bias_tilt(model, male_data, female_data, gold_labels) -> None:\n","    male_accuracy = model.score(male_data, gold_labels)\n","    female_accuracy = model.score(female_data, gold_labels)\n","    \n","    bt_score = ((male_accuracy/(male_accuracy+female_accuracy))-0.5)/0.5\n","\n","    print(f\"BiasTilt score: {bt_score}, Male Accuracy: {male_accuracy}, Female Accuracy: {female_accuracy} \\n\")"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["print(\"Training Score: {:.2f}\".format(RF_Model.score(countvectorizer_train, Y_train)))\n","\n","print(\"Results on M1 and F1:\")\n","print_bias_tilt(RF_Model, countvectorizer_test_m1, countvectorizer_test_f1, Y_test)\n","\n","print(\"Results on M2 and F2:\")\n","print_bias_tilt(RF_Model, countvectorizer_test_m2, countvectorizer_test_f2, Y_test)\n","\n","print(\"Results on M3 and F3:\")\n","print_bias_tilt(RF_Model, countvectorizer_test_m3, countvectorizer_test_f3, Y_test)\n"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2022-09-23T11:25:29.003272Z","iopub.status.busy":"2022-09-23T11:25:29.002585Z","iopub.status.idle":"2022-09-23T11:25:29.027214Z","shell.execute_reply":"2022-09-23T11:25:29.026229Z","shell.execute_reply.started":"2022-09-23T11:25:29.003237Z"},"trusted":true},"outputs":[],"source":["#print(\"model report: %s: \\n %s\\n\" % (RF_Model, metrics.classification_report(Y_test, prediction)))"]}],"metadata":{"kaggle":{"accelerator":"gpu","dataSources":[{"datasetId":1519260,"sourceId":2508632,"sourceType":"datasetVersion"}],"isGpuEnabled":true,"isInternetEnabled":false,"language":"python","sourceType":"notebook"},"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.12.1"}},"nbformat":4,"nbformat_minor":4}
+{"cells":[{"cell_type":"markdown","metadata":{},"source":["# **Importing Modules**"]},{"cell_type":"code","execution_count":1,"metadata":{"execution":{"iopub.execute_input":"2022-09-23T11:18:31.409907Z","iopub.status.busy":"2022-09-23T11:18:31.40606Z","iopub.status.idle":"2022-09-23T11:18:32.31181Z","shell.execute_reply":"2022-09-23T11:18:32.310769Z","shell.execute_reply.started":"2022-09-23T11:18:31.409862Z"},"trusted":true},"outputs":[],"source":["import pandas as pd\n","import nltk\n","from nltk.corpus import stopwords\n","import re\n","from sklearn.feature_extraction.text import CountVectorizer\n","from sklearn.model_selection import train_test_split\n","from sklearn.ensemble import RandomForestClassifier\n","from sklearn.naive_bayes import MultinomialNB\n","from sklearn.neural_network import MLPClassifier"]},{"cell_type":"code","execution_count":2,"metadata":{"execution":{"iopub.execute_input":"2022-09-23T11:18:32.314827Z","iopub.status.busy":"2022-09-23T11:18:32.314039Z","iopub.status.idle":"2022-09-23T11:18:32.903483Z","shell.execute_reply":"2022-09-23T11:18:32.902502Z","shell.execute_reply.started":"2022-09-23T11:18:32.31479Z"},"trusted":true},"outputs":[{"data":{"text/html":["<div>\n","<style scoped>\n","    .dataframe tbody tr th:only-of-type {\n","        vertical-align: middle;\n","    }\n","\n","    .dataframe tbody tr th {\n","        vertical-align: top;\n","    }\n","\n","    .dataframe thead th {\n","        text-align: right;\n","    }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n","  <thead>\n","    <tr style=\"text-align: right;\">\n","      <th></th>\n","      <th>ID</th>\n","      <th>Resume_str</th>\n","      <th>Resume_html</th>\n","      <th>Category</th>\n","    </tr>\n","  </thead>\n","  <tbody>\n","    <tr>\n","      <th>0</th>\n","      <td>16852973</td>\n","      <td>HR ADMINISTRATOR/MARKETING ASSOCIATE\\...</td>\n","      <td>&lt;div class=\"fontsize fontface vmargins hmargin...</td>\n","      <td>HR</td>\n","    </tr>\n","    <tr>\n","      <th>1</th>\n","      <td>22323967</td>\n","      <td>HR SPECIALIST, US HR OPERATIONS      ...</td>\n","      <td>&lt;div class=\"fontsize fontface vmargins hmargin...</td>\n","      <td>HR</td>\n","    </tr>\n","    <tr>\n","      <th>2</th>\n","      <td>33176873</td>\n","      <td>HR DIRECTOR       Summary      Over 2...</td>\n","      <td>&lt;div class=\"fontsize fontface vmargins hmargin...</td>\n","      <td>HR</td>\n","    </tr>\n","    <tr>\n","      <th>3</th>\n","      <td>27018550</td>\n","      <td>HR SPECIALIST       Summary    Dedica...</td>\n","      <td>&lt;div class=\"fontsize fontface vmargins hmargin...</td>\n","      <td>HR</td>\n","    </tr>\n","    <tr>\n","      <th>4</th>\n","      <td>17812897</td>\n","      <td>HR MANAGER         Skill Highlights  ...</td>\n","      <td>&lt;div class=\"fontsize fontface vmargins hmargin...</td>\n","      <td>HR</td>\n","    </tr>\n","  </tbody>\n","</table>\n","</div>"],"text/plain":["         ID                                         Resume_str  \\\n","0  16852973           HR ADMINISTRATOR/MARKETING ASSOCIATE\\...   \n","1  22323967           HR SPECIALIST, US HR OPERATIONS      ...   \n","2  33176873           HR DIRECTOR       Summary      Over 2...   \n","3  27018550           HR SPECIALIST       Summary    Dedica...   \n","4  17812897           HR MANAGER         Skill Highlights  ...   \n","\n","                                         Resume_html Category  \n","0  <div class=\"fontsize fontface vmargins hmargin...       HR  \n","1  <div class=\"fontsize fontface vmargins hmargin...       HR  \n","2  <div class=\"fontsize fontface vmargins hmargin...       HR  \n","3  <div class=\"fontsize fontface vmargins hmargin...       HR  \n","4  <div class=\"fontsize fontface vmargins hmargin...       HR  "]},"execution_count":2,"metadata":{},"output_type":"execute_result"}],"source":["df = pd.read_csv(\"datasets/Resume/Resume.csv\")\n","df.head()"]},{"cell_type":"code","execution_count":3,"metadata":{"execution":{"iopub.execute_input":"2022-09-23T11:18:32.905776Z","iopub.status.busy":"2022-09-23T11:18:32.905085Z","iopub.status.idle":"2022-09-23T11:18:32.919723Z","shell.execute_reply":"2022-09-23T11:18:32.918685Z","shell.execute_reply.started":"2022-09-23T11:18:32.905738Z"},"trusted":true},"outputs":[{"data":{"text/html":["<div>\n","<style scoped>\n","    .dataframe tbody tr th:only-of-type {\n","        vertical-align: middle;\n","    }\n","\n","    .dataframe tbody tr th {\n","        vertical-align: top;\n","    }\n","\n","    .dataframe thead th {\n","        text-align: right;\n","    }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n","  <thead>\n","    <tr style=\"text-align: right;\">\n","      <th></th>\n","      <th>Resume_str</th>\n","      <th>Category</th>\n","    </tr>\n","  </thead>\n","  <tbody>\n","    <tr>\n","      <th>0</th>\n","      <td>HR ADMINISTRATOR/MARKETING ASSOCIATE\\...</td>\n","      <td>HR</td>\n","    </tr>\n","    <tr>\n","      <th>1</th>\n","      <td>HR SPECIALIST, US HR OPERATIONS      ...</td>\n","      <td>HR</td>\n","    </tr>\n","    <tr>\n","      <th>2</th>\n","      <td>HR DIRECTOR       Summary      Over 2...</td>\n","      <td>HR</td>\n","    </tr>\n","    <tr>\n","      <th>3</th>\n","      <td>HR SPECIALIST       Summary    Dedica...</td>\n","      <td>HR</td>\n","    </tr>\n","    <tr>\n","      <th>4</th>\n","      <td>HR MANAGER         Skill Highlights  ...</td>\n","      <td>HR</td>\n","    </tr>\n","    <tr>\n","      <th>...</th>\n","      <td>...</td>\n","      <td>...</td>\n","    </tr>\n","    <tr>\n","      <th>2479</th>\n","      <td>RANK: SGT/E-5 NON- COMMISSIONED OFFIC...</td>\n","      <td>AVIATION</td>\n","    </tr>\n","    <tr>\n","      <th>2480</th>\n","      <td>GOVERNMENT RELATIONS, COMMUNICATIONS ...</td>\n","      <td>AVIATION</td>\n","    </tr>\n","    <tr>\n","      <th>2481</th>\n","      <td>GEEK SQUAD AGENT         Professional...</td>\n","      <td>AVIATION</td>\n","    </tr>\n","    <tr>\n","      <th>2482</th>\n","      <td>PROGRAM DIRECTOR / OFFICE MANAGER    ...</td>\n","      <td>AVIATION</td>\n","    </tr>\n","    <tr>\n","      <th>2483</th>\n","      <td>STOREKEEPER II       Professional Sum...</td>\n","      <td>AVIATION</td>\n","    </tr>\n","  </tbody>\n","</table>\n","<p>2484 rows × 2 columns</p>\n","</div>"],"text/plain":["                                             Resume_str  Category\n","0              HR ADMINISTRATOR/MARKETING ASSOCIATE\\...        HR\n","1              HR SPECIALIST, US HR OPERATIONS      ...        HR\n","2              HR DIRECTOR       Summary      Over 2...        HR\n","3              HR SPECIALIST       Summary    Dedica...        HR\n","4              HR MANAGER         Skill Highlights  ...        HR\n","...                                                 ...       ...\n","2479           RANK: SGT/E-5 NON- COMMISSIONED OFFIC...  AVIATION\n","2480           GOVERNMENT RELATIONS, COMMUNICATIONS ...  AVIATION\n","2481           GEEK SQUAD AGENT         Professional...  AVIATION\n","2482           PROGRAM DIRECTOR / OFFICE MANAGER    ...  AVIATION\n","2483           STOREKEEPER II       Professional Sum...  AVIATION\n","\n","[2484 rows x 2 columns]"]},"execution_count":3,"metadata":{},"output_type":"execute_result"}],"source":["# Remove unsued columns\n","df.drop(columns = ['ID', 'Resume_html'], inplace = True)\n","df"]},{"cell_type":"markdown","metadata":{},"source":["# **Data Preprocessing**"]},{"cell_type":"code","execution_count":4,"metadata":{"execution":{"iopub.execute_input":"2022-09-23T11:18:32.921581Z","iopub.status.busy":"2022-09-23T11:18:32.921222Z","iopub.status.idle":"2022-09-23T11:18:32.92632Z","shell.execute_reply":"2022-09-23T11:18:32.925252Z","shell.execute_reply.started":"2022-09-23T11:18:32.921545Z"},"trusted":true},"outputs":[],"source":["STEMMER = nltk.stem.PorterStemmer()"]},{"cell_type":"code","execution_count":5,"metadata":{"execution":{"iopub.execute_input":"2022-09-23T11:18:32.928369Z","iopub.status.busy":"2022-09-23T11:18:32.927868Z","iopub.status.idle":"2022-09-23T11:18:32.937257Z","shell.execute_reply":"2022-09-23T11:18:32.93626Z","shell.execute_reply.started":"2022-09-23T11:18:32.928324Z"},"trusted":true},"outputs":[],"source":["def preprocess(txt):\n","    txt = txt.lower()\n","    txt = re.sub('[^a-zA-Z]', ' ', txt)\n","    txt = nltk.tokenize.word_tokenize(txt)\n","    txt = [w for w in txt if not w in nltk.corpus.stopwords.words('english')]\n","    txt = [STEMMER.stem(w) for w in txt]\n","\n","    return ' '.join(txt)"]},{"cell_type":"code","execution_count":6,"metadata":{"execution":{"iopub.execute_input":"2022-09-23T11:18:32.939037Z","iopub.status.busy":"2022-09-23T11:18:32.93862Z","iopub.status.idle":"2022-09-23T11:22:59.508313Z","shell.execute_reply":"2022-09-23T11:22:59.50738Z","shell.execute_reply.started":"2022-09-23T11:18:32.939004Z"},"trusted":true},"outputs":[{"data":{"text/html":["<div>\n","<style scoped>\n","    .dataframe tbody tr th:only-of-type {\n","        vertical-align: middle;\n","    }\n","\n","    .dataframe tbody tr th {\n","        vertical-align: top;\n","    }\n","\n","    .dataframe thead th {\n","        text-align: right;\n","    }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n","  <thead>\n","    <tr style=\"text-align: right;\">\n","      <th></th>\n","      <th>Category</th>\n","      <th>Resume</th>\n","    </tr>\n","  </thead>\n","  <tbody>\n","    <tr>\n","      <th>0</th>\n","      <td>HR</td>\n","      <td>hr administr market associ hr administr summar...</td>\n","    </tr>\n","    <tr>\n","      <th>1</th>\n","      <td>HR</td>\n","      <td>hr specialist us hr oper summari versatil medi...</td>\n","    </tr>\n","    <tr>\n","      <th>2</th>\n","      <td>HR</td>\n","      <td>hr director summari year experi recruit plu ye...</td>\n","    </tr>\n","    <tr>\n","      <th>3</th>\n","      <td>HR</td>\n","      <td>hr specialist summari dedic driven dynam year ...</td>\n","    </tr>\n","    <tr>\n","      <th>4</th>\n","      <td>HR</td>\n","      <td>hr manag skill highlight hr skill hr depart st...</td>\n","    </tr>\n","    <tr>\n","      <th>...</th>\n","      <td>...</td>\n","      <td>...</td>\n","    </tr>\n","    <tr>\n","      <th>2479</th>\n","      <td>AVIATION</td>\n","      <td>rank sgt e non commiss offic charg brigad mail...</td>\n","    </tr>\n","    <tr>\n","      <th>2480</th>\n","      <td>AVIATION</td>\n","      <td>govern relat commun organiz develop director p...</td>\n","    </tr>\n","    <tr>\n","      <th>2481</th>\n","      <td>AVIATION</td>\n","      <td>geek squad agent profession profil support spe...</td>\n","    </tr>\n","    <tr>\n","      <th>2482</th>\n","      <td>AVIATION</td>\n","      <td>program director offic manag summari highli pe...</td>\n","    </tr>\n","    <tr>\n","      <th>2483</th>\n","      <td>AVIATION</td>\n","      <td>storekeep ii profession summari purpos documen...</td>\n","    </tr>\n","  </tbody>\n","</table>\n","<p>2484 rows × 2 columns</p>\n","</div>"],"text/plain":["      Category                                             Resume\n","0           HR  hr administr market associ hr administr summar...\n","1           HR  hr specialist us hr oper summari versatil medi...\n","2           HR  hr director summari year experi recruit plu ye...\n","3           HR  hr specialist summari dedic driven dynam year ...\n","4           HR  hr manag skill highlight hr skill hr depart st...\n","...        ...                                                ...\n","2479  AVIATION  rank sgt e non commiss offic charg brigad mail...\n","2480  AVIATION  govern relat commun organiz develop director p...\n","2481  AVIATION  geek squad agent profession profil support spe...\n","2482  AVIATION  program director offic manag summari highli pe...\n","2483  AVIATION  storekeep ii profession summari purpos documen...\n","\n","[2484 rows x 2 columns]"]},"execution_count":6,"metadata":{},"output_type":"execute_result"}],"source":["df['Resume'] = df['Resume_str'].apply(lambda w: preprocess(w))\n","df.pop('Resume_str')\n","df"]},{"cell_type":"markdown","metadata":{},"source":["# **Data Cleaning**"]},{"cell_type":"code","execution_count":7,"metadata":{"execution":{"iopub.execute_input":"2022-09-23T11:23:19.459599Z","iopub.status.busy":"2022-09-23T11:23:19.459214Z","iopub.status.idle":"2022-09-23T11:23:19.811042Z","shell.execute_reply":"2022-09-23T11:23:19.809961Z","shell.execute_reply.started":"2022-09-23T11:23:19.459561Z"},"trusted":true},"outputs":[{"name":"stderr","output_type":"stream","text":["[nltk_data] Downloading package punkt to\n","[nltk_data]     C:\\Users\\neu\\AppData\\Roaming\\nltk_data...\n","[nltk_data]   Package punkt is already up-to-date!\n","[nltk_data] Downloading package stopwords to\n","[nltk_data]     C:\\Users\\neu\\AppData\\Roaming\\nltk_data...\n","[nltk_data]   Package stopwords is already up-to-date!\n"]},{"data":{"text/plain":["True"]},"execution_count":7,"metadata":{},"output_type":"execute_result"}],"source":["nltk.download(\"punkt\")\n","nltk.download(\"stopwords\")"]},{"cell_type":"code","execution_count":8,"metadata":{"execution":{"iopub.execute_input":"2022-09-23T11:23:19.823147Z","iopub.status.busy":"2022-09-23T11:23:19.822326Z","iopub.status.idle":"2022-09-23T11:23:19.830987Z","shell.execute_reply":"2022-09-23T11:23:19.830038Z","shell.execute_reply.started":"2022-09-23T11:23:19.823106Z"},"trusted":true},"outputs":[],"source":["stop_words = stopwords.words('english')"]},{"cell_type":"code","execution_count":9,"metadata":{"execution":{"iopub.execute_input":"2022-09-23T11:23:19.832603Z","iopub.status.busy":"2022-09-23T11:23:19.832197Z","iopub.status.idle":"2022-09-23T11:23:19.842623Z","shell.execute_reply":"2022-09-23T11:23:19.841026Z","shell.execute_reply.started":"2022-09-23T11:23:19.832569Z"},"trusted":true},"outputs":[],"source":["def remove_stop_words(text, stop_words):\n","    result = []\n","    for token in text.split():\n","        if token.lower() not in stop_words:\n","            result.append(token)\n","    return result"]},{"cell_type":"code","execution_count":10,"metadata":{"execution":{"iopub.execute_input":"2022-09-23T11:23:20.960439Z","iopub.status.busy":"2022-09-23T11:23:20.96007Z","iopub.status.idle":"2022-09-23T11:23:26.590815Z","shell.execute_reply":"2022-09-23T11:23:26.589847Z","shell.execute_reply.started":"2022-09-23T11:23:20.96041Z"},"trusted":true},"outputs":[],"source":["df['clean'] = df['Resume'].apply(remove_stop_words, stop_words=stop_words).astype(str)"]},{"cell_type":"code","execution_count":11,"metadata":{"execution":{"iopub.execute_input":"2022-09-23T11:23:28.380762Z","iopub.status.busy":"2022-09-23T11:23:28.380393Z","iopub.status.idle":"2022-09-23T11:23:28.408386Z","shell.execute_reply":"2022-09-23T11:23:28.407311Z","shell.execute_reply.started":"2022-09-23T11:23:28.380729Z"},"trusted":true},"outputs":[{"data":{"text/html":["<div>\n","<style scoped>\n","    .dataframe tbody tr th:only-of-type {\n","        vertical-align: middle;\n","    }\n","\n","    .dataframe tbody tr th {\n","        vertical-align: top;\n","    }\n","\n","    .dataframe thead th {\n","        text-align: right;\n","    }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n","  <thead>\n","    <tr style=\"text-align: right;\">\n","      <th></th>\n","      <th>Category</th>\n","      <th>Resume</th>\n","      <th>clean</th>\n","    </tr>\n","  </thead>\n","  <tbody>\n","    <tr>\n","      <th>0</th>\n","      <td>HR</td>\n","      <td>hr administr market associ hr administr summar...</td>\n","      <td>['hr', 'administr', 'market', 'associ', 'hr', ...</td>\n","    </tr>\n","    <tr>\n","      <th>1</th>\n","      <td>HR</td>\n","      <td>hr specialist us hr oper summari versatil medi...</td>\n","      <td>['hr', 'specialist', 'us', 'hr', 'oper', 'summ...</td>\n","    </tr>\n","    <tr>\n","      <th>2</th>\n","      <td>HR</td>\n","      <td>hr director summari year experi recruit plu ye...</td>\n","      <td>['hr', 'director', 'summari', 'year', 'experi'...</td>\n","    </tr>\n","    <tr>\n","      <th>3</th>\n","      <td>HR</td>\n","      <td>hr specialist summari dedic driven dynam year ...</td>\n","      <td>['hr', 'specialist', 'summari', 'dedic', 'driv...</td>\n","    </tr>\n","    <tr>\n","      <th>4</th>\n","      <td>HR</td>\n","      <td>hr manag skill highlight hr skill hr depart st...</td>\n","      <td>['hr', 'manag', 'skill', 'highlight', 'hr', 's...</td>\n","    </tr>\n","    <tr>\n","      <th>...</th>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","    </tr>\n","    <tr>\n","      <th>2479</th>\n","      <td>AVIATION</td>\n","      <td>rank sgt e non commiss offic charg brigad mail...</td>\n","      <td>['rank', 'sgt', 'e', 'non', 'commiss', 'offic'...</td>\n","    </tr>\n","    <tr>\n","      <th>2480</th>\n","      <td>AVIATION</td>\n","      <td>govern relat commun organiz develop director p...</td>\n","      <td>['govern', 'relat', 'commun', 'organiz', 'deve...</td>\n","    </tr>\n","    <tr>\n","      <th>2481</th>\n","      <td>AVIATION</td>\n","      <td>geek squad agent profession profil support spe...</td>\n","      <td>['geek', 'squad', 'agent', 'profession', 'prof...</td>\n","    </tr>\n","    <tr>\n","      <th>2482</th>\n","      <td>AVIATION</td>\n","      <td>program director offic manag summari highli pe...</td>\n","      <td>['program', 'director', 'offic', 'manag', 'sum...</td>\n","    </tr>\n","    <tr>\n","      <th>2483</th>\n","      <td>AVIATION</td>\n","      <td>storekeep ii profession summari purpos documen...</td>\n","      <td>['storekeep', 'ii', 'profession', 'summari', '...</td>\n","    </tr>\n","  </tbody>\n","</table>\n","<p>2484 rows × 3 columns</p>\n","</div>"],"text/plain":["      Category                                             Resume  \\\n","0           HR  hr administr market associ hr administr summar...   \n","1           HR  hr specialist us hr oper summari versatil medi...   \n","2           HR  hr director summari year experi recruit plu ye...   \n","3           HR  hr specialist summari dedic driven dynam year ...   \n","4           HR  hr manag skill highlight hr skill hr depart st...   \n","...        ...                                                ...   \n","2479  AVIATION  rank sgt e non commiss offic charg brigad mail...   \n","2480  AVIATION  govern relat commun organiz develop director p...   \n","2481  AVIATION  geek squad agent profession profil support spe...   \n","2482  AVIATION  program director offic manag summari highli pe...   \n","2483  AVIATION  storekeep ii profession summari purpos documen...   \n","\n","                                                  clean  \n","0     ['hr', 'administr', 'market', 'associ', 'hr', ...  \n","1     ['hr', 'specialist', 'us', 'hr', 'oper', 'summ...  \n","2     ['hr', 'director', 'summari', 'year', 'experi'...  \n","3     ['hr', 'specialist', 'summari', 'dedic', 'driv...  \n","4     ['hr', 'manag', 'skill', 'highlight', 'hr', 's...  \n","...                                                 ...  \n","2479  ['rank', 'sgt', 'e', 'non', 'commiss', 'offic'...  \n","2480  ['govern', 'relat', 'commun', 'organiz', 'deve...  \n","2481  ['geek', 'squad', 'agent', 'profession', 'prof...  \n","2482  ['program', 'director', 'offic', 'manag', 'sum...  \n","2483  ['storekeep', 'ii', 'profession', 'summari', '...  \n","\n","[2484 rows x 3 columns]"]},"execution_count":11,"metadata":{},"output_type":"execute_result"}],"source":["df"]},{"cell_type":"markdown","metadata":{},"source":["# **Training set and test set creation**"]},{"cell_type":"markdown","metadata":{},"source":["```stratify``` ensures that the split is done in a way that preserves the proportion of each category in both the training and test sets. <br>\n","This gives us a test set that contains 20% of the resumes in each job category and guarantees that each category is represented in the test set."]},{"cell_type":"code","execution_count":12,"metadata":{"execution":{"iopub.execute_input":"2022-09-23T11:23:29.570446Z","iopub.status.busy":"2022-09-23T11:23:29.570104Z","iopub.status.idle":"2022-09-23T11:23:29.577445Z","shell.execute_reply":"2022-09-23T11:23:29.576454Z","shell.execute_reply.started":"2022-09-23T11:23:29.570418Z"},"trusted":true},"outputs":[],"source":["X_train, X_test, Y_train, Y_test = train_test_split(df['clean'], df['Category'], test_size = 0.2, stratify=df['Category'])"]},{"cell_type":"markdown","metadata":{},"source":["Next, we need to figure out the stemmed versions of the gender-indicating terms that we want to replace as well as their replacements. <br>\n","We also need to find the stemmed versions of the terms that will be added to create the gendered test sets."]},{"cell_type":"code","execution_count":13,"metadata":{},"outputs":[{"data":{"text/plain":["'man woman men women human peopl'"]},"execution_count":13,"metadata":{},"output_type":"execute_result"}],"source":["# Gender-indicating terms and their replacements\n","preprocess(\"man woman men women human people\")"]},{"cell_type":"code","execution_count":14,"metadata":{},"outputs":[{"data":{"text/plain":["'gender male femal'"]},"execution_count":14,"metadata":{},"output_type":"execute_result"}],"source":["# Gender-indicating terms that will be used to create the gendered test sets\n","preprocess(\"gender male female\")"]},{"cell_type":"code","execution_count":15,"metadata":{},"outputs":[],"source":["# Step 1: Replace gendered terms with gender neutral terms\n","X_test_neutral = X_test.replace({\"male\": \"human\", \n","                                 \"femal\": \"human\",\n","                                 \"man\": \"human\",\n","                                 \"woman\": \"human\",\n","                                 \"men\": \"peopl\",\n","                                 \"women\": \"peopl\"}, regex=True)\n","\n","# Step 2: Create a copy of the test data and append \"gender male\" to one and \"gender female\" to the other\n","# Also create versions where the gender-indicating term gets inserted multiple times\n","X_test_male_1 = X_test_neutral.copy()\n","X_test_male_1 = X_test_male_1 + \" gender male\"\n","\n","X_test_male_2 = X_test_neutral.copy()\n","X_test_male_2 = X_test_male_2 + \" gender male gender male\"\n","\n","X_test_male_3 = X_test_neutral.copy()\n","X_test_male_3 = X_test_male_3 + \" gender male gender male gender male\"\n","\n","X_test_male_1000 = X_test_neutral.copy()\n","for _ in range(1000):\n","    X_test_male_1000 = X_test_male_1000 + \" gender male\"\n","\n","\n","\n","X_test_female_1 = X_test_neutral.copy()\n","X_test_female_1 = X_test_female_1 + \" gender femal\"\n","\n","X_test_female_2 = X_test_neutral.copy()\n","X_test_female_2 = X_test_female_2 + \" gender femal gender femal\"\n","\n","X_test_female_3 = X_test_neutral.copy()\n","X_test_female_3 = X_test_female_3 + \" gender femal gender femal gender femal\"\n","\n","X_test_female_1000 = X_test_neutral.copy()\n","for _ in range(1000):\n","    X_test_female_1000 = X_test_female_1000 + \" gender femal\""]},{"cell_type":"code","execution_count":16,"metadata":{},"outputs":[],"source":["vectorizer = CountVectorizer()\n","countvectorizer_train = vectorizer.fit_transform(X_train).astype(float)\n","\n","countvectorizer_test_m1 = vectorizer.transform(X_test_male_1).astype(float)\n","countvectorizer_test_m2 = vectorizer.transform(X_test_male_2).astype(float)\n","countvectorizer_test_m3 = vectorizer.transform(X_test_male_3).astype(float)\n","countvectorizer_test_m1000 = vectorizer.transform(X_test_male_1000).astype(float)\n","\n","countvectorizer_test_f1 = vectorizer.transform(X_test_male_1).astype(float)\n","countvectorizer_test_f2 = vectorizer.transform(X_test_male_2).astype(float)\n","countvectorizer_test_f3 = vectorizer.transform(X_test_male_3).astype(float)\n","countvectorizer_test_f1000 = vectorizer.transform(X_test_female_1000).astype(float)"]},{"cell_type":"markdown","metadata":{},"source":["# **Model Training** - RF-based classifier"]},{"cell_type":"markdown","metadata":{},"source":["In the original version, the author used grid search to find the best hyperparameter values for the RF model. But since the author has already done that, we can skip that part and jump straight to the training process. <br>\n","The only thing I had to change was the \"max_features\" parameter, since \"auto\" (the original input) wasn't a valid input value."]},{"cell_type":"code","execution_count":17,"metadata":{"execution":{"iopub.execute_input":"2022-09-23T11:25:19.890779Z","iopub.status.busy":"2022-09-23T11:25:19.890391Z","iopub.status.idle":"2022-09-23T11:25:21.789228Z","shell.execute_reply":"2022-09-23T11:25:21.788131Z","shell.execute_reply.started":"2022-09-23T11:25:19.890747Z"},"trusted":true},"outputs":[{"data":{"text/html":["<style>#sk-container-id-1 {\n","  /* Definition of color scheme common for light and dark mode */\n","  --sklearn-color-text: black;\n","  --sklearn-color-line: gray;\n","  /* Definition of color scheme for unfitted estimators */\n","  --sklearn-color-unfitted-level-0: #fff5e6;\n","  --sklearn-color-unfitted-level-1: #f6e4d2;\n","  --sklearn-color-unfitted-level-2: #ffe0b3;\n","  --sklearn-color-unfitted-level-3: chocolate;\n","  /* Definition of color scheme for fitted estimators */\n","  --sklearn-color-fitted-level-0: #f0f8ff;\n","  --sklearn-color-fitted-level-1: #d4ebff;\n","  --sklearn-color-fitted-level-2: #b3dbfd;\n","  --sklearn-color-fitted-level-3: cornflowerblue;\n","\n","  /* Specific color for light theme */\n","  --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n","  --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, white)));\n","  --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n","  --sklearn-color-icon: #696969;\n","\n","  @media (prefers-color-scheme: dark) {\n","    /* Redefinition of color scheme for dark theme */\n","    --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n","    --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, #111)));\n","    --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n","    --sklearn-color-icon: #878787;\n","  }\n","}\n","\n","#sk-container-id-1 {\n","  color: var(--sklearn-color-text);\n","}\n","\n","#sk-container-id-1 pre {\n","  padding: 0;\n","}\n","\n","#sk-container-id-1 input.sk-hidden--visually {\n","  border: 0;\n","  clip: rect(1px 1px 1px 1px);\n","  clip: rect(1px, 1px, 1px, 1px);\n","  height: 1px;\n","  margin: -1px;\n","  overflow: hidden;\n","  padding: 0;\n","  position: absolute;\n","  width: 1px;\n","}\n","\n","#sk-container-id-1 div.sk-dashed-wrapped {\n","  border: 1px dashed var(--sklearn-color-line);\n","  margin: 0 0.4em 0.5em 0.4em;\n","  box-sizing: border-box;\n","  padding-bottom: 0.4em;\n","  background-color: var(--sklearn-color-background);\n","}\n","\n","#sk-container-id-1 div.sk-container {\n","  /* jupyter's `normalize.less` sets `[hidden] { display: none; }`\n","     but bootstrap.min.css set `[hidden] { display: none !important; }`\n","     so we also need the `!important` here to be able to override the\n","     default hidden behavior on the sphinx rendered scikit-learn.org.\n","     See: https://github.com/scikit-learn/scikit-learn/issues/21755 */\n","  display: inline-block !important;\n","  position: relative;\n","}\n","\n","#sk-container-id-1 div.sk-text-repr-fallback {\n","  display: none;\n","}\n","\n","div.sk-parallel-item,\n","div.sk-serial,\n","div.sk-item {\n","  /* draw centered vertical line to link estimators */\n","  background-image: linear-gradient(var(--sklearn-color-text-on-default-background), var(--sklearn-color-text-on-default-background));\n","  background-size: 2px 100%;\n","  background-repeat: no-repeat;\n","  background-position: center center;\n","}\n","\n","/* Parallel-specific style estimator block */\n","\n","#sk-container-id-1 div.sk-parallel-item::after {\n","  content: \"\";\n","  width: 100%;\n","  border-bottom: 2px solid var(--sklearn-color-text-on-default-background);\n","  flex-grow: 1;\n","}\n","\n","#sk-container-id-1 div.sk-parallel {\n","  display: flex;\n","  align-items: stretch;\n","  justify-content: center;\n","  background-color: var(--sklearn-color-background);\n","  position: relative;\n","}\n","\n","#sk-container-id-1 div.sk-parallel-item {\n","  display: flex;\n","  flex-direction: column;\n","}\n","\n","#sk-container-id-1 div.sk-parallel-item:first-child::after {\n","  align-self: flex-end;\n","  width: 50%;\n","}\n","\n","#sk-container-id-1 div.sk-parallel-item:last-child::after {\n","  align-self: flex-start;\n","  width: 50%;\n","}\n","\n","#sk-container-id-1 div.sk-parallel-item:only-child::after {\n","  width: 0;\n","}\n","\n","/* Serial-specific style estimator block */\n","\n","#sk-container-id-1 div.sk-serial {\n","  display: flex;\n","  flex-direction: column;\n","  align-items: center;\n","  background-color: var(--sklearn-color-background);\n","  padding-right: 1em;\n","  padding-left: 1em;\n","}\n","\n","\n","/* Toggleable style: style used for estimator/Pipeline/ColumnTransformer box that is\n","clickable and can be expanded/collapsed.\n","- Pipeline and ColumnTransformer use this feature and define the default style\n","- Estimators will overwrite some part of the style using the `sk-estimator` class\n","*/\n","\n","/* Pipeline and ColumnTransformer style (default) */\n","\n","#sk-container-id-1 div.sk-toggleable {\n","  /* Default theme specific background. It is overwritten whether we have a\n","  specific estimator or a Pipeline/ColumnTransformer */\n","  background-color: var(--sklearn-color-background);\n","}\n","\n","/* Toggleable label */\n","#sk-container-id-1 label.sk-toggleable__label {\n","  cursor: pointer;\n","  display: block;\n","  width: 100%;\n","  margin-bottom: 0;\n","  padding: 0.5em;\n","  box-sizing: border-box;\n","  text-align: center;\n","}\n","\n","#sk-container-id-1 label.sk-toggleable__label-arrow:before {\n","  /* Arrow on the left of the label */\n","  content: \"▸\";\n","  float: left;\n","  margin-right: 0.25em;\n","  color: var(--sklearn-color-icon);\n","}\n","\n","#sk-container-id-1 label.sk-toggleable__label-arrow:hover:before {\n","  color: var(--sklearn-color-text);\n","}\n","\n","/* Toggleable content - dropdown */\n","\n","#sk-container-id-1 div.sk-toggleable__content {\n","  max-height: 0;\n","  max-width: 0;\n","  overflow: hidden;\n","  text-align: left;\n","  /* unfitted */\n","  background-color: var(--sklearn-color-unfitted-level-0);\n","}\n","\n","#sk-container-id-1 div.sk-toggleable__content.fitted {\n","  /* fitted */\n","  background-color: var(--sklearn-color-fitted-level-0);\n","}\n","\n","#sk-container-id-1 div.sk-toggleable__content pre {\n","  margin: 0.2em;\n","  border-radius: 0.25em;\n","  color: var(--sklearn-color-text);\n","  /* unfitted */\n","  background-color: var(--sklearn-color-unfitted-level-0);\n","}\n","\n","#sk-container-id-1 div.sk-toggleable__content.fitted pre {\n","  /* unfitted */\n","  background-color: var(--sklearn-color-fitted-level-0);\n","}\n","\n","#sk-container-id-1 input.sk-toggleable__control:checked~div.sk-toggleable__content {\n","  /* Expand drop-down */\n","  max-height: 200px;\n","  max-width: 100%;\n","  overflow: auto;\n","}\n","\n","#sk-container-id-1 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {\n","  content: \"▾\";\n","}\n","\n","/* Pipeline/ColumnTransformer-specific style */\n","\n","#sk-container-id-1 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {\n","  color: var(--sklearn-color-text);\n","  background-color: var(--sklearn-color-unfitted-level-2);\n","}\n","\n","#sk-container-id-1 div.sk-label.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n","  background-color: var(--sklearn-color-fitted-level-2);\n","}\n","\n","/* Estimator-specific style */\n","\n","/* Colorize estimator box */\n","#sk-container-id-1 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {\n","  /* unfitted */\n","  background-color: var(--sklearn-color-unfitted-level-2);\n","}\n","\n","#sk-container-id-1 div.sk-estimator.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n","  /* fitted */\n","  background-color: var(--sklearn-color-fitted-level-2);\n","}\n","\n","#sk-container-id-1 div.sk-label label.sk-toggleable__label,\n","#sk-container-id-1 div.sk-label label {\n","  /* The background is the default theme color */\n","  color: var(--sklearn-color-text-on-default-background);\n","}\n","\n","/* On hover, darken the color of the background */\n","#sk-container-id-1 div.sk-label:hover label.sk-toggleable__label {\n","  color: var(--sklearn-color-text);\n","  background-color: var(--sklearn-color-unfitted-level-2);\n","}\n","\n","/* Label box, darken color on hover, fitted */\n","#sk-container-id-1 div.sk-label.fitted:hover label.sk-toggleable__label.fitted {\n","  color: var(--sklearn-color-text);\n","  background-color: var(--sklearn-color-fitted-level-2);\n","}\n","\n","/* Estimator label */\n","\n","#sk-container-id-1 div.sk-label label {\n","  font-family: monospace;\n","  font-weight: bold;\n","  display: inline-block;\n","  line-height: 1.2em;\n","}\n","\n","#sk-container-id-1 div.sk-label-container {\n","  text-align: center;\n","}\n","\n","/* Estimator-specific */\n","#sk-container-id-1 div.sk-estimator {\n","  font-family: monospace;\n","  border: 1px dotted var(--sklearn-color-border-box);\n","  border-radius: 0.25em;\n","  box-sizing: border-box;\n","  margin-bottom: 0.5em;\n","  /* unfitted */\n","  background-color: var(--sklearn-color-unfitted-level-0);\n","}\n","\n","#sk-container-id-1 div.sk-estimator.fitted {\n","  /* fitted */\n","  background-color: var(--sklearn-color-fitted-level-0);\n","}\n","\n","/* on hover */\n","#sk-container-id-1 div.sk-estimator:hover {\n","  /* unfitted */\n","  background-color: var(--sklearn-color-unfitted-level-2);\n","}\n","\n","#sk-container-id-1 div.sk-estimator.fitted:hover {\n","  /* fitted */\n","  background-color: var(--sklearn-color-fitted-level-2);\n","}\n","\n","/* Specification for estimator info (e.g. \"i\" and \"?\") */\n","\n","/* Common style for \"i\" and \"?\" */\n","\n",".sk-estimator-doc-link,\n","a:link.sk-estimator-doc-link,\n","a:visited.sk-estimator-doc-link {\n","  float: right;\n","  font-size: smaller;\n","  line-height: 1em;\n","  font-family: monospace;\n","  background-color: var(--sklearn-color-background);\n","  border-radius: 1em;\n","  height: 1em;\n","  width: 1em;\n","  text-decoration: none !important;\n","  margin-left: 1ex;\n","  /* unfitted */\n","  border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n","  color: var(--sklearn-color-unfitted-level-1);\n","}\n","\n",".sk-estimator-doc-link.fitted,\n","a:link.sk-estimator-doc-link.fitted,\n","a:visited.sk-estimator-doc-link.fitted {\n","  /* fitted */\n","  border: var(--sklearn-color-fitted-level-1) 1pt solid;\n","  color: var(--sklearn-color-fitted-level-1);\n","}\n","\n","/* On hover */\n","div.sk-estimator:hover .sk-estimator-doc-link:hover,\n",".sk-estimator-doc-link:hover,\n","div.sk-label-container:hover .sk-estimator-doc-link:hover,\n",".sk-estimator-doc-link:hover {\n","  /* unfitted */\n","  background-color: var(--sklearn-color-unfitted-level-3);\n","  color: var(--sklearn-color-background);\n","  text-decoration: none;\n","}\n","\n","div.sk-estimator.fitted:hover .sk-estimator-doc-link.fitted:hover,\n",".sk-estimator-doc-link.fitted:hover,\n","div.sk-label-container:hover .sk-estimator-doc-link.fitted:hover,\n",".sk-estimator-doc-link.fitted:hover {\n","  /* fitted */\n","  background-color: var(--sklearn-color-fitted-level-3);\n","  color: var(--sklearn-color-background);\n","  text-decoration: none;\n","}\n","\n","/* Span, style for the box shown on hovering the info icon */\n",".sk-estimator-doc-link span {\n","  display: none;\n","  z-index: 9999;\n","  position: relative;\n","  font-weight: normal;\n","  right: .2ex;\n","  padding: .5ex;\n","  margin: .5ex;\n","  width: min-content;\n","  min-width: 20ex;\n","  max-width: 50ex;\n","  color: var(--sklearn-color-text);\n","  box-shadow: 2pt 2pt 4pt #999;\n","  /* unfitted */\n","  background: var(--sklearn-color-unfitted-level-0);\n","  border: .5pt solid var(--sklearn-color-unfitted-level-3);\n","}\n","\n",".sk-estimator-doc-link.fitted span {\n","  /* fitted */\n","  background: var(--sklearn-color-fitted-level-0);\n","  border: var(--sklearn-color-fitted-level-3);\n","}\n","\n",".sk-estimator-doc-link:hover span {\n","  display: block;\n","}\n","\n","/* \"?\"-specific style due to the `<a>` HTML tag */\n","\n","#sk-container-id-1 a.estimator_doc_link {\n","  float: right;\n","  font-size: 1rem;\n","  line-height: 1em;\n","  font-family: monospace;\n","  background-color: var(--sklearn-color-background);\n","  border-radius: 1rem;\n","  height: 1rem;\n","  width: 1rem;\n","  text-decoration: none;\n","  /* unfitted */\n","  color: var(--sklearn-color-unfitted-level-1);\n","  border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n","}\n","\n","#sk-container-id-1 a.estimator_doc_link.fitted {\n","  /* fitted */\n","  border: var(--sklearn-color-fitted-level-1) 1pt solid;\n","  color: var(--sklearn-color-fitted-level-1);\n","}\n","\n","/* On hover */\n","#sk-container-id-1 a.estimator_doc_link:hover {\n","  /* unfitted */\n","  background-color: var(--sklearn-color-unfitted-level-3);\n","  color: var(--sklearn-color-background);\n","  text-decoration: none;\n","}\n","\n","#sk-container-id-1 a.estimator_doc_link.fitted:hover {\n","  /* fitted */\n","  background-color: var(--sklearn-color-fitted-level-3);\n","}\n","</style><div id=\"sk-container-id-1\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>RandomForestClassifier(max_depth=8, n_estimators=500, random_state=42)</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item\"><div class=\"sk-estimator fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-1\" type=\"checkbox\" checked><label for=\"sk-estimator-id-1\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\">&nbsp;&nbsp;RandomForestClassifier<a class=\"sk-estimator-doc-link fitted\" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.4/modules/generated/sklearn.ensemble.RandomForestClassifier.html\">?<span>Documentation for RandomForestClassifier</span></a><span class=\"sk-estimator-doc-link fitted\">i<span>Fitted</span></span></label><div class=\"sk-toggleable__content fitted\"><pre>RandomForestClassifier(max_depth=8, n_estimators=500, random_state=42)</pre></div> </div></div></div></div>"],"text/plain":["RandomForestClassifier(max_depth=8, n_estimators=500, random_state=42)"]},"execution_count":17,"metadata":{},"output_type":"execute_result"}],"source":["RF_Model = RandomForestClassifier(random_state=42, max_features='sqrt', n_estimators= 500, max_depth=8, criterion='gini')\n","RF_Model.fit(countvectorizer_train, Y_train)"]},{"cell_type":"code","execution_count":18,"metadata":{"execution":{"iopub.execute_input":"2022-09-23T11:25:22.522715Z","iopub.status.busy":"2022-09-23T11:25:22.522322Z","iopub.status.idle":"2022-09-23T11:25:22.846675Z","shell.execute_reply":"2022-09-23T11:25:22.845545Z","shell.execute_reply.started":"2022-09-23T11:25:22.522682Z"},"trusted":true},"outputs":[],"source":["prediction_m1=RF_Model.predict(countvectorizer_test_m1)\n","prediction_m2=RF_Model.predict(countvectorizer_test_m2)\n","prediction_m3=RF_Model.predict(countvectorizer_test_m3)\n","prediction_m1000=RF_Model.predict(countvectorizer_test_m1000)\n","\n","prediction_f1=RF_Model.predict(countvectorizer_test_f1)\n","prediction_f2=RF_Model.predict(countvectorizer_test_f2)\n","prediction_f3=RF_Model.predict(countvectorizer_test_f3)\n","prediction_f1000=RF_Model.predict(countvectorizer_test_f1000)"]},{"cell_type":"code","execution_count":19,"metadata":{},"outputs":[],"source":["def print_bias_tilt(model, male_data, female_data, gold_labels) -> None:\n","    male_accuracy = model.score(male_data, gold_labels)\n","    female_accuracy = model.score(female_data, gold_labels)\n","    \n","    bt_score = ((male_accuracy/(male_accuracy+female_accuracy))-0.5)/0.5\n","\n","    print(f\"BiasTilt score: {bt_score}, Male Accuracy: {male_accuracy}, Female Accuracy: {female_accuracy} \\n\")"]},{"cell_type":"code","execution_count":20,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["Training Score: 0.87\n","Results on M1 and F1:\n","BiasTilt score: 0.0, Male Accuracy: 0.6056338028169014, Female Accuracy: 0.6056338028169014 \n","\n","Results on M2 and F2:\n","BiasTilt score: 0.0, Male Accuracy: 0.6056338028169014, Female Accuracy: 0.6056338028169014 \n","\n","Results on M3 and F3:\n","BiasTilt score: 0.0, Male Accuracy: 0.6056338028169014, Female Accuracy: 0.6056338028169014 \n","\n","Results on M1000 and F1000:\n","BiasTilt score: 0.0, Male Accuracy: 0.6056338028169014, Female Accuracy: 0.6056338028169014 \n","\n"]}],"source":["print(\"Training Score: {:.2f}\".format(RF_Model.score(countvectorizer_train, Y_train)))\n","\n","print(\"Results on M1 and F1:\")\n","print_bias_tilt(RF_Model, countvectorizer_test_m1, countvectorizer_test_f1, Y_test)\n","\n","print(\"Results on M2 and F2:\")\n","print_bias_tilt(RF_Model, countvectorizer_test_m2, countvectorizer_test_f2, Y_test)\n","\n","print(\"Results on M3 and F3:\")\n","print_bias_tilt(RF_Model, countvectorizer_test_m3, countvectorizer_test_f3, Y_test)\n","\n","print(\"Results on M1000 and F1000:\")\n","print_bias_tilt(RF_Model, countvectorizer_test_m1000, countvectorizer_test_f1000, Y_test)\n"]},{"cell_type":"markdown","metadata":{},"source":["# **Model Training** - Multilayer Perceptron"]},{"cell_type":"code","execution_count":21,"metadata":{},"outputs":[],"source":["MLP_Model = MLPClassifier(hidden_layer_sizes=(100,), activation='relu', solver='adam', random_state=42, max_iter=200)\n","MLP_Model.fit(countvectorizer_train, Y_train)\n","\n","prediction_m1 = MLP_Model.predict(countvectorizer_test_m1)\n","prediction_m2 = MLP_Model.predict(countvectorizer_test_m2)\n","prediction_m3 = MLP_Model.predict(countvectorizer_test_m3)\n","prediction_m1000 = MLP_Model.predict(countvectorizer_test_m1000)\n","\n","prediction_f1 = MLP_Model.predict(countvectorizer_test_f1)\n","prediction_f2 = MLP_Model.predict(countvectorizer_test_f2)\n","prediction_f3 = MLP_Model.predict(countvectorizer_test_f3)\n","prediction_f1000 = MLP_Model.predict(countvectorizer_test_f1000)"]},{"cell_type":"code","execution_count":22,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["Training Score: 1.00\n","Results on M1 and F1:\n","BiasTilt score: 0.0, Male Accuracy: 0.6338028169014085, Female Accuracy: 0.6338028169014085 \n","\n","Results on M2 and F2:\n","BiasTilt score: 0.0, Male Accuracy: 0.6297786720321932, Female Accuracy: 0.6297786720321932 \n","\n","Results on M3 and F3:\n","BiasTilt score: 0.0, Male Accuracy: 0.6297786720321932, Female Accuracy: 0.6297786720321932 \n","\n","Results on M1000 and F1000:\n","BiasTilt score: 0.0, Male Accuracy: 0.0482897384305835, Female Accuracy: 0.0482897384305835 \n","\n"]}],"source":["print(\"Training Score: {:.2f}\".format(MLP_Model.score(countvectorizer_train, Y_train)))\n","\n","print(\"Results on M1 and F1:\")\n","print_bias_tilt(MLP_Model, countvectorizer_test_m1, countvectorizer_test_f1, Y_test)\n","\n","print(\"Results on M2 and F2:\")\n","print_bias_tilt(MLP_Model, countvectorizer_test_m2, countvectorizer_test_f2, Y_test)\n","\n","print(\"Results on M3 and F3:\")\n","print_bias_tilt(MLP_Model, countvectorizer_test_m3, countvectorizer_test_f3, Y_test)\n","\n","print(\"Results on M1000 and F1000:\")\n","print_bias_tilt(MLP_Model, countvectorizer_test_m1000, countvectorizer_test_f1000, Y_test)"]},{"cell_type":"markdown","metadata":{},"source":["# **Model Training** - Multinomial Naive Bayes"]},{"cell_type":"code","execution_count":23,"metadata":{},"outputs":[],"source":["NB_Model = MultinomialNB()\n","NB_Model.fit(countvectorizer_train, Y_train)\n","\n","prediction_m1 = NB_Model.predict(countvectorizer_test_m1)\n","prediction_m2 = NB_Model.predict(countvectorizer_test_m2)\n","prediction_m3 = NB_Model.predict(countvectorizer_test_m3)\n","prediction_m1000 = NB_Model.predict(countvectorizer_test_m1000)\n","\n","prediction_f1 = NB_Model.predict(countvectorizer_test_f1)\n","prediction_f2 = NB_Model.predict(countvectorizer_test_f2)\n","prediction_f3 = NB_Model.predict(countvectorizer_test_f3)\n","prediction_f1000 = NB_Model.predict(countvectorizer_test_f1000)"]},{"cell_type":"code","execution_count":24,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["Training Score: 0.77\n","Results on M1 and F1:\n","BiasTilt score: 0.0, Male Accuracy: 0.5573440643863179, Female Accuracy: 0.5573440643863179 \n","\n","Results on M2 and F2:\n","BiasTilt score: 0.0, Male Accuracy: 0.5573440643863179, Female Accuracy: 0.5573440643863179 \n","\n","Results on M3 and F3:\n","BiasTilt score: 0.0, Male Accuracy: 0.5573440643863179, Female Accuracy: 0.5573440643863179 \n","\n","Results on M1000 and F1000:\n","BiasTilt score: 0.018867924528301883, Male Accuracy: 0.05432595573440644, Female Accuracy: 0.052313883299798795 \n","\n"]}],"source":["print(\"Training Score: {:.2f}\".format(NB_Model.score(countvectorizer_train, Y_train)))\n","\n","print(\"Results on M1 and F1:\")\n","print_bias_tilt(NB_Model, countvectorizer_test_m1, countvectorizer_test_f1, Y_test)\n","\n","print(\"Results on M2 and F2:\")\n","print_bias_tilt(NB_Model, countvectorizer_test_m2, countvectorizer_test_f2, Y_test)\n","\n","print(\"Results on M3 and F3:\")\n","print_bias_tilt(NB_Model, countvectorizer_test_m3, countvectorizer_test_f3, Y_test)\n","\n","print(\"Results on M1000 and F1000:\")\n","print_bias_tilt(NB_Model, countvectorizer_test_m1000, countvectorizer_test_f1000, Y_test)"]}],"metadata":{"kaggle":{"accelerator":"gpu","dataSources":[{"datasetId":1519260,"sourceId":2508632,"sourceType":"datasetVersion"}],"isGpuEnabled":true,"isInternetEnabled":false,"language":"python","sourceType":"notebook"},"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.12.1"}},"nbformat":4,"nbformat_minor":4}