diff --git a/analyze_deepset_glove_counts.py b/analyze_deepset_glove_counts.py new file mode 100644 index 0000000000000000000000000000000000000000..5e47dfea6aee9f9bc29bb75fccabb8279a45f71b --- /dev/null +++ b/analyze_deepset_glove_counts.py @@ -0,0 +1,23 @@ +"""Analyze the vocab counts and positions of the name lists +within Deepset's German pretrained GloVe embeddings.""" + +import pandas as pd + +vocab_df = pd.read_csv("./data/deepset_german_glove_vocab.txt", sep=" ", header=None, + names=["word", "count"]) + +names_df = pd.read_csv("./data/names_nationality.csv") + +names_df["deepset_glove_vocab_count"] = 0 +names_df["deepset_glove_vocab_position"] = -1 + +for index, row in names_df.iterrows(): + name = row['name'].lower() + search_result = vocab_df.loc[vocab_df["word"] == name].head(1) + if not search_result.empty: + count_value = search_result["count"].iloc[0] + index_value = search_result.index[0] + names_df.at[index, "deepset_glove_vocab_count"] = count_value + names_df.at[index, "deepset_glove_vocab_position"] = index_value + +names_df.to_csv("./data/names_nationality_deepset.csv", index=False)