Skip to content
Snippets Groups Projects
analyze_deepset_glove_counts.py 892 B
"""Analyze the vocab counts and positions of the name lists
within Deepset's German pretrained GloVe embeddings."""

import pandas as pd

vocab_df = pd.read_csv("./data/deepset_german_glove_vocab.txt", sep=" ", header=None,
                 names=["word", "count"])

names_df = pd.read_csv("./data/names_nationality.csv")

names_df["deepset_glove_vocab_count"] = 0
names_df["deepset_glove_vocab_position"] = -1

for index, row in names_df.iterrows():
    name = row['name'].lower()
    search_result = vocab_df.loc[vocab_df["word"] == name].head(1)
    if not search_result.empty:
        count_value = search_result["count"].iloc[0]
        index_value = search_result.index[0]
        names_df.at[index, "deepset_glove_vocab_count"] = count_value
        names_df.at[index, "deepset_glove_vocab_position"] = index_value

names_df.to_csv("./data/names_nationality_deepset.csv", index=False)