Commit 7e9f3739 authored by Aileen Reichelt's avatar Aileen Reichelt
Browse files

Add script to count deepset GloVe vocab

parent bb4d93f9
Loading
Loading
Loading
Loading
+23 −0
Original line number Diff line number Diff line
"""Analyze the vocab counts and positions of the name lists
within Deepset's German pretrained GloVe embeddings."""

import pandas as pd

vocab_df = pd.read_csv("./data/deepset_german_glove_vocab.txt", sep=" ", header=None,
                 names=["word", "count"])

names_df = pd.read_csv("./data/names_nationality.csv")

names_df["deepset_glove_vocab_count"] = 0
names_df["deepset_glove_vocab_position"] = -1

for index, row in names_df.iterrows():
    name = row['name'].lower()
    search_result = vocab_df.loc[vocab_df["word"] == name].head(1)
    if not search_result.empty:
        count_value = search_result["count"].iloc[0]
        index_value = search_result.index[0]
        names_df.at[index, "deepset_glove_vocab_count"] = count_value
        names_df.at[index, "deepset_glove_vocab_position"] = index_value

names_df.to_csv("./data/names_nationality_deepset.csv", index=False)