Skip to content
Snippets Groups Projects
Commit a579ff50 authored by chrysanthopoulou's avatar chrysanthopoulou
Browse files

Add some data cleaning and stats

parent 90c62caf
No related branches found
No related tags found
No related merge requests found
Showing
with 67 additions and 38 deletions
Source diff could not be displayed: it is stored in LFS. Options to address this: view the blob.
Source diff could not be displayed: it is stored in LFS. Options to address this: view the blob.
cosmere/plots/fanfic_overview/fanfic_kudo_freq_dist.png

130 B | W: | H:

cosmere/plots/fanfic_overview/fanfic_kudo_freq_dist.png

130 B | W: | H:

cosmere/plots/fanfic_overview/fanfic_kudo_freq_dist.png
cosmere/plots/fanfic_overview/fanfic_kudo_freq_dist.png
cosmere/plots/fanfic_overview/fanfic_kudo_freq_dist.png
cosmere/plots/fanfic_overview/fanfic_kudo_freq_dist.png
  • 2-up
  • Swipe
  • Onion skin
Source diff could not be displayed: it is stored in LFS. Options to address this: view the blob.
Source diff could not be displayed: it is stored in LFS. Options to address this: view the blob.
divergent/plots/fanfic_overview/fanfic_kudo_freq_dist.png

130 B | W: | H:

divergent/plots/fanfic_overview/fanfic_kudo_freq_dist.png

130 B | W: | H:

divergent/plots/fanfic_overview/fanfic_kudo_freq_dist.png
divergent/plots/fanfic_overview/fanfic_kudo_freq_dist.png
divergent/plots/fanfic_overview/fanfic_kudo_freq_dist.png
divergent/plots/fanfic_overview/fanfic_kudo_freq_dist.png
  • 2-up
  • Swipe
  • Onion skin
......@@ -27,15 +27,15 @@ def calculate_cum_kudo_distribution(fanfic_pd):
fanfic_pd = fanfic_pd.dropna(subset=["kudos"])
fanfic_kudos = fanfic_pd["kudos"].values.tolist()
fanfic_kudos_freq_dist = FreqDist(fanfic_kudos)
# convert to FreqDist object to a pandas series for easier processing
dist_panda = pd.Series(dict(fanfic_kudos_freq_dist))
# sort, normalise and round the panda series
new_dist = dist_panda.sort_index()
#new_dist = new_dist.drop(columns=["NaN"])
new_dist = new_dist.drop(columns=["NaN"])
for i in range(0, len(new_dist.index)):
new_dist.iat[i] = round(new_dist.iat[i]/len(fanfic_kudos), 3)
#for i in range(0, len(new_dist.index)):
# new_dist.iat[i] = round(new_dist.iat[i]/len(fanfic_kudos), 3)
#calculate cumulative distribution
cum_dist = np.cumsum(new_dist.values)
......@@ -99,25 +99,30 @@ def clean_fanfic_dataset(file_path):
# Step 1: Delete rows with 'published' as 2023 or NaN
# Removes rows with 'published' as 2023
# published,status,status date
# 2023-04-25,Updated,2023-09-22
for index, row in df.iterrows():
published = pd.to_datetime(row["published"])
if published.year == 2023:
df.drop([index])
#print(row["status date"])
#print(row["status date"][0:4])
#print(type(row["status date"][0:4]))
if row["published"][0:4] == "2023": # the date is saved as a string, so that's why I have to check it against a string
#print(row["status date"][0:4])
#print(row)
df.drop(index, inplace=True)
print(df.shape)
df = df.dropna(subset=['published']) # Removes rows with 'published' as NaN
df.dropna(subset=['published'], inplace=True) # Removes rows with 'published' as NaN
print(df.shape)
# Step 2: Delete rows with missing values (NaN) for 'kudos'
df = df.dropna(subset=['kudos'])
df.dropna(subset=['kudos'], inplace=True)
print(df.shape)
df.to_csv(file_path)
df.to_csv(f"{file_path[:-4]}_new.csv")
def run_functions(file_paths):
for index, file_path in enumerate(file_paths):
#clean_fanfic_dataset(file_path)
clean_fanfic_dataset(file_path)
pattern = r"^[a-zA-Z_]+(?=/)" # get series from directory path
match = re.search(pattern, file_path)
if match:
......@@ -127,7 +132,7 @@ def run_functions(file_paths):
dir_path_for_pic = f"{series_name}/plots/fanfic_overview/"
if os.path.exists(dir_path_for_pic) == False:
os.makedirs(dir_path_for_pic)
file = pd.read_csv(file_path)
file = pd.read_csv(f"{file_path[:-4]}_new.csv", thousands=",")#otherwise thousands are simply not recognised
print(file.shape)
title = series_name.replace("_", " ").title()
new_dist, cum_dist = calculate_cum_kudo_distribution(file)
......@@ -137,12 +142,15 @@ def run_functions(file_paths):
#print(new_dist)
ext_dist.to_csv(f"{dir_path_for_pic}fanfic_kudo_ext_dist.csv")
#cum_dist.tofile(f"{dir_path_for_pic}fanfic_kudo_cum_dist.csv", sep=",")
if __name__ == "__main__":
#clean_fanfic_dataset("grishaverse/data/fanfics/grisha_fanfics.csv")
file_paths = ["cosmere/data/fanfics/cosmere_fanfics.csv", "divergent/data/fanfics/divergent_fanfics.csv", "grishaverse/data/fanfics/grisha_fanfics.csv", "maze_runner/data/fanfics/mazerunner_fanfics.csv", "murderbot/data/fanfics/murderbot_fanfics.csv", "percy/data/fanfics/percy_fanfics.csv", "school_for_good_and_evil/data/fanfics/school_fanfics.csv", "throne_of_glass/data/fanfics/throne_of_glass_fanfics.csv",]
#file_paths = ["cosmere/data/fanfics/cosmere_fanfics.csv"]
#file_paths = ["grishaverse/data/fanfics/grisha_fanfics_new.csv"]
run_functions(file_paths=file_paths)
#separate_fanfics_by_good_medium_bad(grisha_fanfics, "grishaverse")
......
Source diff could not be displayed: it is stored in LFS. Options to address this: view the blob.
Source diff could not be displayed: it is stored in LFS. Options to address this: view the blob.
grishaverse/plots/fanfic_overview/fanfic_kudo_freq_dist.png

130 B | W: | H:

grishaverse/plots/fanfic_overview/fanfic_kudo_freq_dist.png

130 B | W: | H:

grishaverse/plots/fanfic_overview/fanfic_kudo_freq_dist.png
grishaverse/plots/fanfic_overview/fanfic_kudo_freq_dist.png
grishaverse/plots/fanfic_overview/fanfic_kudo_freq_dist.png
grishaverse/plots/fanfic_overview/fanfic_kudo_freq_dist.png
  • 2-up
  • Swipe
  • Onion skin
Source diff could not be displayed: it is stored in LFS. Options to address this: view the blob.
Source diff could not be displayed: it is stored in LFS. Options to address this: view the blob.
maze_runner/plots/fanfic_overview/fanfic_kudo_freq_dist.png

130 B | W: | H:

maze_runner/plots/fanfic_overview/fanfic_kudo_freq_dist.png

130 B | W: | H:

maze_runner/plots/fanfic_overview/fanfic_kudo_freq_dist.png
maze_runner/plots/fanfic_overview/fanfic_kudo_freq_dist.png
maze_runner/plots/fanfic_overview/fanfic_kudo_freq_dist.png
maze_runner/plots/fanfic_overview/fanfic_kudo_freq_dist.png
  • 2-up
  • Swipe
  • Onion skin
Source diff could not be displayed: it is stored in LFS. Options to address this: view the blob.
Source diff could not be displayed: it is stored in LFS. Options to address this: view the blob.
murderbot/plots/fanfic_overview/fanfic_kudo_freq_dist.png

130 B | W: | H:

murderbot/plots/fanfic_overview/fanfic_kudo_freq_dist.png

130 B | W: | H:

murderbot/plots/fanfic_overview/fanfic_kudo_freq_dist.png
murderbot/plots/fanfic_overview/fanfic_kudo_freq_dist.png
murderbot/plots/fanfic_overview/fanfic_kudo_freq_dist.png
murderbot/plots/fanfic_overview/fanfic_kudo_freq_dist.png
  • 2-up
  • Swipe
  • Onion skin
Source diff could not be displayed: it is stored in LFS. Options to address this: view the blob.
Source diff could not be displayed: it is stored in LFS. Options to address this: view the blob.
percy/plots/fanfic_overview/fanfic_kudo_freq_dist.png

130 B | W: | H:

percy/plots/fanfic_overview/fanfic_kudo_freq_dist.png

130 B | W: | H:

percy/plots/fanfic_overview/fanfic_kudo_freq_dist.png
percy/plots/fanfic_overview/fanfic_kudo_freq_dist.png
percy/plots/fanfic_overview/fanfic_kudo_freq_dist.png
percy/plots/fanfic_overview/fanfic_kudo_freq_dist.png
  • 2-up
  • Swipe
  • Onion skin
Source diff could not be displayed: it is stored in LFS. Options to address this: view the blob.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment