fanfic_preprocessing.py



This project manages its dependencies using pip.
Learn more
import seaborn as sns
import matplotlib.pyplot as plt
from cycler import cycler
import os
import pandas as pd
import statistics
import re
from nltk.probability import FreqDist
import numpy as np

# code snippets for prettifying plots

#colours
pink = '#d600a7'
light_green = '#55a480'
blue_grey = '#5d9c9c'
purple_grey = '#636273'
CB91_Blue = '#2CBDFE'
CB91_Green = '#47DBCD'
CB91_Pink = '#F3A0F2'
CB91_Purple = '#9D2EC5'
CB91_Violet = '#661D98'
CB91_Amber = '#F5B14C'

color_list = [  blue_grey, CB91_Amber, pink, light_green, CB91_Green, CB91_Pink, CB91_Blue,
              CB91_Purple, CB91_Violet]
plt.rcParams['axes.prop_cycle'] = plt.cycler(color=color_list)

#some colour palette playing around

cm = sns.cubehelix_palette(start=.5, rot=-.75, as_cmap=True)
cm1 = sns.cubehelix_palette(start=.5, rot=-.5, as_cmap=True)
cm2 = sns.cubehelix_palette(as_cmap=True)

#palette_1 = sns.color_palette("flare")
#palette_2 = sns.color_palette("mako_r", as_cmap=True)

# actual preprocessing code

#file header:
# work_id,title,author,rating,category,fandom,relationship,character,additional tags,language,published,status,status date,words,chapters,comments,kudos,bookmarks,hits,all_kudos,all_bookmarks,body
# 27852922,Dealing with Our Demons,['ravenyenn19'],Mature,F/M,"Six of Crows Series",Kaz Brekker/Inej Ghafa,"Kaz B","Romance,Kanej - Freeform, Eventual Smut",English,2020-12-03,Updated,2023-03-16,747673,162/?,8573,12204,1373,709212,"['ud4m', 'book_addict_1228', 'ephemeraldelights', 'bluedelilah25', 'sunshinecorsets', 'I_do_not_like_purple_glasses', 'beep_boop_00', 'schleswigholstein', 'moonandstars75', 'ewerythingoes', 'mindfighters', 'rosibunnis', 'Lizie06', 'ghostlatte', 'aguswolman', 'QueenofEnglan', 'JenBoyette04', 'gnitneb_reads', 'gloomysunshine', 'v1ofvs', 'BazzaKrekker', 'BookGeek', 'poppyflower19', 'Cassanibal', 'vanilla_chai_tea', 'Honorthyword', 'mariaarmengol', 'luc1inda', 'zarawrites', 'monmough', 'Guilty__Pleasures', 'Ilyann', 'folieadeux_0_0', 'dragonguard', 'Emeliemarx', 'angrydabee', 'slythxrclaw', 'samaram0215', 'letsgetthisbread69', 'Mintmew', 'biblichour', 'Katloupet', 'Miss_ginger', 'inejsquake', 'Arabella_7833', 'flossy_flo99', 'a_k123', 'hushedwanderer', 'siriuslymichele', 'AnnaAvinaVTDX']",[],"Dear Kaz,


grisha_fanfics = pd.read_csv("grishaverse/data/fanfics/grishaverse_fics.csv")
tog_fanfics = pd.read_csv("throne_of_glass/data/fanfics/throne_of_glass_fics.csv")

def read_csv_to_pd(file_path, name_of_file) -> pd: #fix type hints
    name_of_file = pd.read_csv(file_path)
    return name_of_file


def calculate_cum_kudo_distribution(fanfic_pd):
    fanfic_kudos = fanfic_pd["kudos"].values.tolist()
    fanfic_kudos_freq_dist = FreqDist(fanfic_kudos)
    # convert to FreqDist object to a pandas series for easier processing
    dist_panda = pd.Series(dict(fanfic_kudos_freq_dist))

    # sort, normalise and round the panda series
    new_dist = dist_panda.sort_index()

    for i in range(0, len(new_dist.index)):
    #for index in new_token_len_dist.index:
        new_dist.iat[i] = round(new_dist.iat[i]/len(fanfic_kudos), 3) #index-1 bc the index starts counting from zero, the word lengths not
        #if float(new_token_len_dist.iat[i]) == 0.00:
            #   new_token_len_dist.drop(index=i) # here it is used as the label, so we want the index, not index -1; bad work-around, I'm sorry

    #calculate cumulative distribution
    cum_dist = np.cumsum(new_dist.values)
    return new_dist, cum_dist


def plot_distribution(new_dist, cum_dist, plt_title, file_path_for_pic:str, x_label="Number of Kudos", y_label="Percentage of Occurence", scatter_plt=False, max_ticks=10):

    plt.figure(figsize=(10,10))
    plt.title(plt_title)
    plt.xlabel(x_label)
    plt.ylabel(y_label)

    if scatter_plt:
        sns.scatterplot(x=new_dist.index, y=cum_dist)
        #plt.xticks(new_dist.index[::100], new_dist.index[::100])

    else:
        sns.lineplot(x=new_dist.index, y=cum_dist)

    plt.savefig(file_path_for_pic)
    plt.close()

def separate_fanfics_by_good_medium_bad(df, series):
    good_fics = []
    medium_fics = []
    bad_fics = []
    few_kudos = 100
    medium_kudos = 1500

    for index, row in df.iterrows():
        published = pd.to_datetime(row["published"])
        if published.year != 2023:
            if not pd.isna(row["kudos"]):
                kudos = pd.to_numeric(row["kudos"], errors="coerce")
                if kudos <= few_kudos:
                    bad_fics.append(row["body"])
                elif kudos <= medium_kudos:
                    medium_fics.append(row["body"])
                elif kudos > medium_kudos:
                    good_fics.append(row["body"])
            else:
                print(f"Missing kudos value for row {index}")

    bad_fics_joined = ''.join(map(str, bad_fics))
    good_fics_joined = ''.join(map(str, good_fics))
    medium_fics_joined = ''.join(map(str, medium_fics))

    with open(f"{series}/data/split_txt_fanfics/good_fics.txt", "w") as f:
        f.write(good_fics_joined)

    with open(f"{series}/data/split_txt_fanfics/bad_fics.txt", "w") as f:
        f.write(bad_fics_joined)

    with open(f"{series}/data/split_txt_fanfics/medium_fics.txt", "w") as f:
        f.write(medium_fics_joined)


if __name__ == "__main__":
    #grishaverse
    #grisha_fanfics = read_csv_to_pd(file_path="grishaverse/data/fanfics/grishaverse_fics.csv", name_of_file=grisha_fanfics)
    #new_dist, cum_dist = calculate_cum_kudo_distribution(grisha_fanfics)
    #plot_distribution(new_dist=new_dist, cum_dist=cum_dist, plt_title="Grishaverse Cumulative Frequency Distribution of All Kudos", file_path_for_pic="grishaverse/freq_distribution/fanfic_kudo_freq_dist.png", scatter_plt=_plt=True)

    #throne of glass
    tog_fanfics = read_csv_to_pd(file_path="throne_of_glass/data/fanfics/throne_of_glass_fics.csv", name_of_file=tog_fanfics)
    new_dist, cum_dist = calculate_cum_kudo_distribution(tog_fanfics)
    plot_distribution(new_dist=new_dist, cum_dist=cum_dist, plt_title="Throne of Glass Cumulative Frequency Distribution of All Kudos", file_path_for_pic= "throne_of_glass/freq_distribution/fanfic_kudo_freq_dist.png", scatter_plt=True)

    #separate_fanfics_by_good_medium_bad(grisha_fanfics, "grishaverse")
    #separate_fanfics_by_good_medium_bad(tog_fanfics, "throne_of_glass")