Skip to content
Snippets Groups Projects
Select Git revision
  • 61b39a784ac3afedd53192e64a5b06fb2397bf89
  • master default protected
2 results

requirements.txt

Blame
  • This project manages its dependencies using pip. Learn more
    fanfic_preprocessing.py 6.17 KiB
    import seaborn as sns
    import matplotlib.pyplot as plt
    from cycler import cycler
    import os
    import pandas as pd
    import statistics
    import re
    from nltk.probability import FreqDist
    import numpy as np
    
    # code snippets for prettifying plots
    
    #colours
    pink = '#d600a7'
    light_green = '#55a480'
    blue_grey = '#5d9c9c'
    purple_grey = '#636273'
    CB91_Blue = '#2CBDFE'
    CB91_Green = '#47DBCD'
    CB91_Pink = '#F3A0F2'
    CB91_Purple = '#9D2EC5'
    CB91_Violet = '#661D98'
    CB91_Amber = '#F5B14C'
    
    color_list = [  blue_grey, CB91_Amber, pink, light_green, CB91_Green, CB91_Pink, CB91_Blue, 
                  CB91_Purple, CB91_Violet]
    plt.rcParams['axes.prop_cycle'] = plt.cycler(color=color_list)
    
    #some colour palette playing around
    
    cm = sns.cubehelix_palette(start=.5, rot=-.75, as_cmap=True)
    cm1 = sns.cubehelix_palette(start=.5, rot=-.5, as_cmap=True)
    cm2 = sns.cubehelix_palette(as_cmap=True)
    
    #palette_1 = sns.color_palette("flare")
    #palette_2 = sns.color_palette("mako_r", as_cmap=True)
    
    # actual preprocessing code
    
    #file header: 
    # work_id,title,author,rating,category,fandom,relationship,character,additional tags,language,published,status,status date,words,chapters,comments,kudos,bookmarks,hits,all_kudos,all_bookmarks,body
    # 27852922,Dealing with Our Demons,['ravenyenn19'],Mature,F/M,"Six of Crows Series",Kaz Brekker/Inej Ghafa,"Kaz B","Romance,Kanej - Freeform, Eventual Smut",English,2020-12-03,Updated,2023-03-16,747673,162/?,8573,12204,1373,709212,"['ud4m', 'book_addict_1228', 'ephemeraldelights', 'bluedelilah25', 'sunshinecorsets', 'I_do_not_like_purple_glasses', 'beep_boop_00', 'schleswigholstein', 'moonandstars75', 'ewerythingoes', 'mindfighters', 'rosibunnis', 'Lizie06', 'ghostlatte', 'aguswolman', 'QueenofEnglan', 'JenBoyette04', 'gnitneb_reads', 'gloomysunshine', 'v1ofvs', 'BazzaKrekker', 'BookGeek', 'poppyflower19', 'Cassanibal', 'vanilla_chai_tea', 'Honorthyword', 'mariaarmengol', 'luc1inda', 'zarawrites', 'monmough', 'Guilty__Pleasures', 'Ilyann', 'folieadeux_0_0', 'dragonguard', 'Emeliemarx', 'angrydabee', 'slythxrclaw', 'samaram0215', 'letsgetthisbread69', 'Mintmew', 'biblichour', 'Katloupet', 'Miss_ginger', 'inejsquake', 'Arabella_7833', 'flossy_flo99', 'a_k123', 'hushedwanderer', 'siriuslymichele', 'AnnaAvinaVTDX']",[],"Dear Kaz,
    
    
    grisha_fanfics = pd.read_csv("grishaverse/data/fanfics/grishaverse_fics.csv")
    tog_fanfics = pd.read_csv("throne_of_glass/data/fanfics/throne_of_glass_fics.csv")
    
    def read_csv_to_pd(file_path, name_of_file) -> pd: #fix type hints
        name_of_file = pd.read_csv(file_path)
        return name_of_file
    
    
    def calculate_cum_kudo_distribution(fanfic_pd):
        fanfic_kudos = fanfic_pd["kudos"].values.tolist()
        fanfic_kudos_freq_dist = FreqDist(fanfic_kudos)
        # convert to FreqDist object to a pandas series for easier processing
        dist_panda = pd.Series(dict(fanfic_kudos_freq_dist))
    
        # sort, normalise and round the panda series
        new_dist = dist_panda.sort_index()
    
        for i in range(0, len(new_dist.index)):
        #for index in new_token_len_dist.index:
            new_dist.iat[i] = round(new_dist.iat[i]/len(fanfic_kudos), 3) #index-1 bc the index starts counting from zero, the word lengths not
            #if float(new_token_len_dist.iat[i]) == 0.00:
                #   new_token_len_dist.drop(index=i) # here it is used as the label, so we want the index, not index -1; bad work-around, I'm sorry
    
        #calculate cumulative distribution
        cum_dist = np.cumsum(new_dist.values)
        return new_dist, cum_dist
    
    
    def plot_distribution(new_dist, cum_dist, plt_title, file_path_for_pic:str, x_label="Number of Kudos", y_label="Percentage of Occurence", scatter_plt=False, max_ticks=10):
        
        plt.figure(figsize=(10,10))
        plt.title(plt_title)
        plt.xlabel(x_label)
        plt.ylabel(y_label)
        
        if scatter_plt:
            sns.scatterplot(x=new_dist.index, y=cum_dist)
            #plt.xticks(new_dist.index[::100], new_dist.index[::100])
    
        else:
            sns.lineplot(x=new_dist.index, y=cum_dist)
    
        plt.savefig(file_path_for_pic) 
        plt.close()
    
    def separate_fanfics_by_good_medium_bad(df, series):
        good_fics = []
        medium_fics = []
        bad_fics = []
        few_kudos = 100
        medium_kudos = 1500
    
        for index, row in df.iterrows():
            published = pd.to_datetime(row["published"])
            if published.year != 2023:
                if not pd.isna(row["kudos"]):
                    kudos = pd.to_numeric(row["kudos"], errors="coerce")
                    if kudos <= few_kudos:
                        bad_fics.append(row["body"])
                    elif kudos <= medium_kudos:
                        medium_fics.append(row["body"])
                    elif kudos > medium_kudos:
                        good_fics.append(row["body"])
                else:
                    print(f"Missing kudos value for row {index}")
    
        bad_fics_joined = ''.join(map(str, bad_fics))
        good_fics_joined = ''.join(map(str, good_fics))
        medium_fics_joined = ''.join(map(str, medium_fics))
    
        with open(f"{series}/data/split_txt_fanfics/good_fics.txt", "w") as f:
            f.write(good_fics_joined)
    
        with open(f"{series}/data/split_txt_fanfics/bad_fics.txt", "w") as f:
            f.write(bad_fics_joined)
    
        with open(f"{series}/data/split_txt_fanfics/medium_fics.txt", "w") as f:
            f.write(medium_fics_joined)
    
    
    if __name__ == "__main__":
        #grishaverse
        #grisha_fanfics = read_csv_to_pd(file_path="grishaverse/data/fanfics/grishaverse_fics.csv", name_of_file=grisha_fanfics)
        #new_dist, cum_dist = calculate_cum_kudo_distribution(grisha_fanfics)
        #plot_distribution(new_dist=new_dist, cum_dist=cum_dist, plt_title="Grishaverse Cumulative Frequency Distribution of All Kudos", file_path_for_pic="grishaverse/freq_distribution/fanfic_kudo_freq_dist.png", scatter_plt=_plt=True)
    
        #throne of glass
        tog_fanfics = read_csv_to_pd(file_path="throne_of_glass/data/fanfics/throne_of_glass_fics.csv", name_of_file=tog_fanfics)
        new_dist, cum_dist = calculate_cum_kudo_distribution(tog_fanfics)
        plot_distribution(new_dist=new_dist, cum_dist=cum_dist, plt_title="Throne of Glass Cumulative Frequency Distribution of All Kudos", file_path_for_pic= "throne_of_glass/freq_distribution/fanfic_kudo_freq_dist.png", scatter_plt=True)
    
        #separate_fanfics_by_good_medium_bad(grisha_fanfics, "grishaverse")
        #separate_fanfics_by_good_medium_bad(tog_fanfics, "throne_of_glass")